add part of opencv

2020-01-27 20:20:56 +08:00
parent 0c4ac1d8bb
commit a71fa47620
6518 changed files with 3122580 additions and 0 deletions
--- a/Lib/opencv/sources/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/caffe/caffe_importer.cpp
@@ -0,0 +1,534 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+
+#ifdef HAVE_PROTOBUF
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include "caffe_io.hpp"
+#endif
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+#ifdef HAVE_PROTOBUF
+using ::google::protobuf::RepeatedField;
+using ::google::protobuf::RepeatedPtrField;
+using ::google::protobuf::Message;
+using ::google::protobuf::Descriptor;
+using ::google::protobuf::FieldDescriptor;
+using ::google::protobuf::Reflection;
+
+namespace
+{
+
+template<typename T>
+static cv::String toString(const T &v)
+{
+    std::ostringstream ss;
+    ss << v;
+    return ss.str();
+}
+
+class CaffeImporter
+{
+    caffe::NetParameter net;
+    caffe::NetParameter netBinary;
+
+public:
+
+    CaffeImporter(const char *pototxt, const char *caffeModel)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromTextFileOrDie(pototxt, &net);
+
+        if (caffeModel && caffeModel[0])
+            ReadNetParamsFromBinaryFileOrDie(caffeModel, &netBinary);
+    }
+
+    CaffeImporter(const char *dataProto, size_t lenProto,
+                  const char *dataModel, size_t lenModel)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromTextBufferOrDie(dataProto, lenProto, &net);
+
+        if (dataModel != NULL && lenModel > 0)
+            ReadNetParamsFromBinaryBufferOrDie(dataModel, lenModel, &netBinary);
+    }
+
+    void extractCustomParams(const google::protobuf::UnknownFieldSet& unknownFields, cv::dnn::LayerParams &params)
+    {
+        const int numFields = unknownFields.field_count();
+        for (int i = 0; i < numFields; ++i)
+        {
+            const google::protobuf::UnknownField& field = unknownFields.field(i);
+            CV_Assert(field.type() == google::protobuf::UnknownField::TYPE_GROUP);
+            std::string fieldName = field.group().field(0).length_delimited();
+            std::string fieldValue = field.group().field(1).length_delimited();
+            params.set(fieldName, fieldValue);
+        }
+    }
+
+    void addParam(const Message &msg, const FieldDescriptor *field, cv::dnn::LayerParams &params)
+    {
+        const Reflection *refl = msg.GetReflection();
+        int type = field->cpp_type();
+        bool isRepeated = field->is_repeated();
+        const std::string &name = field->name();
+
+        #define SET_UP_FILED(getter, arrayConstr, gtype)                                    \
+            if (isRepeated) {                                                               \
+                const RepeatedField<gtype> &v = refl->GetRepeatedField<gtype>(msg, field);  \
+                params.set(name, DictValue::arrayConstr(v.begin(), (int)v.size()));                  \
+            }                                                                               \
+            else {                                                                          \
+                params.set(name, refl->getter(msg, field));                               \
+            }
+
+        switch (type)
+        {
+        case FieldDescriptor::CPPTYPE_INT32:
+            SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int32);
+            break;
+        case FieldDescriptor::CPPTYPE_UINT32:
+            SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint32);
+            break;
+        case FieldDescriptor::CPPTYPE_INT64:
+            SET_UP_FILED(GetInt32, arrayInt, ::google::protobuf::int64);
+            break;
+        case FieldDescriptor::CPPTYPE_UINT64:
+            SET_UP_FILED(GetUInt32, arrayInt, ::google::protobuf::uint64);
+            break;
+        case FieldDescriptor::CPPTYPE_BOOL:
+            SET_UP_FILED(GetBool, arrayInt, bool);
+            break;
+        case FieldDescriptor::CPPTYPE_DOUBLE:
+            SET_UP_FILED(GetDouble, arrayReal, double);
+            break;
+        case FieldDescriptor::CPPTYPE_FLOAT:
+            SET_UP_FILED(GetFloat, arrayReal, float);
+            break;
+        case FieldDescriptor::CPPTYPE_STRING:
+            if (isRepeated) {
+                const RepeatedPtrField<std::string> &v = refl->GetRepeatedPtrField<std::string>(msg, field);
+                params.set(name, DictValue::arrayString(v.begin(), (int)v.size()));
+            }
+            else {
+                params.set(name, refl->GetString(msg, field));
+            }
+            break;
+        case FieldDescriptor::CPPTYPE_ENUM:
+            if (isRepeated) {
+                int size = refl->FieldSize(msg, field);
+                std::vector<cv::String> buf(size);
+                for (int i = 0; i < size; i++)
+                    buf[i] = refl->GetRepeatedEnum(msg, field, i)->name();
+                params.set(name, DictValue::arrayString(buf.begin(), size));
+            }
+            else {
+                params.set(name, refl->GetEnum(msg, field)->name());
+            }
+            break;
+        default:
+            CV_Error(Error::StsError, "Unknown type \"" + String(field->type_name()) + "\" in prototxt");
+            break;
+        }
+    }
+
+    inline static bool ends_with_param(const std::string &str)
+    {
+        static const std::string _param("_param");
+        return (str.size() >= _param.size()) && str.compare(str.size() - _param.size(), _param.size(), _param) == 0;
+    }
+
+    void extractLayerParams(const Message &msg, cv::dnn::LayerParams &params, bool isInternal = false)
+    {
+        const Descriptor *msgDesc = msg.GetDescriptor();
+        const Reflection *msgRefl = msg.GetReflection();
+
+        for (int fieldId = 0; fieldId < msgDesc->field_count(); fieldId++)
+        {
+            const FieldDescriptor *fd = msgDesc->field(fieldId);
+
+            if (!isInternal && !ends_with_param(fd->name()))
+                continue;
+
+            const google::protobuf::UnknownFieldSet& unknownFields = msgRefl->GetUnknownFields(msg);
+            bool hasData =  fd->is_required() ||
+                            (fd->is_optional() && msgRefl->HasField(msg, fd)) ||
+                            (fd->is_repeated() && msgRefl->FieldSize(msg, fd) > 0) ||
+                            !unknownFields.empty();
+            if (!hasData)
+                continue;
+
+            extractCustomParams(unknownFields, params);
+            if (fd->cpp_type() == FieldDescriptor::CPPTYPE_MESSAGE)
+            {
+                if (fd->is_repeated()) //Extract only first item!
+                    extractLayerParams(msgRefl->GetRepeatedMessage(msg, fd, 0), params, true);
+                else
+                    extractLayerParams(msgRefl->GetMessage(msg, fd), params, true);
+            }
+            else
+            {
+                addParam(msg, fd, params);
+            }
+        }
+    }
+
+    void blobShapeFromProto(const caffe::BlobProto &pbBlob, MatShape& shape)
+    {
+        shape.clear();
+        if (pbBlob.has_num() || pbBlob.has_channels() || pbBlob.has_height() || pbBlob.has_width())
+        {
+            shape.push_back(pbBlob.num());
+            shape.push_back(pbBlob.channels());
+            shape.push_back(pbBlob.height());
+            shape.push_back(pbBlob.width());
+        }
+        else if (pbBlob.has_shape())
+        {
+            const caffe::BlobShape &_shape = pbBlob.shape();
+
+            for (int i = 0; i < _shape.dim_size(); i++)
+                shape.push_back((int)_shape.dim(i));
+        }
+        else
+            shape.resize(1, 1);  // Is a scalar.
+    }
+
+    void blobFromProto(const caffe::BlobProto &pbBlob, cv::Mat &dstBlob)
+    {
+        MatShape shape;
+        blobShapeFromProto(pbBlob, shape);
+
+        dstBlob.create((int)shape.size(), &shape[0], CV_32F);
+        if (pbBlob.data_size())
+        {
+            // Single precision floats.
+            CV_Assert(pbBlob.data_size() == (int)dstBlob.total());
+
+            CV_DbgAssert(pbBlob.GetDescriptor()->FindFieldByLowercaseName("data")->cpp_type() == FieldDescriptor::CPPTYPE_FLOAT);
+            Mat(dstBlob.dims, &dstBlob.size[0], CV_32F, (void*)pbBlob.data().data()).copyTo(dstBlob);
+        }
+        else
+        {
+            CV_Assert(pbBlob.has_raw_data());
+            const std::string& raw_data = pbBlob.raw_data();
+            if (pbBlob.raw_data_type() == caffe::FLOAT16)
+            {
+                // Half precision floats.
+                CV_Assert(raw_data.size() / 2 == (int)dstBlob.total());
+
+                Mat halfs((int)shape.size(), &shape[0], CV_16SC1, (void*)raw_data.c_str());
+                convertFp16(halfs, dstBlob);
+            }
+            else if (pbBlob.raw_data_type() == caffe::FLOAT)
+            {
+                CV_Assert(raw_data.size() / 4 == (int)dstBlob.total());
+                Mat((int)shape.size(), &shape[0], CV_32FC1, (void*)raw_data.c_str()).copyTo(dstBlob);
+            }
+            else
+                CV_Error(Error::StsNotImplemented, "Unexpected blob data type");
+        }
+    }
+
+    void extractBinaryLayerParams(const caffe::LayerParameter& layer, LayerParams& layerParams)
+    {
+        const std::string &name = layer.name();
+
+        int li;
+        for (li = 0; li != netBinary.layer_size(); li++)
+        {
+            const caffe::LayerParameter& binLayer = netBinary.layer(li);
+            // Break if the layer name is the same and the blobs are not cleared
+            if (binLayer.name() == name && binLayer.blobs_size() != 0)
+                break;
+        }
+
+        if (li == netBinary.layer_size())
+            return;
+
+        caffe::LayerParameter* binLayer = netBinary.mutable_layer(li);
+        const int numBlobs = binLayer->blobs_size();
+        layerParams.blobs.resize(numBlobs);
+        for (int bi = 0; bi < numBlobs; bi++)
+        {
+            blobFromProto(binLayer->blobs(bi), layerParams.blobs[bi]);
+        }
+        binLayer->clear_blobs();
+        CV_Assert(numBlobs == binLayer->blobs().ClearedCount());
+        for (int bi = 0; bi < numBlobs; bi++)
+        {
+            delete binLayer->mutable_blobs()->ReleaseCleared();
+        }
+    }
+
+    struct BlobNote
+    {
+        BlobNote(const std::string &_name, int _layerId, int _outNum) :
+            name(_name), layerId(_layerId), outNum(_outNum) {}
+
+        std::string name;
+        int layerId, outNum;
+    };
+
+    std::vector<BlobNote> addedBlobs;
+    std::map<String, int> layerCounter;
+
+    void populateNet(Net dstNet)
+    {
+        CV_TRACE_FUNCTION();
+
+        int layersSize = net.layer_size();
+        layerCounter.clear();
+        addedBlobs.clear();
+        addedBlobs.reserve(layersSize + 1);
+
+        //setup input layer names
+        std::vector<String> netInputs(net.input_size());
+        {
+            for (int inNum = 0; inNum < net.input_size(); inNum++)
+            {
+                addedBlobs.push_back(BlobNote(net.input(inNum), 0, inNum));
+                netInputs[inNum] = net.input(inNum);
+            }
+        }
+
+        for (int li = 0; li < layersSize; li++)
+        {
+            const caffe::LayerParameter &layer = net.layer(li);
+            String name = layer.name();
+            String type = layer.type();
+            LayerParams layerParams;
+
+            extractLayerParams(layer, layerParams);
+            extractBinaryLayerParams(layer, layerParams);
+
+            int repetitions = layerCounter[name]++;
+            if (repetitions)
+                name += String("_") + toString(repetitions);
+
+            if (type == "Input")
+            {
+                for (int outNum = 0; outNum < layer.top_size(); outNum++)
+                {
+                    addOutput(layer, 0, outNum);
+                    addedBlobs.back().outNum = netInputs.size();
+                    netInputs.push_back(addedBlobs.back().name);
+                }
+                continue;
+            }
+            else if (type == "BatchNorm")
+            {
+                if (!layerParams.get<bool>("use_global_stats", true))
+                {
+                    CV_Assert_N(layer.bottom_size() == 1, layer.top_size() == 1);
+
+                    LayerParams mvnParams;
+                    mvnParams.set("eps", layerParams.get<float>("eps", 1e-5));
+                    std::string mvnName = name + "/mvn";
+
+                    int repetitions = layerCounter[mvnName]++;
+                    if (repetitions)
+                        mvnName += String("_") + toString(repetitions);
+
+                    int mvnId = dstNet.addLayer(mvnName, "MVN", mvnParams);
+                    addInput(layer.bottom(0), mvnId, 0, dstNet);
+                    addOutput(layer, mvnId, 0);
+                    net.mutable_layer(li)->set_bottom(0, layer.top(0));
+                    layerParams.blobs[0].setTo(0);  // mean
+                    layerParams.blobs[1].setTo(1);  // std
+                }
+            }
+            else if (type == "Axpy")
+            {
+                CV_Assert_N(layer.bottom_size() == 3, layer.top_size() == 1);
+
+                std::string scaleName = name + "/scale";
+                int repetitions = layerCounter[scaleName]++;
+                if (repetitions) {
+                    scaleName += String("_") + toString(repetitions);
+                }
+
+                LayerParams scaleParams;
+                scaleParams.set("axis", 1);
+                scaleParams.set("has_bias", false);
+                int scaleId = dstNet.addLayer(scaleName, "Scale", scaleParams);
+                addInput(layer.bottom(2), scaleId, 0, dstNet);
+                addInput(layer.bottom(0), scaleId, 1, dstNet);
+                addOutput(layer, scaleId, 0);
+                net.mutable_layer(li)->set_bottom(0, layer.top(0));
+                net.mutable_layer(li)->mutable_bottom()->RemoveLast();
+                type = "Eltwise";
+            }
+            else if ("ConvolutionDepthwise" == type)
+            {
+                type = "Convolution";
+            }
+
+            int id = dstNet.addLayer(name, type, layerParams);
+
+            for (int inNum = 0; inNum < layer.bottom_size(); inNum++)
+                addInput(layer.bottom(inNum), id, inNum, dstNet);
+
+            for (int outNum = 0; outNum < layer.top_size(); outNum++)
+                addOutput(layer, id, outNum);
+        }
+        dstNet.setInputsNames(netInputs);
+
+        std::vector<MatShape> inp_shapes;
+        if (net.input_shape_size() > 0 || (layersSize > 0 && net.layer(0).has_input_param() &&
+            net.layer(0).input_param().shape_size() > 0)) {
+
+            int size = (net.input_shape_size() > 0) ? net.input_shape_size() :
+                                                      net.layer(0).input_param().shape_size();
+            for (int inp_id = 0; inp_id < size; inp_id++)
+            {
+                const caffe::BlobShape &_input_shape = (net.input_shape_size() > 0) ?
+                                                        net.input_shape(inp_id) :
+                                                        net.layer(0).input_param().shape(inp_id);
+                MatShape shape;
+                for (int i = 0; i < _input_shape.dim_size(); i++) {
+                    shape.push_back((int)_input_shape.dim(i));
+                }
+                inp_shapes.push_back(shape);
+            }
+        }
+        else if (net.input_dim_size() > 0) {
+            MatShape shape;
+            for (int dim = 0; dim < net.input_dim_size(); dim++) {
+                shape.push_back(net.input_dim(dim));
+            }
+            inp_shapes.push_back(shape);
+        }
+
+        for (int inp_id = 0; inp_id < inp_shapes.size(); inp_id++) {
+            dstNet.setInput(Mat(inp_shapes[inp_id], CV_32F), netInputs[inp_id]);
+        }
+
+        addedBlobs.clear();
+    }
+
+    void addOutput(const caffe::LayerParameter &layer, int layerId, int outNum)
+    {
+        const std::string &name = layer.top(outNum);
+
+        bool haveDups = false;
+        for (int idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
+        {
+            if (addedBlobs[idx].name == name)
+            {
+                haveDups = true;
+                break;
+            }
+        }
+
+        if (haveDups)
+        {
+            bool isInplace = layer.bottom_size() > outNum && layer.bottom(outNum) == name;
+            if (!isInplace)
+                CV_Error(Error::StsBadArg, "Duplicate blobs produced by multiple sources");
+        }
+
+        addedBlobs.push_back(BlobNote(name, layerId, outNum));
+    }
+
+    void addInput(const std::string &name, int layerId, int inNum, Net &dstNet)
+    {
+        int idx;
+        for (idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
+        {
+            if (addedBlobs[idx].name == name)
+                break;
+        }
+
+        if (idx < 0)
+        {
+            CV_Error(Error::StsObjectNotFound, "Can't find output blob \"" + name + "\"");
+            return;
+        }
+
+        dstNet.connect(addedBlobs[idx].layerId, addedBlobs[idx].outNum, layerId, inNum);
+    }
+};
+
+}
+
+Net readNetFromCaffe(const String &prototxt, const String &caffeModel /*= String()*/)
+{
+    CaffeImporter caffeImporter(prototxt.c_str(), caffeModel.c_str());
+    Net net;
+    caffeImporter.populateNet(net);
+    return net;
+}
+
+Net readNetFromCaffe(const char *bufferProto, size_t lenProto,
+                     const char *bufferModel, size_t lenModel)
+{
+    CaffeImporter caffeImporter(bufferProto, lenProto, bufferModel, lenModel);
+    Net net;
+    caffeImporter.populateNet(net);
+    return net;
+}
+
+Net readNetFromCaffe(const std::vector<uchar>& bufferProto, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferProtoPtr = reinterpret_cast<const char*>(&bufferProto[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromCaffe(bufferProtoPtr, bufferProto.size(),
+                            bufferModelPtr, bufferModel.size());
+}
+
+#endif //HAVE_PROTOBUF
+
+CV__DNN_INLINE_NS_END
+}} // namespace
--- a/Lib/opencv/sources/modules/dnn/src/caffe/caffe_io.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/caffe/caffe_io.cpp
--- a/Lib/opencv/sources/modules/dnn/src/caffe/caffe_io.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/caffe/caffe_io.hpp
@@ -0,0 +1,129 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//COPYRIGHT
+//
+//All contributions by the University of California:
+//Copyright (c) 2014, The Regents of the University of California (Regents)
+//All rights reserved.
+//
+//All other contributions:
+//Copyright (c) 2014, the respective contributors
+//All rights reserved.
+//
+//Caffe uses a shared copyright model: each contributor holds copyright over
+//their contributions to Caffe. The project versioning records all such
+//contribution and copyright details. If a contributor wants to further mark
+//their specific copyright on a particular contribution, they should indicate
+//their copyright solely in the commit message of the change when it is
+//committed.
+//
+//LICENSE
+//
+//Redistribution and use in source and binary forms, with or without
+//modification, are permitted provided that the following conditions are met:
+//
+//1. Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//2. Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+//THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+//ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+//WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+//DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+//ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+//(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+//LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+//ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+//(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+//SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+//CONTRIBUTION AGREEMENT
+//
+//By contributing to the BVLC/caffe repository through pull-request, comment,
+//or otherwise, the contributor releases their content to the
+//license and copyright terms herein.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_CAFFE_IO_HPP__
+#define __OPENCV_DNN_CAFFE_IO_HPP__
+#ifdef HAVE_PROTOBUF
+
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wsuggest-override"
+#endif
+#include "opencv-caffe.pb.h"
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
+
+namespace caffe { using namespace opencv_caffe; } // avoid massive renames from caffe proto package
+
+namespace cv {
+namespace dnn {
+
+// Read parameters from a file into a NetParameter proto message.
+void ReadNetParamsFromTextFileOrDie(const char* param_file,
+                                    caffe::NetParameter* param);
+void ReadNetParamsFromBinaryFileOrDie(const char* param_file,
+                                      caffe::NetParameter* param);
+
+// Read parameters from a memory buffer into a NetParammeter proto message.
+void ReadNetParamsFromBinaryBufferOrDie(const char* data, size_t len,
+                                        caffe::NetParameter* param);
+void ReadNetParamsFromTextBufferOrDie(const char* data, size_t len,
+                                      caffe::NetParameter* param);
+
+// Utility functions used internally by Caffe and TensorFlow loaders
+bool ReadProtoFromTextFile(const char* filename, ::google::protobuf::Message* proto);
+bool ReadProtoFromBinaryFile(const char* filename, ::google::protobuf::Message* proto);
+bool ReadProtoFromTextBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
+bool ReadProtoFromBinaryBuffer(const char* data, size_t len, ::google::protobuf::Message* proto);
+
+}
+}
+#endif
+#endif
--- a/Lib/opencv/sources/modules/dnn/src/caffe/caffe_shrinker.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/caffe/caffe_shrinker.cpp
@@ -0,0 +1,80 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "../precomp.hpp"
+
+#ifdef HAVE_PROTOBUF
+#include <fstream>
+#include "caffe_io.hpp"
+#endif
+
+namespace cv { namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+#ifdef HAVE_PROTOBUF
+
+void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& layersTypes)
+{
+    CV_TRACE_FUNCTION();
+
+    std::vector<String> types(layersTypes);
+    if (types.empty())
+    {
+        types.push_back("Convolution");
+        types.push_back("InnerProduct");
+    }
+
+    caffe::NetParameter net;
+    ReadNetParamsFromBinaryFileOrDie(src.c_str(), &net);
+
+    for (int i = 0; i < net.layer_size(); ++i)
+    {
+        caffe::LayerParameter* lp = net.mutable_layer(i);
+        if (std::find(types.begin(), types.end(), lp->type()) == types.end())
+        {
+            continue;
+        }
+        for (int j = 0; j < lp->blobs_size(); ++j)
+        {
+            caffe::BlobProto* blob = lp->mutable_blobs(j);
+            CV_Assert(blob->data_size() != 0);  // float32 array.
+
+            Mat floats(1, blob->data_size(), CV_32FC1, (void*)blob->data().data());
+            Mat halfs(1, blob->data_size(), CV_16SC1);
+            convertFp16(floats, halfs);  // Convert to float16.
+
+            blob->clear_data();  // Clear float32 data.
+
+            // Set float16 data.
+            blob->set_raw_data(halfs.data, halfs.total() * halfs.elemSize());
+            blob->set_raw_data_type(caffe::FLOAT16);
+        }
+    }
+#if GOOGLE_PROTOBUF_VERSION < 3005000
+    size_t msgSize = saturate_cast<size_t>(net.ByteSize());
+#else
+    size_t msgSize = net.ByteSizeLong();
+#endif
+    std::vector<uint8_t> output(msgSize);
+    net.SerializeWithCachedSizesToArray(&output[0]);
+
+    std::ofstream ofs(dst.c_str(), std::ios::binary);
+    ofs.write((const char*)&output[0], msgSize);
+    ofs.close();
+}
+
+#else
+
+void shrinkCaffeModel(const String& src, const String& dst, const std::vector<String>& types)
+{
+    CV_Error(cv::Error::StsNotImplemented, "libprotobuf required to import data from Caffe models");
+}
+
+#endif  // HAVE_PROTOBUF
+
+CV__DNN_INLINE_NS_END
+}} // namespace
--- a/Lib/opencv/sources/modules/dnn/src/caffe/glog_emulator.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/caffe/glog_emulator.hpp
@@ -0,0 +1,106 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
+#define __OPENCV_DNN_CAFFE_GLOG_EMULATOR_HPP__
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <opencv2/core.hpp>
+
+#define CHECK(cond)     for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #cond, cond); _logger.exit(); _logger.check()) _logger.stream()
+#define CHECK_EQ(a, b)  for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, "CHECK", #a"="#b, ((a) == (b))); _logger.exit(); _logger.check()) _logger.stream()
+#define LOG(TYPE)       for(cv::dnn::GLogWrapper _logger(__FILE__, CV_Func, __LINE__, #TYPE); _logger.exit(); _logger.check()) _logger.stream()
+
+namespace cv
+{
+namespace dnn
+{
+
+class GLogWrapper
+{
+    const char *file, *func, *type, *cond_str;
+    int line;
+    bool cond_status, exit_loop;
+    std::stringstream sstream;
+
+public:
+
+    GLogWrapper(const char *_file, const char *_func, int _line,
+          const char *_type,
+          const char *_cond_str = NULL, bool _cond_status = true
+    ) :
+        file(_file), func(_func), type(_type), cond_str(_cond_str),
+        line(_line), cond_status(_cond_status), exit_loop(true) {}
+
+    std::iostream &stream()
+    {
+        return sstream;
+    }
+
+    bool exit()
+    {
+        return exit_loop;
+    }
+
+    void check()
+    {
+        exit_loop = false;
+
+        if (cond_str && !cond_status)
+        {
+            cv::error(cv::Error::StsError, "FAILED: " + String(cond_str) + ". " + sstream.str(), func, file, line);
+        }
+        else if (!cond_str && strcmp(type, "CHECK"))
+        {
+            #ifndef NDEBUG
+            if (!std::strcmp(type, "INFO"))
+                std::cout << sstream.str() << std::endl;
+            else
+                std::cerr << sstream.str() << std::endl;
+            #endif
+        }
+    }
+};
+
+}
+}
+#endif
--- a/Lib/opencv/sources/modules/dnn/src/caffe/opencv-caffe.proto
+++ b/Lib/opencv/sources/modules/dnn/src/caffe/opencv-caffe.proto
--- a/Lib/opencv/sources/modules/dnn/src/cuda/activations.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/activations.cu
@@ -0,0 +1,521 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn  { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void abs_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::abs;
+                    vec.data[j] = abs(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void tanh_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::tanh;
+                    vec.data[j] = tanh(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void swish_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::sigmoid;
+                    vec.data[j] = vec.data[j] * sigmoid(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void mish_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::tanh;
+                    using device::log1pexp;
+                    vec.data[j] = vec.data[j] * tanh(log1pexp(vec.data[j]));
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void sigmoid_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::sigmoid;
+                    vec.data[j] = sigmoid(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void bnll_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::log1pexp;
+                    vec.data[j] = vec.data[j] > T(0) ? vec.data[j] + log1pexp(-vec.data[j]) : log1pexp(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void elu_vec(Span<T> output, View<T> input) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::expm1;
+                    vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : expm1(vec.data[j]);
+                }
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void relu_vec(Span<T> output, View<T> input, T slope) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for(int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : slope * vec.data[j];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void clipped_relu_vec(Span<T> output, View<T> input, T floor, T ceiling) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                using device::clamp;
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = clamp(vec.data[j], floor, ceiling);
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void axiswise_relu_vec(Span<T> output, View<T> input, size_type inner_size, View<T> slope) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type c = (i / inner_size) % static_cast<size_type>(slope.size());
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = vec.data[j] > T(0) ? vec.data[j] : vec.data[j] * slope[c];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void power_vec(Span<T> output, View<T> input, T exp, T scale, T shift) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                using device::pow;
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = pow(shift + scale * vec.data[j], exp);
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N>
+    void launch_vectorized_abs(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::abs_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void abs(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_abs<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_abs<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_abs<T, 1>(stream, output, input);
+        }
+    }
+
+    template void abs<__half>(const Stream& stream, Span<__half> output, View<__half> input);
+    template void abs<float>(const Stream& stream, Span<float> output, View<float> input);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_tanh(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::tanh_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void tanh(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_tanh<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_tanh<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_tanh<T, 1>(stream, output, input);
+        }
+    }
+
+    template void tanh<__half>(const Stream&, Span<__half>, View<__half>);
+    template void tanh<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_swish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::swish_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void swish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_swish<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_swish<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_swish<T, 1>(stream, output, input);
+        }
+    }
+
+    template void swish<__half>(const Stream&, Span<__half>, View<__half>);
+    template void swish<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_mish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::mish_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void mish(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_mish<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_mish<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_mish<T, 1>(stream, output, input);
+        }
+    }
+
+    template void mish<__half>(const Stream&, Span<__half>, View<__half>);
+    template void mish<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_sigmoid(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::sigmoid_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void sigmoid(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_sigmoid<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_sigmoid<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_sigmoid<T, 1>(stream, output, input);
+        }
+    }
+
+    template void sigmoid<__half>(const Stream&, Span<__half>, View<__half>);
+    template void sigmoid<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_bnll(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::bnll_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void bnll(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_bnll<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_bnll<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_bnll<T, 1>(stream, output, input);
+        }
+    }
+
+    template void bnll<__half>(const Stream&, Span<__half>, View<__half>);
+    template void bnll<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_elu(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::elu_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input);
+    }
+
+    template <class T>
+    void elu(const Stream& stream, Span<T> output, View<T> input) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_elu<T, 4>(stream, output, input);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_elu<T, 2>(stream, output, input);
+        } else {
+            launch_vectorized_elu<T, 1>(stream, output, input);
+        }
+    }
+
+    template void elu<__half>(const Stream&, Span<__half>, View<__half>);
+    template void elu<float>(const Stream&, Span<float>, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::relu_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, slope);
+    }
+
+    template <class T>
+    void relu(const Stream& stream, Span<T> output, View<T> input, T slope) {
+        CV_Assert(input.size() == output.size());
+
+        if(is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_relu<T, 4>(stream, output, input, slope);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_relu<T, 2>(stream, output, input, slope);
+        } else {
+            launch_vectorized_relu<T, 1>(stream, output, input, slope);
+        }
+    }
+
+    template void relu<__half>(const Stream&, Span<__half>, View<__half>, __half);
+    template void relu<float>(const Stream&, Span<float>, View<float>, float);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::clipped_relu_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, floor, ceiling);
+    }
+
+    template <class T>
+    void clipped_relu(const Stream& stream, Span<T> output, View<T> input, T floor, T ceiling) {
+        CV_Assert(input.size() == output.size());
+        CV_Assert(static_cast<double>(floor) <= static_cast<double>(ceiling));
+
+        if(is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_vectorized_clipped_relu<T, 4>(stream, output, input, floor, ceiling);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_vectorized_clipped_relu<T, 2>(stream, output, input, floor, ceiling);
+        } else {
+            launch_vectorized_clipped_relu<T, 1>(stream, output, input, floor, ceiling);
+        }
+    }
+
+    template void clipped_relu<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+    template void clipped_relu<float>(const Stream&, Span<float>, View<float>, float, float);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::axiswise_relu_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, slope);
+    }
+
+    template <class T>
+    void axiswise_relu(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> slope) {
+        CV_Assert(input.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_vectorized_axiswise_relu<T, 4>(stream, output, input, inner_size, slope);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_vectorized_axiswise_relu<T, 2>(stream, output, input, inner_size, slope);
+        } else {
+            launch_vectorized_axiswise_relu<T, 1>(stream, output, input, inner_size, slope);
+        }
+    }
+
+    template void axiswise_relu<__half>(const Stream&, Span<__half>, View<__half>, std::size_t, View<__half>);
+    template void axiswise_relu<float>(const Stream&, Span<float>, View<float>, std::size_t, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::power_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, exp, scale, shift);
+    }
+
+    template <class T>
+    void power(const Stream& stream, Span<T> output, View<T> input, T exp, T scale, T shift) {
+        CV_Assert(input.size() == output.size());
+
+        if (static_cast<float>(exp) == 1.0f) {
+            scale1_with_bias1(stream, output, input, scale, shift);
+            return;
+        }
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && output.size()) {
+            launch_vectorized_power<T, 4>(stream, output, input, exp, scale, shift);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && output.size()) {
+            launch_vectorized_power<T, 2>(stream, output, input, exp, scale, shift);
+        } else {
+            launch_vectorized_power<T, 1>(stream, output, input, exp, scale, shift);
+        }
+    }
+
+    template void power<__half>(const Stream&, Span<__half>, View<__half>, __half, __half, __half);
+    template void power<float>(const Stream&, Span<float>, View<float>, float, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/array.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/array.hpp
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+#define OPENCV_DNN_SRC_CUDA_ARRAY_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+
+#include <cstddef>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T, std::size_t N>
+    struct array {
+        using value_type        = T;
+        using size_type         = device::size_type;
+        using difference_type   = std::ptrdiff_t;
+        using reference         = typename std::add_lvalue_reference<value_type>::type;
+        using const_reference   = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>::type;
+        using pointer           = typename std::add_pointer<value_type>::type;
+        using const_pointer     = typename std::add_pointer<typename std::add_const<value_type>::type>::type;
+        using iterator          = pointer;
+        using const_iterator    = const_pointer;
+        using reverse_iterator  = std::reverse_iterator<iterator>;
+        using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+        __host__ __device__ bool empty() const noexcept { return N == 0; }
+        __host__ __device__ size_type size() const noexcept { return N; }
+
+        __host__ __device__ iterator begin() noexcept { return ptr; }
+        __host__ __device__ iterator end() noexcept { return ptr + N; }
+        __host__ __device__ const_iterator begin() const noexcept { return ptr; }
+        __host__ __device__ const_iterator end() const noexcept { return ptr + N; }
+
+        __host__ __device__ const_iterator cbegin() const noexcept { return ptr; }
+        __host__ __device__ const_iterator cend() const noexcept { return ptr + N; }
+
+        __host__ __device__ reverse_iterator rbegin() noexcept { return ptr + N; }
+        __host__ __device__ reverse_iterator rend() noexcept { return ptr; }
+        __host__ __device__ const_reverse_iterator rbegin() const noexcept { return ptr + N; }
+        __host__ __device__ const_reverse_iterator rend() const noexcept { return ptr; }
+
+        __host__ __device__ const_reverse_iterator crbegin() const noexcept { return ptr + N; }
+        __host__ __device__ const_reverse_iterator crend() const noexcept { return ptr; }
+
+        template <class InputItr>
+        __host__ void assign(InputItr first, InputItr last) {
+            std::copy(first, last, std::begin(ptr));
+        }
+
+        __host__ __device__ reference operator[](int idx) { return ptr[idx]; }
+        __host__ __device__ const_reference operator[](int idx) const { return ptr[idx]; }
+
+        __host__ __device__ reference front() { return ptr[0]; }
+        __host__ __device__ const_reference front() const { return ptr[0]; }
+
+        __host__ __device__ reference back() { return ptr[N - 1]; }
+        __host__ __device__ const_reference back() const { return ptr[N - 1]; }
+
+        __host__ __device__ pointer data() noexcept { return ptr; }
+        __host__ __device__ const_pointer data() const noexcept { return ptr; }
+
+        T ptr[N];
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_ARRAY_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/atomics.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/atomics.hpp
@@ -0,0 +1,32 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+#define OPENCV_DNN_SRC_CUDA_ATOMICS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#else
+inline __device__ void atomicAdd(__half* address, __half val) {
+    unsigned int* address_as_ui = (unsigned int *)((char *)address - ((size_t)address & 2));
+    unsigned int old = *address_as_ui;
+    unsigned int assumed;
+
+    do {
+        assumed = old;
+
+        __half_raw hsum;
+        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
+        __half tmpres = hsum + val;
+        hsum = __half_raw(tmpres);
+
+        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
+        old = atomicCAS(address_as_ui, assumed, old);
+    } while (assumed != old);
+}
+#endif
+
+#endif /* OPENCV_DNN_SRC_CUDA_ATOMICS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/bias_activation.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/bias_activation.cu
@@ -0,0 +1,336 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "types.hpp"
+#include "math.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+namespace raw {
+
+    template <class T, std::size_t N>
+    __global__ void biasN_relu_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, T slope) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                vec.data[j] += bias[bias_idx];
+                vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : slope * vec.data[j];
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void biasN_clipped_relu_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, T floor, T ceil) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                using device::clamp;
+                vec.data[j] = clamp(vec.data[j] + bias[bias_idx], floor, ceil);
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void biasN_power_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, T power) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                using device::pow;
+                vec.data[j] = pow(vec.data[j] + bias[bias_idx], power);
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void biasN_tanh_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                using device::tanh;
+                vec.data[j] = tanh(vec.data[j] + bias[bias_idx]);
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void biasN_sigmoid_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                using device::sigmoid;
+                vec.data[j] = sigmoid(vec.data[j] + bias[bias_idx]);
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void biasN_swish_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                using device::sigmoid;
+                vec.data[j] += bias[bias_idx];
+                vec.data[j] = vec.data[j] * sigmoid(vec.data[j]);
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+
+    template <class T, std::size_t N>
+    __global__ void biasN_mish_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
+        using vector_type = get_vector_type_t<T, N>;
+
+        auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
+
+        inner_size /= vector_type::size();
+        for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
+            const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+            vector_type vec;
+            v_load(vec, inplace_output_vPtr[i]);
+            for(int j = 0; j < vec.size(); j++) {
+                using device::tanh;
+                using device::log1pexp;
+                vec.data[j] += bias[bias_idx];
+                vec.data[j] = vec.data[j] * tanh(log1pexp(vec.data[j]));
+            }
+            v_store(inplace_output_vPtr[i], vec);
+        }
+    }
+}
+
+template <class T, std::size_t N> static
+void launch_biasN_relu_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_relu_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias, slope);
+}
+
+template <class T>
+void biasN_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_relu_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias, slope);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_relu_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias, slope);
+    } else {
+        launch_biasN_relu_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias, slope);
+    }
+}
+
+template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
+template void biasN_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
+
+template <class T, std::size_t N> static
+void launch_biasN_clipped_relu_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_clipped_relu_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias, floor, ceil);
+}
+
+template <class T>
+void biasN_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_clipped_relu_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias, floor, ceil);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_clipped_relu_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias, floor, ceil);
+    } else {
+        launch_biasN_clipped_relu_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias, floor, ceil);
+    }
+}
+
+template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half);
+template void biasN_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float);
+
+template <class T, std::size_t N> static
+void launch_biasN_power_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_power_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias, power);
+}
+
+template <class T>
+void biasN_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_power_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias, power);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_power_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias, power);
+    } else {
+        launch_biasN_power_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias, power);
+    }
+}
+
+template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
+template void biasN_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
+
+template <class T, std::size_t N> static
+void launch_biasN_tanh_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_tanh_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_tanh_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_tanh_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
+    } else {
+        launch_biasN_tanh_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
+    }
+}
+
+template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+
+template <class T, std::size_t N> static
+void launch_biasN_sigmoid_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_sigmoid_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_sigmoid_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_sigmoid_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
+    } else {
+        launch_biasN_sigmoid_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
+    }
+}
+
+template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+
+template <class T, std::size_t N> static
+void launch_biasN_swish_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_swish_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_swish_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_swish_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
+    } else {
+        launch_biasN_swish_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
+    }
+}
+
+template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+
+template <class T, std::size_t N> static
+void launch_biasN_mish_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    CV_Assert(is_fully_aligned<T>(inplace_output, N));
+    CV_Assert(inner_size % N == 0);
+
+    auto kernel = raw::biasN_mish_inplace_vec<T, N>;
+    auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
+    launch_kernel(kernel, policy, inplace_output, inner_size, bias);
+}
+
+template <class T>
+void biasN_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
+    if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
+        launch_biasN_mish_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
+    } else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
+        launch_biasN_mish_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
+    } else {
+        launch_biasN_mish_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
+    }
+}
+
+template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
+template void biasN_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/concat.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/concat.cu
@@ -0,0 +1,259 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void concat_vec(
+            Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+            View<T> input, size_type input_axis_size, size_type concat_size)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            /* we need to copy all the elements of input to some location in the output
+             * we copy blocks of size `total_concat_size` to some location in the output
+             */
+            const auto total_concat_size = concat_size * input_axis_size;
+
+            for (auto in_idx : grid_stride_range(input.size() / vector_type::size())) {
+                const index_type idx = in_idx * vector_type::size();
+                const index_type concat_num = idx / total_concat_size;
+                const index_type concat_index = idx % total_concat_size;
+                const index_type top_index = concat_index +
+                    (concat_num * output_axis_size + output_axis_offset) * concat_size;
+
+                const auto out_idx = top_index / vector_type::size();
+
+                vector_type vec;
+                v_load(vec, input_vPtr[in_idx]);
+                v_store(output_vPtr[out_idx], vec);
+            }
+        }
+
+        template <class T, std::size_t Rank>
+        __global__ void concat_with_offsets(
+            Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> out_offset,
+            View<T> input, array<size_type, Rank> in_strides)
+        {
+            for (auto i : grid_stride_range(input.size())) {
+                index_type in_index = i / in_strides[0];
+                index_type out_index = out_offset[0] + in_index;
+                index_type oidx = out_index * out_strides[0];
+                for (int j = 1; j < Rank; j++) {
+                    in_index = (i % in_strides[j - 1]) / in_strides[j];
+                    out_index = out_offset[j] + in_index;
+                    oidx += out_index * out_strides[j];
+                }
+
+                output[oidx] = input[i];
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_concat(const Stream& stream,
+        Span<T> output, size_type output_axis_size, index_type output_axis_offset,
+        View<T> input, size_type input_axis_size, size_type concat_size)
+    {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        /* more assertions are required to fully check for vectorization possibility; check concat() */
+
+        auto kernel = raw::concat_vec<T, N>;
+        auto policy = make_policy(kernel, input.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+    }
+
+    template <class T>
+    void concat(
+        const Stream& stream,
+        TensorSpan<T> output, std::size_t output_axis_offset,
+        TensorView<T> input, std::size_t axis)
+    {
+        /* let's call the axis of interest as the channel axis for the purpose of the following discussion
+         * even though it can be any axis
+         *
+         * for each batch item:
+         *    we move all the channels from the input (which together, for a single batch item, is contiguous)
+         *    of a batch item to its corresponding contiguous place in the output
+         *
+         * for a valid vector operation:
+         * - the size of each copy block must be aligned
+         * - input must be aligned
+         * - all the destination locations in the output must be aligned
+         */
+        std::size_t concat_size = output.size_range(axis + 1, output.rank());
+
+        std::size_t input_axis_size = input.get_axis_size(axis);
+        std::size_t output_axis_size = output.get_axis_size(axis);
+
+        std::size_t copy_block_size = concat_size * input_axis_size;
+        std::size_t copy_block_stride = concat_size * output_axis_size;
+        std::size_t starting_offset = output_axis_offset * concat_size;
+
+        /* in a nutshell, all this concat operation does is copy several blocks of size `copy_block_size`
+         * to the output starting from `starting_offset` with blocks in the output strided by `copy_block_stride`
+         */
+
+        bool is_aligned_4 = copy_block_size % 4 == 0 && copy_block_stride % 4 == 0 && starting_offset % 4 == 0;
+        bool is_aligned_2 = copy_block_size % 2 == 0 && copy_block_stride % 2 == 0 && starting_offset % 2 == 0;
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && is_aligned_4) {
+            launch_vectorized_concat<T, 4>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && is_aligned_2) {
+            launch_vectorized_concat<T, 2>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        } else {
+            launch_vectorized_concat<T, 1>(stream, output, output_axis_size, output_axis_offset, input, input_axis_size, concat_size);
+        }
+    }
+
+    template void concat<__half>(const Stream&, TensorSpan<__half>, std::size_t, TensorView<__half>, std::size_t);
+    template void concat<float>(const Stream&, TensorSpan<float>, std::size_t, TensorView<float>,  std::size_t);
+
+    template <class T, std::size_t Rank> static
+    void launch_concat_with_offsets(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride, const std::vector<std::size_t>& outOffset,
+        View<T> input, const std::vector<std::size_t>& inStride)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(outOffset.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> outOffset_k;
+        outOffset_k.assign(std::begin(outOffset), std::end(outOffset));
+
+        auto kernel = raw::concat_with_offsets<T, Rank>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, outOffset_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(concat_with_offsets_dispatcher, launch_concat_with_offsets);
+
+    template <class T>
+    void concat_with_offsets(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> offsets)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == offsets.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the output
+         * tensor will be [i1 + off1, i2 + off2, ...]. The concat operation essentially copies items
+         * from the input tensor to new locations in the output tensor.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output
+         * indices for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...]
+         * respectively. The first index does not contribute to the element's address calculation and
+         * hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(offsets[0] == 0);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            offsets.erase(std::begin(offsets));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == offsets.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes that undergo full copy can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not undergo any
+         * concatenation. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+         *
+         * Each axis in the contiguous axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be i12 * stride2` as the kernel iterates through `i12`.
+         * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any slicing */
+            if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+                    /* `j` axis is also copied fully; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    offsets[i] = 0; /* redundant */
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    offsets.erase(std::begin(offsets) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == offsets.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(offsets[i] == 0);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        concat_with_offsets_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, offsets, input, inStride);
+    }
+
+    template void concat_with_offsets(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+    template void concat_with_offsets(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/crop_and_resize.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/crop_and_resize.cu
@@ -0,0 +1,168 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void crop_and_resize(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes,
+            size_type num_channels)
+        {
+            // input [1, num_channels, in_height, in_width]
+            // output [boxes, num_channels, out_height, out_width]
+
+            const auto in_image_size = in_height * in_width;
+            const auto out_image_size = out_height * out_width;
+            const auto out_box_size = num_channels * out_image_size;
+
+            /* we have to compute the output value for every combination of (box, c, y, x) in the output
+             *
+             * the computation involving (y, x) are identical for all non-spatial dimensions
+             * the computation and memory requests involving the box are identical for remaining three axes
+             *
+             * we process multiple channels every iteration to reuse the identical computation
+             * and memory requests involved with the box and spatial dimensions
+             */
+
+            /*
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_channels / CHANNELS_PER_ITER) iterations per (box, x, y)
+             */
+            auto num_channel_iters_per_box_xy = num_channels / CHANNELS_PER_ITER;
+
+            /* we need `num_channel_iters_per_box_xy` iterations per (box, x, y) and there are
+             * `num_boxes` boxes and `out_image_size` combinations of (x, y)
+             */
+            auto num_boxes = boxes.size() / 7; /* 7 values per box */
+            auto iters_per_box = num_channel_iters_per_box_xy * out_image_size;
+            auto iters_required = num_boxes * iters_per_box;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type box_no = iter / iters_per_box;
+                const index_type c_start = ((iter % iters_per_box) / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                const index_type box_offset = box_no * 7;
+                const auto left = boxes[box_offset + 3],
+                           top = boxes[box_offset + 4],
+                           right = boxes[box_offset + 5],
+                           bottom = boxes[box_offset + 6];
+
+                const auto box_width = right - left;
+                const auto box_height = bottom - top;
+
+                const auto o2i_fy = static_cast<T>(in_height - 1) / static_cast<T>(out_height - 1);
+                const auto o2i_fx = static_cast<T>(in_width - 1) / static_cast<T>(out_width - 1);
+
+                const auto height_scale = box_height * o2i_fy;
+                const auto width_scale = box_width * o2i_fx;
+
+                const auto in_y = top * static_cast<T>(in_height - 1) + static_cast<T>(y) * height_scale;
+                const auto in_x = left * static_cast<T>(in_width - 1) + static_cast<T>(x) * width_scale;
+
+                const auto in_y0 = static_cast<index_type>(in_y);
+                const auto in_x0 = static_cast<index_type>(in_x);
+
+                using device::min;
+                const auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                const auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = box_no * out_box_size + c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling */
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    auto v_00 = input[in_offset_r0 + in_x0],
+                         v_01 = input[in_offset_r0 + in_x1],
+                         v_10 = input[in_offset_r1 + in_x0],
+                         v_11 = input[in_offset_r1 + in_x1];
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - T(in_y0)) * T(v_10 - v_00) +
+                        T(in_x - T(in_x0)) * T(v_01 - v_00) +
+                        T(in_y - T(in_y0)) * T(in_x - T(in_x0)) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_crop_and_resize(const Stream& stream,
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> boxes, size_type num_channels)
+    {
+        auto kernel = raw::crop_and_resize<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+    }
+
+    template <class T>
+    void crop_and_resize(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> boxes) {
+        CV_Assert(input.get_axis_size(0) == 1); /* batch not supported */
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_channels = input.get_axis_size(1);
+
+        if (num_channels % 64 == 0) {
+            launch_multichannel_crop_and_resize<T, 64>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 32 == 0) {
+            launch_multichannel_crop_and_resize<T, 32>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 16 == 0) {
+            launch_multichannel_crop_and_resize<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 8 == 0) {
+            launch_multichannel_crop_and_resize<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 4 == 0) {
+            launch_multichannel_crop_and_resize<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else if (num_channels % 2 == 0) {
+            launch_multichannel_crop_and_resize<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        } else {
+            launch_multichannel_crop_and_resize<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, boxes, num_channels);
+        }
+    }
+
+    template void crop_and_resize<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, View<__half> boxes);
+    template void crop_and_resize<float>(const Stream&, TensorSpan<float>, TensorView<float>, View<float> boxes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/eltwise_ops.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/eltwise_ops.cu
@@ -0,0 +1,272 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void eltwise_max_2_vec(Span<T> output, View<T> x, View<T> y) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto x_vPtr = vector_type::get_pointer(x.data());
+            auto y_vPtr = vector_type::get_pointer(y.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec_x, vec_y;
+                v_load(vec_x, x_vPtr[i]);
+                v_load(vec_y, y_vPtr[i]);
+
+                for (int j = 0; j < vector_type::size(); j++) {
+                    using device::max;
+                    vec_x.data[j] = max(vec_x.data[j], vec_y.data[j]);
+                }
+
+                v_store(output_vPtr[i], vec_x);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void eltwise_sum_2_vec(Span<T> output, View<T> x, View<T> y) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto x_vPtr = vector_type::get_pointer(x.data());
+            auto y_vPtr = vector_type::get_pointer(y.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec_x, vec_y;
+                v_load(vec_x, x_vPtr[i]);
+                v_load(vec_y, y_vPtr[i]);
+
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec_x.data[j] = vec_x.data[j] + vec_y.data[j];
+
+                v_store(output_vPtr[i], vec_x);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void eltwise_sum_coeff_2_vec(Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto x_vPtr = vector_type::get_pointer(x.data());
+            auto y_vPtr = vector_type::get_pointer(y.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec_x, vec_y;
+                v_load(vec_x, x_vPtr[i]);
+                v_load(vec_y, y_vPtr[i]);
+
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec_x.data[j] = coeff_x * vec_x.data[j] + coeff_y * vec_y.data[j];
+
+                v_store(output_vPtr[i], vec_x);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void eltwise_prod_2_vec(Span<T> output, View<T> x, View<T> y) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto x_vPtr = vector_type::get_pointer(x.data());
+            auto y_vPtr = vector_type::get_pointer(y.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec_x, vec_y;
+                v_load(vec_x, x_vPtr[i]);
+                v_load(vec_y, y_vPtr[i]);
+
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec_x.data[j] = vec_x.data[j] * vec_y.data[j];
+
+                v_store(output_vPtr[i], vec_x);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void eltwise_div_2_vec(Span<T> output, View<T> x, View<T> y) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto x_vPtr = vector_type::get_pointer(x.data());
+            auto y_vPtr = vector_type::get_pointer(y.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec_x, vec_y;
+                v_load(vec_x, x_vPtr[i]);
+                v_load(vec_y, y_vPtr[i]);
+
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec_x.data[j] = vec_x.data[j] / vec_y.data[j];
+
+                v_store(output_vPtr[i], vec_x);
+            }
+        }
+    }
+
+    template <class T, std::size_t N>
+    void launch_vectorized_eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(x, N));
+        CV_Assert(is_fully_aligned<T>(y, N));
+
+        auto kernel = raw::eltwise_max_2_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, x, y);
+    }
+
+    template <class T>
+    void eltwise_max_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_max_2<T, 4>(stream, output, x, y);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_max_2<T, 2>(stream, output, x, y);
+        } else {
+            launch_vectorized_eltwise_max_2<T, 1>(stream, output, x, y);
+        }
+    }
+
+    template void eltwise_max_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+    template void eltwise_max_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(x, N));
+        CV_Assert(is_fully_aligned<T>(y, N));
+
+        auto kernel = raw::eltwise_sum_2_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, x, y);
+    }
+
+    template <class T>
+    void eltwise_sum_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_sum_2<T, 4>(stream, output, x, y);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_sum_2<T, 2>(stream, output, x, y);
+        } else {
+            launch_vectorized_eltwise_sum_2<T, 1>(stream, output, x, y);
+        }
+    }
+
+    template void eltwise_sum_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+    template void eltwise_sum_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(x, N));
+        CV_Assert(is_fully_aligned<T>(y, N));
+
+        auto kernel = raw::eltwise_sum_coeff_2_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, coeff_x, x, coeff_y, y);
+    }
+
+    template <class T>
+    void eltwise_sum_coeff_2(const Stream& stream, Span<T> output, T coeff_x, View<T> x, T coeff_y, View<T> y) {
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (static_cast<float>(coeff_x) == 1.0f && static_cast<float>(coeff_y) == 1.0f) {
+            eltwise_sum_2(stream, output, x, y);
+            return;
+        }
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_sum_coeff_2<T, 4>(stream, output, coeff_x, x, coeff_y, y);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_sum_coeff_2<T, 2>(stream, output, coeff_x, x, coeff_y, y);
+        } else {
+            launch_vectorized_eltwise_sum_coeff_2<T, 1>(stream, output, coeff_x, x, coeff_y, y);
+        }
+    }
+
+    template void eltwise_sum_coeff_2(const Stream&, Span<__half>, __half, View<__half>, __half, View<__half>);
+    template void eltwise_sum_coeff_2(const Stream&, Span<float>, float, View<float>, float, View<float>);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(x, N));
+        CV_Assert(is_fully_aligned<T>(y, N));
+
+        auto kernel = raw::eltwise_prod_2_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, x, y);
+    }
+
+    template <class T>
+    void eltwise_prod_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_prod_2<T, 4>(stream, output, x, y);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_prod_2<T, 2>(stream, output, x, y);
+        } else {
+            launch_vectorized_eltwise_prod_2<T, 1>(stream, output, x, y);
+        }
+    }
+
+    template void eltwise_prod_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+    template void eltwise_prod_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+    template <class T, std::size_t N>
+    void launch_vectorized_eltwise_div_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(x, N));
+        CV_Assert(is_fully_aligned<T>(y, N));
+
+        auto kernel = raw::eltwise_div_2_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, x, y);
+    }
+
+    template <class T>
+    void eltwise_div_2(const Stream& stream, Span<T> output, View<T> x, View<T> y) {
+        CV_Assert(x.size() == y.size());
+        CV_Assert(x.size() == output.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(x, 4) && is_fully_aligned<T>(y, 4)) {
+            launch_vectorized_eltwise_div_2<T, 4>(stream, output, x, y);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(x, 2) && is_fully_aligned<T>(y, 2)) {
+            launch_vectorized_eltwise_div_2<T, 2>(stream, output, x, y);
+        } else {
+            launch_vectorized_eltwise_div_2<T, 1>(stream, output, x, y);
+        }
+    }
+
+    template void eltwise_div_2(const Stream& stream, Span<__half> output, View<__half> x, View<__half> y);
+    template void eltwise_div_2(const Stream& stream, Span<float> output, View<float> x, View<float> y);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/execution.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/execution.hpp
@@ -0,0 +1,81 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+#define OPENCV_DNN_SRC_CUDA_EXECUTION_HPP
+
+#include "../cuda4dnn/csl/error.hpp"
+#include "../cuda4dnn/csl/stream.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    struct execution_policy {
+        execution_policy(dim3 grid_size, dim3 block_size)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ 0 } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ nullptr } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, const Stream& strm)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ 0 }, stream{ strm.get() } { }
+
+        execution_policy(dim3 grid_size, dim3 block_size, std::size_t shared_mem, const Stream& strm)
+            : grid{ grid_size }, block{ block_size }, sharedMem{ shared_mem }, stream{ strm.get() } { }
+
+        dim3 grid;
+        dim3 block;
+        std::size_t sharedMem;
+        cudaStream_t stream;
+    };
+
+    /* this overload shouldn't be necessary; we should always provide a bound on the number of threads */
+    /*
+    template <class Kernel> inline
+    execution_policy make_policy(Kernel kernel, std::size_t sharedMem = 0, const Stream& stream = 0) {
+        int grid_size, block_size;
+        CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+        return execution_policy(grid_size, block_size, sharedMem, stream);
+    }*/
+
+    template <class Kernel> inline
+    execution_policy make_policy(Kernel kernel, std::size_t max_threads, std::size_t sharedMem = 0, const Stream& stream = 0) {
+        CV_Assert(max_threads > 0);
+
+        int grid_size = 0, block_size = 0;
+        CUDA4DNN_CHECK_CUDA(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, sharedMem));
+        if (grid_size * block_size > max_threads) {
+            grid_size = (max_threads + block_size - 1) / block_size;
+            if (block_size > max_threads)
+                block_size = max_threads;
+        }
+
+        CV_Assert(grid_size >= 1 && block_size >= 1);
+        return execution_policy(grid_size, block_size, sharedMem, stream);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, Args ...args) {
+        auto policy = make_policy(kernel);
+        kernel <<<policy.grid, policy.block>>> (std::forward<Args>(args)...);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, dim3 grid, dim3 block, Args ...args) {
+        kernel <<<grid, block>>> (std::forward<Args>(args)...);
+    }
+
+    template <class Kernel, typename ...Args> inline
+    void launch_kernel(Kernel kernel, execution_policy policy, Args ...args) {
+        kernel <<<policy.grid, policy.block, policy.sharedMem, policy.stream>>> (std::forward<Args>(args)...);
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA_EXECUTION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/fill.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/fill.cu
@@ -0,0 +1,58 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void fill_vec(Span<T> output, T value) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                for (int j = 0; j < vector_type::size(); j++)
+                    vec.data[j] = value;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_vectorized_fill(const Stream& stream, Span<T> output, T value) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+
+        auto kernel = raw::fill_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, value);
+    }
+
+    template <class T>
+    void fill(const Stream& stream, Span<T> output, T value) {
+        if (is_fully_aligned<T>(output, 4)) {
+            launch_vectorized_fill<T, 4>(stream, output, value);
+        } else if (is_fully_aligned<T>(output, 2)) {
+            launch_vectorized_fill<T, 2>(stream, output, value);
+        } else {
+            launch_vectorized_fill<T, 1>(stream, output, value);
+        }
+    }
+
+    template void fill(const Stream&, Span<__half>, __half);
+    template void fill(const Stream&, Span<float>, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/grid_stride_range.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/grid_stride_range.hpp
@@ -0,0 +1,92 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+#define OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP
+
+#include "types.hpp"
+
+#include <cuda_runtime.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    namespace detail {
+        template <int>  __device__ auto getGridDim()->decltype(dim3::x);
+        template <> inline __device__ auto getGridDim<0>()->decltype(dim3::x) { return gridDim.x; }
+        template <> inline __device__ auto getGridDim<1>()->decltype(dim3::x) { return gridDim.y; }
+        template <> inline __device__ auto getGridDim<2>()->decltype(dim3::x) { return gridDim.z; }
+
+        template <int> __device__ auto getBlockDim()->decltype(dim3::x);
+        template <> inline __device__ auto getBlockDim<0>()->decltype(dim3::x) { return blockDim.x; }
+        template <> inline __device__ auto getBlockDim<1>()->decltype(dim3::x) { return blockDim.y; }
+        template <> inline __device__ auto getBlockDim<2>()->decltype(dim3::x) { return blockDim.z; }
+
+        template <int> __device__ auto getBlockIdx()->decltype(uint3::x);
+        template <> inline __device__ auto getBlockIdx<0>()->decltype(uint3::x) { return blockIdx.x; }
+        template <> inline __device__ auto getBlockIdx<1>()->decltype(uint3::x) { return blockIdx.y; }
+        template <> inline __device__ auto getBlockIdx<2>()->decltype(uint3::x) { return blockIdx.z; }
+
+        template <int> __device__ auto getThreadIdx()->decltype(uint3::x);
+        template <> inline __device__ auto getThreadIdx<0>()->decltype(uint3::x) { return threadIdx.x; }
+        template <> inline __device__ auto getThreadIdx<1>()->decltype(uint3::x) { return threadIdx.y; }
+        template <> inline __device__ auto getThreadIdx<2>()->decltype(uint3::x) { return threadIdx.z; }
+    }
+
+    template <int dim, class index_type = device::index_type, class size_type = device::size_type>
+    class grid_stride_range_generic {
+    public:
+        __device__ grid_stride_range_generic(index_type to_) : from(0), to(to_) { }
+        __device__ grid_stride_range_generic(index_type from_, index_type to_) : from(from_), to(to_) { }
+
+        class iterator
+        {
+        public:
+            __device__ iterator(index_type pos_) : pos(pos_) {}
+
+            /* these iterators return the index when dereferenced; this allows us to loop
+             * through the indices using a range based for loop
+             */
+            __device__ index_type operator*() const { return pos; }
+
+            __device__ iterator& operator++() {
+                pos += detail::getGridDim<dim>() * static_cast<index_type>(detail::getBlockDim<dim>());
+                return *this;
+            }
+
+            __device__ bool operator!=(const iterator& other) const {
+                /* NOTE HACK
+                ** 'pos' can move in large steps (see operator++)
+                ** expansion of range for loop uses != as the loop conditioion
+                ** => operator!= must return false if 'pos' crosses the end
+                */
+                return pos < other.pos;
+            }
+
+        private:
+            index_type pos;
+        };
+
+        __device__ iterator begin() const {
+            using detail::getBlockDim;
+            using detail::getBlockIdx;
+            using detail::getThreadIdx;
+            return iterator(from + getBlockDim<dim>() * getBlockIdx<dim>() + getThreadIdx<dim>());
+        }
+
+        __device__ iterator end() const {
+            return iterator(to);
+        }
+
+    private:
+        index_type from, to;
+    };
+
+    using grid_stride_range_x = grid_stride_range_generic<0>;
+    using grid_stride_range_y = grid_stride_range_generic<1>;
+    using grid_stride_range_z = grid_stride_range_generic<2>;
+    using grid_stride_range = grid_stride_range_x;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_GRID_STRIDE_RANGE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/kernel_dispatcher.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/kernel_dispatcher.hpp
@@ -0,0 +1,76 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+#define OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+/* The performance of many kernels are highly dependent on the tensor rank. Instead of having
+ * one kernel which can work with the maximally ranked tensors, we make one kernel for each supported
+ * tensor rank. This is to ensure that the requirements of the maximally ranked tensors do not take a
+ * toll on the performance of the operation for low ranked tensors. Hence, many kernels take the tensor
+ * rank as a template parameter.
+ *
+ * The kernel is a template and we have different instantiations for each rank. This causes the following pattern
+ * to arise frequently:
+ *
+ * if(rank == 3)
+ *     kernel<T, 3>();
+ * else if(rank == 2)
+ *     kernel<T, 2>();
+ * else
+ *     kernel<T, 1>();
+ *
+ * The rank is a runtime variable. To facilitate creation of such structures, we use GENERATE_KERNEL_DISPATCHER.
+ * This macro creates a function which selects the correct kernel instantiation at runtime.
+ *
+ * Example:
+ *
+ * // function which setups the kernel and launches it
+ * template <class T, std::size_t Rank>
+ * void launch_some_kernel(...);
+ *
+ * // creates the dispatcher named "some_dispatcher" which invokves the correct instantiation of "launch_some_kernel"
+ * GENERATE_KERNEL_DISPATCHER(some_dispatcher, launch_some_kernel);
+ *
+ * // internal API function
+ * template <class T>
+ * void some(...) {
+ *    // ...
+ *    auto rank = input.rank();
+ *    some_dispatcher<T, MIN_RANK, MAX_RANK>(rank, ...);
+ * }
+ */
+
+/*
+ * name     name of the dispatcher function that is generated
+ * func     template function that requires runtime selection
+ *
+ * T        first template parameter to `func`
+ * start    starting rank
+ * end      ending rank (inclusive)
+ *
+ * Executes func<T, selector> based on runtime `selector` argument given `selector` lies
+ * within the range [start, end]. If outside the range, no instantiation of `func` is executed.
+ */
+#define GENERATE_KERNEL_DISPATCHER(name,func);                                          \
+    template <class T, std::size_t start, std::size_t end, class... Args> static        \
+    typename std::enable_if<start == end, void>                                         \
+    ::type name(int selector, Args&& ...args) {                                         \
+        if(selector == start)                                                           \
+            func<T, start>(std::forward<Args>(args)...);                                \
+    }                                                                                   \
+                                                                                        \
+    template <class T, std::size_t start, std::size_t end, class... Args> static        \
+    typename std::enable_if<start != end, void>                                         \
+    ::type name(int selector, Args&& ...args) {                                         \
+        if(selector == start)                                                           \
+            func<T, start>(std::forward<Args>(args)...);                                \
+        else                                                                            \
+            name<T, start + 1, end, Args...>(selector, std::forward<Args>(args)...);    \
+    }
+
+#endif /* OPENCV_DNN_SRC_CUDA_KERNEL_DISPATCHER_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/limits.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/limits.hpp
@@ -0,0 +1,34 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+#define OPENCV_DNN_SRC_CUDA_LIMITS_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include <cfloat>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T>
+    struct numeric_limits;
+
+    template <>
+    struct numeric_limits<__half> {
+        __device__ static __half min() { return 0.0000610; }
+        __device__ static __half max() { return 65504.0; }
+        __device__ static __half lowest() { return -65504.0; }
+    };
+
+    template <>
+    struct numeric_limits<float> {
+        __device__ static float min() { return FLT_MIN; }
+        __device__ static float max() { return FLT_MAX; }
+        __device__ static float lowest() { return -FLT_MAX; }
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_LIMITS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/math.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/math.hpp
@@ -0,0 +1,137 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_MATH_HPP
+#define OPENCV_DNN_SRC_CUDA_MATH_HPP
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    template <class T> __device__ T abs(T val) { return (val < T(0) ? -val : val); }
+    template <> inline __device__ __half2 abs(__half2 val) {
+        val.x = abs(val.x);
+        val.y = abs(val.y);
+        return val;
+    }
+    template <> inline __device__ float abs(float val) { return fabsf(val); }
+    template <> inline __device__ double abs(double val) { return fabs(val); }
+
+    template <class T> __device__ T exp(T val);
+    template <> inline __device__ __half exp(__half val) { return hexp(val); }
+    template <> inline __device__ __half2 exp(__half2 val) { return h2exp(val); }
+    template <> inline __device__ float exp(float val) { return expf(val); }
+    template <> inline __device__ double exp(double val) { return ::exp(val); }
+
+    template <class T> __device__ T expm1(T val);
+    template <> inline __device__ __half expm1(__half val) { return hexp(val) - __half(1); }
+    template <> inline __device__ __half2 expm1(__half2 val) { return h2exp(val) - __half2(1, 1); }
+    template <> inline __device__ float expm1(float val) { return expm1f(val); }
+    template <> inline __device__ double expm1(double val) { return ::expm1(val); }
+
+    template <class T> __device__ T max(T x, T y) { return (x > y ? x : y); }
+    template <> inline __device__ __half2 max(__half2 a, __half2 b) {
+        a.x = max(a.x, a.x);
+        a.y = max(a.y, b.y);
+        return a;
+    }
+    template <> inline __device__ float max(float x, float y) { return fmaxf(x, y); }
+    template <> inline __device__ double max(double x, double y) { return fmax(x, y); }
+
+    template <class T> __device__ T min(T x, T y) { return (x > y ? y : x); }
+    template <> inline __device__ __half2 min(__half2 a, __half2 b) {
+        a.x = min(a.x, a.x);
+        a.y = min(a.y, b.y);
+        return a;
+    }
+    template <> inline __device__ float min(float x, float y) { return fminf(x, y); }
+    template <> inline __device__ double min(double x, double y) { return fmin(x, y); }
+
+    template <class T> __device__ T log1p(T val);
+    template <> inline __device__ __half log1p(__half val) { return hlog(__half(1) + val); }
+    template <> inline __device__ __half2 log1p(__half2 val) { return h2log(__half2(1, 1) + val); }
+    template <> inline __device__ float log1p(float val) { return log1pf(val); }
+
+    template <class T> __device__ T log1pexp(T val);
+    template <> inline __device__ __half log1pexp(__half val) {
+        if (val <= __half(-4.0))
+            return exp(val);
+        else if (val <= __half(8.0))
+            return log1p(exp(val));
+        else if (val <= __half(8.7))
+            return val + exp(-val);
+        else
+            return val;
+    }
+    template <> inline __device__ __half2 log1pexp(__half2 val) {
+        val.x = log1pexp(val.x);
+        val.y = log1pexp(val.y);
+        return val;
+    }
+    template <> inline __device__ float log1pexp(float val) {
+        if (val <= -20)
+            return expf(val);
+        else if (val <= 9.0)
+            return log1pf(expf(val));
+        else if (val <= 14.6)
+            return val + exp(-val);
+        else
+            return val;
+    }
+    template <> inline __device__ double log1pexp(double val) {
+        if (val <= -37)
+            return exp(val);
+        else if (val <= 18)
+            return log1p(exp(val));
+        else if (val <= 33.3)
+            return val + exp(-val);
+        else
+            return val;
+    }
+
+    template <class T> __device__ T tanh(T val);
+    template <> inline __device__ __half tanh(__half val) { return tanhf(val); }
+    template <> inline __device__ __half2 tanh(__half2 val) { return __half2(tanh(val.x), tanh(val.y)); }
+    template <> inline __device__ float tanh(float val) { return tanhf(val); }
+    template <> inline __device__ double tanh(double val) { return ::tanh(val); }
+
+    template <class T> __device__ T pow(T val, T exp);
+    template <> inline __device__ __half pow(__half val, __half exp) { return powf(val, exp); }
+    template <> inline __device__ __half2 pow(__half2 val, __half2 exp) { return __half2(pow(val.x, exp.x), pow(val.y, exp.y)); }
+    template <> inline __device__ float pow(float val, float exp) { return powf(val, exp); }
+    template <> inline __device__ double pow(double val, double exp) { return ::pow(val, exp); }
+
+    template <class T> __device__ T sqrt(T val);
+    template <> inline __device__ __half sqrt(__half val) { return hsqrt(val); }
+    template <> inline __device__ __half2 sqrt(__half2 val) { return h2sqrt(val); }
+    template <> inline __device__ float sqrt(float val) { return sqrtf(val); }
+    template <> inline __device__ double sqrt(double val) { return ::sqrt(val); }
+
+    template <class T> __device__ T rsqrt(T val);
+    template <> inline __device__ __half rsqrt(__half val) { return hrsqrt(val); }
+    template <> inline __device__ __half2 rsqrt(__half2 val) { return h2rsqrt(val); }
+    template <> inline __device__ float rsqrt(float val) { return rsqrtf(val); }
+    template <> inline __device__ double rsqrt(double val) { return ::rsqrt(val); }
+
+    template <class T> __device__ T sigmoid(T val) { return T(1) / (T(1) + exp(-val)); }
+    template <> inline __device__ __half2 sigmoid(__half2 val) { return __half2(1, 1) / (__half2(1, 1) + exp(__hneg2(val))); }
+
+    template <class T> __device__ T clamp(T value, T lower, T upper) { return min(max(value, lower), upper); }
+
+    template <class T> __device__ T round(T value);
+    template <> inline __device__ double round(double value) { return ::round(value); }
+    template <> inline __device__ float round(float value) { return roundf(value); }
+    template <> inline __device__ __half round(__half value) { return hrint(value); }
+    template <> inline __device__ __half2 round(__half2 value) { return h2rint(value); }
+
+    template <class T> __device__ T ceil(T value);
+    template <> inline __device__ double ceil(double value) { return ::ceil(value); }
+    template <> inline __device__ float ceil(float value) { return ceilf(value); }
+    template <> inline __device__ __half ceil(__half value) { return hceil(value); }
+    template <> inline __device__ __half2 ceil(__half2 value) { return h2ceil(value); }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_MATH_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/max_unpooling.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/max_unpooling.cu
@@ -0,0 +1,307 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "array.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <type_traits>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Order,
+        typename std::enable_if<Order == 2 || Order == 3, bool>::type = true> /* Order has been hardcoded; see code */
+        __global__ void max_pooling_with_indices(
+            Span<T> output, Span<T> indices, View<T> input, size_type channels,
+            array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+            array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+        {
+            /* every element in the output is mapped to a window in the input and each thread processes several windows */
+            for (auto idx : grid_stride_range(output.size())) {
+                size_type out_spatial_size = 1;
+                array<index_type, Order> window_idx;
+                for (int i = Order - 1; i >= 0; i--) {
+                    window_idx[i] = (idx / out_spatial_size) % out_spatial_dims[i];
+                    out_spatial_size *= out_spatial_dims[i];
+                }
+
+                const index_type n = idx / (out_spatial_size * channels);
+                const index_type c = (idx / out_spatial_size) % channels;
+
+                array<index_type, Order> start;
+                for(int i = 0; i < Order; i++)
+                    start[i] = window_idx[i] * strides[i] - padding_left[i];
+
+                array<index_type, Order> end;
+                for (int i = 0; i < Order; i++) {
+                    using device::min;
+                    end[i] = min<index_type>(start[i] + window_size[i], in_spatial_dims[i]);
+                }
+
+                for (int i = 0; i < Order; i++) {
+                    using device::max;
+                    start[i] = max(start[i], 0);
+                }
+
+                T max_value = numeric_limits<T>::lowest();
+                index_type max_idx = -1;
+
+                size_type in_spatial_size = 1;
+                for (int i = 0; i < Order; i++)
+                    in_spatial_size *= in_spatial_dims[i];
+
+                const auto outer_offset =  (n * channels + c) * in_spatial_size;
+                if (Order == 2) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+                            index_type offset = 0;
+                            index_type stride = 1;
+                            for (int i = Order - 1; i >= 0; i--) {
+                                offset += stride * idx[i];
+                                stride *= in_spatial_dims[i];
+                            }
+
+                            if (input[outer_offset + offset] > max_value) {
+                                max_idx = offset;
+                                max_value = input[outer_offset + offset];
+                            }
+                        }
+                    }
+                } else if(Order == 3) {
+                    array<index_type, Order> idx;
+                    for (idx[0] = start[0]; idx[0] != end[0]; idx[0]++) {
+                        for (idx[1] = start[1]; idx[1] != end[1]; idx[1]++) {
+                            for (idx[2] = start[2]; idx[2] != end[2]; idx[2]++) {
+                                index_type offset = 0;
+                                index_type stride = 1;
+                                for (int i = Order - 1; i >= 0; i--) {
+                                    offset += stride * idx[i];
+                                    stride *= in_spatial_dims[i];
+                                }
+
+                                if (input[outer_offset + offset] > max_value) {
+                                    max_idx = offset;
+                                    max_value = input[outer_offset + offset];
+                                }
+                            }
+                        }
+                    }
+                }
+
+                output[idx] = max_value;
+                indices[idx] = max_idx;
+            }
+        }
+
+        template <class T, std::size_t Order>
+        __global__ void max_unpooling(
+            Span<T> output, View<T> input, View<T> indices, size_type channels,
+            array<size_type, Order> out_spatial_dims, array<size_type, Order> in_spatial_dims,
+            array<size_type, Order> window_size, array<size_type, Order> strides, array<size_type, Order> padding_left)
+        {
+            /* the output has already been zero filled */
+            /* Every input value represents a window in the output. The max unpooling operation
+             * copies the input value to exactly one location in the output window which is given
+             * by the indices tensor.
+             */
+            for (auto idx : grid_stride_range(input.size())) {
+                size_type in_spatial_size = 1;
+                array<index_type, Order> window_idx;
+                for (int i = Order - 1; i >= 0; i--) {
+                    window_idx[i] = (idx / in_spatial_size) % in_spatial_dims[i];
+                    in_spatial_size *= in_spatial_dims[i];
+                }
+
+                const index_type n = idx / (in_spatial_size * channels);
+                const index_type c = (idx / in_spatial_size) % channels;
+
+                array<index_type, Order> start;
+                for (int i = 0; i < Order; i++) {
+                    using device::min;
+                    using device::max;
+                    start[i] = max(0, min(window_idx[i] * strides[i] - padding_left[i], out_spatial_dims[i] - 1));
+                }
+
+                size_type out_spatial_size = 1;
+                for (int i = 0; i < Order; i++)
+                    out_spatial_size *= out_spatial_dims[i];
+
+                index_type outer_offset = (n * channels + c) * out_spatial_size;
+                output[outer_offset + static_cast<index_type>(indices[idx])] = input[idx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Order> static
+    void launch_max_pooling_kernel(
+        const Stream& stream,
+        Span<T> output, Span<T> indices, View<T> input, std::size_t channels,
+        const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+        const std::vector<std::size_t>& window_size,
+        const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(indices.size() == output.size());
+        CV_Assert(out_spatial_dims.size() == Order);
+        CV_Assert(in_spatial_dims.size() == Order);
+        CV_Assert(window_size.size() == Order);
+        CV_Assert(strides.size() == Order);
+        CV_Assert(padding_left.size() == Order);
+
+        array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+        out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+        in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+        array<size_type, Order> window_size_k, strides_k, padding_left_k;
+        window_size_k.assign(std::begin(window_size), std::end(window_size));
+        strides_k.assign(std::begin(strides), std::end(strides));
+        padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+        auto kernel = raw::max_pooling_with_indices<T, Order>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, indices, input, channels,
+            out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+    }
+
+    template <class T>
+    void max_pooling_with_indices(
+        const Stream& stream,
+        TensorSpan<T> output, TensorSpan<T> indices, TensorView<T> input,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(is_shape_same(output, indices));
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto order = window_size.size();
+        CV_Assert(strides.size() == order);
+        CV_Assert(padding_left.size() == order);
+        CV_Assert(output.rank() == order + 2);
+        CV_Assert(input.rank() == order + 2);
+
+        std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+        for (int i = 0; i < order; i++) {
+            in_spatial_dims[i] = input.get_axis_size(2 + i);
+            out_spatial_dims[i] = output.get_axis_size(2 + i);
+        }
+
+        /* only max_pooling2d and max_pooling3d are supported */
+        CV_Assert(2 <= order && order <= 3);
+        std::size_t channels = input.get_axis_size(1);
+        if (order == 3) {
+            launch_max_pooling_kernel<T, 3>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 2) {
+            launch_max_pooling_kernel<T, 2>(stream, output, indices, input, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        }
+    }
+
+    template void max_pooling_with_indices(const Stream&,
+        TensorSpan<__half>, TensorSpan<__half>, TensorView<__half>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+    template void max_pooling_with_indices(const Stream&,
+        TensorSpan<float>, TensorSpan<float>, TensorView<float>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+    template <class T, std::size_t Order> static
+    void launch_max_unpooling_kernel(
+        const Stream& stream,
+        Span<T> output, View<T> input, View<T> indices, std::size_t channels,
+        const std::vector<std::size_t>& out_spatial_dims, const std::vector<std::size_t>& in_spatial_dims,
+        const std::vector<std::size_t>& window_size,
+        const std::vector<std::size_t>& strides, const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(out_spatial_dims.size() == Order);
+        CV_Assert(in_spatial_dims.size() == Order);
+        CV_Assert(window_size.size() == Order);
+        CV_Assert(strides.size() == Order);
+        CV_Assert(padding_left.size() == Order);
+        CV_Assert(indices.size() == input.size());
+
+        array<size_type, Order> out_spatial_dims_k, in_spatial_dims_k;
+        out_spatial_dims_k.assign(std::begin(out_spatial_dims), std::end(out_spatial_dims));
+        in_spatial_dims_k.assign(std::begin(in_spatial_dims), std::end(in_spatial_dims));
+
+        array<size_type, Order> window_size_k, strides_k, padding_left_k;
+        window_size_k.assign(std::begin(window_size), std::end(window_size));
+        strides_k.assign(std::begin(strides), std::end(strides));
+        padding_left_k.assign(std::begin(padding_left), std::end(padding_left));
+
+        auto kernel = raw::max_unpooling<T, Order>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, output, input, indices, channels,
+            out_spatial_dims_k, in_spatial_dims_k, window_size_k, strides_k, padding_left_k);
+    }
+
+    template <class T>
+    void max_unpooling(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input, TensorView<T> indices,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left)
+    {
+        CV_Assert(is_shape_same(input, indices));
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        auto order = window_size.size();
+        CV_Assert(strides.size() == order);
+        CV_Assert(padding_left.size() == order);
+        CV_Assert(output.rank() == order + 2);
+        CV_Assert(input.rank() == order + 2);
+
+        std::vector<std::size_t> out_spatial_dims(order), in_spatial_dims(order);
+        for (int i = 0; i < order; i++) {
+            in_spatial_dims[i] = input.get_axis_size(2 + i);
+            out_spatial_dims[i] = output.get_axis_size(2 + i);
+        }
+
+        kernels::fill<T>(stream, output, 0.0);
+
+        /* only max_unpooling2d and max_unpooling3d are supported */
+        CV_Assert(2 <= order && order <= 3);
+        std::size_t channels = input.get_axis_size(1);
+        if (order == 3) {
+            launch_max_unpooling_kernel<T, 3>(stream, output, input, indices, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        } else if (order == 2) {
+            launch_max_unpooling_kernel<T, 2>(stream, output, input, indices, channels,
+                out_spatial_dims, in_spatial_dims, window_size, strides, padding_left);
+        }
+    }
+
+    template void max_unpooling(const Stream&,
+        TensorSpan<__half>, TensorView<__half>, TensorView<__half>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+    template void max_unpooling(const Stream&,
+        TensorSpan<float>, TensorView<float>, TensorView<float>,
+        const std::vector<std::size_t>&, const std::vector<std::size_t>&,
+        const std::vector<std::size_t>&);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/normalize.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/normalize.cu
@@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "atomics.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include "../cuda4dnn/kernels/fill.hpp"
+#include "../cuda4dnn/kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T>
+        __global__ void reduce_sum_abs(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+            for (auto idx : grid_stride_range(input.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                atomicAdd(&output[sum_idx], device::abs(input[idx]));
+            }
+        }
+
+        template <class T>
+        __global__ void reciprocal(Span<T> output, T epsilon) {
+            for (auto idx : grid_stride_range(output.size()))
+                output[idx] = T(1) / (output[idx] + epsilon);
+        }
+
+        template <class T>
+        __global__ void reduce_sum_squared(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride) {
+           for (auto idx : grid_stride_range(input.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                atomicAdd(&output[sum_idx], input[idx] * input[idx]);
+           }
+        }
+
+        template <class T>
+        __global__ void rsqrt(Span<T> output, T epsilon) {
+            for (auto idx : grid_stride_range(output.size())) {
+                using device::sqrt;
+                output[idx] = T(1) / sqrt(output[idx] + epsilon);
+            }
+        }
+
+        template <class T>
+        __global__ void apply_norm(Span<T> output, View<T> input, size_type outer_stride, size_type mid_stride, View<T> sums) {
+            for (auto idx : grid_stride_range(output.size())) {
+                const index_type outer_idx = idx / outer_stride;
+                const index_type inner_idx = idx % mid_stride;
+
+                const index_type sum_idx = outer_idx * mid_stride + inner_idx;
+                output[idx] = input[idx] * sums[sum_idx];
+            }
+        }
+    }
+
+    template <class T>
+    void normalize(
+        const Stream& stream,
+        Span<T> output,
+        View<T> input, std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
+        Span<T> workspace)
+    {
+        CV_Assert(output.size() == input.size());
+        CV_Assert(output.size() == outer_size * mid_size * inner_size);
+        CV_Assert(norm == 1 || norm == 2);
+        CV_Assert(workspace.size() >= outer_size * inner_size);
+
+        auto sums = Span<T>(workspace.data(), outer_size * inner_size);
+
+        fill<T>(stream, sums, 0.0);
+
+        if (norm == 1) {
+            auto reduce_kernel = raw::reduce_sum_abs<T>;
+            auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+            launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+            auto reciprocal_kernel = raw::reciprocal<T>;
+            policy = make_policy(reciprocal_kernel, sums.size(), 0, stream);
+            launch_kernel(reciprocal_kernel, policy, sums, epsilon);
+        } else {
+            auto reduce_kernel = raw::reduce_sum_squared<T>;
+            auto policy = make_policy(reduce_kernel, input.size(), 0, stream);
+            launch_kernel(reduce_kernel, policy, sums, input, mid_size * inner_size, inner_size);
+
+            auto rsqrt_kernel = raw::rsqrt<T>;
+            policy = make_policy(rsqrt_kernel, sums.size(), 0, stream);
+            launch_kernel(rsqrt_kernel, policy, sums, epsilon);
+        }
+
+        auto scale_kernel = raw::apply_norm<T>;
+        auto policy = make_policy(scale_kernel, output.size(), 0, stream);
+        launch_kernel(scale_kernel, policy, output, input, mid_size * inner_size, inner_size, sums);
+    }
+
+    template void normalize(const Stream&, Span<__half>, View<__half>, std::size_t, std::size_t, std::size_t, std::size_t, __half, Span<__half>);
+    template void normalize(const Stream&, Span<float>, View<float>, std::size_t, std::size_t, std::size_t, std::size_t, float, Span<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/padding.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/padding.cu
@@ -0,0 +1,199 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void copy_with_reflection101(
+            Span<T> output, array<size_type, Rank> out_strides, array<index_type, Rank> start, array<index_type, Rank> end,
+            View<T> input, array<size_type, Rank> in_strides)
+        {
+            for (auto i : grid_stride_range(output.size())) {
+                /* compute output axis indices corresponding to element 'i' */
+                array<index_type, Rank> out_index;
+                out_index[0] = i / out_strides[0];
+                for (int j = 1; j < Rank; j++)
+                    out_index[j] = (i % out_strides[j - 1]) / out_strides[j];
+
+                /* compute input axis indices corresponding to output axis indices */
+                array<index_type, Rank> in_index;
+                for (int j = 0; j < Rank; j++) {
+                    /* if out_index < start, the point is in the left reflection region
+                     * the reflected value's index is the absolute value of the difference
+                     *
+                     * otherwise, if the value is in the copy region, out_index - start gives the input index
+                     */
+                    using device::abs;
+                    in_index[j] = abs(out_index[j] - start[j]);
+
+                    /* if out_index >= end, it's in the right reflection region */
+                    if (out_index[j] >= end[j])
+                        in_index[j] = (end[j] - start[j]) - (out_index[j] - end[j]) - 2;
+                }
+
+                /* compute input element number from input axis indices */
+                index_type iidx = 0;
+                for (int j = 0; j < Rank; j++)
+                    iidx += in_index[j] * in_strides[j];
+
+                output[i] = input[iidx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_copy_with_reflection101(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride,
+        const std::vector<std::pair<std::size_t, std::size_t>>& ranges)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+        CV_Assert(ranges.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> start_k, end_k;
+        for (int i = 0; i < Rank; i++) {
+            start_k[i] = ranges[i].first;
+            end_k[i] = ranges[i].second;
+        }
+
+        auto kernel = raw::copy_with_reflection101<T, Rank>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, start_k, end_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(copy_with_reflection101_dispatcher, launch_copy_with_reflection101);
+
+    template <class T>
+    void copy_with_reflection101(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::pair<std::size_t, std::size_t>> ranges)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == ranges.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+         * output tensor will be [i1 + off1, i2 + off2, ...]. The rest of the elements in the output are padding.
+         * The padding operation essentially copies items from the input tensor to new locations in the output tensor
+         * and pads the remaining.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output indices
+         * for all the elements will be of the form be [0, i2, ...] and [0, i2 + off2, ...] respectively. Note that
+         * there cannot be extra padding since the axes have unit size. The first index does not contribute to the
+         * element's address calculation and hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(ranges[0].first == 0 && ranges[0].second == 1);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            ranges.erase(std::begin(ranges));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == ranges.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes which do not have any padding can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, i3, ...]. Let the first two axes not have any
+         * padding. The indices in the output tensor will be [i1, i2, i3 + off3, ...].
+         *
+         * Each axis in the contiguous unpadded axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `i1 * stride1 + i2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be `i12 * stride2` as the kernel iterates through `i12`.
+         * Note that `i12` is actually `(i1 * size2 + i2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any padding */
+            if (ranges[i].first == 0 && ranges[i].second == inShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unpadded axis sequence */
+                CV_Assert(inShape[i] == outShape[i]);
+
+                /* we now iterate through the axes which follow and try to merge */
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && ranges[j].first == 0 && ranges[j].second == inShape[j]) {
+                    CV_Assert(inShape[j] == outShape[j]);
+
+                    /* `j` is also unpadded; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    ranges[i].second = new_size;
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    ranges.erase(std::begin(ranges) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == ranges.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(ranges[i].first == 0 && ranges[i].second == inShape[i]);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<int>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<int>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        copy_with_reflection101_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, ranges);
+    }
+
+    template void copy_with_reflection101(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+    template void copy_with_reflection101(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::pair<std::size_t, std::size_t>> ranges);
+
+}}}} /* namespace namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/permute.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/permute.cu
@@ -0,0 +1,143 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void permute(
+            array<index_type, Rank> axis_order,
+            Span<T> output, array<size_type, Rank> outStrides,
+            View<T> input, array<size_type, Rank> inStrides)
+        {
+            for (auto i : grid_stride_range(input.size())) {
+                index_type oldPosition = 0;
+                index_type newPosition = i;
+
+                for (int j = 0; j < Rank; j++)
+                {
+                    auto order = axis_order[j];
+                    oldPosition += (newPosition / outStrides[j]) * inStrides[order];
+                    newPosition %= outStrides[j];
+                }
+
+                output[i] = input[oldPosition];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_permute_kernel(
+        const Stream& stream,
+        const std::vector<std::size_t>& order,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride)
+    {
+        CV_Assert(order.size() == Rank);
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+
+        array<index_type, Rank> order_k;
+        order_k.assign(std::begin(order), std::end(order));
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        auto kernel = raw::permute<T, Rank>;
+        auto policy = make_policy(kernel, input.size(), 0, stream);
+        launch_kernel(kernel, policy, order_k, output, outStride_k, input, inStride_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(permute_dispatcher, launch_permute_kernel);
+
+    template <class T>
+    void permute(
+        const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> order)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(input.rank() == order.size());
+        CV_Assert(input.size() == output.size());
+
+        /* squeezable axes at the beginning of both tensors which aren't permuted can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the input tensor is [i1, i2, ...]. The indices in the
+         * output tensor will be some permutation of the input tensor indices. Let the output
+         * tensor indices be [o1, o2, ...]. The permutation operation essentially copies items
+         * from the input tensor to new locations in the output tensor as dictated by the indices.
+         *
+         * If the size of the first axis of the input and output tensor is one and these axes are
+         * not involved in any permutation, i.e. order[0] = 0, the input and output indicies for
+         * all the elements will be of the form be [0, i2, ...] and [0, o2, ...] respectively.
+         * The first index does not contribute to the element's address calculation and hence does
+         * nothing apart from eating up few cycles.
+         */
+        while (order[0] == 0 && input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            /* remove the axes */
+            input.squeeze(0);
+            output.squeeze(0);
+
+            /* when we remove axis zero, the axis index will be one less than the previous index
+             * for the remaining axes
+             */
+            order.erase(order.begin());
+            for (auto& axis : order)
+                axis--;
+
+            /* optimizations should not break the invariants */
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(input.rank() == order.size());
+            CV_Assert(input.size() == output.size());
+        }
+
+        auto rank = output.rank();
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(2 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        permute_dispatcher<T, 2, CSL_MAX_TENSOR_RANK>(rank, stream, order, output, outStride, input, inStride);
+    }
+
+    template void permute(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+    template void permute(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/prior_box.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/prior_box.cu
@@ -0,0 +1,174 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "math.hpp"
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, bool Normalize>
+        __global__ void prior_box(
+            Span<T> output,
+            View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+            size_type layerWidth, size_type layerHeight,
+            size_type imageWidth, size_type imageHeight)
+        {
+            /* each box consists of two pair of coordinates and hence 4 values in total */
+            /* since the entire output consists (first channel at least) of these boxes,
+             * we are garunteeed that the output is aligned to a boundary of 4 values
+             */
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+
+            /* num_points contains the number of points in the feature map of interest
+             * each iteration of the stride loop selects a point and generates prior boxes for it
+             */
+            size_type num_points = layerWidth * layerHeight;
+            for (auto idx : grid_stride_range(num_points)) {
+                const index_type x = idx % layerWidth,
+                                 y = idx / layerWidth;
+
+                index_type output_offset_v4 = idx * offsetX.size() * boxWidth.size();
+                for (int i = 0; i < boxWidth.size(); i++) {
+                    for (int j = 0; j < offsetX.size(); j++) {
+                        float center_x = (x + offsetX[j]) * stepX;
+                        float center_y = (y + offsetY[j]) * stepY;
+
+                        vector_type vec;
+                        if(Normalize) {
+                            vec.data[0] = (center_x - boxWidth[i] * 0.5f) / imageWidth;
+                            vec.data[1] = (center_y - boxHeight[i] * 0.5f) / imageHeight;
+                            vec.data[2] = (center_x + boxWidth[i] * 0.5f) / imageWidth;
+                            vec.data[3] = (center_y + boxHeight[i] * 0.5f) / imageHeight;
+                        } else {
+                            vec.data[0] = center_x - boxWidth[i] * 0.5f;
+                            vec.data[1] = center_y - boxHeight[i] * 0.5f;
+                            vec.data[2] = center_x + boxWidth[i] * 0.5f - 1.0f;
+                            vec.data[3] = center_y + boxHeight[i] * 0.5f - 1.0f;
+                        }
+
+                        v_store(output_vPtr[output_offset_v4], vec);
+                        output_offset_v4++;
+                    }
+                }
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_clip(Span<T> output) {
+            for (auto i : grid_stride_range(output.size())) {
+                using device::clamp;
+                output[i] = clamp<T>(output[i], 0.0, 1.0);
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_set_variance1(Span<T> output, float variance) {
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / 4)) {
+                vector_type vec;
+                for (int j = 0; j < 4; j++)
+                    vec.data[j] = variance;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T>
+        __global__ void prior_box_set_variance4(Span<T> output, array<float, 4> variance) {
+            using vector_type = get_vector_type_t<T, 4>;
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            for (auto i : grid_stride_range(output.size() / 4)) {
+                vector_type vec;
+                for(int j = 0; j < 4; j++)
+                    vec.data[j] = variance[j];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, bool Normalize> static
+    void launch_prior_box_kernel(
+        const Stream& stream,
+        Span<T> output, View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+        std::size_t layerWidth, std::size_t layerHeight, std::size_t imageWidth, std::size_t imageHeight)
+    {
+        auto num_points = layerWidth * layerHeight;
+        auto kernel = raw::prior_box<T, Normalize>;
+        auto policy = make_policy(kernel, num_points, 0, stream);
+        launch_kernel(kernel, policy,
+            output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+            layerWidth, layerHeight, imageWidth, imageHeight);
+    }
+
+    template <class T>
+    void generate_prior_boxes(
+        const Stream& stream,
+        Span<T> output,
+        View<float> boxWidth, View<float> boxHeight, View<float> offsetX, View<float> offsetY, float stepX, float stepY,
+        std::vector<float> variance,
+        std::size_t numPriors,
+        std::size_t layerWidth, std::size_t layerHeight,
+        std::size_t imageWidth, std::size_t imageHeight,
+        bool normalize, bool clip)
+    {
+        if (normalize) {
+            launch_prior_box_kernel<T, true>(
+                stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+                layerWidth, layerHeight, imageWidth, imageHeight
+            );
+        } else {
+            launch_prior_box_kernel<T, false>(
+                stream, output, boxWidth, boxHeight, offsetX, offsetY, stepX, stepY,
+                layerWidth, layerHeight, imageWidth, imageHeight
+            );
+        }
+
+        std::size_t channel_size = layerHeight * layerWidth * numPriors * 4;
+        CV_Assert(channel_size * 2 == output.size());
+
+        if (clip) {
+            auto output_span_c1 = Span<T>(output.data(), channel_size);
+            auto kernel = raw::prior_box_clip<T>;
+            auto policy = make_policy(kernel, output_span_c1.size(), 0, stream);
+            launch_kernel(kernel, policy, output_span_c1);
+        }
+
+        auto output_span_c2 = Span<T>(output.data() + channel_size, channel_size);
+        if (variance.size() == 1) {
+            auto kernel = raw::prior_box_set_variance1<T>;
+            auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+            launch_kernel(kernel, policy, output_span_c2, variance[0]);
+        } else {
+            array<float, 4> variance_k;
+            variance_k.assign(std::begin(variance), std::end(variance));
+            auto kernel = raw::prior_box_set_variance4<T>;
+            auto policy = make_policy(kernel, output_span_c2.size() / 4, 0, stream);
+            launch_kernel(kernel, policy, output_span_c2, variance_k);
+        }
+    }
+
+    template void generate_prior_boxes(const Stream&, Span<__half>, View<float>, View<float>, View<float>, View<float>, float, float,
+        std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+
+    template void generate_prior_boxes(const Stream&, Span<float>, View<float>, View<float>, View<float>, View<float>, float, float,
+        std::vector<float>, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/region.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/region.cu
@@ -0,0 +1,177 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "limits.hpp"
+#include "vector_traits.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T>
+        __global__ void region_box(
+            Span<T> output, View<T> input, View<T> bias,
+            size_type boxes_per_cell, size_type box_size,
+            size_type rows, size_type cols,
+            size_type height_norm, size_type width_norm,
+            T object_prob_cutoff)
+        {
+            using vector2_type = get_vector_type_t<T, 2>;
+            auto bias_vPtr = vector2_type::get_pointer(bias.data());
+
+            for (auto box_index : grid_stride_range(output.size() / box_size)) {
+                const auto box_of_the_cell = box_index % boxes_per_cell; /* box number within a cell */
+                const auto box_offset = box_index * box_size;
+
+                const auto batch_inner_size = rows * cols * boxes_per_cell;
+                const auto row_inner_size = cols * boxes_per_cell;
+                const auto col_inner_size = boxes_per_cell;
+
+                const auto y = (box_index % batch_inner_size) / row_inner_size;
+                const auto x = (box_index % row_inner_size) / col_inner_size;
+
+                using device::sigmoid;
+                output[box_offset + 0] = (T(x) + sigmoid(input[box_offset + 0])) / T(cols);
+                output[box_offset + 1] = (T(y) + sigmoid(input[box_offset + 1])) / T(rows);
+
+                vector2_type bias_xy;
+                v_load(bias_xy, bias_vPtr[box_of_the_cell]);
+
+                using device::exp;
+                output[box_offset + 2] = exp(input[box_offset + 2]) * bias_xy.data[0] / T(width_norm);
+                output[box_offset + 3] = exp(input[box_offset + 3]) * bias_xy.data[1] / T(height_norm);
+
+                /* squash objectness score into a probability */
+                using device::sigmoid;
+                T objectness_prob = sigmoid(input[box_offset + 4]);
+
+                /* ignore prediction if the objectness probability is less than the cutoff */
+                if (objectness_prob < object_prob_cutoff)
+                    objectness_prob = 0;
+
+                output[box_offset + 4] = objectness_prob;
+            }
+        }
+
+        template <class T>
+        __global__ void region_sigmoid_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size)
+        {
+            for (auto idx : grid_stride_range(output.size())) {
+                const index_type box_no = idx / box_size;
+                const index_type start_of_box = box_no * box_size;
+                const index_type box_offset = idx % box_size;
+
+                if (box_offset < 5) {
+                    /* continue as we have already processed these in region_box */
+                    continue;
+                }
+
+                auto objectness_prob = output[start_of_box + 4];
+
+                /* the class probabilities we currently have are conditional class probabilities
+                 * given the object
+                 *
+                 * to obtain the actual class probability, we multiply the conditional probability
+                 * with the object probability
+                 */
+                auto actual_class_prob = objectness_prob * sigmoid(input[idx]);
+                if (actual_class_prob <= class_prob_cutoff)
+                    actual_class_prob = T(0);
+                output[idx] = actual_class_prob;
+            }
+        }
+
+        template <class T>
+        __global__ void region_softmax_class_score(Span<T> output, View<T> input, T class_prob_cutoff, size_type box_size) {
+            for (auto box_no : grid_stride_range(output.size() / box_size)) {
+                const index_type start_of_box = box_no * box_size;
+                const index_type start_idx = start_of_box + 5;
+                const index_type end_idx = start_of_box + box_size;
+
+                auto largest = numeric_limits<T>::lowest();
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    using device::max;
+                    largest = max(largest, input[idx]);
+                }
+
+                auto sum = T(0);
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    using device::exp;
+                    auto temp = exp(input[idx] - largest);
+                    sum += temp;
+                    output[idx] = temp;
+                }
+
+                for (int idx = start_idx; idx < end_idx; idx++) {
+                    auto softmax_score = output[idx] / sum;
+
+                    /* the class probabilities we currently have are conditional class probabilities
+                     * given the object
+                     *
+                     * to obtain the actual class probability, we multiply the conditional probability
+                     * with the object probability
+                     */
+                    auto objectness_prob = output[start_of_box + 4];
+                    auto actual_class_prob = objectness_prob * softmax_score;
+                    if (actual_class_prob <= class_prob_cutoff)
+                        actual_class_prob = T(0);
+                    output[idx] = actual_class_prob;
+                }
+            }
+        }
+    }
+
+    template <class T>
+    void region(const Stream& stream, Span<T> output, View<T> input, View<T> bias,
+        T object_prob_cutoff, T class_prob_cutoff,
+        std::size_t boxes_per_cell, std::size_t box_size,
+        std::size_t rows, std::size_t cols,
+        std::size_t height_norm, std::size_t width_norm,
+        bool if_true_sigmoid_else_softmax /* true = sigmoid, false = softmax */)
+    {
+        CV_Assert(output.size() == input.size());
+        CV_Assert(output.size() % box_size == 0);
+        CV_Assert(is_fully_aligned(bias, 2));
+
+        auto box_kernel = raw::region_box<T>;
+        auto box_policy = make_policy(box_kernel, output.size() / box_size, 0, stream);
+        launch_kernel(box_kernel, box_policy,
+            output, input, bias, boxes_per_cell, box_size,
+            rows, cols, height_norm, width_norm,
+            object_prob_cutoff);
+
+        if (if_true_sigmoid_else_softmax) {
+            auto kernel_score = raw::region_sigmoid_class_score<T>;
+            auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
+            launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
+        } else {
+            auto kernel_score = raw::region_softmax_class_score<T>;
+            auto policy_score = make_policy(kernel_score, output.size(), 0, stream);
+            launch_kernel(kernel_score, policy_score, output, input, class_prob_cutoff, box_size);
+        }
+    }
+
+    template void region(const Stream&, Span<__half>, View<__half>, View<__half>,
+        __half, __half, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool);
+
+    template void region(const Stream&, Span<float>, View<float>, View<float>,
+        float, float, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, std::size_t, bool);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/resize.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/resize.cu
@@ -0,0 +1,233 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <cuda_runtime.h>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void resize_nn(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width)
+        {
+            auto in_image_size = in_height * in_width;
+            auto out_image_size = out_height * out_width;
+
+            /* think of the output and input as a collection of 2d images with the last axis
+             * representing the width and the last but one axis representing the height
+             *
+             * the remaining axis together form a collection of these images/channels
+             */
+            auto num_effective_channels = output.size() / out_image_size;
+
+            /* we process multiple channels every iteration to reuse the identical computation
+             * involved with the spatial dimensions
+             *
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
+             */
+            auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
+
+            /* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
+             * combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
+             * iterations in total to finish the resize operation
+             */
+            auto iters_required = num_channel_iters_per_xy * out_image_size;
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                /* o2i = output to input */
+                auto o2i_fy = static_cast<float>(in_height) / out_height;
+                auto o2i_fx = static_cast<float>(in_width) / out_width;
+
+                auto in_y = static_cast<index_type>(y * o2i_fy);
+                auto in_x = static_cast<index_type>(x * o2i_fx);
+
+                index_type in_idx = c_start * in_image_size + in_y * in_width + in_x;
+                index_type out_idx = c_start * out_image_size + y * out_width + x;
+
+                for (int i = 0; i < CHANNELS_PER_ITER; i++) {
+                    output[out_idx] = input[in_idx];
+
+                    in_idx += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+
+        template <class T, std::size_t CHANNELS_PER_ITER>
+        __global__ void resize_bilinear(
+            Span<T> output, size_type out_height, size_type out_width,
+            View<T> input, size_type in_height, size_type in_width,
+            float o2i_fy, float o2i_fx)
+        {
+            auto in_image_size = in_height * in_width;
+            auto out_image_size = out_height * out_width;
+
+            /* think of the output and input as a collection of 2d images with the last axis
+             * representing the width and the last but one axis representing the height
+             *
+             * the remaining axis together form a collection of these images/channels
+             */
+            auto num_effective_channels = output.size() / out_image_size;
+
+            /* we process multiple channels every iteration to reuse the identical computation
+             * involved with the spatial dimensions
+             *
+             * if we are processing `CHANNELS_PER_ITER` channels per iteration, we will need
+             * (num_effective_channels / CHANNELS_PER_ITER) iterations per (x, y) location
+             */
+            auto num_channel_iters_per_xy = (num_effective_channels / CHANNELS_PER_ITER);
+
+            /* we need `num_channel_iters_per_xy` iterations per (x, y) and there are `out_image_size`
+             * combinations of (x, y); hence, we'll need `num_channel_iters_per_xy * out_image_size`
+             * iterations in total to finish the resize operation
+             */
+            auto iters_required = num_channel_iters_per_xy * out_image_size;
+
+            for (auto iter : grid_stride_range(iters_required)) {
+                const index_type c_start = (iter / out_image_size) * CHANNELS_PER_ITER;
+                const index_type c_end = c_start + CHANNELS_PER_ITER;
+
+                /* note here that consecutive `iter` values will often have consecutive `x` values
+                 * => stores into output will be coalesced across threads
+                 */
+                const index_type y = (iter % out_image_size) / out_width;
+                const index_type x = iter % out_width;
+
+                auto in_x = x * o2i_fx;
+                auto in_y = y * o2i_fy;
+
+                auto in_x0 = static_cast<index_type>(in_x);
+                auto in_y0 = static_cast<index_type>(in_y);
+
+                using device::min;
+                auto in_x1 = min<index_type>(in_x0 + 1, in_width - 1);
+                auto in_y1 = min<index_type>(in_y0 + 1, in_height - 1);
+
+                index_type in_offset_r0 = c_start * in_image_size + in_y0 * in_width;
+                index_type in_offset_r1 = c_start * in_image_size + in_y1 * in_width;
+                index_type out_idx = c_start * out_image_size + y * out_width + x;
+
+                #pragma unroll 1 /* disable unrolling to reduce register pressure; not sure how but it works */
+                for (auto c = c_start; c < c_end; c++) {
+                    auto v_00 = input[in_offset_r0 + in_x0],
+                         v_01 = input[in_offset_r0 + in_x1],
+                         v_10 = input[in_offset_r1 + in_x0],
+                         v_11 = input[in_offset_r1 + in_x1];
+
+                    output[out_idx] =
+                        v_00 +
+                        T(in_y - in_y0) * T(v_10 - v_00) +
+                        T(in_x - in_x0) * T(v_01 - v_00) +
+                        T(in_y - in_y0) * T(in_x - in_x0) * T(v_11 - v_01 - v_10 + v_00);
+
+                    in_offset_r0 += in_image_size;
+                    in_offset_r1 += in_image_size;
+                    out_idx += out_image_size;
+                }
+            }
+        }
+    }
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_resize_nn(const Stream& stream,
+        Span<T> output, size_type out_height, size_type out_width,
+        View<T> input, size_type in_height, size_type in_width)
+    {
+        auto kernel = raw::resize_nn<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width);
+    }
+
+    template <class T>
+    void resize_nn(const Stream& stream, TensorSpan<T> output, TensorView<T> input) {
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_effective_channels = input.size_range(0, 2);
+        auto num_iters = num_effective_channels * out_height * out_width;
+
+        if (num_effective_channels % 32 == 0 && num_iters > 655360) {
+            launch_multichannel_resize_nn<T, 32>(stream, output, out_height, out_width, input, in_height, in_width);
+        } else if (num_effective_channels % 16 == 0 && num_iters > 327680) {
+            launch_multichannel_resize_nn<T, 16>(stream, output, out_height, out_width, input, in_height, in_width);
+        } else if (num_effective_channels % 8 == 0 && num_iters > 163840) {
+            launch_multichannel_resize_nn<T, 8>(stream, output, out_height, out_width, input, in_height, in_width);
+        } else if (num_effective_channels % 4 == 0 && num_iters > 81920) {
+            launch_multichannel_resize_nn<T, 4>(stream, output, out_height, out_width, input, in_height, in_width);
+        } else if (num_effective_channels % 2 == 0) {
+            launch_multichannel_resize_nn<T, 2>(stream, output, out_height, out_width, input, in_height, in_width);
+        } else {
+            launch_multichannel_resize_nn<T, 1>(stream, output, out_height, out_width, input, in_height, in_width);
+        }
+    }
+
+    template void resize_nn<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>);
+    template void resize_nn<float>(const Stream&, TensorSpan<float>, TensorView<float>);
+
+    template <class T, std::size_t CHANNELS_PER_ITER> static
+    void launch_multichannel_resize_bilinear(const Stream& stream,
+        Span<T> output, size_type out_height, size_type out_width,
+        View<T> input, size_type in_height, size_type in_width,
+        float scale_y, float scale_x)
+    {
+        auto kernel = raw::resize_bilinear<T, CHANNELS_PER_ITER>;
+        auto policy = make_policy(kernel, output.size() / CHANNELS_PER_ITER, 0, stream);
+        launch_kernel(kernel, policy, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+    }
+
+    template <class T>
+    void resize_bilinear(const Stream& stream, TensorSpan<T> output, TensorView<T> input, float scale_y, float scale_x) {
+        auto out_height = output.get_axis_size(-2);
+        auto out_width = output.get_axis_size(-1);
+
+        auto in_height = input.get_axis_size(-2);
+        auto in_width = input.get_axis_size(-1);
+
+        auto num_effective_channels = input.size_range(0, 2);
+        auto num_iters = num_effective_channels * out_height * out_width;
+
+        if (num_effective_channels % 16 == 0 && num_iters > 163840) {
+            launch_multichannel_resize_bilinear<T, 16>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+        } else if (num_effective_channels % 8 == 0 && num_iters > 81920) {
+            launch_multichannel_resize_bilinear<T, 8>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+        } else if (num_effective_channels % 4 == 0 && num_iters > 40960) {
+            launch_multichannel_resize_bilinear<T, 4>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+        } else if (num_effective_channels % 2 == 0) {
+            launch_multichannel_resize_bilinear<T, 2>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+        } else {
+            launch_multichannel_resize_bilinear<T, 1>(stream, output, out_height, out_width, input, in_height, in_width, scale_y, scale_x);
+        }
+    }
+
+    template void resize_bilinear<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, float, float);
+    template void resize_bilinear<float>(const Stream&, TensorSpan<float>, TensorView<float>, float, float);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/roi_pooling.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/roi_pooling.cu
@@ -0,0 +1,121 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "math.hpp"
+#include "limits.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+
+        template <class T>
+        __global__ void roi_pooling(
+            Span<T> output, size_type pooled_height, size_type pooled_width,
+            View<T> input, size_type in_height, size_type in_width,
+            View<T> rois, size_type num_channels, T spatial_scale)
+        {
+            // input: [1, num_channels, in_height, in_width]
+            // rois: [num_rois, 5]
+
+            // output: [num_rois, num_channels, pooled_height, pooled_width]
+            const auto out_spatial_size = pooled_height * pooled_width;
+            const auto out_roi_size = num_channels * out_spatial_size;
+
+            /* every element in the output is mapped to a window in the input and each thread processes several windows */
+            for (auto idx : grid_stride_range(output.size()))
+            {
+                const auto n = idx / out_roi_size;
+                const auto c = (idx % out_roi_size) / out_spatial_size;
+                const auto y = (idx % out_spatial_size) / pooled_width;
+                const auto x = idx % pooled_width;
+
+                const index_type roi_offset = n * 5;
+
+                using device::round;
+                const index_type batch_id = rois[roi_offset + 0];
+                const index_type x_start_roi = round(rois[roi_offset + 1] * spatial_scale);
+                const index_type y_start_roi = round(rois[roi_offset + 2] * spatial_scale);
+                const index_type x_end_roi = round(rois[roi_offset + 3] * spatial_scale);
+                const index_type y_end_roi = round(rois[roi_offset + 4] * spatial_scale);
+
+                using device::max;
+                const auto roi_width = max<index_type>(x_end_roi - x_start_roi + 1, 1);
+                const auto roi_height = max<index_type>(y_end_roi - y_start_roi + 1, 1);
+
+                const auto roi_width_ratio = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+                const auto roi_height_ratio = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+
+                auto x_start = x_start_roi + static_cast<index_type>(static_cast<T>(x) * roi_width_ratio);
+                auto y_start = y_start_roi + static_cast<index_type>(static_cast<T>(y) * roi_height_ratio);
+
+                using device::ceil;
+                auto x_end = x_start_roi + static_cast<index_type>(ceil(static_cast<T>(x + 1) * roi_width_ratio));
+                auto y_end = y_start_roi + static_cast<index_type>(ceil(static_cast<T>(y + 1) * roi_height_ratio));
+
+                using device::max;
+                x_start = max<index_type>(x_start, 0);
+                y_start = max<index_type>(y_start, 0);
+
+                using device::min;
+                x_end = min<index_type>(x_end, in_width);
+                y_end = min<index_type>(y_end, in_height);
+
+                /* We have to set the output to zero if (x_start >= x_end) or (y_start >= y_end). If either
+                 * condition is true, the loops below won't execute even a single iteration. Hence, by setting
+                 * `max_val` to zero in this case, we can combine it with the `else` code.
+                 */
+                T max_val = (x_start >= x_end || y_start >= y_end) ? T(0) : device::numeric_limits<T>::lowest();
+
+                const index_type in_offset = (batch_id * num_channels + c) * in_height * in_width;
+                for (auto iy = y_start; iy < y_end; iy++)
+                {
+                    for (auto ix = x_start; ix < x_end; ix++)
+                    {
+                        const auto in_idx = in_offset + iy * in_width + ix;
+                        max_val = max(max_val, input[in_idx]);
+                    }
+                }
+
+                output[idx] = max_val;
+            }
+        }
+    }
+
+    template <class T>
+    void roi_pooling(const Stream& stream, TensorSpan<T> output, TensorView<T> input, View<T> rois, T spatial_scale)
+    {
+        CV_Assert(input.get_axis_size(1) == output.get_axis_size(1));
+
+        size_type num_channels = output.get_axis_size(1);
+
+        size_type pooled_height = output.get_axis_size(2);
+        size_type pooled_width = output.get_axis_size(3);
+
+        size_type in_height = input.get_axis_size(2);
+        size_type in_width = input.get_axis_size(3);
+
+        auto kernel = raw::roi_pooling<T>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, pooled_height, pooled_width, input, in_height, in_width, rois, num_channels, spatial_scale);
+    }
+
+    template void roi_pooling(const Stream& stream, TensorSpan<__half> output, TensorView<__half> input, View<__half> rois, __half spatial_scale);
+    template void roi_pooling(const Stream& stream, TensorSpan<float> output, TensorView<float> input, View<float> rois, float spatial_scale);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/scale_shift.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/scale_shift.cu
@@ -0,0 +1,311 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "types.hpp"
+#include "vector_traits.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t N>
+        __global__ void bias1_vec(Span<T> output, View<T> input, T beta) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] + beta;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> bias) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for(int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] + bias[bias_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scale1_vec(Span<T> output, View<T> input, T alpha) {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * alpha;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scaleN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size());
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * weights[scale_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scale1_with_bias1_vec(Span<T> output, View<T> input, T alpha, T beta)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = alpha * vec.data[j] + beta;
+                v_store(output_vPtr[i], vec);
+            }
+        }
+
+        template <class T, std::size_t N>
+        __global__ void scaleN_with_biasN_vec(Span<T> output, View<T> input, size_type inner_size, View<T> weights, View<T> bias)
+        {
+            using vector_type = get_vector_type_t<T, N>;
+
+            auto output_vPtr = vector_type::get_pointer(output.data());
+            auto input_vPtr = vector_type::get_pointer(input.data());
+
+            inner_size /= vector_type::size();
+            for (auto i : grid_stride_range(output.size() / vector_type::size())) {
+                const index_type scale_idx = (i / inner_size) % static_cast<size_type>(weights.size());
+
+                vector_type vec;
+                v_load(vec, input_vPtr[i]);
+                for (int j = 0; j < vec.size(); j++)
+                    vec.data[j] = vec.data[j] * weights[scale_idx] + bias[scale_idx];
+                v_store(output_vPtr[i], vec);
+            }
+        }
+    }
+
+    template <class T, std::size_t N> static
+    void launch_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T beta) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::bias1_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, beta);
+    }
+
+    template <class T>
+    void bias1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T beta) {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_bias1_vec_kernel<T, 4>(stream, output, input, beta);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_bias1_vec_kernel<T, 2>(stream, output, input, beta);
+        } else {
+            launch_bias1_vec_kernel<T, 1>(stream, output, input, beta);
+        }
+    }
+
+    template void bias1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
+    template void bias1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
+
+    template <class T, std::size_t N> static
+    void launch_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> bias){
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::biasN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, bias);
+    }
+
+    template <class T>
+    void biasN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> bias)
+    {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, bias);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, bias);
+        } else {
+            launch_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, bias);
+        }
+    }
+
+    template void biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+    template void biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+    template <class T, std::size_t N> static
+    void launch_scale1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::scale1_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, alpha);
+    }
+
+    template <class T>
+    void scale1(const Stream& stream, TensorSpan<T> output, TensorView<T> input, T alpha) {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_scale1_vec_kernel<T, 4>(stream, output, input, alpha);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_scale1_vec_kernel<T, 2>(stream, output, input, alpha);
+        } else {
+            launch_scale1_vec_kernel<T, 1>(stream, output, input, alpha);
+        }
+    }
+
+    template void scale1<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, __half);
+    template void scale1<float>(const Stream&, TensorSpan<float>, TensorView<float>, float);
+
+    template <class T, std::size_t N> static
+    void launch_scaleN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::scaleN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, weights);
+    }
+
+    template <class T>
+    void scaleN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> weights)
+    {
+        CV_Assert(is_shape_same(input, output));
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_scaleN_vec_kernel<T, 4>(stream, output, input, inner_size, weights);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_scaleN_vec_kernel<T, 2>(stream, output, input, inner_size, weights);
+        } else {
+            launch_scaleN_vec_kernel<T, 1>(stream, output, input, inner_size, weights);
+        }
+    }
+
+    template void scaleN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>);
+    template void scaleN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>);
+
+    template <class T, std::size_t N> static
+    void launch_scale1_with_bias1_vec_kernel(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+
+        auto kernel = raw::scale1_with_bias1_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, alpha, beta);
+    }
+
+    template <class T>
+    void scale1_with_bias1(const Stream& stream, Span<T> output, View<T> input, T alpha, T beta) {
+        CV_Assert(output.size() == input.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4)) {
+            launch_scale1_with_bias1_vec_kernel<T, 4>(stream, output, input, alpha, beta);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2)) {
+            launch_scale1_with_bias1_vec_kernel<T, 2>(stream, output, input, alpha, beta);
+        } else {
+            launch_scale1_with_bias1_vec_kernel<T, 1>(stream, output, input, alpha, beta);
+        }
+    }
+
+    template void scale1_with_bias1<__half>(const Stream&, Span<__half>, View<__half>, __half, __half);
+    template void scale1_with_bias1<float>(const Stream&, Span<float>, View<float>, float, float);
+
+    template <class T, std::size_t N> static
+    void launch_scaleN_with_biasN_vec_kernel(const Stream& stream, Span<T> output, View<T> input, std::size_t inner_size, View<T> weights, View<T> bias) {
+        CV_Assert(is_fully_aligned<T>(output, N));
+        CV_Assert(is_fully_aligned<T>(input, N));
+        CV_Assert(inner_size % N == 0);
+
+        auto kernel = raw::scaleN_with_biasN_vec<T, N>;
+        auto policy = make_policy(kernel, output.size() / N, 0, stream);
+        launch_kernel(kernel, policy, output, input, inner_size, weights, bias);
+    }
+
+    template <class T>
+    void scaleN_with_biasN(
+        const Stream& stream,
+        TensorSpan<T> output,
+        TensorView<T> input, std::size_t inner_size,
+        TensorView<T> weights, TensorView<T> bias)
+    {
+        CV_Assert(is_shape_same(input, output));
+        CV_Assert(weights.size() == bias.size());
+
+        if (is_fully_aligned<T>(output, 4) && is_fully_aligned<T>(input, 4) && inner_size % 4 == 0) {
+            launch_scaleN_with_biasN_vec_kernel<T, 4>(stream, output, input, inner_size, weights, bias);
+        } else if (is_fully_aligned<T>(output, 2) && is_fully_aligned<T>(input, 2) && inner_size % 2 == 0) {
+            launch_scaleN_with_biasN_vec_kernel<T, 2>(stream, output, input, inner_size, weights, bias);
+        } else {
+            launch_scaleN_with_biasN_vec_kernel<T, 1>(stream, output, input, inner_size, weights, bias);
+        }
+    }
+
+    template void scaleN_with_biasN<__half>(const Stream&, TensorSpan<__half>, TensorView<__half>, std::size_t, TensorView<__half>, TensorView<__half>);
+    template void scaleN_with_biasN<float>(const Stream&, TensorSpan<float>, TensorView<float>, std::size_t, TensorView<float>, TensorView<float>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/slice.cu
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/slice.cu
@@ -0,0 +1,169 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+
+#include "array.hpp"
+#include "types.hpp"
+#include "grid_stride_range.hpp"
+#include "execution.hpp"
+#include "kernel_dispatcher.hpp"
+
+#include "../cuda4dnn/csl/stream.hpp"
+#include "../cuda4dnn/csl/tensor.hpp"
+#include "../cuda4dnn/csl/span.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <iostream>
+
+using namespace cv::dnn::cuda4dnn::csl;
+using namespace cv::dnn::cuda4dnn::csl::device;
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    namespace raw {
+        template <class T, std::size_t Rank>
+        __global__ void slice(
+            Span<T> output, array<size_type, Rank> out_strides,
+            View<T> input, array<size_type, Rank> in_strides, array<index_type, Rank> in_offset)
+        {
+            for (auto i : grid_stride_range(output.size())) {
+                index_type out_index = i / out_strides[0];
+                index_type in_index = in_offset[0] + out_index;
+                index_type iidx = in_index * in_strides[0];
+                for (int j = 1; j < Rank; j++) {
+                    out_index = (i % out_strides[j - 1]) / out_strides[j];
+                    in_index = in_offset[j] + out_index;
+                    iidx += in_index * in_strides[j];
+                }
+
+                output[i] = input[iidx];
+            }
+        }
+    }
+
+    template <class T, std::size_t Rank> static
+    void launch_slice(
+        const Stream& stream,
+        Span<T> output, const std::vector<std::size_t>& outStride,
+        View<T> input, const std::vector<std::size_t>& inStride, const std::vector<std::size_t>& inOffset)
+    {
+        CV_Assert(outStride.size() == Rank);
+        CV_Assert(inStride.size() == Rank);
+        CV_Assert(inOffset.size() == Rank);
+
+        array<size_type, Rank> outStride_k, inStride_k;
+        outStride_k.assign(std::begin(outStride), std::end(outStride));
+        inStride_k.assign(std::begin(inStride), std::end(inStride));
+
+        array<index_type, Rank> inOffset_k;
+        inOffset_k.assign(std::begin(inOffset), std::end(inOffset));
+
+        auto kernel = raw::slice<T, Rank>;
+        auto policy = make_policy(kernel, output.size(), 0, stream);
+        launch_kernel(kernel, policy, output, outStride_k, input, inStride_k, inOffset_k);
+    }
+
+    GENERATE_KERNEL_DISPATCHER(slice_dispatcher, launch_slice);
+
+    template <class T>
+    void slice(const Stream& stream,
+        TensorSpan<T> output, TensorView<T> input,
+        std::vector<std::size_t> offsets)
+    {
+        CV_Assert(output.rank() == input.rank());
+        CV_Assert(output.rank() == offsets.size());
+
+        /* squeezable axes at the beginning of both tensors can be eliminated
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the output tensor is [o1, o2, ...]. The indices in the input
+         * tensor will be [o1 + off1, o2 + off2, ...]. The rest of the elements in the input are igored.
+         *
+         * If the size of the first axis of the input and output tensor is unity, the input and output indices
+         * for all the elements will be of the form be [0, o2 + off2, ...] and [0, o2, ...] respectively. Note that
+         * there cannot be any ignored items since the axes have unit size. The first index does not contribute to the
+         * element's address calculation and hence does nothing apart from eating up few cycles.
+         */
+        while (input.get_axis_size(0) == 1 && output.get_axis_size(0) == 1) {
+            CV_Assert(offsets[0] == 0);
+
+            input.squeeze(0);
+            output.squeeze(0);
+            offsets.erase(std::begin(offsets));
+
+            CV_Assert(output.rank() == input.rank());
+            CV_Assert(output.rank() == offsets.size());
+        }
+
+        auto inShape = input.shape_as_vector();
+        auto outShape = output.shape_as_vector();
+
+        /* contiguous axes which do not undergo slicing can be combined into one axis
+         *
+         * Reasoning:
+         * ----------
+         * Suppose an item's indices in the output tensor is [o1, o2, o3, ...]. Let the first two axes not undergo any
+         * slicing. The indices in the input tensor will be [o1, o2, o3 + off3, ...].
+         *
+         * Each axis in the contiguous unsliced axes sequence will add an offset of iN * strideN. In the above example,
+         * the two axes add a total offset of `o1 * stride1 + o2 * stride2`. We can merge the two axes into one axis with
+         * a size of `size1 * size2`. The new offset added will be o12 * stride2` as the kernel iterates through `o12`.
+         * Note that `o12` is actually `(o1 * size2 + o2)` in the original tensor.
+         */
+        for (int i = 0; i < inShape.size(); i++) {
+            /* check if axis `i` requires any slicing */
+            if (offsets[i] == 0 && inShape[i] == outShape[i]) {
+                /* loop invariant: `i` is the first axis in the contiguous unsliced axis sequence */
+
+                int j = i + 1; /* `j` is the axis which we will attempt to merge */
+                while (j < inShape.size() && offsets[j] == 0 && inShape[j] == outShape[j]) {
+                    /* `j` axis is also unsliced; merge `i` and `j` */
+                    auto new_size = inShape[i] * inShape[j];
+                    inShape[i] = new_size;
+                    outShape[i] = new_size;
+                    offsets[i] = 0; /* redundant */
+
+                    /* delete axis `j` */
+                    inShape.erase(std::begin(inShape) + j);
+                    outShape.erase(std::begin(outShape) + j);
+                    offsets.erase(std::begin(offsets) + j);
+
+                    /* optimizations should not break the invariants */
+                    CV_Assert(inShape.size() == outShape.size());
+                    CV_Assert(inShape.size() == offsets.size());
+                    CV_Assert(inShape[i] == outShape[i]);
+                    CV_Assert(offsets[i] == 0);
+                }
+            }
+        }
+
+        auto rank = inShape.size();
+
+        std::vector<std::size_t> inStride(rank), outStride(rank);
+        inStride.back() = 1;
+        outStride.back() = 1;
+        /* garbage, ..., garbage, 1 */
+
+        std::copy(std::begin(inShape) + 1, std::end(inShape), std::begin(inStride));
+        std::copy(std::begin(outShape) + 1, std::end(outShape), std::begin(outStride));
+        /* dim[0], dim[1], ..., dim[-1], 1 */
+
+        std::partial_sum(inStride.rbegin(), inStride.rend(), inStride.rbegin(), std::multiplies<std::size_t>());
+        std::partial_sum(outStride.rbegin(), outStride.rend(), outStride.rbegin(), std::multiplies<std::size_t>());
+        /* stride[0], stride[1], ..., stride[-2], 1 */
+
+        CV_Assert(1 <= rank && rank <= CSL_MAX_TENSOR_RANK);
+        slice_dispatcher<T, 1, CSL_MAX_TENSOR_RANK>(rank, stream, output, outStride, input, inStride, offsets);
+    }
+
+    template void slice(const Stream&, TensorSpan<__half>, TensorView<__half>, std::vector<std::size_t>);
+    template void slice(const Stream&, TensorSpan<float>, TensorView<float>, std::vector<std::size_t>);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/types.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/types.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_TYPES_HPP
+#define OPENCV_DNN_SRC_CUDA_TYPES_HPP
+
+#include <cstdint>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    /* For indices, we can use 32bit variables or 64bit variables. The GPU registers are 32 bits in size.
+     * Hence, a 64bit variable requires two registers and is significantly slower than the 32bit versions.
+     *
+     * If we do not need to handle huge tensors, we can use 32-bit indices and get better performance.
+     */
+#ifdef __CUDACC__
+    using size_type = int;
+    using index_type = int;
+#else
+    using size_type = std::int32_t;
+    using index_type = std::int32_t;
+#endif
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_TYPES_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda/vector_traits.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda/vector_traits.hpp
@@ -0,0 +1,109 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+#define OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP
+
+#include <cuda_runtime.h>
+
+#include "types.hpp"
+
+#include "../cuda4dnn/csl/pointer.hpp"
+
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace device {
+
+    /** \file vector_traits.hpp
+     *  \brief utility classes and functions for vectorized memory loads/stores
+     *
+     * Example:
+     * using vector_type = get_vector_type_t<float, 4>;
+     *
+     * auto input_vPtr = type::get_pointer(iptr); // iptr is of type DevicePtr<const float>
+     * auto output_vPtr = type::get_pointer(optr);  // optr is of type DevicePtr<float>
+     *
+     * vector_type vec;
+     * v_load(vec, input_vPtr);
+     *
+     * for(int i = 0; i < vector_type::size(); i++)
+     *      vec[i] = do_something(vec[i]);
+     *
+     * v_store(output_vPtr, vec);
+     */
+
+    namespace detail {
+        template <size_type N> struct raw_type_ { };
+        template <> struct raw_type_<256> { typedef ulonglong4 type; };
+        template <> struct raw_type_<128> { typedef uint4 type; };
+        template <> struct raw_type_<64> { typedef uint2 type; };
+        template <> struct raw_type_<32> { typedef uint1 type; };
+        template <> struct raw_type_<16> { typedef uchar2 type; };
+        template <> struct raw_type_<8> { typedef uchar1 type; };
+
+        template <size_type N> struct raw_type {
+            using type = typename raw_type_<N>::type;
+            static_assert(sizeof(type) * 8 == N, "");
+        };
+    }
+
+    /* \tparam T    type of element in the vector
+     * \tparam N    "number of elements" of type T in the vector
+     */
+    template <class T, size_type N>
+    union vector_type {
+        using value_type = T;
+        using raw_type = typename detail::raw_type<N * sizeof(T) * 8>::type;
+
+        __device__ vector_type() { }
+
+        __device__ static constexpr size_type size() { return N; }
+
+        raw_type raw;
+        T data[N];
+
+        template <class U> static __device__
+        typename std::enable_if<std::is_const<U>::value, const vector_type*>
+        ::type get_pointer(csl::DevicePtr<U> ptr) {
+            return reinterpret_cast<const vector_type*>(ptr.get());
+        }
+
+        template <class U> static __device__
+        typename std::enable_if<!std::is_const<U>::value, vector_type*>
+        ::type get_pointer(csl::DevicePtr<U> ptr) {
+            return reinterpret_cast<vector_type*>(ptr.get());
+        }
+    };
+
+    template <class V>
+    __device__ void v_load(V& dest, const V& src) {
+        dest.raw = src.raw;
+    }
+
+    template <class V>
+    __device__ void v_load(V& dest, const V* src) {
+        dest.raw = src->raw;
+    }
+
+    template <class V>
+    __device__ void v_store(V* dest, const V& src) {
+        dest->raw = src.raw;
+    }
+
+    template <class V>
+    __device__ void v_store(V& dest, const V& src) {
+        dest.raw = src.raw;
+    }
+
+    template <class T, size_type N>
+    struct get_vector_type {
+        typedef vector_type<T, N> type;
+    };
+
+    template <class T, size_type N>
+    using get_vector_type_t = typename get_vector_type<T, N>::type;
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::device */
+
+#endif /* OPENCV_DNN_SRC_CUDA_VECTOR_TRAITS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cublas.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cublas.hpp
@@ -0,0 +1,230 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP
+
+#include "error.hpp"
+#include "stream.hpp"
+#include "pointer.hpp"
+#include "fp16.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cublas_v2.h>
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#define CUDA4DNN_CHECK_CUBLAS(call) \
+    ::cv::dnn::cuda4dnn::csl::cublas::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cublas {
+
+    /** @brief exception class for errors thrown by the cuBLAS API */
+    class cuBLASException : public CUDAException {
+    public:
+        using CUDAException::CUDAException;
+    };
+
+    namespace detail {
+        static void check(cublasStatus_t status, const char* func, const char* file, int line) {
+            auto cublasGetErrorString = [](cublasStatus_t err) {
+                switch (err) {
+                case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+                case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+                case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+                case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+                case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+                case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+                case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+                case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+                case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+                case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR";
+                }
+                return "UNKNOWN_CUBLAS_ERROR";
+            };
+
+            if (status != CUBLAS_STATUS_SUCCESS)
+                throw cuBLASException(Error::GpuApiCallError, cublasGetErrorString(status), func, file, line);
+        }
+    }
+
+    /** noncopyable cuBLAS smart handle
+     *
+     * UniqueHandle is a smart non-sharable wrapper for cuBLAS handle which ensures that the handle
+     * is destroyed after use. The handle can be associated with a CUDA stream by specifying the
+     * stream during construction. By default, the handle is associated with the default stream.
+     */
+    class UniqueHandle {
+    public:
+        UniqueHandle() { CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle)); }
+        UniqueHandle(UniqueHandle&) = delete;
+        UniqueHandle(UniqueHandle&& other) noexcept
+            : stream(std::move(other.stream)), handle{ other.handle } {
+            other.handle = nullptr;
+        }
+
+        UniqueHandle(Stream strm) : stream(std::move(strm)) {
+            CUDA4DNN_CHECK_CUBLAS(cublasCreate(&handle));
+            try {
+                CUDA4DNN_CHECK_CUBLAS(cublasSetStream(handle, stream.get()));
+            } catch (...) {
+                /* cublasDestroy won't throw if a valid handle is passed */
+                CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
+                throw;
+            }
+        }
+
+        ~UniqueHandle() noexcept {
+            if (handle != nullptr) {
+                /* cublasDestroy won't throw if a valid handle is passed */
+                CUDA4DNN_CHECK_CUBLAS(cublasDestroy(handle));
+            }
+        }
+
+        UniqueHandle& operator=(const UniqueHandle&) = delete;
+        UniqueHandle& operator=(UniqueHandle&& other) noexcept {
+            stream = std::move(other.stream);
+            handle = other.handle;
+            other.handle = nullptr;
+            return *this;
+        }
+
+        /** @brief returns the raw cuBLAS handle */
+        cublasHandle_t get() const noexcept { return handle; }
+
+    private:
+        Stream stream;
+        cublasHandle_t handle;
+    };
+
+    /** @brief sharable cuBLAS smart handle
+     *
+     * Handle is a smart sharable wrapper for cuBLAS handle which ensures that the handle
+     * is destroyed after all references to the handle are destroyed. The handle can be
+     * associated with a CUDA stream by specifying the stream during construction. By default,
+     * the handle is associated with the default stream.
+     *
+     * @note Moving a Handle object to another invalidates the former
+     */
+    class Handle {
+    public:
+        Handle() : handle(std::make_shared<UniqueHandle>()) { }
+        Handle(const Handle&) = default;
+        Handle(Handle&&) = default;
+        Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
+
+        Handle& operator=(const Handle&) = default;
+        Handle& operator=(Handle&&) = default;
+
+        /** returns true if the handle is valid */
+        explicit operator bool() const noexcept { return static_cast<bool>(handle); }
+
+        cublasHandle_t get() const noexcept {
+            CV_Assert(handle);
+            return handle->get();
+        }
+
+    private:
+        std::shared_ptr<UniqueHandle> handle;
+    };
+
+    /** @brief GEMM for colummn-major matrices
+     *
+     * \f$ C = \alpha AB + \beta C \f$
+     *
+     * @tparam          T           matrix element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuBLAS Handle
+     * @param           transa      use transposed matrix of A for computation
+     * @param           transb      use transposed matrix of B for computation
+     * @param           rows_c      number of rows in C
+     * @param           cols_c      number of columns in C
+     * @param           common_dim  common dimension of A (or trans A) and B (or trans B)
+     * @param           alpha       scale factor for AB
+     * @param[in]       A           pointer to column-major matrix A in device memory
+     * @param           lda         leading dimension of matrix A
+     * @param[in]       B           pointer to column-major matrix B in device memory
+     * @param           ldb         leading dimension of matrix B
+     * @param           beta        scale factor for C
+     * @param[in,out]   C           pointer to column-major matrix C in device memory
+     * @param           ldc         leading dimension of matrix C
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void gemm(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        T alpha, const DevicePtr<const T> A, std::size_t lda,
+        const DevicePtr<const T> B, std::size_t ldb,
+        T beta, const DevicePtr<T> C, std::size_t ldc);
+
+    template <> inline
+    void gemm<half>(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        half alpha, const DevicePtr<const half> A, std::size_t lda,
+        const DevicePtr<const half> B, std::size_t ldb,
+        half beta, const DevicePtr<half> C, std::size_t ldc)
+    {
+        CV_Assert(handle);
+
+        auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+            opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+        int irows_c = static_cast<int>(rows_c),
+            icols_c = static_cast<int>(cols_c),
+            icommon_dim = static_cast<int>(common_dim),
+            ilda = static_cast<int>(lda),
+            ildb = static_cast<int>(ldb),
+            ildc = static_cast<int>(ldc);
+
+        CUDA4DNN_CHECK_CUBLAS(
+            cublasHgemm(
+                handle.get(),
+                opa, opb,
+                irows_c, icols_c, icommon_dim,
+                &alpha, A.get(), ilda,
+                B.get(), ildb,
+                &beta, C.get(), ildc
+            )
+        );
+    }
+
+    template <> inline
+    void gemm<float>(const Handle& handle,
+        bool transa, bool transb,
+        std::size_t rows_c, std::size_t cols_c, std::size_t common_dim,
+        float alpha, const DevicePtr<const float> A, std::size_t lda,
+        const DevicePtr<const float> B, std::size_t ldb,
+        float beta, const DevicePtr<float> C, std::size_t ldc)
+    {
+        CV_Assert(handle);
+
+        auto opa = transa ? CUBLAS_OP_T : CUBLAS_OP_N,
+            opb = transb ? CUBLAS_OP_T : CUBLAS_OP_N;
+        int irows_c = static_cast<int>(rows_c),
+            icols_c = static_cast<int>(cols_c),
+            icommon_dim = static_cast<int>(common_dim),
+            ilda = static_cast<int>(lda),
+            ildb = static_cast<int>(ldb),
+            ildc = static_cast<int>(ldc);
+
+        CUDA4DNN_CHECK_CUBLAS(
+            cublasSgemm(
+                handle.get(),
+                opa, opb,
+                irows_c, icols_c, icommon_dim,
+                &alpha, A.get(), ilda,
+                B.get(), ildb,
+                &beta, C.get(), ildc
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cublas */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUBLAS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn.hpp
@@ -0,0 +1,10 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP
+
+#include "cudnn/cudnn.hpp"
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_CUDNN_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/convolution.hpp
@@ -0,0 +1,410 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    /** describe convolution filters
+     *
+     * @tparam  T   type of elements in the kernels
+     */
+    template <class T>
+    class FilterDescriptor {
+    public:
+        FilterDescriptor() noexcept : descriptor{ nullptr } { }
+        FilterDescriptor(const FilterDescriptor&) = delete;
+        FilterDescriptor(FilterDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a filter descriptor from the filter dimensions provided in \p shape
+         *
+         * Shape dimensions:
+         * 0: number of filters
+         * 1: number of input feature maps
+         * 2..n: kernel dimensions
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        FilterDescriptor(const SequenceContainer& shape) {
+            constructor(shape.begin(), shape.end());
+        }
+
+        /** constructs a filter descriptor from the filter dimensions provided in [begin, end)
+         *
+         * Shape dimensions:
+         * 0: number of filters
+         * 1: number of input feature maps
+         * 2..n: kernel dimensions
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
+        FilterDescriptor(ForwardItr begin, ForwardItr end) {
+            constructor(begin, end);
+        }
+
+        /** constructs a filter descriptor from the filter dimensions provided as arguments
+         *
+         * Shape dimensions:
+         * 0: number of filters
+         * 1: number of input feature maps
+         * 2..n: kernel dimensions
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class ...Sizes>
+        FilterDescriptor(Sizes ...sizes) {
+            static_assert(sizeof...(Sizes) >= 3, "filter descriptors must have at least three dimensions");
+            static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
+            std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
+            constructor(std::begin(dims), std::end(dims));
+        }
+
+        ~FilterDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
+            }
+        }
+
+        FilterDescriptor& operator=(const FilterDescriptor&) = delete;
+        FilterDescriptor& operator=(FilterDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnFilterDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class ForwardItr>
+        void constructor(ForwardItr start, ForwardItr end) {
+            CV_Assert(start != end);
+            CV_Assert(std::distance(start, end) >= 3);
+            CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateFilterDescriptor(&descriptor));
+            try {
+                const auto rank = std::distance(start, end);
+                if (rank == 4) {
+                    std::array<int, 4> dims;
+                    std::copy(start, end, std::begin(dims));
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetFilter4dDescriptor(
+                            descriptor,
+                            detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
+                            dims[0], dims[1], dims[2], dims[3]
+                        )
+                    );
+                } else {
+                    std::vector<int> dims(start, end);
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetFilterNdDescriptor(
+                            descriptor,
+                            detail::get_data_type<T>(), CUDNN_TENSOR_NCHW,
+                            dims.size(), dims.data()
+                        )
+                    );
+                }
+            } catch (...) {
+                /* cudnnDestroyFilterDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyFilterDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnFilterDescriptor_t descriptor;
+    };
+
+    /** describes a convolution operation
+     *
+     * @tparam  T   type of element participating in convolution
+     */
+    template <class T>
+    class ConvolutionDescriptor {
+    public:
+        ConvolutionDescriptor() noexcept : descriptor{ nullptr } { }
+        ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
+        ConvolutionDescriptor(ConvolutionDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a convolution descriptor
+         *
+         * Pre-conditions:
+         * - \p zero_padding, \p stride and \p dilation must have the same size
+         *
+         * The length of the containers is interpreted as the order of the convolution.
+         *
+         * Exception Guarantee: Strong
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        ConvolutionDescriptor(
+            const SequenceContainer& zero_padding,
+            const SequenceContainer& stride,
+            const SequenceContainer& dilation,
+            std::size_t group_count)
+        {
+            constructor(zero_padding, stride, dilation, group_count);
+        }
+
+        ~ConvolutionDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyConvolutionDescriptor will not fail for a valid descriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
+            }
+        }
+
+        ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
+        ConvolutionDescriptor& operator=(ConvolutionDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnConvolutionDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class SequenceContainer>
+        void constructor(
+            const SequenceContainer& zero_padding,
+            const SequenceContainer& stride,
+            const SequenceContainer& dilation,
+            std::size_t group_count)
+        {
+            CV_Assert(zero_padding.size() == stride.size());
+            CV_Assert(zero_padding.size() == dilation.size());
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&descriptor));
+            try {
+                const auto rank = zero_padding.size();
+                if (rank == 2) {
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetConvolution2dDescriptor(
+                            descriptor,
+                            zero_padding[0], zero_padding[1],
+                            stride[0], stride[1],
+                            dilation[0], dilation[1],
+                            CUDNN_CROSS_CORRELATION,
+                            detail::get_data_type<T>()
+                        )
+                    );
+                } else {
+                    std::vector<int> ipadding(std::begin(zero_padding), std::end(zero_padding));
+                    std::vector<int> istride(std::begin(stride), std::end(stride));
+                    std::vector<int> idilation(std::begin(dilation), std::end(dilation));
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetConvolutionNdDescriptor(
+                            descriptor,
+                            rank, ipadding.data(), istride.data(), idilation.data(),
+                            CUDNN_CROSS_CORRELATION,
+                            detail::get_data_type<T>()
+                        )
+                    );
+                }
+                CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionGroupCount(descriptor, group_count));
+                if (std::is_same<T, half>::value)
+                    CUDA4DNN_CHECK_CUDNN(cudnnSetConvolutionMathType(descriptor, CUDNN_TENSOR_OP_MATH));
+            } catch (...) {
+                /* cudnnDestroyConvolutionDescriptor will not fail for a valid desriptor object */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyConvolutionDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnConvolutionDescriptor_t descriptor;
+    };
+
+    /** wrapper around a convolution algorithm
+     *
+     * @tparam  T   type of elements being convolved
+     */
+    template <class T>
+    class ConvolutionAlgorithm {
+    public:
+        ConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
+        ConvolutionAlgorithm(ConvolutionAlgorithm&) = default;
+        ConvolutionAlgorithm(ConvolutionAlgorithm&&) = default;
+
+        /** selects a good algorithm for convolution for given configuration
+         *
+         * Exception Guarantee: Strong
+         */
+        ConvolutionAlgorithm(
+            const Handle& handle,
+            const ConvolutionDescriptor<T>& conv,
+            const FilterDescriptor<T>& filter,
+            const TensorDescriptor<T>& input,
+            const TensorDescriptor<T>& output)
+        {
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionForwardAlgorithm(
+                    handle.get(),
+                    input.get(), filter.get(), conv.get(), output.get(),
+                    CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
+                    0, /* no memory limit */
+                    &algo
+                )
+            );
+
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionForwardWorkspaceSize(
+                    handle.get(),
+                    input.get(), filter.get(), conv.get(), output.get(),
+                    algo, &workspace_size
+                )
+            );
+        }
+
+        ConvolutionAlgorithm& operator=(const ConvolutionAlgorithm&) = default;
+        ConvolutionAlgorithm& operator=(ConvolutionAlgorithm&& other) = default;
+
+        cudnnConvolutionFwdAlgo_t get() const noexcept { return algo; }
+
+        /** number of bytes of workspace memory required by the algorithm */
+        std::size_t get_workspace_size() const noexcept { return workspace_size; }
+
+    private:
+        cudnnConvolutionFwdAlgo_t algo;
+        std::size_t workspace_size;
+    };
+
+    /** gives the shape of the output tensor of convolution
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void getConvolutionForwardOutputDim(
+        const ConvolutionDescriptor<T>& convDesc,
+        const FilterDescriptor<T>& filterDesc,
+        const TensorDescriptor<T>& inputDesc,
+        std::vector<int>& output)
+    {
+        output.clear();
+        output.resize(CUDNN_DIM_MAX); /* we use `output` to hold temporaries */
+
+        std::vector<int> temp(CUDNN_DIM_MAX);
+        cudnnDataType_t tempDataType;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnGetTensorNdDescriptor(
+                inputDesc.get(),
+                CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
+                &tempDataType,
+                output.data(),
+                temp.data(),
+                temp.data()
+            )
+        );
+
+        const auto rank = output[0];
+        output.resize(rank);
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnGetConvolutionNdForwardOutputDim(
+                convDesc.get(), inputDesc.get(), filterDesc.get(), rank, output.data()
+            )
+        );
+    }
+
+    /** @brief performs convolution
+     *
+     * dstValue = alpha * result + beta * priorDstValue
+     *
+     * @tparam          T           convolution element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           convDesc    convolution description
+     * @param           convAlgo    algorithm to use for convolution
+     * @param           workspace   workspace memory which meets the requirements of \p convAlgo
+     * @param           filterDesc  filter descriptor
+     * @param[in]       filterPtr   pointer to device memory containing the filters
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           alpha       result scale factor
+     * @param           beta        previous value scale factor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void convolve(
+        const Handle& handle,
+        const ConvolutionDescriptor<T>& convDesc,
+        const ConvolutionAlgorithm<T>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<T>& filterDesc,
+        DevicePtr<const T> filterPtr,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        T alpha, T beta,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CV_Assert(handle);
+
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnConvolutionForward(
+                handle.get(),
+                &alpha, inputDesc.get(), inputPtr.get(),
+                filterDesc.get(), filterPtr.get(),
+                convDesc.get(), convAlgo.get(),
+                static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+                &beta, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+    template <> inline
+    void convolve(
+        const Handle& handle,
+        const ConvolutionDescriptor<half>& convDesc,
+        const ConvolutionAlgorithm<half>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<half>& filterDesc,
+        DevicePtr<const half> filterPtr,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        half alpha, half beta,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        CV_Assert(handle);
+
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha_ = alpha, beta_ = beta;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnConvolutionForward(
+                handle.get(),
+                &alpha_, inputDesc.get(), inputPtr.get(),
+                filterDesc.get(), filterPtr.get(),
+                convDesc.get(), convAlgo.get(),
+                static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+                &beta_, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CONVOLUTION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/cudnn.hpp
@@ -0,0 +1,280 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_CUDNN_HPP
+
+#include "../fp16.hpp"
+#include "../pointer.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <functional>
+#include <numeric>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+#define CUDA4DNN_CHECK_CUDNN(call) \
+    ::cv::dnn::cuda4dnn::csl::cudnn::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    /** @brief exception class for errors thrown by the cuDNN API */
+    class cuDNNException : public CUDAException {
+    public:
+        using CUDAException::CUDAException;
+    };
+
+    namespace detail {
+        inline void check(cudnnStatus_t status, const char* func, const char* file, int line) {
+            if (status != CUDNN_STATUS_SUCCESS)
+                throw cuDNNException(Error::GpuApiCallError, cudnnGetErrorString(status), func, file, line);
+        }
+
+        /** get_data_type<T> returns the equivalent cudnn enumeration constant for type T */
+        template <class> auto get_data_type()->decltype(CUDNN_DATA_FLOAT);
+        template <> inline auto get_data_type<half>()->decltype(CUDNN_DATA_HALF) { return CUDNN_DATA_HALF; }
+        template <> inline auto get_data_type<float>()->decltype(CUDNN_DATA_FLOAT) { return CUDNN_DATA_FLOAT; }
+    }
+
+    /** @brief noncopyable cuDNN smart handle
+     *
+     * UniqueHandle is a smart non-sharable wrapper for cuDNN handle which ensures that the handle
+     * is destroyed after use.
+     */
+    class UniqueHandle {
+    public:
+        /** creates a cuDNN handle which executes in the default stream
+         *
+         * Exception Guarantee: Basic
+         */
+        UniqueHandle() { CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle)); }
+
+        UniqueHandle(UniqueHandle&) = delete;
+        UniqueHandle(UniqueHandle&& other) noexcept
+            : stream(std::move(other.stream)), handle{ other.handle } {
+            other.handle = nullptr;
+        }
+
+        /** creates a cuDNN handle and associates it with the stream specified
+         *
+         * Exception Guarantee: Basic
+         */
+        UniqueHandle(Stream strm) : stream(std::move(strm)) {
+            CUDA4DNN_CHECK_CUDNN(cudnnCreate(&handle));
+            try {
+                CUDA4DNN_CHECK_CUDNN(cudnnSetStream(handle, stream.get()));
+            } catch (...) {
+                /* cudnnDestroy won't throw if a valid handle is passed */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
+                throw;
+            }
+        }
+
+        ~UniqueHandle() noexcept {
+            if (handle != nullptr) {
+                /* cudnnDestroy won't throw if a valid handle is passed */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroy(handle));
+            }
+        }
+
+        UniqueHandle& operator=(const UniqueHandle&) = delete;
+        UniqueHandle& operator=(UniqueHandle&& other) noexcept {
+            stream = std::move(other.stream);
+            handle = other.handle;
+            other.handle = nullptr;
+            return *this;
+        }
+
+        /** returns the raw cuDNN handle */
+        cudnnHandle_t get() const noexcept { return handle; }
+
+    private:
+        Stream stream;
+        cudnnHandle_t handle;
+    };
+
+    /** @brief sharable cuDNN smart handle
+     *
+     * Handle is a smart sharable wrapper for cuDNN handle which ensures that the handle
+     * is destroyed after all references to the handle are destroyed.
+     *
+     * @note Moving a Handle object to another invalidates the former
+     */
+    class Handle {
+    public:
+        /** creates a cuDNN handle which executes in the default stream
+         *
+         * Exception Guarantee: Basic
+         */
+        Handle() : handle(std::make_shared<UniqueHandle>()) { }
+
+        Handle(const Handle&) = default;
+        Handle(Handle&&) = default;
+
+        /** creates a cuDNN handle and associates it with the stream specified
+         *
+         * Exception Guarantee: Basic
+         */
+        Handle(Stream strm) : handle(std::make_shared<UniqueHandle>(std::move(strm))) { }
+
+        Handle& operator=(const Handle&) = default;
+        Handle& operator=(Handle&&) = default;
+
+        /** returns true if the handle is valid */
+        explicit operator bool() const noexcept { return static_cast<bool>(handle); }
+
+        cudnnHandle_t get() const noexcept {
+            CV_Assert(handle);
+            return handle->get();
+        }
+
+    private:
+        std::shared_ptr<UniqueHandle> handle;
+    };
+
+    /** describe a tensor
+     *
+     * @tparam  T   type of elements in the tensor
+     */
+    template <class T>
+    class TensorDescriptor {
+    public:
+        TensorDescriptor() noexcept : descriptor{ nullptr } { }
+        TensorDescriptor(const TensorDescriptor&) = delete;
+        TensorDescriptor(TensorDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a tensor descriptor from the axis lengths provided in \p shape
+         *
+         * Exception Guarantee: Basic
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        TensorDescriptor(const SequenceContainer& shape) {
+            constructor(shape.begin(), shape.end());
+        }
+
+        /** constructs a tensor descriptor from the axis lengths provided in [begin, end)
+         *
+         * Exception Guarantee: Basic
+         */
+        template <class ForwardItr, typename = typename std::enable_if<!std::is_integral<ForwardItr>::value, void>::type> // TODO is_iterator
+        TensorDescriptor(ForwardItr begin, ForwardItr end) {
+            constructor(begin, end);
+        }
+
+        /** constructs a tensor descriptor from the axis lengths provided as arguments
+         *
+         * Exception Guarantee: Basic
+         */
+        template <class ...Sizes>
+        TensorDescriptor(Sizes ...sizes) {
+            static_assert(sizeof...(Sizes) <= CUDNN_DIM_MAX, "required rank exceeds maximum supported rank");
+            std::array<int, sizeof...(Sizes)> dims = { static_cast<int>(sizes)... };
+            constructor(std::begin(dims), std::end(dims));
+        }
+
+        ~TensorDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyTensorDescriptor will not fail */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
+            }
+        }
+
+        TensorDescriptor& operator=(const TensorDescriptor&) = delete;
+        TensorDescriptor& operator=(TensorDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnTensorDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class ForwardItr>
+        void constructor(ForwardItr start, ForwardItr end) {
+            CV_Assert(start != end);
+            CV_Assert(std::distance(start, end) <= CUDNN_DIM_MAX);
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorDescriptor(&descriptor));
+            try {
+                /* cuDNN documentation recommends using the 4d tensor API whenever possible
+                 * hence, we create a 4d tensor descriptors for 3d tensor
+                 */
+                const auto rank = std::distance(start, end);
+                if (rank <= 4) {
+                    std::array<int, 4> dims;
+                    std::fill(std::begin(dims), std::end(dims), 1);
+
+                    /* suppose we have a 3d tensor, the first axis is the batch axis and
+                     * the second axis is the channel axis (generally)
+                     *
+                     * cuDNN frequently assumes that the first axis is the batch axis and the
+                     * second axis is the channel axis; hence, we copy the shape of a lower rank
+                     * tensor to the beginning of `dims`
+                     */
+                    std::copy(start, end, std::begin(dims));
+
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetTensor4dDescriptor(descriptor,
+                            CUDNN_TENSOR_NCHW, detail::get_data_type<T>(),
+                            dims[0], dims[1], dims[2], dims[3]
+                        )
+                    );
+                } else {
+                    std::vector<int> stride(rank);
+                    stride.back() = 1;
+                    /* WHAT WE HAVE NOW:
+                     * stride[-1] = 1
+                     * stride[-2] = garbage
+                     * stride[-3] = garbage
+                     * stride[-4] = garbage
+                     * ...
+                     */
+
+                    std::copy(start + 1, end, stride.begin());
+                    /* WHAT WE HAVE NOW:
+                     * stride[-1] = 1
+                     * stride[-2] = dim[-1]
+                     * stride[-3] = dim[-2]
+                     * stride[-4] = dim[-3]
+                     * ...
+                     */
+
+                    std::partial_sum(stride.rbegin(), stride.rend(), stride.rbegin(), std::multiplies<int>());
+                    /* WHAT WE HAVE NOW:
+                     * stride[-1] = 1
+                     * stride[-2] = stride[-1] * dim[-1]
+                     * stride[-3] = stride[-2] * dim[-2]
+                     * stride[-4] = stride[-3] * dim[-3]
+                     * ...
+                     */
+
+                    std::vector<int> dims(start, end);
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetTensorNdDescriptor(descriptor,
+                            detail::get_data_type<T>(), rank,
+                            dims.data(), stride.data()
+                        )
+                    );
+                }
+            } catch (...) {
+                /* cudnnDestroyTensorDescriptor will not fail */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnTensorDescriptor_t descriptor;
+    };
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/lrn.hpp
@@ -0,0 +1,205 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cudnn.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    class LRNDescriptor {
+    public:
+        enum class LRNType {
+            ACROSS_CHANNELS,
+            WITHIN_CHANNEL
+        };
+
+        LRNDescriptor() noexcept : descriptor{ nullptr } { }
+        LRNDescriptor(const LRNDescriptor&) = delete;
+        LRNDescriptor(LRNDescriptor&& other) noexcept
+            : descriptor{ other.descriptor }, type{ other.type } {
+            other.descriptor = nullptr;
+        }
+
+        /** sets up a LRN descriptor
+         *
+         * @param local_size    size of the normalization window
+         * @param alpha         variance scaling parameter
+         * @param beta          power parameter
+         * @param k             bias parameter
+         *
+         * @note \p alpha is divided by the window width in across channels mode
+         * @note \p alpha is divided by the (window width)^spatialDimensions in within channel mode
+         *
+         * @note the \p alpha, \p beta and \p k will be type casted to the tensor datatype during operation
+         *
+         * Exception Guarantee: Basic
+         */
+        LRNDescriptor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
+            constructor(local_size, alpha, beta, k, type_);
+        }
+
+        ~LRNDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
+            }
+        }
+
+        LRNDescriptor& operator=(const LRNDescriptor&) = delete;
+        LRNDescriptor& operator=(LRNDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            type = other.type;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnLRNDescriptor_t get() const noexcept { return descriptor; }
+        LRNType getType() const noexcept { return type; }
+
+    private:
+        void constructor(std::size_t local_size, double alpha, double beta, double k, LRNType type_) {
+            CV_Assert(CUDNN_LRN_MIN_N <= local_size && local_size <= CUDNN_LRN_MAX_N);
+
+            type = type_;
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateLRNDescriptor(&descriptor));
+            try {
+                CUDA4DNN_CHECK_CUDNN(
+                    cudnnSetLRNDescriptor(
+                        descriptor,
+                        local_size,
+                        alpha,
+                        beta,
+                        k
+                    )
+               );
+            } catch (...) {
+                /* cudnnDestroyLRNDescriptor will not fail for a valid descriptor */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyLRNDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnLRNDescriptor_t descriptor;
+        LRNType type;
+    };
+
+    /** @brief performs local response normalization
+     *
+     * dstValue = alpha * result + beta * priorDstValue
+     *
+     * @tparam          T           element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           lrnDesc     LRN description
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           alpha       result scale factor
+     * @param           beta        previous value scale factor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     * @param           workspace   workspace memory which meets the requirements of \p convAlgo
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void LRNForward(
+        const Handle& handle,
+        const LRNDescriptor& lrnDesc,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        T alpha, T beta,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr,
+        WorkspaceInstance workspace)
+    {
+        CV_Assert(handle);
+
+        if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnLRNCrossChannelForward(
+                    handle.get(),
+                    lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+                    &alpha, inputDesc.get(), inputPtr.get(),
+                    &beta, outputDesc.get(), outputPtr.get()
+                )
+            );
+        } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
+            std::size_t size;
+            CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
+
+            DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
+            DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
+
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnDivisiveNormalizationForward(
+                    handle.get(),
+                    lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
+                    &alpha, inputDesc.get(), inputPtr.get(),
+                    NULL,
+                    static_cast<void*>(temp1), static_cast<void*>(temp2),
+                    &beta, outputDesc.get(), outputPtr.get()
+                )
+            );
+        }
+    }
+
+    template <> inline
+    void LRNForward(
+       const Handle& handle,
+       const LRNDescriptor& lrnDesc,
+       const TensorDescriptor<half>& inputDesc,
+       DevicePtr<const half> inputPtr,
+       half alpha, half beta,
+       const TensorDescriptor<half>& outputDesc,
+       DevicePtr<half> outputPtr,
+        WorkspaceInstance workspace)
+    {
+        CV_Assert(handle);
+
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha_ = alpha, beta_ = beta;
+        if (lrnDesc.getType() == LRNDescriptor::LRNType::ACROSS_CHANNELS) {
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnLRNCrossChannelForward(
+                    handle.get(),
+                    lrnDesc.get(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+                    &alpha_, inputDesc.get(), inputPtr.get(),
+                    &beta_, outputDesc.get(), outputPtr.get()
+                )
+            );
+        } else if (lrnDesc.getType() == LRNDescriptor::LRNType::WITHIN_CHANNEL) {
+            std::size_t size;
+            CUDA4DNN_CHECK_CUDNN(cudnnGetTensorSizeInBytes(inputDesc.get(), &size));
+
+            DevicePtr<void> temp1 = workspace.get_span<half>(size).data();
+            DevicePtr<void> temp2 = workspace.get_span<half>(size).data();
+
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnDivisiveNormalizationForward(
+                    handle.get(),
+                    lrnDesc.get(), CUDNN_DIVNORM_PRECOMPUTED_MEANS,
+                    &alpha_, inputDesc.get(), inputPtr.get(),
+                    NULL,
+                    static_cast<void*>(temp1), static_cast<void*>(temp2),
+                    &beta_, outputDesc.get(), outputPtr.get()
+                )
+            );
+        }
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_LRN_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/pooling.hpp
@@ -0,0 +1,236 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cudnn.h>
+
+#include <cstddef>
+#include <array>
+#include <algorithm>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    class PoolingDescriptor {
+    public:
+        enum class PoolingType {
+            MAX,
+            MAX_DETERMINISTIC,
+            AVERAGE_EXCLUDE_PADDING,
+            AVERAGE_INCLUDE_PADDING
+        };
+
+        PoolingDescriptor() noexcept : descriptor{ nullptr } { }
+        PoolingDescriptor(const PoolingDescriptor&) = delete;
+        PoolingDescriptor(PoolingDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a pooling descriptor
+         *
+         * Pre-conditions:
+         * - \p window_size, \p padding and \p stride must have the same size
+         *
+         * The length of the containers is interpreted as the order of the pooling operation.
+         *
+         * Exception Guarantee: Basic
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        PoolingDescriptor(
+            const SequenceContainer& window_size,
+            const SequenceContainer& padding,
+            const SequenceContainer& stride,
+            PoolingType type)
+        {
+            constructor(window_size, padding, stride, type);
+        }
+
+        ~PoolingDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
+            }
+        }
+
+        PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
+        PoolingDescriptor& operator=(PoolingDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnPoolingDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class SequenceContainer>
+        void constructor(
+            const SequenceContainer& window_size,
+            const SequenceContainer& padding,
+            const SequenceContainer& stride,
+            PoolingType type)
+        {
+            CV_Assert(window_size.size() == padding.size());
+            CV_Assert(window_size.size() == stride.size());
+
+            auto get_pooling_type = [] (PoolingType type) {
+                switch (type) {
+                case PoolingType::MAX:
+                    return CUDNN_POOLING_MAX;
+                case PoolingType::MAX_DETERMINISTIC:
+                    return CUDNN_POOLING_MAX_DETERMINISTIC;
+                case PoolingType::AVERAGE_EXCLUDE_PADDING:
+                    return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+                case PoolingType::AVERAGE_INCLUDE_PADDING:
+                    return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+                }
+                CV_Error(Error::StsBadArg, "unknown pooling type");
+            };
+
+            CUDA4DNN_CHECK_CUDNN(cudnnCreatePoolingDescriptor(&descriptor));
+            try {
+                const auto rank = window_size.size();
+                if (rank == 2) {
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetPooling2dDescriptor(
+                            descriptor,
+                            get_pooling_type(type), CUDNN_PROPAGATE_NAN,
+                            window_size[0], window_size[1],
+                            padding[0], padding[1],
+                            stride[0], stride[1]
+                        )
+                    );
+                } else {
+                    std::vector<int> iwindow_size(std::begin(window_size), std::end(window_size));
+                    std::vector<int> ipadding(std::begin(padding), std::end(padding));
+                    std::vector<int> istride(std::begin(stride), std::end(stride));
+                    CUDA4DNN_CHECK_CUDNN(
+                        cudnnSetPoolingNdDescriptor(
+                            descriptor,
+                            get_pooling_type(type), CUDNN_PROPAGATE_NAN,
+                            rank, iwindow_size.data(), ipadding.data(), istride.data()
+                        )
+                    );
+                }
+            } catch (...) {
+                /* cudnnDestroyPoolingDescriptor will not fail for a valid descriptor */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyPoolingDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnPoolingDescriptor_t descriptor;
+    };
+
+    /** gives the shape of the output tensor after pooling
+     *
+     * @note it's not required to enforce the this shape in the output tensor; slightly different shapes will work
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T> inline
+    void getPoolingForwardOutputDim(
+        const PoolingDescriptor& poolingDesc,
+        const TensorDescriptor<T>& inputDesc,
+        std::vector<int>& output_dim)
+    {
+        output_dim.clear();
+        output_dim.resize(CUDNN_DIM_MAX); /* we use `output_dim` to hold temporaries */
+
+        std::vector<int> temp(CUDNN_DIM_MAX);
+        cudnnDataType_t tempDataType;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnGetTensorNdDescriptor(
+                inputDesc.get(),
+                CUDNN_DIM_MAX + 1, /* according to docs, this is what we do to get the rank */
+                &tempDataType,
+                output_dim.data(),
+                temp.data(),
+                temp.data()
+            )
+        );
+
+        const auto rank = output_dim[0];
+        output_dim.resize(rank);
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnGetPoolingNdForwardOutputDim(poolingDesc.get(), inputDesc.get(), rank, output_dim.data())
+        );
+    }
+
+    /** @brief performs pooling operation
+     *
+     * dstValue = alpha * result + beta * priorDstValue
+     *
+     * @tparam          T           pooling element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN Handle
+     * @param           poolingDesc pooling description
+     * @param           inputDesc   tensor descriptor describing the input
+     * @param[in]       inputPtr    pointer to input tensor in device memory
+     * @param           alpha       result scale factor
+     * @param           beta        previous value scale factor
+     * @param           outputDesc  tensor descriptor describing the output
+     * @param[out]      outputPtr   pointer to output tensor in device memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void pool(
+        const Handle& handle,
+        const PoolingDescriptor& poolingDesc,
+        const TensorDescriptor<T>& inputDesc,
+        const DevicePtr<const T> inputPtr,
+        T alpha, T beta,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CV_Assert(handle);
+
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnPoolingForward(
+                handle.get(),
+                poolingDesc.get(),
+                &alpha, inputDesc.get(), inputPtr.get(),
+                &beta, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+    template <> inline
+    void pool(
+        const Handle& handle,
+        const PoolingDescriptor& poolingDesc,
+        const TensorDescriptor<half>& inputDesc,
+        const DevicePtr<const half> inputPtr,
+        half alpha, half beta,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        CV_Assert(handle);
+
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha_ = alpha, beta_ = beta;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnPoolingForward(
+                handle.get(),
+                poolingDesc.get(),
+                &alpha_, inputDesc.get(), inputPtr.get(),
+                &beta_, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_POOLING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/softmax.hpp
@@ -0,0 +1,68 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP
+
+#include "cudnn.hpp"
+
+#include "../pointer.hpp"
+
+#include <cudnn.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    /** @brief computes softmax (or log softmax)
+     *
+     * @tparam          T           element type (must be `half` or `float`)
+     *
+     * @param           handle      valid cuDNN handle
+     * @param           outputDesc  tensor descriptor for A
+     * @param[out]      output      pointer to tensor in device memory
+     * @param           inputDesc   tensor descriptor for C
+     * @param[in]       input       pointer to tensor in device memory
+     * @param           log         apply log on probabilities
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void softmax(const cudnn::Handle& handle,
+        const TensorDescriptor<T>& outputDesc, DevicePtr<T> output,
+        const TensorDescriptor<T>& inputDesc, DevicePtr<const T> input,
+        bool log)
+    {
+        T alpha = 1.0, beta = 0.0;
+        cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnSoftmaxForward(
+                handle.get(),
+                algo, CUDNN_SOFTMAX_MODE_CHANNEL,
+                &alpha, inputDesc.get(), input.get(),
+                &beta, outputDesc.get(), output.get()
+            )
+        );
+    }
+
+    template <> inline
+    void softmax(const cudnn::Handle& handle,
+        const TensorDescriptor<half>& outputDesc, DevicePtr<half> output,
+        const TensorDescriptor<half>& inputDesc, DevicePtr<const half> input,
+        bool log)
+    {
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha = 1.0, beta = 0.0;
+        cudnnSoftmaxAlgorithm_t algo = log ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnSoftmaxForward(
+                handle.get(),
+                algo, CUDNN_SOFTMAX_MODE_CHANNEL,
+                &alpha, inputDesc.get(), input.get(),
+                &beta, outputDesc.get(), output.get()
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_SOFTMAX_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/transform.hpp
@@ -0,0 +1,142 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP
+
+#include "../pointer.hpp"
+
+#include "cudnn.hpp"
+
+#include <cudnn.h>
+#include <vector>
+#include <type_traits>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    /** describes a tensor transform operation
+     *
+     * Supported transformations:
+     * - add or remove asymmetric padding
+     */
+    class TensorTransformDescriptor {
+    public:
+        TensorTransformDescriptor() noexcept : descriptor{ nullptr } { }
+        TensorTransformDescriptor(const TensorTransformDescriptor&) = delete;
+        TensorTransformDescriptor(TensorTransformDescriptor&& other) noexcept
+            : descriptor{ other.descriptor } {
+            other.descriptor = nullptr;
+        }
+
+        /** constructs a convolution descriptor
+         *
+         * Pre-conditions:
+         * - \p padding_left and \p padding_right must have the same size
+         *
+         * The length of the containers is interpreted as the rank of the tensors which will be given.
+         *
+         * @note \p padding_left and \p padding_right may have negative values to remove padding
+         *
+         * Exception Guarantee: Basic
+         */
+        template <class SequenceContainer, typename = decltype(std::begin(std::declval<SequenceContainer>()))>
+        TensorTransformDescriptor(
+            const SequenceContainer& padding_left,
+            const SequenceContainer& padding_right)
+        {
+            constructor(padding_left, padding_right);
+        }
+
+        ~TensorTransformDescriptor() noexcept {
+            if (descriptor != nullptr) {
+                /* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
+            }
+        }
+
+        TensorTransformDescriptor& operator=(const TensorTransformDescriptor&) = delete;
+        TensorTransformDescriptor& operator=(TensorTransformDescriptor&& other) noexcept {
+            descriptor = other.descriptor;
+            other.descriptor = nullptr;
+            return *this;
+        };
+
+        cudnnTensorTransformDescriptor_t get() const noexcept { return descriptor; }
+
+    private:
+        template <class SequenceContainer>
+        void constructor(
+            const SequenceContainer& padding_left,
+            const SequenceContainer& padding_right
+        )
+        {
+            CV_Assert(padding_left.size() == padding_right.size());
+
+            auto ipadding_left  = std::vector<int32_t>(std::begin(padding_left), std::end(padding_left));
+            auto ipadding_right = std::vector<int32_t>(std::begin(padding_right), std::end(padding_right));
+            CUDA4DNN_CHECK_CUDNN(cudnnCreateTensorTransformDescriptor(&descriptor));
+            try {
+                CUDA4DNN_CHECK_CUDNN(
+                    cudnnSetTensorTransformDescriptor(
+                        descriptor,
+                        ipadding_left.size(), CUDNN_TENSOR_NCHW,
+                        ipadding_left.data(), ipadding_right.data(),
+                        NULL, CUDNN_TRANSFORM_FOLD
+                    )
+                );
+            } catch (...) {
+                /* cudnnDestroyTensorTransformDescriptor will not fail for a valid descriptor */
+                CUDA4DNN_CHECK_CUDNN(cudnnDestroyTensorTransformDescriptor(descriptor));
+                throw;
+            }
+        }
+
+        cudnnTensorTransformDescriptor_t descriptor;
+    };
+
+    template <class T>
+    void transform(
+        const Handle& handle,
+        const TensorTransformDescriptor& transDesc,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        T alpha = 1.0, beta = 0.0;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnTransformTensorEx(
+                handle.get(),
+                transDesc.get(),
+                &alpha, inputDesc.get(), inputPtr.get(),
+                &beta, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+    template <> inline
+    void transform(
+        const Handle& handle,
+        const TensorTransformDescriptor& transDesc,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha = 1.0, beta = 0.0;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnTransformTensorEx(
+                handle.get(),
+                transDesc.get(),
+                &alpha, inputDesc.get(), inputPtr.get(),
+                &beta, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSFORM_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/cudnn/transpose_convolution.hpp
@@ -0,0 +1,148 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
+#define OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP
+
+#include "cudnn.hpp"
+#include "convolution.hpp"
+
+#include "../pointer.hpp"
+#include "../workspace.hpp"
+
+#include <cudnn.h>
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl { namespace cudnn {
+
+    /** wrapper around a transpose convolution algorithm
+     *
+     * @tparam  T   type of elements being transpose-convolved
+     */
+    template <class T>
+    class TransposeConvolutionAlgorithm {
+    public:
+        TransposeConvolutionAlgorithm() noexcept : workspace_size{ 0 } { }
+        TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&) = default;
+        TransposeConvolutionAlgorithm(TransposeConvolutionAlgorithm&&) = default;
+
+        TransposeConvolutionAlgorithm(
+            const Handle& handle,
+            const ConvolutionDescriptor<T>& conv,
+            const FilterDescriptor<T>& filter,
+            const TensorDescriptor<T>& input,
+            const TensorDescriptor<T>& output)
+        {
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionBackwardDataAlgorithm(
+                    handle.get(),
+                    filter.get(), input.get(), conv.get(), output.get(),
+                    CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
+                    0, /* no memory limit */
+                    &dalgo
+                )
+            );
+
+            CUDA4DNN_CHECK_CUDNN(
+                cudnnGetConvolutionBackwardDataWorkspaceSize(
+                    handle.get(),
+                    filter.get(), input.get(), conv.get(), output.get(),
+                    dalgo, &workspace_size
+                )
+            );
+        }
+
+        TransposeConvolutionAlgorithm& operator=(const TransposeConvolutionAlgorithm&) = default;
+        TransposeConvolutionAlgorithm& operator=(TransposeConvolutionAlgorithm&& other) = default;
+
+        cudnnConvolutionBwdDataAlgo_t get() const noexcept { return dalgo; }
+
+        std::size_t get_workspace_size() const noexcept { return workspace_size; }
+
+    private:
+        cudnnConvolutionBwdDataAlgo_t dalgo;
+        std::size_t workspace_size;
+    };
+
+    /** @brief performs transpose convolution
+      *
+      * dstValue = alpha * result + beta * priorDstValue
+      *
+      * @tparam          T              transpose convolution element type (must be `half` or `float`)
+      *
+      * @param           handle         valid cuDNN Handle
+      * @param           convDesc       convolution description
+      * @param           transConvAlgo  algorithm to use for convolution
+      * @param           workspace      workspace memory which meets the requirements of \p convAlgo
+      * @param           filterDesc     filter descriptor
+      * @param[in]       filterPtr      pointer to device memory containing the filters
+      * @param           inputDesc      tensor descriptor describing the input
+      * @param[in]       inputPtr       pointer to input tensor in device memory
+      * @param           alpha          result scale factor
+      * @param           beta           previous value scale factor
+      * @param           outputDesc     tensor descriptor describing the output
+      * @param[out]      outputPtr      pointer to output tensor in device memory
+      *
+      * Exception Guarantee: Basic
+      */
+    template <class T>
+    void transpose_convolve(
+        const Handle& handle,
+        const ConvolutionDescriptor<T>& convDesc,
+        const TransposeConvolutionAlgorithm<T>& transConvAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<T>& filterDesc,
+        DevicePtr<const T> filterPtr,
+        const TensorDescriptor<T>& inputDesc,
+        DevicePtr<const T> inputPtr,
+        T alpha, T beta,
+        const TensorDescriptor<T>& outputDesc,
+        DevicePtr<T> outputPtr)
+    {
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnConvolutionBackwardData(
+                handle.get(),
+                &alpha,
+                filterDesc.get(), filterPtr.get(),
+                inputDesc.get(), inputPtr.get(),
+                convDesc.get(), transConvAlgo.get(),
+                static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+                &beta, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+    template <> inline
+    void transpose_convolve(
+        const Handle& handle,
+        const ConvolutionDescriptor<half>& convDesc,
+        const TransposeConvolutionAlgorithm<half>& convAlgo,
+        WorkspaceInstance workspace,
+        const FilterDescriptor<half>& filterDesc,
+        DevicePtr<const half> filterPtr,
+        const TensorDescriptor<half>& inputDesc,
+        DevicePtr<const half> inputPtr,
+        half alpha, half beta,
+        const TensorDescriptor<half>& outputDesc,
+        DevicePtr<half> outputPtr)
+    {
+        /* we specalize for fp16 as the scaling factors must be provided as `float` */
+        float alpha_ = alpha, beta_ = beta;
+        CUDA4DNN_CHECK_CUDNN(
+            cudnnConvolutionBackwardData(
+                handle.get(),
+                &alpha_,
+                filterDesc.get(), filterPtr.get(),
+                inputDesc.get(), inputPtr.get(),
+                convDesc.get(), convAlgo.get(),
+                static_cast<void*>(workspace.get()), workspace.size_in_bytes(),
+                &beta_, outputDesc.get(), outputPtr.get()
+            )
+        );
+    }
+
+}}}}} /* namespace cv::dnn::cuda4dnn::csl::cudnn */
+
+#endif /* OPENCV_DNN_CUDA4DNN_CSL_CUDNN_TRANSPOSE_CONVOLUTION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/error.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/error.hpp
@@ -0,0 +1,30 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#define CUDA4DNN_CHECK_CUDA(call) \
+    ::cv::dnn::cuda4dnn::csl::detail::check((call), CV_Func, __FILE__, __LINE__)
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+    /** @brief exception class for errors thrown by the CUDA APIs */
+    class CUDAException : public cv::Exception {
+    public:
+        using cv::Exception::Exception;
+    };
+
+    namespace detail {
+        inline void check(cudaError_t err, const char* func, const char* file, int line) {
+            if (err != cudaSuccess)
+                throw CUDAException(Error::GpuApiCallError, cudaGetErrorString(err), func, file, line);
+        }
+    }
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_ERROR_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/event.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/event.hpp
@@ -0,0 +1,101 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP
+
+#include "error.hpp"
+#include "stream.hpp"
+
+#include <opencv2/core/utils/logger.hpp>
+
+#include <cuda_runtime_api.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    /** @brief sharable CUDA event
+     *
+     * Event is a smart sharable wrapper for CUDA event handle which ensures that
+     * the handle is destroyed after use.
+     *
+     * @note Moving an Event object to another invalidates the former
+     */
+    class Event {
+    public:
+        Event() noexcept : event{ nullptr } { }
+        Event(const Event&) = delete;
+        Event(Event&& other) noexcept
+            : event{ other.event } {
+            other.event = nullptr;
+        }
+
+        /** if \p create is `true`, a new event will be created; otherwise, an empty event object is created */
+        Event(bool create, bool timing_event = false) : event{nullptr} {
+            if (create) {
+                unsigned int flags = cudaEventBlockingSync | (timing_event ? 0 : cudaEventDisableTiming);
+                CUDA4DNN_CHECK_CUDA(cudaEventCreateWithFlags(&event, flags));
+            }
+        }
+
+        ~Event() {
+            try {
+                if (event != nullptr)
+                    CUDA4DNN_CHECK_CUDA(cudaEventDestroy(event));
+            } catch (const CUDAException& ex) {
+                std::ostringstream os;
+                os << "Asynchronous exception caught during CUDA event destruction.\n";
+                os << ex.what();
+                os << "Exception will be ignored.\n";
+                CV_LOG_WARNING(0, os.str().c_str());
+            }
+        }
+
+        Event& operator=(const Event&) noexcept = delete;
+        Event& operator=(Event&& other) noexcept {
+            event = other.event;
+            other.event = nullptr;
+            return *this;
+        }
+
+        /** mark a point in \p stream */
+        void record(const Stream& stream) {
+            CUDA4DNN_CHECK_CUDA(cudaEventRecord(event, stream.get()));
+        }
+
+        /** blocks the caller thread until all operations before the event finish */
+        void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaEventSynchronize(event)); }
+
+        /** returns true if there are operations pending before the event completes */
+        bool busy() const {
+            auto status = cudaEventQuery(event);
+            if (status == cudaErrorNotReady)
+                return true;
+            CUDA4DNN_CHECK_CUDA(status);
+            return false;
+        }
+
+        cudaEvent_t get() const noexcept { return event; }
+
+        /** returns true if the event is valid */
+        explicit operator bool() const noexcept { return event; }
+
+    private:
+        cudaEvent_t event;
+    };
+
+    /** makes a stream wait on an event */
+    void StreamWaitOnEvent(const Stream& stream, const Event& event) {
+        CUDA4DNN_CHECK_CUDA(cudaStreamWaitEvent(stream.get(), event.get(), 0));
+    }
+
+    /** returns the time elapsed between two events in milliseconds */
+    float TimeElapsedBetweenEvents(const Event& start, const Event& end) {
+        float temp;
+        CUDA4DNN_CHECK_CUDA(cudaEventElapsedTime(&temp, start.get(), end.get()));
+        return temp;
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_EVENT_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/fp16.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/fp16.hpp
@@ -0,0 +1,84 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP
+
+#include "nvcc_defs.hpp"
+
+#include <cuda_fp16.h>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    namespace detail {
+        template <class T, class = void>
+        struct is_half_convertible : std::false_type { };
+
+        template <class T>
+        struct is_half_convertible<T, typename std::enable_if<std::is_integral<T>::value, void>::type> : std::true_type { };
+
+        template <class T>
+        struct is_half_convertible<T, typename std::enable_if<std::is_floating_point<T>::value, void>::type> : std::true_type { };
+    }
+
+    /* Note: nvcc has a broken overload resolution; it considers host overloads inside device code
+    CUDA4DNN_HOST bool operator==(half lhs, half rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); }
+    CUDA4DNN_HOST bool operator!=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); }
+    CUDA4DNN_HOST bool operator<(half lhs, half rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); }
+    CUDA4DNN_HOST bool operator>(half lhs, half rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); }
+    CUDA4DNN_HOST bool operator<=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); }
+    CUDA4DNN_HOST bool operator>=(half lhs, half rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); }
+    */
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator==(half lhs, T rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator!=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator<(half lhs, T rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator>(half lhs, T rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator<=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator>=(half lhs, T rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator==(T lhs, half rhs) noexcept { return static_cast<float>(lhs) == static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator!=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) != static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator<(T lhs, half rhs) noexcept { return static_cast<float>(lhs) < static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator>(T lhs, half rhs) noexcept { return static_cast<float>(lhs) > static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator<=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) <= static_cast<float>(rhs); }
+
+    template <class T> CUDA4DNN_HOST
+    typename std::enable_if<detail::is_half_convertible<T>::value, bool>
+    ::type operator>=(T lhs, half rhs) noexcept { return static_cast<float>(lhs) >= static_cast<float>(rhs); }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_FP16_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/memory.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/memory.hpp
@@ -0,0 +1,295 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP
+
+#include "error.hpp"
+#include "pointer.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <memory>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    /* @brief smart device pointer with allocation/deallocation methods
+     *
+     * ManagedPtr is a smart shared device pointer which also handles memory allocation.
+     */
+    template <class T>
+    class ManagedPtr {
+        static_assert(!std::is_const<T>::value && !std::is_volatile<T>::value, "T cannot be cv-qualified");
+        static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
+
+    public:
+        using element_type = T;
+
+        using pointer = DevicePtr<element_type>;
+        using const_pointer = DevicePtr<typename std::add_const<element_type>::type>;
+
+        using size_type = std::size_t;
+
+        ManagedPtr() noexcept : wrapped{ nullptr }, n{ 0 }, capacity{ 0 } { }
+        ManagedPtr(const ManagedPtr&) noexcept = default;
+        ManagedPtr(ManagedPtr&& other) noexcept
+            : wrapped{ std::move(other.wrapped) }, n{ other.n }, capacity { other.capacity }
+        {
+            other.reset();
+        }
+
+        /** allocates device memory for \p count number of element */
+        ManagedPtr(size_type count) {
+            if (count <= 0) {
+                CV_Error(Error::StsBadArg, "number of elements is zero or negative");
+            }
+
+            void* temp = nullptr;
+            CUDA4DNN_CHECK_CUDA(cudaMalloc(&temp, count * sizeof(element_type)));
+
+            auto ptr = typename pointer::pointer(static_cast<element_type*>(temp));
+            wrapped.reset(ptr, [](element_type* ptr) {
+                if (ptr != nullptr) {
+                    /* contract violation for std::shared_ptr if cudaFree throws */
+                    try {
+                        CUDA4DNN_CHECK_CUDA(cudaFree(ptr));
+                    } catch (const CUDAException& ex) {
+                        std::ostringstream os;
+                        os << "Device memory deallocation failed in deleter.\n";
+                        os << ex.what();
+                        os << "Exception will be ignored.\n";
+                        CV_LOG_WARNING(0, os.str().c_str());
+                    }
+                }
+            });
+            /* std::shared_ptr<T>::reset invokves the deleter if an exception occurs; hence, we don't
+             * need to have a try-catch block to free the allocated device memory
+             */
+
+            n = capacity = count;
+        }
+
+        ManagedPtr& operator=(ManagedPtr&& other) noexcept {
+            wrapped = std::move(other.wrapped);
+            n = other.n;
+            capacity = other.capacity;
+
+            other.reset();
+            return *this;
+        }
+
+        size_type size() const noexcept { return n; }
+
+        void reset() noexcept { wrapped.reset(); n = capacity = 0; }
+
+        /**
+         * deallocates any previously allocated memory and allocates device memory
+         * for \p count number of elements
+         *
+         * @note no reallocation if the previously allocated memory has no owners and the requested memory size fits in it
+         * @note use move constructor to guarantee a deallocation of the previously allocated memory
+         *
+         * Exception Guarantee: Strong
+         */
+        void reset(size_type count) {
+            /* we need to fully own the memory to perform optimizations */
+            if (wrapped.use_count() == 1) {
+                /* avoid reallocation if the existing capacity is sufficient */
+                if (count <= capacity) {
+                    n = count;
+                    return;
+                }
+            }
+
+            /* no optimization performed; allocate memory */
+            ManagedPtr tmp(count);
+            swap(tmp, *this);
+        }
+
+        pointer get() const noexcept { return pointer(wrapped.get()); }
+
+        explicit operator bool() const noexcept { return wrapped; }
+
+        friend bool operator==(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped == rhs.wrapped; }
+        friend bool operator!=(const ManagedPtr& lhs, const ManagedPtr& rhs) noexcept { return lhs.wrapped != rhs.wrapped; }
+
+        friend void swap(ManagedPtr& lhs, ManagedPtr& rhs) noexcept {
+            using std::swap;
+            swap(lhs.wrapped, rhs.wrapped);
+            swap(lhs.n, rhs.n);
+            swap(lhs.capacity, rhs.capacity);
+        }
+
+    private:
+        std::shared_ptr<element_type> wrapped;
+        size_type n, capacity;
+    };
+
+    /** copies entire memory block pointed by \p src to \p dest
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    host pointer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(T *dest, const ManagedPtr<T>& src) {
+        memcpy<T>(dest, src.get(), src.size());
+    }
+
+    /** copies data from memory pointed by \p src to fully fill \p dest
+     *
+     * \param[in]   src     host pointer
+     * \param[out]  dest    device pointer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p src must be at least as big as the memory block held by \p dest
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(const ManagedPtr<T>& dest, const T* src) {
+        memcpy<T>(dest.get(), src, dest.size());
+    }
+
+    /** copies data from memory pointed by \p src to \p dest
+     *
+     * if the two \p src and \p  dest have different sizes, the number of elements copied is
+     * equal to the size of the smaller memory block
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    device pointer
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(const ManagedPtr<T>& dest, const ManagedPtr<T>& src) {
+        memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()));
+    }
+
+    /** sets device memory block to a specific 8-bit value
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  ch      8-bit value to fill the device memory with
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memset(const ManagedPtr<T>& dest, std::int8_t ch) {
+        memset<T>(dest.get(), ch, dest.size());
+    }
+
+    /** copies entire memory block pointed by \p src to \p dest asynchronously
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    host pointer
+     * \param       stream  CUDA stream that has to be used for the memory transfer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
+     * - \p dest points to page-locked memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(T *dest, const ManagedPtr<T>& src, const Stream& stream) {
+        CV_Assert(stream);
+        memcpy<T>(dest, src.get(), src.size(), stream);
+    }
+
+    /** copies data from memory pointed by \p src to \p dest asynchronously
+     *
+     * \param[in]   src     host pointer
+     * \param[out]  dest    device pointer
+     * \param       stream  CUDA stream that has to be used for the memory transfer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest must be large enough to hold the entire block of memory held by \p src
+     * - \p src points to page-locked memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(const ManagedPtr<T>& dest, const T* src, const Stream& stream) {
+        CV_Assert(stream);
+        memcpy<T>(dest.get(), src, dest.size(), stream);
+    }
+
+    /** copies data from memory pointed by \p src to \p dest asynchronously
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    device pointer
+     * \param       stream  CUDA stream that has to be used for the memory transfer
+     *
+     * if the two \p src and \p  dest have different sizes, the number of elements copied is
+     * equal to the size of the smaller memory block
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(ManagedPtr<T>& dest, const ManagedPtr<T>& src, const Stream& stream) {
+        CV_Assert(stream);
+        memcpy<T>(dest.get(), src.get(), std::min(dest.size(), src.size()), stream);
+    }
+
+    /** sets device memory block to a specific 8-bit value asynchronously
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  ch      8-bit value to fill the device memory with
+     * \param       stream  CUDA stream that has to be used for the memory operation
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memset(const ManagedPtr<T>& dest, int ch, const Stream& stream) {
+        CV_Assert(stream);
+        memset<T>(dest.get(), ch, dest.size(), stream);
+    }
+
+    /** @brief registers host memory as page-locked and unregisters on destruction */
+    class MemoryLockGuard {
+    public:
+        MemoryLockGuard() noexcept : ptr { nullptr } { }
+        MemoryLockGuard(const MemoryLockGuard&) = delete;
+        MemoryLockGuard(MemoryLockGuard&& other) noexcept : ptr{ other.ptr } {
+            other.ptr = nullptr;
+        }
+
+        /** page-locks \p size_in_bytes bytes of memory starting from \p ptr_
+         *
+         * Pre-conditons:
+         * - host memory should be unregistered
+         */
+        MemoryLockGuard(void* ptr_, std::size_t size_in_bytes) {
+            CUDA4DNN_CHECK_CUDA(cudaHostRegister(ptr_, size_in_bytes, cudaHostRegisterPortable));
+            ptr = ptr_;
+        }
+
+        MemoryLockGuard& operator=(const MemoryLockGuard&) = delete;
+        MemoryLockGuard& operator=(MemoryLockGuard&& other) noexcept {
+            ptr = other.ptr;
+            other.ptr = nullptr;
+            return *this;
+        }
+
+        ~MemoryLockGuard() {
+            if(ptr != nullptr)
+                CUDA4DNN_CHECK_CUDA(cudaHostUnregister(ptr));
+        }
+
+    private:
+        void *ptr;
+    };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_MEMORY_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/nvcc_defs.hpp
@@ -0,0 +1,20 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP
+
+#include <cuda_runtime_api.h>
+
+#ifdef __CUDACC__
+#   define CUDA4DNN_HOST __host__
+#   define CUDA4DNN_DEVICE __device__
+#   define CUDA4DNN_HOST_DEVICE CUDA4DNN_HOST CUDA4DNN_DEVICE
+#else
+#   define CUDA4DNN_HOST
+#   define CUDA4DNN_DEVICE
+#   define CUDA4DNN_HOST_DEVICE
+#endif
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_NVCC_DEFS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/pointer.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/pointer.hpp
@@ -0,0 +1,411 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP
+
+#include "nvcc_defs.hpp"
+#include "error.hpp"
+#include "stream.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <ostream>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    /** @brief provides a type-safe device pointer
+     *
+     * DevicePtr wraps a raw pointer and mimics its behaviour. It does not implicitly convert
+     * to a raw pointer. This ensures that accidental mixing of host and device pointers do not happen.
+     *
+     * It is meant to point to locations in device memory. Hence, it provides dereferencing or
+     * array subscript capability for device code only.
+     *
+     * A `const DevicePtr<T>` represents an immutable pointer to a mutable memory.
+     * A `DevicePtr<const T>` represents a mutable pointer to an immutable memory.
+     * A `const DevicePtr<const T>` represents an immutable pointer to an immutable memory.
+     *
+     * A `DevicePtr<T>` can implicitly convert to `DevicePtr<const T>`.
+     *
+     * Specalizations:
+     * - DevicePtr<void>/DevicePtr<const void> do not support pointer arithmetic (but relational operators are provided)
+     * - any device pointer pointing to mutable memory is implicitly convertible to DevicePtr<void>
+     * - any device pointer is implicitly convertible to DevicePtr<const void>
+     * - DevicePtr<void> can be explicitly converted to any device pointer
+     * - DevicePtr<const void> can be explicitly converted to any device pointer pointing to immutable memory
+     */
+    template <class T>
+    class DevicePtr {
+        static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
+
+    public:
+        using element_type = T;
+        using difference_type = std::ptrdiff_t;
+        using pointer = typename std::add_pointer<element_type>::type;
+        using reference = typename std::add_lvalue_reference<element_type>::type;
+
+        DevicePtr() = default;
+        CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
+
+        CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
+
+        CUDA4DNN_DEVICE reference operator[](difference_type idx) const noexcept { return get()[idx]; }
+        CUDA4DNN_DEVICE reference operator*() const noexcept { return *get(); }
+        CUDA4DNN_DEVICE pointer operator->() const noexcept { return get(); }
+
+        template<class U = T, typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
+        CUDA4DNN_HOST_DEVICE operator DevicePtr<typename std::add_const<U>::type>() const noexcept {
+            return DevicePtr<typename std::add_const<U>::type>{ptr};
+        }
+
+        CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator++() noexcept {
+            ++ptr;
+            return *this;
+        }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator++(int) noexcept {
+            auto tmp = DevicePtr(*this);
+            ptr++;
+            return tmp;
+        }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator--() noexcept {
+            --ptr;
+            return *this;
+        }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator--(int) noexcept {
+            auto tmp = DevicePtr(*this);
+            ptr--;
+            return tmp;
+        }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator+=(std::ptrdiff_t offset) noexcept {
+            ptr += offset;
+            return *this;
+        }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator-=(std::ptrdiff_t offset) noexcept {
+            ptr -= offset;
+            return *this;
+        }
+
+        CUDA4DNN_HOST_DEVICE friend DevicePtr operator+(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
+            return lhs += offset;
+        }
+
+        CUDA4DNN_HOST_DEVICE friend DevicePtr operator-(DevicePtr lhs, std::ptrdiff_t offset) noexcept {
+            return lhs -= offset;
+        }
+
+        CUDA4DNN_HOST_DEVICE friend difference_type operator-(DevicePtr lhs, DevicePtr rhs) noexcept {
+            return lhs.ptr - rhs.ptr;
+        }
+
+        CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
+        CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
+        CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
+        CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
+        CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
+        CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
+
+        CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
+
+        CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
+            using std::swap;
+            swap(lhs.ptr, rhs.ptr);
+        }
+
+        template <class U, class V>
+        CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
+            os << other.get() << " (device)";
+            return os;
+        }
+
+    private:
+        pointer ptr;
+    };
+
+    template <>
+    class DevicePtr<const void> {
+    public:
+        using element_type = const void;
+        using pointer = typename std::add_pointer<element_type>::type;
+
+        DevicePtr() = default;
+
+        /* host const void pointer to const void device pointer */
+        CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
+
+        /* allow any device pointer to be implicitly convereted to void device pointer */
+        template <class T>
+        CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr{ ptr_.get() } { }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
+
+        CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
+
+        CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
+
+        CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
+        CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
+        CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
+        CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
+        CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
+        CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
+
+        /* explicit conversion into host void pointer */
+        CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
+
+        /* const void device pointer can be explicitly casted into any const device pointer type */
+        template <class T, typename std::enable_if<std::is_const<T>::value, bool>::type = true>
+        CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
+            return static_cast<T*>(ptr);
+        }
+
+        CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
+            using std::swap;
+            swap(lhs.ptr, rhs.ptr);
+        }
+
+        template <class U, class V>
+        CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
+            os << other.get() << " (device)";
+            return os;
+        }
+
+    private:
+        pointer ptr;
+    };
+
+    template <>
+    class DevicePtr<void> {
+    public:
+        using element_type = void;
+        using pointer = typename std::add_pointer<element_type>::type;
+
+        DevicePtr() = default;
+
+        /* host pointer to device pointer */
+        CUDA4DNN_HOST_DEVICE explicit DevicePtr(pointer ptr_) noexcept : ptr{ ptr_ } { }
+
+        /* allow any device pointer to mutable memory to be implicitly convereted to void device pointer */
+        template <class T, typename std::enable_if<!std::is_const<T>::value, bool>::type = false>
+        CUDA4DNN_HOST_DEVICE DevicePtr(DevicePtr<T> ptr_) noexcept : ptr { ptr_.get() } { }
+
+        CUDA4DNN_HOST_DEVICE DevicePtr operator=(pointer ptr_) noexcept { ptr = ptr_; return *this; }
+
+        CUDA4DNN_HOST_DEVICE pointer get() const noexcept { return ptr; };
+
+        CUDA4DNN_HOST_DEVICE operator DevicePtr<const void>() const noexcept { return DevicePtr<const void>{ptr}; }
+
+        CUDA4DNN_HOST_DEVICE explicit operator bool() const noexcept { return ptr; }
+
+        CUDA4DNN_HOST_DEVICE friend bool operator==(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr == rhs.ptr; }
+        CUDA4DNN_HOST_DEVICE friend bool operator!=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs == rhs); }
+        CUDA4DNN_HOST_DEVICE friend bool operator<(DevicePtr lhs, DevicePtr rhs) noexcept { return lhs.ptr < rhs.ptr; }
+        CUDA4DNN_HOST_DEVICE friend bool operator>(DevicePtr lhs, DevicePtr rhs) noexcept { return rhs < lhs; }
+        CUDA4DNN_HOST_DEVICE friend bool operator<=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(rhs < lhs); }
+        CUDA4DNN_HOST_DEVICE friend bool operator>=(DevicePtr lhs, DevicePtr rhs) noexcept { return !(lhs < rhs); }
+
+        /* explicit conversion into host void pointer */
+        CUDA4DNN_HOST_DEVICE explicit operator pointer() const noexcept { return ptr; }
+
+        /* void device pointer can be explicitly casted into any device pointer type */
+        template <class T>
+        CUDA4DNN_HOST_DEVICE explicit operator DevicePtr<T>() const noexcept {
+            return DevicePtr<T>(static_cast<T*>(ptr));
+        }
+
+        CUDA4DNN_HOST friend void swap(DevicePtr& lhs, DevicePtr& rhs) noexcept {
+            using std::swap;
+            swap(lhs.ptr, rhs.ptr);
+        }
+
+        template <class U, class V>
+        CUDA4DNN_HOST friend std::basic_ostream<U, V>& operator<<(std::basic_ostream<U, V>& os, DevicePtr other) {
+            os << other.get() << " (device)";
+            return os;
+        }
+
+    private:
+        pointer ptr;
+    };
+
+    template <class T>
+    bool is_aligned(DevicePtr<const T> ptr, std::size_t alignment) {
+        auto addr = reinterpret_cast<std::intptr_t>(ptr.get());
+        return addr % alignment == 0;
+    }
+
+    /** copies \p n elements from \p src to \p dest4
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    host pointer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(T *dest, DevicePtr<const T> src, std::size_t n) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest, src.get(), n * sizeof(T), cudaMemcpyDefault));
+    }
+
+    /** copies \p n elements from \p src to \p dest
+     *
+     * \param[in]   src     host pointer
+     * \param[out]  dest    device pointer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(DevicePtr<T> dest, const T* src, std::size_t n) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src, n * sizeof(T), cudaMemcpyDefault));
+    }
+
+    /** copies \p n elements from \p src to \p dest
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    device pointer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemcpy(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault));
+    }
+
+    /** sets \p n elements to \p ch in \p dest
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  ch      8-bit value to fill the device memory with
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest must be large enough to hold \p n elements
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemset(dest.get(), ch, n * sizeof(T)));
+    }
+
+    /** copies \p n elements from \p src to \p dest asynchronously
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    host pointer
+     * \param       stream  CUDA stream that has to be used for the memory transfer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+     * - \p dest points to page-locked memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(T *dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest, src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
+    }
+
+    /** copies data from memory pointed by \p src to \p dest asynchronously
+     *
+     * \param[in]   src     host pointer
+     * \param[out]  dest    device pointer
+     * \param       stream  CUDA stream that has to be used for the memory transfer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+     * - \p src points to page-locked memory
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(DevicePtr<T> dest, const T *src, std::size_t n, const Stream& stream) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src, n * sizeof(T), cudaMemcpyDefault, stream.get()));
+    }
+
+    /** copies \p n elements from \p src to \p dest asynchronously
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  dest    device pointer
+     * \param       stream  CUDA stream that has to be used for the memory transfer
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest and \p src must be large enough to hold \p n elements
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memcpy(DevicePtr<T> dest, DevicePtr<const T> src, std::size_t n, const Stream& stream) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemcpyAsync(dest.get(), src.get(), n * sizeof(T), cudaMemcpyDefault, stream.get()));
+    }
+
+    /** sets \p n elements to \p ch in \p dest asynchronously
+     *
+     * \param[in]   src     device pointer
+     * \param[out]  ch      8-bit value to fill the device memory with
+     * \param       stream  CUDA stream that has to be used for the memory operation
+     *
+     * Pre-conditions:
+     * - memory pointed by \p dest must be large enough to hold \p n elements
+     *
+     * Exception Guarantee: Basic
+     */
+    template <class T>
+    void memset(DevicePtr<T> dest, std::int8_t ch, std::size_t n, const Stream& stream) {
+        if (n <= 0) {
+            CV_Error(Error::StsBadArg, "number of elements to copy is zero or negtaive");
+        }
+
+        CUDA4DNN_CHECK_CUDA(cudaMemsetAsync(dest.get(), ch, n * sizeof(T), stream.get()));
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_POINTER_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/span.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/span.hpp
@@ -0,0 +1,83 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP
+
+#include "pointer.hpp"
+#include "nvcc_defs.hpp"
+
+#include <cstddef>
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    /** @brief provides non-owning mutable access for device arrays
+     *
+     *  const Span<T>/Span<T> provides mutable access to the elements unless T is const qualified
+     *  const Span<T> makes the span immutable but not the elements
+     */
+    template <class T>
+    class Span {
+        static_assert(std::is_standard_layout<T>::value, "T must satisfy StandardLayoutType");
+
+    public:
+        using value_type = T;
+        using size_type = std::size_t;
+        using difference_type = std::ptrdiff_t;
+
+        using pointer = DevicePtr<value_type>;
+        using const_pointer = DevicePtr<typename std::add_const<value_type>::type>;
+        using reference = typename std::add_lvalue_reference<value_type>::type;
+        using const_reference = typename std::add_lvalue_reference<typename std::add_const<value_type>::type>;
+
+        using iterator = pointer;
+        using const_iterator = const_pointer;
+
+        Span() noexcept : ptr{ nullptr }, sz{ 0 } { }
+        CUDA4DNN_HOST_DEVICE Span(pointer first, pointer last) noexcept : ptr{ first }, sz{ last - first } { }
+        CUDA4DNN_HOST_DEVICE Span(pointer first, size_type count) noexcept : ptr{ first }, sz{ count } { }
+
+        CUDA4DNN_HOST_DEVICE size_type size() const noexcept { return sz; }
+        CUDA4DNN_HOST_DEVICE bool empty() const noexcept { return size() == 0; }
+
+        CUDA4DNN_DEVICE reference operator[](difference_type index) const { return ptr[index]; }
+        CUDA4DNN_HOST_DEVICE pointer data() const noexcept { return ptr; }
+
+        template<class U = T, class V = typename std::add_const<U>::type,
+            typename std::enable_if<!std::is_const<U>::value, bool>::type = true>
+            CUDA4DNN_HOST_DEVICE operator Span<V>() const noexcept { return Span<V>{ptr, sz}; }
+
+    private:
+        pointer ptr;
+        size_type sz;
+    };
+
+    /** @brief provides non-owning immutable view for device arrays */
+    template <class T>
+    using View = Span<const T>;
+
+    /** returns true if the address of a span/view is aligned to \p alignment number of elements (not bytes) */
+    template <class T>
+    bool is_address_aligned(View<T> v, std::size_t alignment) {
+        return is_aligned(v.data(), alignment * sizeof(T));
+    }
+
+    /** returns true if the size of a span/view is a multiple of \p alignment */
+    template <class T>
+    bool is_size_aligned(View<T> v, std::size_t alignment) {
+        return v.size() % alignment == 0;
+    }
+
+    /** @brief returns true if the address and the size of the span/view is aligned
+     * \p alignment refers to the number of elements (not bytes)
+     */
+    template <class T>
+    bool is_fully_aligned(View<T> v, std::size_t alignment) {
+        return is_address_aligned(v, alignment) && is_size_aligned(v, alignment);
+    }
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_SPAN_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/stream.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/stream.hpp
@@ -0,0 +1,118 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP
+
+#include "error.hpp"
+
+#include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
+
+#include <cuda_runtime_api.h>
+
+#include <memory>
+#include <sstream>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    /** @brief noncopyable smart CUDA stream
+     *
+     * UniqueStream is a smart non-sharable wrapper for CUDA stream handle which ensures that
+     * the handle is destroyed after use. Unless explicitly specified by a constructor argument,
+     * the stream object represents the default stream.
+     */
+    class UniqueStream {
+    public:
+        UniqueStream() noexcept : stream{ 0 } { }
+        UniqueStream(UniqueStream&) = delete;
+        UniqueStream(UniqueStream&& other) noexcept {
+            stream = other.stream;
+            other.stream = 0;
+        }
+
+        UniqueStream(bool create) : stream{ 0 } {
+            if (create) {
+                CUDA4DNN_CHECK_CUDA(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+            }
+        }
+
+        ~UniqueStream() {
+            try {
+                if (stream != 0)
+                    CUDA4DNN_CHECK_CUDA(cudaStreamDestroy(stream));
+            } catch (const CUDAException& ex) {
+                std::ostringstream os;
+                os << "Asynchronous exception caught during CUDA stream destruction.\n";
+                os << ex.what();
+                os << "Exception will be ignored.\n";
+                CV_LOG_WARNING(0, os.str().c_str());
+            }
+        }
+
+        UniqueStream& operator=(const UniqueStream&) = delete;
+        UniqueStream& operator=(UniqueStream&& other) noexcept {
+            stream = other.stream;
+            other.stream = 0;
+            return *this;
+        }
+
+        /** returns the raw CUDA stream handle */
+        cudaStream_t get() const noexcept { return stream; }
+
+        void synchronize() const { CUDA4DNN_CHECK_CUDA(cudaStreamSynchronize(stream)); }
+        bool busy() const {
+            auto status = cudaStreamQuery(stream);
+            if (status == cudaErrorNotReady)
+                return true;
+            CUDA4DNN_CHECK_CUDA(status);
+            return false;
+        }
+
+    private:
+        cudaStream_t stream;
+    };
+
+    /** @brief sharable smart CUDA stream
+     *
+     * Stream is a smart sharable wrapper for CUDA stream handle which ensures that
+     * the handle is destroyed after use. Unless explicitly specified by a constructor argument,
+     * the stream object represents the default stream.
+     *
+     * @note Moving a Stream object to another invalidates the former
+     */
+    class Stream {
+    public:
+        Stream() : stream(std::make_shared<UniqueStream>()) { }
+        Stream(const Stream&) = default;
+        Stream(Stream&&) = default;
+
+        /** if \p create is `true`, a new stream will be created instead of the otherwise default stream */
+        Stream(bool create) : stream(std::make_shared<UniqueStream>(create)) { }
+
+        Stream& operator=(const Stream&) = default;
+        Stream& operator=(Stream&&) = default;
+
+        /** blocks the caller thread until all operations in the stream are complete */
+        void synchronize() const { stream->synchronize(); }
+
+        /** returns true if there are operations pending in the stream */
+        bool busy() const { return stream->busy(); }
+
+        /** returns true if the stream is valid */
+        explicit operator bool() const noexcept { return static_cast<bool>(stream); }
+
+        cudaStream_t get() const noexcept {
+            CV_Assert(stream);
+            return stream->get();
+        }
+
+    private:
+        std::shared_ptr<UniqueStream> stream;
+    };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_STREAM_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/tensor.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/tensor.hpp
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/tensor_ops.hpp
@@ -0,0 +1,384 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP
+
+#include "stream.hpp"
+#include "tensor.hpp"
+#include "pointer.hpp"
+#include "cublas.hpp"
+#include "cudnn.hpp"
+#include "workspace.hpp"
+
+#include "cudnn/convolution.hpp"
+#include "cudnn/pooling.hpp"
+#include "cudnn/lrn.hpp"
+#include "cudnn/softmax.hpp"
+#include "cudnn/transform.hpp"
+#include "cudnn/transpose_convolution.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <array>
+#include <vector>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    namespace tensor_ops {
+
+        /** @brief copies data between tensors
+         *
+         * Pre-conditions:
+         * - \p dest and \p src must have the same shape
+         *
+         * Exception Gaurantee: Basic
+         */
+        template <class T> inline
+        void copy(const Stream& stream, TensorSpan<T> dest, TensorView<T> src) {
+            CV_Assert(is_shape_same(dest, src));
+            if (dest.get() != src.get())
+                memcpy(dest.get(), src.get(), dest.size(), stream);
+        }
+
+        /** @brief performs generalized matrix-multiplication
+         *
+         * Pre-conditions:
+         * - \p A and \p B must meet the mathematical requirements for matrix multiplication
+         * - \p result must be large enough to hold the result
+         *
+         * Exception Gaurantee: Basic
+         */
+        template <class T> inline
+        void gemm(const cublas::Handle& handle, T beta, TensorSpan<T> result, T alpha, bool transa, TensorView<T> A, bool transb, TensorView<T> B) {
+            /* matrix operations can be performed only on rank two or less tensors */
+            CV_Assert(get_effective_rank(A) <= 2 &&
+                get_effective_rank(B) <= 2 &&
+                get_effective_rank(result) <= 2);
+
+            /* check dimension requirements for matrix multiplication */
+            if (!transa && !transb) {
+                CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
+                CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-2));
+                CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
+            } else if (!transa && transb) {
+                CV_Assert(A.get_axis_size(-2) == result.get_axis_size(-2));
+                CV_Assert(A.get_axis_size(-1) == B.get_axis_size(-1));
+                CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
+            } else if (transa && !transb) {
+                CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
+                CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-2));
+                CV_Assert(B.get_axis_size(-1) == result.get_axis_size(-1));
+            } else {
+                CV_Assert(A.get_axis_size(-1) == result.get_axis_size(-2));
+                CV_Assert(A.get_axis_size(-2) == B.get_axis_size(-1));
+                CV_Assert(B.get_axis_size(-2) == result.get_axis_size(-1));
+            }
+
+            const auto result_nr = result.get_axis_size(-2);
+            const auto result_nc = result.get_axis_size(-1);
+            const auto common_dim = A.get_axis_size(transa ? -2 : -1);
+            const auto A_nc = A.get_axis_size(-1);
+            const auto B_nc = B.get_axis_size(-1);
+
+            /* tensors are stored in row-major but cublas::gemm operates on column-major matrices
+             * a row-major matrix when read as column-major matrix gives the transpose of the intended matrix
+             *
+             * Required: C = AB
+             * what cuBLAS sees: C^T = A^TB^T = (BA)^T
+             *
+             * By reversing operands, we effectively perform:
+             * C^T = B^TA^T = (AB)^T
+             *
+             * which gives C = AB
+             */
+            cublas::gemm<T>(handle,
+                transb, transa,
+                result_nc, result_nr, common_dim,
+                alpha, B.get(), B_nc,
+                A.get(), A_nc,
+                beta, result.get(), result_nc);
+        }
+
+        /** @brief performs element-wise addition with broadcasting
+         *
+         * Pre-conditions:
+         * - \p A and \p result must be compatible tensors
+         *
+         * Exception Gaurantee: Basic
+         */
+        template <class T> inline
+        void softmax(const cudnn::Handle& handle, TensorSpan<T> output, TensorView<T> input, int channel_axis, bool log) {
+            CV_Assert(is_shape_same(output, input));
+
+            channel_axis = clamp_axis(channel_axis, input.rank());
+
+            std::size_t outer_size = input.size_range(0, channel_axis);
+            auto channel_size = input.get_axis_size(channel_axis);
+            std::size_t inner_size = input.size_range(channel_axis + 1, input.rank());
+
+            std::array<std::size_t, 4> shape = { outer_size, channel_size, 1, inner_size };
+
+            using cudnn::TensorDescriptor;
+            auto inputDesc = TensorDescriptor<T>(shape);
+            auto outputDesc = TensorDescriptor<T>(shape);
+            cudnn::softmax(handle, outputDesc, output.get(), inputDesc, input.get(), log);
+        }
+    }
+
+    template <class T>
+    class Convolution {
+        using TensorDescriptor = cudnn::TensorDescriptor<T>;
+        using FilterDescriptor = cudnn::FilterDescriptor<T>;
+        using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
+        using ConvolutionAlgorithm = cudnn::ConvolutionAlgorithm<T>;
+
+    public:
+        struct params_type {
+            std::vector<std::size_t> input_shape;
+            std::vector<std::size_t> filter_shape;
+
+            std::vector<std::size_t> padding;
+            std::vector<std::size_t> stride;
+            std::vector<std::size_t> dilation;
+
+            std::size_t groups;
+        };
+
+        Convolution() = default;
+        Convolution(const Convolution&) = delete;
+        Convolution(Convolution&&) = default;
+        Convolution(cudnn::Handle handle, const params_type& params) {
+            cudnnHandle = std::move(handle);
+
+            inputTensorDesc = TensorDescriptor(params.input_shape);
+            filterDesc = FilterDescriptor(params.filter_shape);
+            convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
+
+            std::vector<int> output_dims;
+            getConvolutionForwardOutputDim(convDesc, filterDesc, inputTensorDesc, output_dims);
+            outputTensorDesc = TensorDescriptor(output_dims);
+
+            algo = ConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, inputTensorDesc, outputTensorDesc);
+        }
+
+        Convolution& operator=(const Convolution&) = delete;
+        Convolution& operator=(Convolution&&) = default;
+
+        std::size_t get_workspace_size() const noexcept {
+            return algo.get_workspace_size();
+        }
+
+        void convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
+            cudnn::convolve<T>(
+                cudnnHandle,
+                convDesc, algo, scratchpad,
+                filterDesc, filters.get(),
+                inputTensorDesc, input.get(),
+                1.0, 0.0, outputTensorDesc, output.get()
+            );
+        }
+
+    private:
+        cudnn::Handle cudnnHandle;
+        TensorDescriptor inputTensorDesc, outputTensorDesc;
+        FilterDescriptor filterDesc;
+        ConvolutionDescriptor convDesc;
+        ConvolutionAlgorithm algo;
+    };
+
+    template <class T>
+    class TransposeConvolution {
+        using TensorDescriptor = cudnn::TensorDescriptor<T>;
+        using FilterDescriptor = cudnn::FilterDescriptor<T>;
+        using ConvolutionDescriptor = cudnn::ConvolutionDescriptor<T>;
+        using TransposeConvolutionAlgorithm = cudnn::TransposeConvolutionAlgorithm<T>;
+
+    public:
+        struct params_type {
+            std::vector<std::size_t> input_shape;
+            std::vector<std::size_t> output_shape;
+
+            std::vector<std::size_t> filter_shape;
+
+            std::vector<std::size_t> padding;
+            std::vector<std::size_t> stride;
+            std::vector<std::size_t> dilation;
+
+            std::size_t groups;
+        };
+
+        TransposeConvolution() = default;
+        TransposeConvolution(const TransposeConvolution&) = delete;
+        TransposeConvolution(TransposeConvolution&&) = default;
+        TransposeConvolution(cudnn::Handle handle, const params_type& params) {
+            cudnnHandle = std::move(handle);
+
+            filterDesc = FilterDescriptor(params.filter_shape);
+            convDesc = ConvolutionDescriptor(params.padding, params.stride, params.dilation, params.groups);
+
+            /* input_shape is the output shape for convolution
+             * output_shape is the input shape for convolution
+             */
+            convInputTensorDesc = TensorDescriptor(params.output_shape);
+
+            std::vector<int> conv_output_dims;
+            getConvolutionForwardOutputDim(convDesc, filterDesc, convInputTensorDesc, conv_output_dims);
+
+            /* the convolution output must be identical to what cuDNN expects */
+            CV_Assert(std::equal(std::begin(conv_output_dims), std::end(conv_output_dims), std::begin(params.input_shape)));
+
+            convOutputTensorDesc = TensorDescriptor(params.input_shape);
+
+            algo = TransposeConvolutionAlgorithm(cudnnHandle, convDesc, filterDesc, convOutputTensorDesc, convInputTensorDesc);
+        }
+
+        TransposeConvolution& operator=(const TransposeConvolution&) = delete;
+        TransposeConvolution& operator=(TransposeConvolution&&) = default;
+
+        std::size_t get_workspace_size() const noexcept {
+            return algo.get_workspace_size();
+        }
+
+        void transpose_convolve(TensorSpan<T> output, TensorView<T> input, TensorView<T> filters, WorkspaceInstance scratchpad) {
+            cudnn::transpose_convolve<T>(
+                cudnnHandle,
+                convDesc, algo, scratchpad,
+                filterDesc, filters.get(),
+                convOutputTensorDesc, input.get(),
+                1.0, 0.0, convInputTensorDesc, output.get()
+            );
+        }
+
+    private:
+        cudnn::Handle cudnnHandle;
+        TensorDescriptor convInputTensorDesc, convOutputTensorDesc;
+        FilterDescriptor filterDesc;
+        ConvolutionDescriptor convDesc;
+        TransposeConvolutionAlgorithm algo;
+    };
+
+    template <class T>
+    class Pooling {
+        using TensorDescriptor = cudnn::TensorDescriptor<T>;
+        using PoolingDescriptor = cudnn::PoolingDescriptor;
+
+    public:
+        using PoolingType = PoolingDescriptor::PoolingType;
+
+        struct params_type {
+            std::vector<std::size_t> input_shape;
+            std::vector<std::size_t> output_shape;
+
+            std::vector<std::size_t> window_size;
+            std::vector<std::size_t> padding;
+            std::vector<std::size_t> stride;
+
+            PoolingType type;
+        };
+
+        Pooling() = default;
+        Pooling(const Pooling&) = delete;
+        Pooling(Pooling&&) = default;
+        Pooling(cudnn::Handle handle, const params_type& params) {
+            cudnnHandle = std::move(handle);
+
+            inputTensorDesc = TensorDescriptor(params.input_shape);
+            poolingDesc = PoolingDescriptor(params.window_size, params.padding, params.stride, params.type);
+
+            //std::vector<int> output_dim;
+            //getPoolingForwardOutputDim(poolingDesc, inputTensorDesc, output_dim);
+            outputTensorDesc = TensorDescriptor(params.output_shape);
+        }
+
+        Pooling& operator=(const Pooling&) = delete;
+        Pooling& operator=(Pooling&&) = default;
+
+        void pool(TensorView<T> input, TensorSpan<T> output) {
+            cudnn::pool<T>(
+                cudnnHandle,
+                poolingDesc,
+                inputTensorDesc, input.get(),
+                1.0, 0.0, outputTensorDesc, output.get()
+            );
+        }
+
+    private:
+        cudnn::Handle cudnnHandle;
+        TensorDescriptor inputTensorDesc, outputTensorDesc;
+        PoolingDescriptor poolingDesc;
+    };
+
+    template <class T>
+    class LRN {
+        using LRNDescriptor = cudnn::LRNDescriptor;
+        using TensorDescriptor = cudnn::TensorDescriptor<T>;
+
+    public:
+        using LRNType = LRNDescriptor::LRNType;
+
+        LRN() = default;
+        LRN(const LRN&) = delete;
+        LRN(LRN&&) = default;
+        LRN(cudnn::Handle handle, std::size_t local_size, T alpha, T beta, T k, LRNType type) {
+            cudnnHandle = std::move(handle);
+            lrnDesc = LRNDescriptor(local_size, alpha, beta, k, type);
+        }
+
+        LRN& operator=(const LRN&) = delete;
+        LRN& operator=(LRN&&) = default;
+
+        void normalize(TensorView<T> input, TensorSpan<T> output, WorkspaceInstance workspace) {
+            cudnn::LRNForward<T>(
+                cudnnHandle,
+                lrnDesc,
+                TensorDescriptor(input.shape_as_vector()), input.get(),
+                1.0, 0.0, TensorDescriptor(output.shape_as_vector()), output.get(),
+                workspace
+            );
+        }
+
+    private:
+        cudnn::Handle cudnnHandle;
+        LRNDescriptor lrnDesc;
+    };
+
+    template <class T>
+    class TensorTransform {
+        using TensorTransformDescriptor = cudnn::TensorTransformDescriptor;
+        using TensorDescriptor = cudnn::TensorDescriptor<T>;
+
+    public:
+        TensorTransform() = default;
+        TensorTransform(const TensorTransform&) = delete;
+        TensorTransform(TensorTransform&&) = default;
+
+        template <class SequenceContainer>
+        TensorTransform(cudnn::Handle handle, const SequenceContainer& paddingLeft, const SequenceContainer& paddingRight) {
+            cudnnHandle = std::move(handle);
+            transDesc = TensorTransformDescriptor(paddingLeft, paddingRight);
+        }
+
+        TensorTransform& operator=(const TensorTransform&) = delete;
+        TensorTransform& operator=(TensorTransform&&) = default;
+
+        void transform(TensorView<T> input, TensorSpan<T> output) {
+            cudnn::transform<T>(
+                cudnnHandle,
+                transDesc,
+                TensorDescriptor(input.shape_as_vector()), input.get(),
+                TensorDescriptor(output.shape_as_vector()), output.get()
+            );
+        }
+
+    private:
+        cudnn::Handle cudnnHandle;
+        TensorTransformDescriptor transDesc;
+    };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_TENSOR_OPS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/workspace.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/csl/workspace.hpp
@@ -0,0 +1,166 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP
+
+#include "pointer.hpp"
+#include "span.hpp"
+#include "tensor.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace csl {
+
+    /** @brief maintains a single block of reusable device memory
+     *
+     * Each Workspace object is intended to be used by a single entity at a time but by
+     * different entities at different times. It maintains a single reusable block of memory which
+     * is sufficient for the largest consumer.
+     */
+    class Workspace {
+    public:
+
+        /** @brief reserve \p bytes of memory */
+        void require(std::size_t bytes) {
+            if (bytes > ptr.size())
+                ptr.reset(bytes);
+        }
+
+        /** @brief number of bytes reserved by the largest consumer */
+        std::size_t size() const noexcept {
+            return ptr.size();
+        }
+
+        /** @brief returns the pointer to the workspace memory */
+        DevicePtr<unsigned char> get() {
+            return ptr.get();
+        }
+
+    private:
+        ManagedPtr<unsigned char> ptr;
+    };
+
+    /** used to compute total workspace size from several workspace requests */
+    class WorkspaceBuilder {
+    public:
+        WorkspaceBuilder() noexcept : max_size_in_bytes{ 0 } { }
+
+        /** request memory for \p count number of elements of the type \tparam T */
+        template <class T = std::int8_t>
+        void require(std::size_t count) noexcept {
+            auto blocks256 = (count * sizeof(T) + 255) / 256;
+            max_size_in_bytes += blocks256 * 256;
+        }
+
+        /** returns the total workspace memory that is required */
+        std::size_t required_workspace_size() const noexcept { return max_size_in_bytes; }
+
+    private:
+        std::size_t max_size_in_bytes;
+    };
+
+    /** general memory block from a workspace which can be passed on to the requester */
+    class WorkspaceInstance {
+    public:
+
+        /** returns a device pointer to the workspace memory */
+        template <class T = void>
+        DevicePtr<T> get() const noexcept {
+            return static_cast<DevicePtr<T>>(ptr);
+        }
+
+        /** returnss the size of the workspace memory in bytes */
+        std::size_t size_in_bytes() const noexcept {
+            return size_in_bytes_;
+        }
+
+        /** creates a Span<T> of \p count elements from the workspace memory */
+        template <class T>
+        Span<T> get_span(std::size_t count = 0) const {
+            if (count == 0)
+                count = size_in_bytes_ / sizeof(T);
+
+            if (count * sizeof(T) > size_in_bytes_)
+                CV_Error(Error::StsNoMem, "memory not sufficient");
+
+            return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
+        }
+
+        /** creates a TensorSpan<T> of the given shape from the workspace memory */
+        template <class T, class ForwardItr>
+        TensorSpan<T> get_tensor_span(ForwardItr shape_begin, ForwardItr shape_end) const {
+            using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+            auto required_size = std::accumulate(shape_begin, shape_end, 1, std::multiplies<ItrValueType>());
+            if (required_size * sizeof(T) > size_in_bytes_)
+                CV_Error(Error::StsNoMem, "memory not sufficient");
+            return TensorSpan<T>(static_cast<DevicePtr<T>>(ptr), shape_begin, shape_end);
+        }
+
+    private:
+        DevicePtr<void> ptr;
+        std::size_t size_in_bytes_;
+
+        friend class WorkspaceAllocator;
+        WorkspaceInstance(DevicePtr<void> ptr_, std::size_t size_in_bytes__)
+            : ptr{ ptr_ }, size_in_bytes_{ size_in_bytes__ } { }
+    };
+
+    /** used to split a single workspace into constituents */
+    class WorkspaceAllocator {
+    public:
+        WorkspaceAllocator() = default;
+        WorkspaceAllocator(Workspace& workspace) noexcept
+            : current{ workspace.get() }, bytes_remaining { workspace.size() }
+        {
+            CV_Assert(is_aligned<void>(current, 256));
+            CV_Assert(bytes_remaining % 256 == 0);
+        }
+
+        /** allocates a Span<T> of \p count elements from the workspace memory */
+        template <class T>
+        Span<T> get_span(std::size_t count = 0) {
+            return accquire<T>(count);
+        }
+
+        /** allocates a TensorSpan<T> of the given shape from the workspace memory */
+        template <class T, class ForwardItr>
+        TensorSpan<T> get_tensor_span(ForwardItr start, ForwardItr end) {
+            using ItrValueType = typename std::iterator_traits<ForwardItr>::value_type;
+            auto required_size = std::accumulate(start, end, 1, std::multiplies<ItrValueType>());
+            return TensorSpan<T>(accquire<T>(required_size).data(), start, end);
+        }
+
+        /** allocates a WorkspaceInstance of size \p bytes from the workspace memory */
+        WorkspaceInstance get_instance(std::size_t bytes = 0) {
+            auto span = accquire(bytes);
+            return WorkspaceInstance(DevicePtr<void>(span.data()), span.size());
+        }
+
+    private:
+        template <class T = std::int8_t>
+        Span<T> accquire(std::size_t count = 0) {
+            auto ptr = current;
+
+            if (count == 0)
+                count = bytes_remaining / sizeof(T);
+
+            auto blocks256 = (count * sizeof(T) + 255) / 256;
+            if (bytes_remaining < blocks256 * 256)
+                CV_Error(Error::StsNoMem, "out of workspace memory");
+
+            bytes_remaining -= blocks256 * 256;
+            current = static_cast<DevicePtr<std::int8_t>>(current) + blocks256 * 256;
+            return Span<T>(static_cast<DevicePtr<T>>(ptr), count);
+        }
+
+        DevicePtr<void> current;
+        std::size_t bytes_remaining;
+    };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CSL_WORKSPACE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/cxx_utils/is_iterator.hpp
@@ -0,0 +1,31 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP
+
+#include <iterator>
+#include <type_traits>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
+
+    namespace detail {
+        template <class T, class Tag, class = void>
+        struct is_iterator_helper : std::false_type {};
+
+        template <class T, class Tag>
+        struct is_iterator_helper<T, Tag,
+                typename std::enable_if<std::is_base_of<Tag, typename std::iterator_traits<T>::iterator_category>::value, void>::type
+            > : std::true_type {};
+    }
+
+    template <class T>
+    using is_iterator = typename detail::is_iterator_helper<T, std::input_iterator_tag>;
+
+    template <class T>
+    using is_forward_iterator = typename detail::is_iterator_helper<T, std::forward_iterator_tag>;
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_IS_ITERATOR_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/cxx_utils/resizable_static_array.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/cxx_utils/resizable_static_array.hpp
@@ -0,0 +1,110 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP
+
+#include <cstddef>
+#include <array>
+#include <cassert>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace cxx_utils {
+
+    template <class T, std::size_t maxN>
+    class resizable_static_array {
+        using container_type = std::array<T, maxN>;
+
+    public:
+        using value_type                = typename container_type::value_type;
+        using size_type                 = typename container_type::size_type;
+        using difference_type           = typename container_type::difference_type;
+        using reference                 = typename container_type::reference;
+        using const_reference           = typename container_type::const_reference;
+        using pointer                   = typename container_type::pointer;
+        using const_pointer             = typename container_type::const_pointer;
+        using iterator                  = typename container_type::iterator;
+        using const_iterator            = typename container_type::const_iterator;
+        using reverse_iterator          = typename container_type::reverse_iterator;
+        using const_reverse_iterator    = typename container_type::const_reverse_iterator;
+
+        resizable_static_array() noexcept : size_{ 0 } { }
+        explicit resizable_static_array(size_type sz) noexcept : size_{ sz } { }
+
+        bool empty() const noexcept { return static_cast<bool>(size_); }
+        size_type size() const noexcept { return size_; }
+        size_type capacity() const noexcept { return maxN; }
+
+        void resize(size_type sz) noexcept {
+            assert(sz <= capacity());
+            size_ = sz;
+        }
+
+        void clear() noexcept { size_ = 0; }
+
+        template <class ForwardItr>
+        void assign(ForwardItr first, ForwardItr last) {
+            resize(std::distance(first, last));
+            std::copy(first, last, begin());
+        }
+
+        iterator begin() noexcept { return std::begin(arr); }
+        iterator end() noexcept { return std::begin(arr) + size(); }
+
+        const_iterator begin() const noexcept { return arr.cbegin(); }
+        const_iterator end() const noexcept { return arr.cbegin() + size(); }
+
+        const_iterator cbegin() const noexcept { return arr.cbegin(); }
+        const_iterator cend() const noexcept { return arr.cbegin() + size(); }
+
+        reverse_iterator rbegin() noexcept { return std::begin(arr) + size(); }
+        reverse_iterator rend() noexcept { return std::begin(arr); }
+
+        const_reverse_iterator rbegin() const noexcept { return arr.cbegin()+ size(); }
+        const_reverse_iterator rend() const noexcept { return arr.cbegin(); }
+
+        const_reverse_iterator crbegin() const noexcept { return arr.cbegin() + size(); }
+        const_reverse_iterator crend() const noexcept { return arr.cbegin(); }
+
+        reference operator[](size_type pos) {
+            assert(pos < size());
+            return arr[pos];
+        }
+
+        const_reference operator[](size_type pos) const {
+            assert(pos < size());
+            return arr[pos];
+        }
+
+        iterator insert(iterator pos, const T& value) {
+            resize(size() + 1);
+            std::move_backward(pos, end() - 1, end());
+            *pos = value;
+            return pos;
+        }
+
+        iterator insert(iterator pos, T&& value) {
+            resize(size() + 1);
+            std::move_backward(pos, end() - 1, end());
+            *pos = std::move(value);
+            return pos;
+        }
+
+        iterator erase(iterator pos) {
+            std::move(pos + 1, end(), pos);
+            resize(size() - 1);
+            return pos;
+        }
+
+        pointer data() noexcept { return arr.data(); }
+        const_pointer data() const noexcept { return arr.data(); }
+
+    private:
+        std::size_t size_;
+        container_type arr;
+    };
+
+}}}} /* namespace cv::dnn::cuda4dnn::csl::cxx_utils */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_CXX_UTILS_RESIZABLE_STATIC_ARRAY_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/activations.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/activations.hpp
@@ -0,0 +1,50 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void abs(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void tanh(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void swish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void mish(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void sigmoid(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void bnll(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void elu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input);
+
+    template <class T>
+    void relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T slope);
+
+    template <class T>
+    void clipped_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T floor, T ceiling);
+
+    template <class T>
+    void axiswise_relu(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, std::size_t inner_size, csl::View<T> slope);
+
+    template <class T>
+    void power(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T exp, T scale, T shift);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ACTIVATIONS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
@@ -0,0 +1,38 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void biasN_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T slope);
+
+    template <class T>
+    void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);
+
+    template <class T>
+    void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp);
+
+    template <class T>
+    void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
+
+    template <class T>
+    void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
+
+    template <class T>
+    void biasN_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
+
+    template <class T>
+    void biasN_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/concat.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/concat.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void concat(
+        const csl::Stream& stream,
+        csl::TensorSpan<T> output, std::size_t output_axis_offset,
+        csl::TensorView<T> input, std::size_t axis);
+
+    template <class T>
+    void concat_with_offsets(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> axis_offsets);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CONCAT_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/crop_and_resize.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/span.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void crop_and_resize(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> boxes);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_CROP_AND_RESIZE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/eltwise_ops.hpp
@@ -0,0 +1,32 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void eltwise_max_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+    template <class T>
+    void eltwise_sum_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+    template <class T>
+    void eltwise_sum_coeff_2(const csl::Stream& stream, csl::Span<T> output, T coeff_x, csl::View<T> x, T coeff_y, csl::View<T> y);
+
+    template <class T>
+    void eltwise_prod_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+    template <class T>
+    void eltwise_div_2(const csl::Stream& stream, csl::Span<T> output, csl::View<T> x, csl::View<T> y);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ELTWISE_OPS_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/fill.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/fill.hpp
@@ -0,0 +1,18 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void fill(const csl::Stream& stream, csl::Span<T> output, T value);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_FILL_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/max_unpooling.hpp
@@ -0,0 +1,32 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void max_pooling_with_indices(
+        const csl::Stream& stream,
+        csl::TensorSpan<T> output, csl::TensorSpan<T> indices, csl::TensorView<T> input,
+        const std::vector<std::size_t>& kernel_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left);
+
+    template <class T>
+    void max_unpooling(
+        const csl::Stream& stream,
+        csl::TensorSpan<T> output, csl::TensorView<T> input, csl::TensorView<T> indices,
+        const std::vector<std::size_t>& window_size, const std::vector<std::size_t>& strides,
+        const std::vector<std::size_t>& padding_left);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_MAX_UNPOOLING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/normalize.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/normalize.hpp
@@ -0,0 +1,24 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void normalize(
+        const csl::Stream& stream,
+        csl::Span<T> output, csl::View<T> input,
+        std::size_t outer_size, std::size_t mid_size, std::size_t inner_size, std::size_t norm, T epsilon,
+        csl::Span<T> workspace);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_NORMALIZE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/padding.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/padding.hpp
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void copy_with_reflection101(
+        const csl::Stream& stream,
+        csl::TensorSpan<T> output, csl::TensorView<T> input,
+        std::vector<std::pair<std::size_t, std::size_t>> ranges);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PADDING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/permute.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/permute.hpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void permute(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, std::vector<std::size_t> order);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PERMUTE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/prior_box.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/prior_box.hpp
@@ -0,0 +1,28 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void generate_prior_boxes(
+        const csl::Stream& stream,
+        csl::Span<T> output,
+        csl::View<float> boxWidth, csl::View<float> boxHeight, csl::View<float> offsetX, csl::View<float> offsetY, float stepX, float stepY,
+        std::vector<float> variance,
+        std::size_t numPriors,
+        std::size_t layerWidth, std::size_t layerHeight,
+        std::size_t imageWidth, std::size_t imageHeight,
+        bool normalize, bool clip);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_PRIOR_BOX_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/region.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/region.hpp
@@ -0,0 +1,25 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void region(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, csl::View<T> bias,
+        T object_prob_cutoff, T class_prob_cutoff,
+        std::size_t boxes_per_cell, std::size_t box_size,
+        std::size_t rows, std::size_t cols,
+        std::size_t height_norm, std::size_t width_norm,
+        bool if_true_sigmoid_else_softmax);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_REGION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/resize.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/resize.hpp
@@ -0,0 +1,21 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void resize_nn(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input);
+
+    template <class T>
+    void resize_bilinear(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, float scale_y, float scale_x);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_RESIZE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/roi_pooling.hpp
@@ -0,0 +1,19 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/span.hpp"
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void roi_pooling(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, csl::View<T> rois, T spatial_scale);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_ROI_POOLING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/scale_shift.hpp
@@ -0,0 +1,45 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void bias1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
+
+    template <class T>
+    void biasN(const csl::Stream& stream,
+        csl::TensorSpan<T> output,
+        csl::TensorView<T> input, std::size_t inner_size,
+        csl::TensorView<T> bias);
+
+    template <class T>
+    void scale1(const csl::Stream& stream, csl::TensorSpan<T> output, csl::TensorView<T> input, T alpha);
+
+    template <class T>
+    void scaleN(const csl::Stream& stream,
+        csl::TensorSpan<T> output,
+        csl::TensorView<T> input, std::size_t inner_size,
+        csl::TensorView<T> weights);
+
+    template <class T>
+    void scale1_with_bias1(const csl::Stream& stream, csl::Span<T> output, csl::View<T> input, T alpha, T beta);
+
+    template <class T>
+    void scaleN_with_biasN(
+        const csl::Stream& stream,
+        csl::TensorSpan<T> output,
+        csl::TensorView<T> input, std::size_t inner_size,
+        csl::TensorView<T> weights, csl::TensorView<T> bias);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SCALE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/slice.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/kernels/slice.hpp
@@ -0,0 +1,22 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include <cstddef>
+
+namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
+
+    template <class T>
+    void slice(const csl::Stream& stream,
+        csl::TensorSpan<T> output, csl::TensorView<T> input,
+        std::vector<std::size_t> offsets);
+
+}}}} /* namespace cv::dnn::cuda4dnn::kernels */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_SLICE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/activation.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/activation.hpp
@@ -0,0 +1,346 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/activations.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ReLUOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ReLUOp(csl::Stream stream_, T slope_)
+            : stream(std::move(stream_)), slope{ slope_ } { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::relu<T>(stream, output, input, slope);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        const T slope;
+    };
+
+    template <class T>
+    class ClippedReLUOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ClippedReLUOp(csl::Stream stream_, T min_, T max_)
+            : stream(std::move(stream_)), min{ min_ }, max{ max_ } { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::clipped_relu<T>(stream, output, input, min, max);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        const T min, max;
+    };
+
+    template <class T>
+    class ChannelwiseReLUOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ChannelwiseReLUOp(csl::Stream stream_, const Mat& slope)
+            : stream(std::move(stream_))
+        {
+            CV_Assert(!slope.empty());
+            slopeTensor = csl::makeTensorHeader<T>(slope);
+            csl::copyMatToTensor<T>(slope, slopeTensor, stream);
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                CV_Assert(input.get_axis_size(1) == slopeTensor.size());
+                std::size_t inner_size = input.size_range(2, input.rank());
+                kernels::axiswise_relu<T>(stream, output, input, inner_size, slopeTensor);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        csl::Tensor<T> slopeTensor;
+    };
+
+    template <class T>
+    class TanHOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        TanHOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::tanh<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class SwishOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        SwishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::swish<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class MishOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        MishOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::mish<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class SigmoidOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        SigmoidOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::sigmoid<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class ELUOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ELUOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::elu<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class AbsValOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        AbsValOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::abs<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class BNLLOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        BNLLOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::bnll<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+    template <class T>
+    class PowerOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        PowerOp(csl::Stream stream_, T exp_, T scale_, T shift_)
+            : stream(std::move(stream_)), exp{ exp_ }, scale{ scale_ }, shift{ shift_ } { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::power<T>(stream, output, input, exp, scale, shift);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        const T exp, scale, shift;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ACTIVATION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/batch_norm.hpp
@@ -0,0 +1,58 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class BatchNormOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        BatchNormOp(csl::Stream stream_, const cv::Mat& weights, const cv::Mat& bias)
+            : stream(std::move(stream_))
+        {
+            biasTensor = csl::makeTensorHeader<T>(bias);
+            csl::copyMatToTensor<T>(bias, biasTensor, stream);
+
+            weightsTensor = csl::makeTensorHeader<T>(weights);
+            csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            std::size_t inner_size = input.size_range(2, input.rank());
+            kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weightsTensor, biasTensor);
+        }
+
+    private:
+        csl::Stream stream;
+        csl::Tensor<T> weightsTensor, biasTensor;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_BATCH_NORM_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/concat.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/concat.hpp
@@ -0,0 +1,90 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/pointer.hpp"
+
+#include "../kernels/fill.hpp"
+#include "../kernels/concat.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ConcatOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ConcatOp(csl::Stream stream_, std::size_t concat_axis, bool zero_padding)
+            : stream(std::move(stream_)), concat_axis{ concat_axis }, zero_padding{ zero_padding }
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(outputs.size() == 1);
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            if(zero_padding)
+            {
+                auto output_shape = output_wrapper->getShape();
+
+                kernels::fill<T>(stream, output, 0.0);
+
+                std::size_t output_concat_axis_offset = 0;
+                for (int i = 0; i < inputs.size(); i++)
+                {
+                    auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                    auto input = input_wrapper->getView();
+                    auto input_shape = input_wrapper->getShape();
+
+                    std::vector<std::size_t> offsets(input_shape.size());
+                    for (int j = 0; j < offsets.size(); j++)
+                        offsets[j] = (output_shape[j] - input_shape[j]) / 2;
+                    offsets[concat_axis] = output_concat_axis_offset;
+
+                    kernels::concat_with_offsets(stream, output, input, offsets);
+
+                    output_concat_axis_offset += input.get_axis_size(concat_axis);
+                }
+            }
+            else
+            {
+                std::size_t output_axis_offset = 0;
+                for (int i = 0; i < inputs.size(); i++)
+                {
+                    auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                    auto input = input_wrapper->getView();
+
+                    kernels::concat(stream, output, output_axis_offset, input, concat_axis);
+
+                    output_axis_offset += input.get_axis_size(concat_axis);
+                }
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        std::size_t concat_axis;
+        bool zero_padding;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONCAT_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/const.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/const.hpp
@@ -0,0 +1,51 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ConstOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ConstOp(csl::Stream stream_, const cv::Mat& data)
+            : stream(std::move(stream_))
+        {
+            constTensor = csl::makeTensorHeader<T>(data);
+            csl::copyMatToTensor<T>(data, constTensor, stream);
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(outputs.size() == 1 && inputs.size() == 0);
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+            csl::tensor_ops::copy<T>(stream, output, constTensor);
+        }
+
+    private:
+        csl::Stream stream;
+        csl::Tensor<T> constTensor;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONST_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/convolution.hpp
@@ -0,0 +1,333 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+#include "../kernels/scale_shift.hpp"
+#include "../kernels/activations.hpp"
+#include "../kernels/bias_activation.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    struct ConvolutionConfiguration {
+        /* the size of the following vectors must be equal to the kernel size */
+        std::vector<std::size_t> kernel_size;
+        std::vector<std::size_t> dilations, strides;
+
+        enum class PaddingMode {
+            MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+            VALID, /* no padding is added */
+            SAME /* TensorFlow logic is used for same padding */
+        };
+
+        /* explicit paddings are used if and only if padMode is set to manual */
+        PaddingMode padMode;
+        std::vector<std::size_t> pads_begin, pads_end;
+
+        /* full shape inclusive of channel and batch axis */
+        std::vector<std::size_t> input_shape;
+        std::vector<std::size_t> output_shape;
+
+        /* group count for grouped convolution */
+        std::size_t groups;
+
+        enum class ActivationType {
+            IDENTITY,
+            RELU, /* uses value provided in `relu_negative_slope` */
+            CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
+            POWER, /* scale and shift fused beforehand (fuseWeights); only `power_exp` is handled by CUDA */
+            TANH,
+            SIGMOID,
+            SWISH,
+            MISH
+        };
+
+        ActivationType activation_type;
+        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+    };
+
+    template <class T>
+    class ConvolutionOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const ConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
+            : stream(std::move(stream_)), cudnnHandle(std::move(handle))
+        {
+            const auto& kernel_size = config.kernel_size;
+            const auto& dilations = config.dilations;
+            const auto& strides = config.strides;
+
+            const auto convolution_order = kernel_size.size();
+            CV_Assert(convolution_order > 1);
+
+            CV_Assert(convolution_order == dilations.size());
+            CV_Assert(convolution_order == strides.size());
+
+            const auto& input_shape = config.input_shape;
+            const auto& output_shape = config.output_shape;
+            CV_Assert(input_shape.size() == output_shape.size());
+            CV_Assert(input_shape.size() == convolution_order + 2);
+
+            const auto groups = config.groups;
+
+            if (convolution_order > 3)
+                CV_Error(Error::StsNotImplemented, "Only 2D/3D convolution is supported.");
+
+            const auto rank = input_shape.size();
+            const auto output_feature_maps = output_shape[1];
+            const auto input_feature_maps = input_shape[1];
+            const auto input_feature_maps_per_group = input_feature_maps / groups;
+            CV_Assert(input_feature_maps % groups == 0);
+
+            filtersTensor = csl::makeTensorHeader<T>(filters);
+            csl::copyMatToTensor<T>(filters, filtersTensor, stream);
+
+            if (!bias.empty())
+            {
+                biasTensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, biasTensor, stream);
+            }
+
+            /* left and right are misleading as the padding is applicable for any number of dimensions
+             * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
+             *
+             * `common_padding` contains the amount of padding that has to be added to both sides
+             * `padding_left` and `padding_right` contains the amount of padding that needs to be added
+             * to a particular side in addition to the common padding
+             */
+            std::vector<std::size_t> common_padding(rank, 0);
+            std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
+            if (config.padMode == ConvolutionConfiguration::PaddingMode::MANUAL)
+            {
+                const auto& pads_begin = config.pads_begin;
+                const auto& pads_end = config.pads_end;
+
+                CV_Assert(convolution_order == pads_begin.size());
+                CV_Assert(convolution_order == pads_end.size());
+
+                for (int i = 2; i < common_padding.size(); i++)
+                {
+                    common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
+                    padding_left[i] = pads_begin[i - 2] - common_padding[i];
+                    padding_right[i] = pads_end[i - 2] - common_padding[i];
+                }
+            }
+            else if (config.padMode == ConvolutionConfiguration::PaddingMode::VALID)
+            {
+                /* nothing to do as the paddings are already preset to zero */
+            }
+            else if (config.padMode == ConvolutionConfiguration::PaddingMode::SAME)
+            {
+                /* TensorFlow Logic:
+                 * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+                 *
+                 * if total padding is odd, the extra is added towards the end
+                 */
+                for (int i = 2; i < rank; i++)
+                {
+                    const auto j = i - 2; /* filter index */
+                    const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+                    const auto required_total_padding =
+                        std::max<std::int64_t>(0, (output_shape[i] - 1) * strides[j] + effective_kernel_size - input_shape[i]);
+
+                    common_padding[i] = required_total_padding / 2;
+                    padding_left[i] = 0;
+                    padding_right[i] = required_total_padding % 2;
+                }
+            }
+
+            /* in some scenarios, the extra padding at the end may not change the output at all */
+            for (int i = 2; i < rank; i++) {
+                const auto j = i - 2; /* filter idx */
+                const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+                const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+                std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
+
+                /* the output shape doesn't change if we decrease the total padding by at most `rem`
+                 * provided that we decrease from the right
+                 */
+                if (rem && padding_right[i] > 0)
+                    padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
+            }
+
+            auto is_not_zero = [](std::size_t i) { return i != 0; };
+            if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
+               std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
+            {
+                /* csl::Convolution supports symmetric padding only; hence, we deal with asymmetric padding by
+                 * copying the input to a bigger tensor and padding the ends manually
+                 */
+                transformed_shape = input_shape;
+                for (int i = 0; i < rank; i++)
+                    transformed_shape[i] += padding_left[i] + padding_right[i];
+
+                inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
+            }
+
+            typename csl::Convolution<T>::params_type params;
+            if (transformed_shape.empty())
+            {
+                params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+            }
+            else
+            {
+                /* the convolution operation will be seeing the transformed input */
+                params.input_shape.assign(std::begin(transformed_shape), std::end(transformed_shape));
+            }
+
+            auto& fshape = params.filter_shape;
+            fshape.resize(rank);
+            fshape[0] = output_feature_maps;
+            fshape[1] = input_feature_maps_per_group;
+            std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
+            CV_Assert(fshape.size() == kernel_size.size() + 2);
+
+            params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
+            params.stride = strides;
+            params.dilation = dilations;
+            params.groups = config.groups;
+
+            convoluter = csl::Convolution<T>(cudnnHandle, params);
+
+            activation = config.activation_type;
+            relu_negative_slope = config.relu_negative_slope;
+            crelu_floor = config.crelu_floor;
+            crelu_ceil = config.crelu_ceil;
+            power_exp = config.power_exp;
+
+            if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
+                activation = ConvolutionConfiguration::ActivationType::IDENTITY;
+
+            csl::WorkspaceBuilder builder;
+            if (!transformed_shape.empty()) {
+                auto& shape = transformed_shape;
+                auto sz = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<std::size_t>());
+                builder.require<T>(sz);
+            }
+            builder.require(convoluter.get_workspace_size());
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            csl::WorkspaceAllocator allocator(workspace);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            if (!transformed_shape.empty())
+            {
+                auto& shape = transformed_shape;
+                auto transformed_input = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
+                inputTransformer.transform(input, transformed_input);
+                input = transformed_input;
+            }
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            convoluter.convolve(output, input, filtersTensor, allocator.get_instance());
+            if (!biasTensor.empty())
+            {
+                std::size_t inner_size = output.size_range(2, output.rank());
+                switch(activation)
+                {
+                    case ConvolutionConfiguration::ActivationType::IDENTITY:
+                        kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::RELU:
+                        kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
+                        kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::POWER:
+                        kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::TANH:
+                        kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::SIGMOID:
+                        kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::SWISH:
+                        kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::MISH:
+                        kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
+                        break;
+                }
+            }
+            else
+            {
+                switch(activation)
+                {
+                    case ConvolutionConfiguration::ActivationType::IDENTITY:
+                        break;
+                    case ConvolutionConfiguration::ActivationType::RELU:
+                        kernels::relu<T>(stream, output, output, relu_negative_slope);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
+                        kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::POWER:
+                        kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::TANH:
+                        kernels::tanh<T>(stream, output, output);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::SIGMOID:
+                        kernels::sigmoid<T>(stream, output, output);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::SWISH:
+                        kernels::swish<T>(stream, output, output);
+                        break;
+                    case ConvolutionConfiguration::ActivationType::MISH:
+                        kernels::mish<T>(stream, output, output);
+                        break;
+                }
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::Stream stream;
+        csl::cudnn::Handle cudnnHandle;
+        csl::Tensor<T> filtersTensor, biasTensor;
+        csl::Convolution<T> convoluter;
+
+        std::vector<std::size_t> transformed_shape;
+        csl::TensorTransform<T> inputTransformer;
+
+        std::size_t scratch_mem_in_bytes;
+
+        ConvolutionConfiguration::ActivationType activation;
+        float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CONVOLUTION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/crop_and_resize.hpp
@@ -0,0 +1,51 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+
+#include "../kernels/crop_and_resize.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class CropAndResizeOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        CropAndResizeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 2 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto box_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto boxes = box_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            kernels::crop_and_resize(stream, output, input, static_cast<csl::View<T>>(boxes));
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_CROP_AND_RESIZE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/eltwise.hpp
@@ -0,0 +1,118 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/eltwise_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    enum class EltwiseOpType {
+        MAX,
+        SUM,
+        PRODUCT,
+        DIV
+    };
+
+    template <class T>
+    class EltwiseOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        template <class V>
+        EltwiseOp(csl::Stream stream_, EltwiseOpType op_, std::vector<V> coeffs_)
+            : stream(std::move(stream_)), op{ op_ }, coeffs(std::begin(coeffs_), std::end(coeffs_))
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() >= 2);
+            CV_Assert(outputs.size() == 1);
+
+            CV_Assert(coeffs.size() == 0 || op == EltwiseOpType::SUM);
+            CV_Assert(coeffs.size() == 0 || inputs.size() == coeffs.size());
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            if (inputs.size() == 2)
+            {
+                auto input_wrapper_x = inputs[0].dynamicCast<wrapper_type>();
+                auto input_x = input_wrapper_x->getView();
+
+                auto input_wrapper_y = inputs[1].dynamicCast<wrapper_type>();
+                auto input_y = input_wrapper_y->getView();
+
+                switch (op)
+                {
+                case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::DIV: kernels::eltwise_div_2<T>(stream, output, input_x, input_y); break;
+                case EltwiseOpType::SUM:
+                    if (coeffs.empty() || (coeffs[0] == 1 && coeffs[1] == 1))
+                        kernels::eltwise_sum_2<T>(stream, output, input_x, input_y);
+                    else
+                        kernels::eltwise_sum_coeff_2<T>(stream, output, coeffs[0], input_x, coeffs[1], input_y);
+                    break;
+                }
+            }
+            else
+            {
+                auto input_wrapper_0 = inputs[0].dynamicCast<wrapper_type>();
+                auto input_0 = input_wrapper_0->getView();
+
+                /* we first make a copy and then apply EltwiseOp cumulatively */
+                csl::tensor_ops::copy(stream, output, input_0);
+
+                for (int i = 1; i < inputs.size(); i++)
+                {
+                    auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                    auto input = input_wrapper->getView();
+
+                    switch (op)
+                    {
+                    case EltwiseOpType::MAX: kernels::eltwise_max_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::PRODUCT: kernels::eltwise_prod_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::DIV: kernels::eltwise_div_2<T>(stream, output, output, input); break;
+                    case EltwiseOpType::SUM:
+                        if (coeffs.empty() || coeffs[i] == 1)
+                            kernels::eltwise_sum_2<T>(stream, output, output, input);
+                        else
+                        {
+                            /* if this is the first op, we must scale output too */
+                            auto coeff_x = (i == 1) ? coeffs[0] : static_cast<T>(1.0);
+                            kernels::eltwise_sum_coeff_2<T>(stream, output, coeff_x, output, coeffs[i], input);
+                        }
+                        break;
+                    }
+                }
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        EltwiseOpType op;
+        std::vector<T> coeffs;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ELTWISE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/inner_product.hpp
@@ -0,0 +1,92 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/cublas.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class InnerProductOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        InnerProductOp(csl::Stream stream_, csl::cublas::Handle handle, std::size_t axis, const Mat& weights, const Mat& bias)
+            : stream(std::move(stream_)), cublasHandle(std::move(handle)), axis{ axis }
+        {
+            weightsTensor = csl::makeTensorHeader<T>(weights);
+            CV_Assert(get_effective_rank(weightsTensor) == 2);
+            csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+
+            if (!bias.empty())
+            {
+                biasTensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, biasTensor, stream);
+                CV_Assert(weightsTensor.get_axis_size(-2) == biasTensor.size());
+            }
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                std::size_t batch_size = input.size_range(0, axis);
+
+                auto input_size = input.size() / batch_size;
+                CV_Assert(input_size == weightsTensor.get_axis_size(-1));
+
+                auto output_size = output.size() / batch_size;
+                CV_Assert(output_size == weightsTensor.get_axis_size(-2));
+
+                /* we treat the input and output as a matrix with dimensions (batch_size, input_size)
+                 * and (batch_size, output_size) respectively
+                 *
+                 * weight matrix dimensions: (output_size, input_size)
+                 *
+                 * I(W^T) = O
+                 * (batch_size, input_size) * (input_size, output_size) = (batch_size, output_size)
+                 */
+                input.reshape(batch_size, input_size);
+                output.reshape(batch_size, output_size);
+                csl::tensor_ops::gemm<T>(cublasHandle, 0.0, output, 1.0, false, input, true, weightsTensor);
+
+                if (!biasTensor.empty())
+                    kernels::biasN<T>(stream, output, output, 1, biasTensor);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        csl::cublas::Handle cublasHandle;
+        csl::Tensor<T> weightsTensor, biasTensor;
+        std::size_t axis;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_INNER_PRODUCT_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/lrn.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/lrn.hpp
@@ -0,0 +1,75 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <cstddef>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    enum class LRNType {
+        ACROSS_CHANNELS,
+        WITHIN_CHANNEL
+    };
+
+    template <class T>
+    class LRNOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        LRNOp(csl::cudnn::Handle handle, LRNType type_, std::size_t local_size, T alpha, T beta, T bias, std::size_t largestInputSize)
+            : scratch_mem_in_bytes { 0 }
+        {
+            typename csl::LRN<T>::LRNType type{};
+            switch (type_) {
+            case LRNType::ACROSS_CHANNELS: type = csl::LRN<T>::LRNType::ACROSS_CHANNELS; break;
+            case LRNType::WITHIN_CHANNEL: type = csl::LRN<T>::LRNType::WITHIN_CHANNEL; break;
+            }
+            lrn = csl::LRN<T>(std::move(handle), local_size, alpha, beta, bias, type);
+
+            csl::WorkspaceBuilder builder;
+            if (type_ == LRNType::WITHIN_CHANNEL) {
+                /* this is not a bug; we require two of these */
+                builder.require<T>(largestInputSize);
+                builder.require<T>(largestInputSize);
+            }
+
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                csl::WorkspaceAllocator allocator(workspace);
+                lrn.normalize(input, output, allocator.get_instance());
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::LRN<T> lrn;
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_LRN_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/max_unpooling.hpp
@@ -0,0 +1,182 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/max_unpooling.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    struct MaxPoolingConfiguration {
+        /* the size of the following vectors must be equal to the pooling order */
+        std::vector<std::size_t> window_size;
+        std::vector<std::size_t> strides;
+
+        enum class PaddingMode {
+            MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+            VALID, /* no padding is added */
+            SAME /* TensorFlow logic is used for same padding */
+        };
+
+        PaddingMode padMode;
+
+        /* explicit paddings are used if and only if padMode is set to manual */
+        std::vector<std::size_t> pads_begin;
+
+        /* full shape inclusive of channel and batch axis */
+        std::vector<std::size_t> input_shape;
+    };
+
+    template <class T>
+    class MaxPoolingOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        MaxPoolingOp(csl::Stream stream_, const MaxPoolingConfiguration& config)
+            : stream(std::move(stream_))
+        {
+            window_size = config.window_size;
+
+            const auto pooling_order = window_size.size();
+            CV_Assert(pooling_order >= 1);
+
+            strides = config.strides;
+            CV_Assert(pooling_order == strides.size());
+
+            if (pooling_order != 2 && pooling_order != 3)
+                CV_Error(Error::StsNotImplemented, "Only 2D/3D max-pooling are supported.");
+
+            padding_left.resize(pooling_order);
+            if (config.padMode == MaxPoolingConfiguration::PaddingMode::MANUAL)
+            {
+                const auto& pads_begin = config.pads_begin;
+                CV_Assert(pooling_order == pads_begin.size());
+
+                padding_left.assign(std::begin(pads_begin), std::end(pads_begin));
+            }
+            else if (config.padMode == MaxPoolingConfiguration::PaddingMode::VALID)
+            {
+                /* nothing to do as the paddings are already preset to zero */
+            }
+            else if (config.padMode == MaxPoolingConfiguration::PaddingMode::SAME)
+            {
+                /* TensorFlow Logic:
+                 * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+                 *
+                 * if total padding is odd, the extra is added towards the end
+                 */
+                const auto& input_shape = config.input_shape;
+                CV_Assert(input_shape.size() == pooling_order + 2);
+
+                for (int i = 0; i < pooling_order; i++)
+                {
+                    const auto output_dim = (input_shape[i + 2] - 1 + strides[i]) / strides[i];
+                    const auto required_total_padding =
+                        std::max<std::int64_t>(0, (output_dim - 1) * strides[i] + window_size[i] - input_shape[i + 2]);
+
+                    padding_left[i] = required_total_padding / 2;
+                }
+            }
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 2);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input_data = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output_data = output_wrapper->getSpan();
+
+            auto indices_wrapper = outputs[1].dynamicCast<wrapper_type>();
+            auto output_indices = indices_wrapper->getSpan();
+
+            kernels::max_pooling_with_indices<T>(
+                stream, output_data, output_indices, input_data, window_size, strides, padding_left
+            );
+        }
+
+    private:
+        csl::Stream stream;
+
+        std::vector<std::size_t> window_size, strides, padding_left;
+    };
+
+    struct MaxUnpoolingConfiguration {
+        /* the size of the following vectors must be equal to the unpooling order */
+        std::vector<std::size_t> window_size;
+        std::vector<std::size_t> strides;
+        std::vector<std::size_t> pads_begin;
+    };
+
+    template <class T>
+    class MaxUnpoolingOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        MaxUnpoolingOp(csl::Stream stream_, const MaxUnpoolingConfiguration& config)
+            : stream(std::move(stream_))
+        {
+            window_size = config.window_size;
+
+            const auto pooling_order = window_size.size();
+            CV_Assert(pooling_order >= 1);
+
+            strides = config.strides;
+            padding_left = config.pads_begin;
+            CV_Assert(strides.size() == pooling_order);
+            CV_Assert(padding_left.size() == pooling_order);
+
+            if (pooling_order != 2 && pooling_order != 3)
+                CV_Error(Error::StsNotImplemented, "Only 2D/3D max-unpooling are supported.");
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            /* sometimes a third input is passed to provide the output shape; we won't need it */
+            CV_Assert(inputs.size() == 2 || inputs.size() == 3);
+            CV_Assert(outputs.size() >= 1);
+
+            for(int i = 0;  i < outputs.size(); i++)
+            {
+                auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+                auto input_data = input_wrapper->getView();
+
+                auto indices_wrapper = inputs[1].dynamicCast<wrapper_type>();
+                auto input_indices = indices_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output_data = output_wrapper->getSpan();
+
+                kernels::max_unpooling<T>(stream, output_data, input_data, input_indices, window_size, strides, padding_left);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+
+        std::vector<std::size_t> window_size, strides, padding_left;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_MAX_UNPOOLING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/normalize_bbox.hpp
@@ -0,0 +1,142 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/workspace.hpp"
+
+#include "../kernels/scale_shift.hpp"
+#include "../kernels/normalize.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    struct NormalizeConfiguration {
+        std::vector<std::size_t> input_shape;
+
+        /* axis range across which values are normalized
+         *
+         * [0, axis_start) = outer range
+         * [axis_start, axis_end) = mid range
+         * [axis_end + 1, -1) = inner range
+         *
+         * for each location in the outer and inner range, all the values in the mid range are
+         * normalized together
+         */
+        std::size_t axis_start, axis_end;
+
+        /* 1 for L1 norm, 2 for L2 norm */
+        std::size_t norm;
+
+        /* epsilon to use to avoid division by zero */
+        T eps;
+    };
+
+    template <class T>
+    class NormalizeOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        template <class V>
+        NormalizeOp(csl::Stream stream_, const Mat& weights, const NormalizeConfiguration<V>& config)
+            : stream(std::move(stream_)), weight{ 1.0 }
+        {
+            norm_order = config.norm;
+            epsilon = config.eps;
+            axis_start = config.axis_start;
+            axis_end = config.axis_end;
+
+            if (!weights.empty())
+            {
+                if (weights.total() == 1)
+                {
+                    CV_Assert(weights.type() == CV_32F);
+                    weight = weights.at<float>(0, 0);
+                }
+                else
+                {
+                    weightsTensor = csl::makeTensorHeader<T>(weights);
+                    csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+                }
+            }
+
+            std::size_t outer_size = 1;
+            for (int i = 0; i < axis_start; i++)
+                outer_size *= config.input_shape[i];
+
+            std::size_t inner_size = 1;
+            for (int i = axis_end; i < config.input_shape.size(); i++)
+                inner_size *= config.input_shape[i];
+
+            csl::WorkspaceBuilder builder;
+            builder.require<T>(outer_size * inner_size);
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            std::size_t outer_size = input.size_range(0, axis_start);
+            std::size_t mid_size = input.size_range(axis_start, axis_end);
+            std::size_t inner_size = input.size_range(axis_end, input.rank());
+
+            auto ws_allocator = csl::WorkspaceAllocator(workspace);
+            auto scratch = ws_allocator.get_span<T>();
+            kernels::normalize<T>(stream, output, input, outer_size, mid_size, inner_size, norm_order, epsilon, scratch);
+
+            /* there might be a single weight in which case `weight` will be not equal to 1.0
+             * or there might be several weights
+             * or we don't have to scale
+             */
+            if (weight != 1.0)
+            {
+                kernels::scale1<T>(stream, output, input, weight);
+            }
+            else if (!weightsTensor.empty())
+            {
+                CV_Assert(weightsTensor.size() != 1); /* constructor should have set up to use `weight` */
+                CV_Assert(weightsTensor.size() == mid_size);
+                kernels::scaleN<T>(stream, output, input, inner_size, weightsTensor);
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::Stream stream;
+        csl::Tensor<T> weightsTensor;
+        T weight; /* if there is only one weight, we use this */
+
+        T epsilon;
+        std::size_t norm_order;
+        std::size_t axis_start, axis_end;
+
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_NORMALIZE_BBOX_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/padding.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/padding.hpp
@@ -0,0 +1,113 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/fill.hpp"
+#include "../kernels/concat.hpp"
+#include "../kernels/padding.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <algorithm>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    enum class PaddingType {
+        CONSTANT,
+        REFLECTION101
+    };
+
+    template <class T>
+    class PaddingOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        /* `ranges` is indexed by axis and contains the range in the output where the input is copied to */
+        PaddingOp(csl::Stream stream_, PaddingType type_, T value_, std::vector<cv::Range> ranges)
+            : stream(std::move(stream_)),  type{ type_ }, value{ value_ }, dstRanges(std::move(ranges))
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto effective_rank = get_effective_rank(input);
+            CV_Assert(get_effective_rank(input) == get_effective_rank(output));
+
+            /* suppose we require padding for the first spatial axis (H in NCHW or D in NCDHW)
+             *
+             * there could be a case where the batch axis, channel axis, and the first spatial axis are all one
+             * this would result in effective rank being less than the number of axes requiring padding
+             */
+            effective_rank = std::max(effective_rank, dstRanges.size());
+
+            for (int i = effective_rank - dstRanges.size(); i < effective_rank; i++)
+            {
+                if (dstRanges[i] == Range::all())
+                    CV_Assert(input.get_axis_size(i) == output.get_axis_size(i));
+                else
+                    CV_Assert(input.get_axis_size(i) == dstRanges[i].size());
+            }
+
+            if (type == PaddingType::CONSTANT)
+            {
+                kernels::fill<T>(stream, output, value);
+
+                std::vector<std::size_t> offsets(effective_rank, 0);
+                for (int i = 0; i < dstRanges.size(); i++)
+                {
+                    const auto delta = effective_rank - dstRanges.size();
+                    if (dstRanges[i] != Range::all())
+                        offsets[delta + i] = dstRanges[i].start;
+                }
+
+                kernels::concat_with_offsets<T>(stream, output, input, offsets);
+            }
+            else if (type == PaddingType::REFLECTION101)
+            {
+                std::vector<std::pair<std::size_t, std::size_t>> ranges(effective_rank);
+                for (int i = 0; i < effective_rank; i++)
+                {
+                    const auto delta = effective_rank - dstRanges.size();
+                    if (i < delta || dstRanges[i - delta] == Range::all())
+                        ranges[i] = { 0, input.get_axis_size(i) };
+                    else
+                        ranges[i] = { dstRanges[i].start, dstRanges[i].end };
+                }
+
+                kernels::copy_with_reflection101<T>(stream, output, input, ranges);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        PaddingType type;
+        T value;
+
+        std::vector<cv::Range> dstRanges;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PADDING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/permute.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/permute.hpp
@@ -0,0 +1,70 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/permute.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class PermuteOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        PermuteOp(csl::Stream stream_, std::vector<std::size_t> order_)
+            : stream(std::move(stream_)), order(std::move(order_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                auto needsPermute = [&] {
+                    for (int i = 0; i < order.size(); i++)
+                        if (order[i] != i)
+                            return true;
+                    return false;
+                }();
+
+                if (needsPermute)
+                {
+                    kernels::permute(stream, output, input, order);
+                }
+                else
+                {
+                    if (input.get() != output.get())
+                        csl::tensor_ops::copy(stream, output, input);
+                }
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        std::vector<std::size_t> order;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PERMUTE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/pooling.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/pooling.hpp
@@ -0,0 +1,258 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    struct PoolingConfiguration {
+        enum class PoolingMode {
+            MAX,
+            AVERAGE_INCLUDE_PADDING, /* include padding while calculating average */
+            AVERAGE_EXCLUDE_PADDING /* exclude padding while calculating average */
+        };
+
+        PoolingMode poolMode;
+
+        /* the size of the following vectors must be equal to the window size */
+        std::vector<std::size_t> window_size;
+        std::vector<std::size_t> strides;
+
+        enum class PaddingMode {
+            MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+            VALID, /* no padding is added */
+            SAME /* TensorFlow logic is used for same padding */
+        };
+
+        PaddingMode padMode;
+
+        /* explicit paddings are used if and only if padMode is set to manual */
+        std::vector<std::size_t> pads_begin, pads_end;
+
+        /* the output shape is calculated using the following formula:
+         * output_dim = func[(input_dim + padding_left + padding_right - kernel_dim)/stride] + 1
+         *
+         * rounding mode decides what is used as `func`
+         */
+        enum class RoundingMode {
+            CEIL, /* uses ceil */
+            FLOOR
+        };
+
+        RoundingMode roundMode;
+
+        /* full shape inclusive of channel and batch axis */
+        std::vector<std::size_t> input_shape;
+    };
+
+    template <class T>
+    class PoolingOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        PoolingOp(csl::cudnn::Handle handle, const PoolingConfiguration& config)
+            : cudnnHandle(std::move(handle))
+        {
+            const auto& window_size = config.window_size;
+
+            const auto pooling_order = window_size.size();
+            CV_Assert(pooling_order >= 1);
+
+            const auto& strides = config.strides;
+            CV_Assert(pooling_order == strides.size());
+
+            const auto& input_shape = config.input_shape;
+            CV_Assert(input_shape.size() == pooling_order + 2);
+
+            if (pooling_order > 3)
+                CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D pooling are supported.");
+
+            const auto rank = input_shape.size();
+
+            /* left and right are misleading as the padding is applicable for any number of dimensions
+             * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
+             *
+             * `common_padding` contains the amount of padding that has to be added to both sides
+             * `padding_left` and `padding_right` contains the amount of padding that needs to be added
+             * to a particular side in addition to the common padding
+             */
+            std::vector<std::size_t> common_padding(rank, 0);
+            std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
+            if (config.padMode == PoolingConfiguration::PaddingMode::MANUAL)
+            {
+                const auto& pads_begin = config.pads_begin;
+                const auto& pads_end = config.pads_end;
+
+                CV_Assert(pooling_order == pads_begin.size());
+                CV_Assert(pooling_order == pads_end.size());
+
+                /* cuDNN rounds down by default; hence, if ceilMode is false, we do nothing
+                 * otherwise, we add extra padding towards the end so that the convolution arithmetic yeilds
+                 * the correct output size without having to deal with fancy fractional sizes
+                 */
+                auto pads_end_modified = pads_end;
+                if (config.roundMode == PoolingConfiguration::RoundingMode::CEIL)
+                {
+                    for (int i = 0; i < window_size.size(); i++) {
+                        auto rem = (input_shape[i + 2] + pads_begin[i] + pads_end[i] - window_size[i]) % strides[i];
+                        if (rem)
+                            pads_end_modified[i] += strides[i] - rem;
+                    }
+                }
+
+                for (int i = 2; i < common_padding.size(); i++)
+                {
+                    common_padding[i] = std::min(pads_begin[i - 2], pads_end_modified[i - 2]);
+                    padding_left[i] = pads_begin[i - 2] - common_padding[i];
+                    padding_right[i] = pads_end_modified[i - 2] - common_padding[i];
+                }
+            }
+            else if (config.padMode == PoolingConfiguration::PaddingMode::VALID)
+            {
+                /* nothing to do as the paddings are already preset to zero */
+            }
+            else if (config.padMode == PoolingConfiguration::PaddingMode::SAME)
+            {
+                /* TensorFlow Logic:
+                 * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+                 *
+                 * if total padding is odd, the extra is added towards the end
+                 */
+                for (int i = 2; i < rank; i++)
+                {
+                    const auto j = i - 2; /* filter index */
+                    const auto output_dim = (input_shape[i] - 1 + strides[j]) / strides[j];
+                    const auto required_total_padding =
+                        std::max<std::int64_t>(0, (output_dim - 1) * strides[j] + window_size[j] - input_shape[i]);
+
+                    common_padding[i] = required_total_padding / 2;
+                    padding_left[i] = 0;
+                    padding_right[i] = required_total_padding % 2;
+                }
+            }
+
+            /* in some scenarios, the extra padding at the end may not change the output at all */
+            for (int i = 2; i < rank; i++) {
+                const auto j = i - 2; /* filter idx */
+                const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+                std::int64_t rem = (input_shape[i] + total_padding - window_size[j]) % strides[j];
+
+                /* the output shape doesn't change if we decrease the total padding by at most `rem`
+                 * provided that we decrease from the right
+                 */
+                if (rem && padding_right[i] > 0)
+                    padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
+            }
+
+            auto is_not_zero = [](std::size_t i) { return i != 0; };
+            if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
+                std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
+            {
+                /* csl::Pooling does not fully support asymmetric padding; hence, we deal with asymmetric padding by
+                 * copying the input to a bigger tensor and padding the ends manually
+                 *
+                 * But we first try to avoid the transformation using cuDNN's flexibility. cuDNN can accept a smaller or
+                 * a bigger output shape. This effectively allows us to have arbitrary padding at the right.
+                 */
+                if (std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero))
+                {
+                    /* there is padding on the left and we are forced to transform */
+                    auto transformed_input_shape = input_shape;
+                    for (int i = 0; i < rank; i++)
+                        transformed_input_shape[i] += padding_left[i] + padding_right[i];
+
+                    transformedInput.resize(std::begin(transformed_input_shape), std::end(transformed_input_shape));
+                    inputTransformer = csl::TensorTransform<T>(cudnnHandle, padding_left, padding_right);
+                }
+            }
+
+            typename csl::Pooling<T>::params_type params;
+            if (transformedInput.empty())
+            {
+                /* no transform => use original input shape */
+                params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+            }
+            else
+            {
+                /* the pooling operation will be seeing the transformed input */
+                auto transformed_input_shape = transformedInput.shape_as_vector();
+                params.input_shape.assign(std::begin(transformed_input_shape), std::end(transformed_input_shape));
+            }
+
+            auto output_shape = input_shape;
+            for (int i = 2; i < rank; i++)
+            {
+                auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+                output_shape[i] = (params.input_shape[i] + total_padding - window_size[i - 2]) / strides[i - 2] + 1;
+            }
+
+            params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
+            params.window_size = window_size;
+            params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
+            params.stride = strides;
+
+            if (config.poolMode == PoolingConfiguration::PoolingMode::MAX)
+            {
+                params.type = csl::Pooling<T>::PoolingType::MAX;
+            }
+            else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_INCLUDE_PADDING)
+            {
+                params.type = csl::Pooling<T>::PoolingType::AVERAGE_INCLUDE_PADDING;
+            }
+            else if (config.poolMode == PoolingConfiguration::PoolingMode::AVERAGE_EXCLUDE_PADDING)
+            {
+                params.type = csl::Pooling<T>::PoolingType::AVERAGE_EXCLUDE_PADDING;
+            }
+
+            pooler = csl::Pooling<T>(cudnnHandle, params);
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            if (!transformedInput.empty())
+            {
+                inputTransformer.transform(input, transformedInput);
+                input = csl::TensorView<T>(transformedInput);
+            }
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            pooler.pool(input, output);
+        }
+
+    private:
+        csl::cudnn::Handle cudnnHandle;
+        csl::Pooling<T> pooler;
+
+        csl::Tensor<T> transformedInput;
+        csl::TensorTransform<T> inputTransformer;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_POOLING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/prior_box.hpp
@@ -0,0 +1,136 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/span.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/prior_box.hpp"
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    struct PriorBoxConfiguration {
+        std::size_t feature_map_width, feature_map_height;
+        std::size_t image_width, image_height;
+
+        /* parameters for prior boxes for each feature point */
+        std::vector<float> box_widths, box_heights;
+        std::vector<float> offsets_x, offsets_y;
+        float stepX, stepY;
+
+        std::vector<float> variance;
+
+        /* number of priors per feature point */
+        std::size_t num_priors;
+
+        /* clamps the box coordinates to [0, 1] range */
+        bool clip;
+
+        /* normalizes the box coordinates using the image dimensions */
+        bool normalize;
+    };
+
+    template <class T>
+    class PriorBoxOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        PriorBoxOp(csl::Stream stream_, const PriorBoxConfiguration& config)
+            : stream(std::move(stream_))
+        {
+            feature_map_width = config.feature_map_width;
+            feature_map_height = config.feature_map_height;
+
+            image_width = config.image_width;
+            image_height = config.image_height;
+
+            const auto& box_widths = config.box_widths;
+            const auto& box_heights = config.box_heights;
+            CV_Assert(box_widths.size() == box_heights.size());
+
+            box_size = box_widths.size();
+
+            const auto& offsets_x = config.offsets_x;
+            const auto& offsets_y = config.offsets_y;
+            CV_Assert(offsets_x.size() == offsets_y.size());
+
+            offset_size = offsets_x.size();
+
+            /* for better memory utilization and preassumably better cache performance, we merge
+             * the four vectors and put them in a single tensor
+             */
+            auto total = box_widths.size() * 2 + offsets_x.size() * 2;
+            std::vector<float> merged_params;
+            merged_params.insert(std::end(merged_params), std::begin(box_widths), std::end(box_widths));
+            merged_params.insert(std::end(merged_params), std::begin(box_heights), std::end(box_heights));
+            merged_params.insert(std::end(merged_params), std::begin(offsets_x), std::end(offsets_x));
+            merged_params.insert(std::end(merged_params), std::begin(offsets_y), std::end(offsets_y));
+            CV_Assert(merged_params.size() == total);
+
+            paramsTensor.resize(total);
+            csl::memcpy(paramsTensor.get(), merged_params.data(), total, stream); /* synchronous copy */
+
+            const auto& variance_ = config.variance;
+            variance.assign(std::begin(variance_), std::end(variance_));
+
+            num_priors = config.num_priors;
+            stepX = config.stepX;
+            stepY = config.stepY;
+            clip = config.clip;
+            normalize = config.normalize;
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 2); /* we don't need the inputs but we are given */
+            CV_Assert(outputs.size() == 1);
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            /* we had stored all the parameters in a single tensor; now we create appropriate views
+             * for each of the parameter arrays from the single tensor
+             */
+            auto boxWidths  = csl::View<float>(paramsTensor.get(), box_size);
+            auto boxHeights = csl::View<float>(paramsTensor.get() + box_size, box_size);
+            auto offsetsX   = csl::View<float>(paramsTensor.get() + 2 * box_size, offset_size);
+            auto offsetsY   = csl::View<float>(paramsTensor.get() + 2 * box_size + offset_size, offset_size);
+
+            kernels::generate_prior_boxes<T>(stream, output,
+                boxWidths, boxHeights, offsetsX, offsetsY, stepX, stepY,
+                variance, num_priors, feature_map_width, feature_map_height, image_width, image_height, normalize, clip);
+        }
+
+    private:
+        csl::Stream stream;
+        csl::Tensor<float> paramsTensor; /* widths, heights, offsetsX, offsetsY */
+
+        std::size_t feature_map_width, feature_map_height;
+        std::size_t image_width, image_height;
+
+        std::size_t box_size, offset_size;
+        float stepX, stepY;
+
+        std::vector<float> variance;
+
+        std::size_t num_priors;
+        bool clip, normalize;
+    };
+
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_PRIOR_BOX_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/region.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/region.hpp
@@ -0,0 +1,181 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/region.hpp"
+
+#include "../../nms.inl.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    enum class SquashMethod {
+        SOFTMAX,
+        SIGMOID
+    };
+
+    template <class T>
+    struct RegionConfiguration {
+        /* The image is divided into (H, W) cells.
+         *
+         * Each cell is interested in exactly one object and predicts `boxes_per_cell` bounding boxes
+         * for that object.
+         *
+         * Each bounding box contains:
+         * - 4 box coordinates
+         * - objectness confidence score
+         * - `classes` number of class scores
+         *
+         * The object score is reduced to a probability using sigmoid and the class scores are reduced to
+         * probabilities by either applying sigmoid or softmax (which is a configuration option).
+         *
+         * object_prob = sigmoid(object_score)
+         * conditional_class_prob = sigmoid, softmax across all classes
+         *
+         * actual class probability = conditional_class_prob * object_prob
+         */
+
+        /* method for reducing class scores to probabilities */
+        SquashMethod squash_method;
+
+        std::size_t classes, boxes_per_cell;
+
+        std::size_t width_norm, height_norm;
+
+        /* prob cutoffs below which the prediction is nulled */
+        T object_prob_cutoff;
+        T class_prob_cutoff;
+
+        T nms_iou_threshold;
+    };
+
+    template <class T>
+    class RegionOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        template <class V>
+        RegionOp(csl::Stream stream_, const cv::Mat& bias, const RegionConfiguration<V>& config)
+            : stream(std::move(stream_))
+        {
+            biasTensor = csl::makeTensorHeader<T>(bias);
+            csl::copyMatToTensor<T>(bias, biasTensor, stream);
+
+            classes = config.classes;
+            boxes_per_cell = config.boxes_per_cell;
+
+            width_norm = config.width_norm;
+            height_norm = config.height_norm;
+
+            squash_type = config.squash_method;
+
+            object_prob_cutoff = config.object_prob_cutoff;
+            class_prob_cutoff = config.class_prob_cutoff;
+
+            nms_iou_threshold = config.nms_iou_threshold;
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            auto rows = input.get_axis_size(1);
+            auto cols = input.get_axis_size(2);
+
+            auto cell_box_size = classes + 4 + 1;
+
+            /* we squash class scores into probabilities using softmax or sigmoid */
+            bool if_true_sigmoid_else_softmax = (squash_type == SquashMethod::SIGMOID);
+
+            kernels::region<T>(stream, output, input, biasTensor,
+                object_prob_cutoff, class_prob_cutoff,
+                boxes_per_cell, cell_box_size,
+                rows, cols,
+                height_norm, width_norm,
+                if_true_sigmoid_else_softmax
+            );
+
+            if (nms_iou_threshold > 0) {
+                auto output_mat = output_wrapper->getMutableHostMat();
+                CV_Assert(output_mat.type() == CV_32F);
+                for (int i = 0; i < input.get_axis_size(0); i++) {
+                    auto sample_size = rows * cols * boxes_per_cell * cell_box_size;
+                    do_nms_sort(reinterpret_cast<float*>(output_mat.data) + i * sample_size, rows * cols * boxes_per_cell, class_prob_cutoff, nms_iou_threshold);
+                }
+            }
+        }
+
+    private:
+        void do_nms_sort(float *detections, int total, float score_thresh, float nms_thresh)
+        {
+            std::vector<Rect2d> boxes(total);
+            std::vector<float> scores(total);
+
+            for (int i = 0; i < total; ++i)
+            {
+                Rect2d &b = boxes[i];
+                int box_index = i * (classes + 4 + 1);
+                b.width = detections[box_index + 2];
+                b.height = detections[box_index + 3];
+                b.x = detections[box_index + 0] - b.width / 2;
+                b.y = detections[box_index + 1] - b.height / 2;
+            }
+
+            std::vector<int> indices;
+            for (int k = 0; k < classes; ++k)
+            {
+                for (int i = 0; i < total; ++i)
+                {
+                    int box_index = i * (classes + 4 + 1);
+                    int class_index = box_index + 5;
+                    scores[i] = detections[class_index + k];
+                    detections[class_index + k] = 0;
+                }
+                NMSBoxes(boxes, scores, score_thresh, nms_thresh, indices);
+                for (int i = 0, n = indices.size(); i < n; ++i)
+                {
+                    int box_index = indices[i] * (classes + 4 + 1);
+                    int class_index = box_index + 5;
+                    detections[class_index + k] = scores[indices[i]];
+                }
+            }
+        }
+
+    private:
+        csl::Stream stream;
+
+        csl::Tensor<T> biasTensor;
+        std::size_t classes, boxes_per_cell;
+        std::size_t width_norm, height_norm;
+        SquashMethod squash_type;
+
+        T object_prob_cutoff, class_prob_cutoff;
+        T nms_iou_threshold;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REGION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/reorg.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/reorg.hpp
@@ -0,0 +1,75 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../kernels/permute.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ReorgOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ReorgOp(csl::Stream stream_, std::size_t stride_)
+            : stream(std::move(stream_)), stride{ stride_ } { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            const std::size_t permute_input_shape[] = {
+               input.get_axis_size(0),
+               input.get_axis_size(1) * input.get_axis_size(2) / (stride * stride),
+               stride,
+               input.get_axis_size(3),
+               stride
+            };
+
+            constexpr std::size_t order[] = { 0, 2, 4, 1, 3 };
+
+            const std::size_t permute_output_shape[] = {
+                permute_input_shape[order[0]],
+                permute_input_shape[order[1]],
+                permute_input_shape[order[2]],
+                permute_input_shape[order[3]],
+                permute_input_shape[order[4]]
+            };
+
+            input.unsqueeze();
+            input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape));
+
+            output.unsqueeze();
+            output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape));
+
+            kernels::permute(stream, output, input, { std::begin(order), std::end(order) });
+        }
+
+    private:
+        csl::Stream stream;
+        std::size_t stride;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_REORG_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/reshape.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/reshape.hpp
@@ -0,0 +1,61 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ReshapeOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ReshapeOp(csl::Stream stream_) : stream(std::move(stream_)) { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            /* sometimes the output shape is passed as extra inputs; hence, >= instead of == */
+            CV_Assert(inputs.size() >= outputs.size());
+
+            for (int i = 0; i < outputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                if (input.get() != output.get())
+                {
+                    while (input.rank() < output.rank())
+                        input.unsqueeze();
+
+                    while (output.rank() < input.rank())
+                        output.unsqueeze();
+
+                    input.reshape_as(output);
+                    csl::tensor_ops::copy(stream, output, input);
+                }
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESHAPE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/resize.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/resize.hpp
@@ -0,0 +1,60 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/resize.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    enum class InterpolationType {
+        NEAREST_NEIGHBOUR,
+        BILINEAR
+    };
+
+    template <class T>
+    class ResizeOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ResizeOp(csl::Stream stream_, InterpolationType type_, float scaleHeight_, float scaleWidth_)
+            : stream(std::move(stream_)), type{ type_ }, scaleHeight{ scaleHeight_ }, scaleWidth{ scaleWidth_ }
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            if (type == InterpolationType::NEAREST_NEIGHBOUR)
+                kernels::resize_nn<T>(stream, output, input);
+            else if (type == InterpolationType::BILINEAR)
+                kernels::resize_bilinear<T>(stream, output, input, scaleHeight, scaleWidth);
+        }
+
+    private:
+        csl::Stream stream;
+        InterpolationType type;
+        float scaleHeight, scaleWidth; /* for bilinear interpolation */
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_RESIZE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/roi_pooling.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/roi_pooling.hpp
@@ -0,0 +1,52 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ROI_POOLING_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ROI_POOLING_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/roi_pooling.hpp"
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ROIPoolingOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ROIPoolingOp(csl::Stream stream_, float spatial_scale)
+            : stream(std::move(stream_)), spatial_scale{spatial_scale} { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 2 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto rois_wrapper = inputs[1].dynamicCast<wrapper_type>();
+            auto rois = rois_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            kernels::roi_pooling<T>(stream, output, input, rois, spatial_scale);
+        }
+
+    private:
+        csl::Stream stream;
+        float spatial_scale;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_ROI_POOLING_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/scale_shift.hpp
@@ -0,0 +1,110 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ScaleShiftOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ScaleShiftOp(csl::Stream stream_, std::size_t axis, const cv::Mat& weights, const cv::Mat& bias)
+            : stream(std::move(stream_)), axis{ axis }
+        {
+            if (!weights.empty())
+            {
+                weightsTensor = csl::makeTensorHeader<T>(weights);
+                csl::copyMatToTensor<T>(weights, weightsTensor, stream);
+            }
+
+            if (!bias.empty())
+            {
+                biasTensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, biasTensor, stream);
+            }
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            csl::TensorView<T> weights;
+            if (weightsTensor.empty() && biasTensor.empty())
+            {
+                CV_Assert(inputs.size() == 2);
+
+                /* no explicit scale/shift values provided; use the second input as weights */
+                auto wrapper = inputs[1].dynamicCast<wrapper_type>();
+                weights = wrapper->getView();
+            }
+            else if (!weightsTensor.empty())
+            {
+                weights = csl::TensorSpan<T>(weightsTensor);
+            }
+
+            csl::TensorView<T> bias;
+            if (!biasTensor.empty())
+                bias = csl::TensorSpan<T>(biasTensor);
+
+            const auto numParams = !weights.empty() ? weights.size() : bias.size();
+            CV_Assert(numParams != 0);
+            if (!weightsTensor.empty() && !biasTensor.empty())
+            {
+                CV_CheckEQ(weights.size(), bias.size(), "weights and bias size are not equal");
+            }
+
+            /* the weights/bias might require broadcasting to scale/shift */
+            const int end_axis = [&] {
+                for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++)
+                {
+                    std::size_t size = input.size_range(axis, endAxis);
+                    if (size == numParams)
+                        return endAxis;
+                }
+                CV_Assert(0 /* invalid weights matrix */);
+            }();
+
+            std::size_t inner_size = input.size_range(end_axis, input.rank());
+
+            if (!weights.empty() && !bias.empty())
+                kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weights, bias);
+            else if (!weights.empty())
+                kernels::scaleN<T>(stream, output, input, inner_size, weights);
+            else
+                kernels::biasN<T>(stream, output, input, inner_size, bias);
+        }
+
+    private:
+        csl::Stream stream;
+        csl::Tensor<T> weightsTensor, biasTensor;
+        std::size_t axis;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/shuffle_channel.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/shuffle_channel.hpp
@@ -0,0 +1,79 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/permute.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class ShuffleChannelOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        ShuffleChannelOp(csl::Stream stream_, std::size_t group_)
+            : stream(std::move(stream_)), group{ group_ } { }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            if (group == 1) {
+                /* permute is redundant; check else branch to know why */
+                if (input.get() != output.get()) {
+                    input.reshape_as(output);
+                    csl::tensor_ops::copy(stream, output, input);
+                }
+            } else {
+                const std::size_t permute_input_shape[] = {
+                   input.get_axis_size(0),
+                   group,
+                   input.get_axis_size(1) / group,
+                   input.get_axis_size(2) * input.get_axis_size(3)
+                };
+
+                constexpr std::size_t order[] = { 0, 2, 1, 3 };
+
+                const std::size_t permute_output_shape[] = {
+                    permute_input_shape[order[0]],
+                    permute_input_shape[order[1]],
+                    permute_input_shape[order[2]],
+                    permute_input_shape[order[3]],
+                };
+
+                input.reshape(std::begin(permute_input_shape), std::end(permute_input_shape));
+                output.reshape(std::begin(permute_output_shape), std::end(permute_output_shape));
+                kernels::permute(stream, output, input, { std::begin(order), std::end(order) });
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        std::size_t group;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SHUFFLE_CHANNEL_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/slice.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/slice.hpp
@@ -0,0 +1,62 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+
+#include "../kernels/slice.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <vector>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class SliceOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        /* offsets is indexed by output number and each subvector is indexed by axis number */
+        SliceOp(csl::Stream stream_, std::vector<std::vector<std::size_t>> offsets)
+            : stream(std::move(stream_)), offsets(std::move(offsets))
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            /* sometimes the output shape is passed in the form of a second input tensor
+             * it's only required for initialization and not here
+             */
+            CV_Assert(inputs.size() == 1 || inputs.size() == 2);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            for (int i = 0; i < outputs.size(); ++i)
+            {
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                kernels::slice<T>(stream, output, input, offsets[i]);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+        std::vector<std::vector<std::size_t>> offsets;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SLICE_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/softmax.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/softmax.hpp
@@ -0,0 +1,53 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <cstddef>
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class SoftmaxOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        SoftmaxOp(csl::cudnn::Handle handle, std::size_t axis_, bool log_)
+            : cudnnHandle(std::move(handle)), channel_axis{ axis_ }, log{ log_ }
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            for (int i = 0; i < inputs.size(); i++)
+            {
+                auto input_wrapper = inputs[i].dynamicCast<wrapper_type>();
+                auto input = input_wrapper->getView();
+
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                csl::tensor_ops::softmax<T>(cudnnHandle, output, input, channel_axis, log);
+            }
+        }
+
+    private:
+        csl::cudnn::Handle cudnnHandle;
+        std::size_t channel_axis;
+        bool log;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SOFTMAX_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/split.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/split.hpp
@@ -0,0 +1,54 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/stream.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <utility>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    template <class T>
+    class SplitOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        SplitOp(csl::Stream stream_)
+            : stream(std::move(stream_))
+        {
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            for (int i = 0; i < outputs.size(); i++)
+            {
+                auto output_wrapper = outputs[i].dynamicCast<wrapper_type>();
+                auto output = output_wrapper->getSpan();
+
+                csl::tensor_ops::copy<T>(stream, output, input);
+            }
+        }
+
+    private:
+        csl::Stream stream;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SPLIT_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/cuda4dnn/primitives/transpose_convolution.hpp
@@ -0,0 +1,230 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP
+
+#include "../../op_cuda.hpp"
+
+#include "../csl/cudnn.hpp"
+#include "../csl/stream.hpp"
+#include "../csl/tensor.hpp"
+#include "../csl/tensor_ops.hpp"
+
+#include "../kernels/scale_shift.hpp"
+
+#include <opencv2/core.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+namespace cv { namespace dnn { namespace cuda4dnn {
+
+    struct TransposeConvolutionConfiguration {
+        /* other than `input_shape` and `output_shape`, all the configuration values must be provided
+         * for the corresponding convolution operation (not transpose convolution)
+         */
+
+        /* the size of the following vectors must be equal to the kernel size */
+        std::vector<std::size_t> kernel_size;
+        std::vector<std::size_t> dilations, strides;
+
+        enum class PaddingMode {
+            MANUAL, /* uses explicit padding values provided in `pads_begin` and `pads_end` */
+            VALID, /* no padding is added */
+            SAME /* TensorFlow logic is used for same padding */
+        };
+
+        /* explicit paddings are used if and only if padMode is set to manual */
+        PaddingMode padMode;
+        std::vector<std::size_t> pads_begin, pads_end;
+
+        /* full shape inclusive of channel and batch axis */
+        std::vector<std::size_t> input_shape;
+        std::vector<std::size_t> output_shape;
+
+        /* group count for grouped convolution */
+        std::size_t groups;
+    };
+
+    template <class T>
+    class TransposeConvolutionOp final : public CUDABackendNode {
+    public:
+        using wrapper_type = GetCUDABackendWrapperType<T>;
+
+        TransposeConvolutionOp(csl::Stream stream_, csl::cudnn::Handle handle, const TransposeConvolutionConfiguration& config, const Mat& filters, const Mat& bias)
+            : stream(std::move(stream_)), cudnnHandle(std::move(handle))
+        {
+            /* we make use of backward pass of convolution to perform forward pass of transpose convolution
+             * hence, we must setup configuration for the convolution operation and perform backward pass
+             */
+            const auto& kernel_size = config.kernel_size;
+            const auto& dilations = config.dilations;
+            const auto& strides = config.strides;
+
+            const auto convolution_order = kernel_size.size();
+            CV_Assert(convolution_order >= 1);
+
+            CV_Assert(convolution_order == dilations.size());
+            CV_Assert(convolution_order == strides.size());
+
+            const auto& input_shape = config.input_shape;
+            const auto& output_shape = config.output_shape;
+            CV_Assert(input_shape.size() == output_shape.size());
+            CV_Assert(input_shape.size() == convolution_order + 2);
+
+            const auto groups = config.groups;
+
+            if (convolution_order > 3)
+                CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D transpose convolution is supported.");
+
+            const auto rank = input_shape.size();
+            const auto input_feature_maps = input_shape[1];
+            const auto output_feature_maps = output_shape[1];
+            const auto output_feature_maps_per_group = output_feature_maps / groups;
+            CV_Assert(output_feature_maps % groups == 0);
+
+            filtersTensor = csl::makeTensorHeader<T>(filters);
+            csl::copyMatToTensor<T>(filters, filtersTensor, stream);
+
+            if (!bias.empty())
+            {
+                CV_Assert(bias.total() == output_feature_maps);
+                biasTensor = csl::makeTensorHeader<T>(bias);
+                csl::copyMatToTensor<T>(bias, biasTensor, stream);
+            }
+
+            /* left and right are misleading as the padding is applicable for any number of dimensions
+             * but we use those identifiers to avoid confusion with `pads_begin` and `pads_end`
+             *
+             * `common_padding` contains the amount of padding that has to be added to both sides
+             * `padding_left` and `padding_right` contains the amount of padding that needs to be added
+             * to a particular side in addition to the common padding
+             *
+             * note that we compute the padding for the convolution operation
+             */
+            std::vector<std::size_t> common_padding(rank, 0);
+            std::vector<std::size_t> padding_left(rank, 0), padding_right(rank, 0);
+            if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::MANUAL)
+            {
+                const auto& pads_begin = config.pads_begin;
+                const auto& pads_end = config.pads_end;
+
+                CV_Assert(convolution_order == pads_begin.size());
+                CV_Assert(convolution_order == pads_end.size());
+
+                for (int i = 2; i < common_padding.size(); i++)
+                {
+                    common_padding[i] = std::min(pads_begin[i - 2], pads_end[i - 2]);
+                    padding_left[i] = pads_begin[i - 2] - common_padding[i];
+                    padding_right[i] = pads_end[i - 2] - common_padding[i];
+                }
+            }
+            else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::VALID)
+            {
+                /* nothing to do as the paddings are already preset to zero */
+            }
+            else if (config.padMode == TransposeConvolutionConfiguration::PaddingMode::SAME)
+            {
+                /* TensorFlow Logic:
+                 * total_padding[i] = (o[i] - 1) * s[i] + effective_k[i] - i[i]
+                 *
+                 * if total padding is odd, the extra is added towards the end
+                 */
+                for (int i = 2; i < rank; i++)
+                {
+                    const auto j = i - 2; /* filter index */
+                    const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+                    const auto required_total_padding =
+                        std::max<std::int64_t>(0, (input_shape[i] - 1) * strides[j] + effective_kernel_size - output_shape[i]);
+
+                    common_padding[i] = required_total_padding / 2;
+                    padding_left[i] = 0;
+                    padding_right[i] = required_total_padding % 2;
+                }
+            }
+
+            /* in some scenarios, the extra padding at the end may not change the output at all */
+            for (int i = 2; i < rank; i++) {
+                const auto j = i - 2; /* filter idx */
+                const auto total_padding = common_padding[i] * 2 + padding_left[i] + padding_right[i];
+                const auto effective_kernel_size = dilations[j] * (kernel_size[j] - 1) + 1;
+                std::int64_t rem = (input_shape[i] + total_padding - effective_kernel_size) % strides[j];
+
+                /* the output shape doesn't change if we decrease the total padding by at most `rem`
+                 * provided that we decrease from the right
+                 */
+                if (rem && padding_right[i] > 0)
+                    padding_right[i] = std::max<std::int64_t>(0, padding_right[i] - rem);
+            }
+
+            auto is_not_zero = [](std::size_t i) { return i != 0; };
+            if(std::any_of(std::begin(padding_left), std::end(padding_left), is_not_zero) ||
+               std::any_of(std::begin(padding_right), std::end(padding_right), is_not_zero))
+            {
+                CV_Error(Error::StsNotImplemented, "Padding configuration requires asymmetric padding and hence is not supported.");
+            }
+
+            typename csl::TransposeConvolution<T>::params_type params;
+            params.input_shape.assign(std::begin(input_shape), std::end(input_shape));
+            params.output_shape.assign(std::begin(output_shape), std::end(output_shape));
+
+            auto& fshape = params.filter_shape;
+            fshape.resize(rank);
+            fshape[0] = input_feature_maps;
+            fshape[1] = output_feature_maps_per_group;
+            std::copy(std::begin(kernel_size), std::end(kernel_size), std::begin(fshape) + 2);
+            CV_Assert(fshape.size() == kernel_size.size() + 2);
+
+            params.padding.assign(std::begin(common_padding) + 2, std::end(common_padding));
+            params.stride = strides;
+            params.dilation = dilations;
+            params.groups = config.groups;
+
+            convoluter = csl::TransposeConvolution<T>(cudnnHandle, params);
+
+            csl::WorkspaceBuilder builder;
+            builder.require(convoluter.get_workspace_size());
+            scratch_mem_in_bytes = builder.required_workspace_size();
+        }
+
+        void forward(
+            const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+            const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+            csl::Workspace& workspace) override
+        {
+            CV_Assert(inputs.size() == 1 && outputs.size() == 1);
+
+            auto input_wrapper = inputs[0].dynamicCast<wrapper_type>();
+            auto input = input_wrapper->getView();
+
+            auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+            auto output = output_wrapper->getSpan();
+
+            csl::WorkspaceAllocator allocator(workspace);
+            convoluter.transpose_convolve(output, input, filtersTensor, allocator.get_instance());
+            if (!biasTensor.empty())
+            {
+                std::size_t inner_size = total(output_wrapper->getShape(), 2, -1);
+                kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
+            }
+        }
+
+        std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
+
+    private:
+        csl::Stream stream;
+        csl::cudnn::Handle cudnnHandle;
+        csl::Tensor<T> filtersTensor, biasTensor;
+        csl::TransposeConvolution<T> convoluter;
+
+        std::size_t scratch_mem_in_bytes;
+    };
+
+}}} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_TRANSPOSE_CONVOLUTION_HPP */
--- a/Lib/opencv/sources/modules/dnn/src/darknet/darknet_importer.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/darknet/darknet_importer.cpp
@@ -0,0 +1,255 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//                        (3-clause BSD License)
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * Neither the names of the copyright holders nor the names of the contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "../precomp.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <vector>
+#include <map>
+
+#include "darknet_io.hpp"
+
+
+namespace cv {
+namespace dnn {
+CV__DNN_INLINE_NS_BEGIN
+
+namespace
+{
+
+class DarknetImporter
+{
+    darknet::NetParameter net;
+
+public:
+
+    DarknetImporter() {}
+
+    DarknetImporter(std::istream &cfgStream, std::istream &darknetModelStream)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromCfgStreamOrDie(cfgStream, &net);
+        ReadNetParamsFromBinaryStreamOrDie(darknetModelStream, &net);
+    }
+
+    DarknetImporter(std::istream &cfgStream)
+    {
+        CV_TRACE_FUNCTION();
+
+        ReadNetParamsFromCfgStreamOrDie(cfgStream, &net);
+    }
+
+    struct BlobNote
+    {
+        BlobNote(const std::string &_name, int _layerId, int _outNum) :
+            name(_name), layerId(_layerId), outNum(_outNum) {}
+
+        std::string name;
+        int layerId, outNum;
+    };
+
+    std::vector<BlobNote> addedBlobs;
+    std::map<String, int> layerCounter;
+
+    void populateNet(Net dstNet)
+    {
+        CV_TRACE_FUNCTION();
+
+        int layersSize = net.layer_size();
+        layerCounter.clear();
+        addedBlobs.clear();
+        addedBlobs.reserve(layersSize + 1);
+
+        //setup input layer names
+        {
+            std::vector<String> netInputs(net.input_size());
+            for (int inNum = 0; inNum < net.input_size(); inNum++)
+            {
+                addedBlobs.push_back(BlobNote(net.input(inNum), 0, inNum));
+                netInputs[inNum] = net.input(inNum);
+            }
+            dstNet.setInputsNames(netInputs);
+        }
+
+        for (int li = 0; li < layersSize; li++)
+        {
+            const darknet::LayerParameter &layer = net.layer(li);
+            String name = layer.name();
+            String type = layer.type();
+            LayerParams layerParams = layer.getLayerParams();
+
+            int repetitions = layerCounter[name]++;
+            if (repetitions)
+                name += cv::format("_%d", repetitions);
+
+            int id = dstNet.addLayer(name, type, layerParams);
+
+            // iterate many bottoms layers (for example for: route -1, -4)
+            for (int inNum = 0; inNum < layer.bottom_size(); inNum++)
+                addInput(layer.bottom(inNum), id, inNum, dstNet, layer.name());
+
+            for (int outNum = 0; outNum < layer.top_size(); outNum++)
+                addOutput(layer, id, outNum);
+        }
+
+        addedBlobs.clear();
+    }
+
+    void addOutput(const darknet::LayerParameter &layer, int layerId, int outNum)
+    {
+        const std::string &name = layer.top(outNum);
+
+        bool haveDups = false;
+        for (int idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
+        {
+            if (addedBlobs[idx].name == name)
+            {
+                haveDups = true;
+                break;
+            }
+        }
+
+        if (haveDups)
+        {
+            bool isInplace = layer.bottom_size() > outNum && layer.bottom(outNum) == name;
+            if (!isInplace)
+                CV_Error(Error::StsBadArg, "Duplicate blobs produced by multiple sources");
+        }
+
+        addedBlobs.push_back(BlobNote(name, layerId, outNum));
+    }
+
+    void addInput(const std::string &name, int layerId, int inNum, Net &dstNet, std::string nn)
+    {
+        int idx;
+        for (idx = (int)addedBlobs.size() - 1; idx >= 0; idx--)
+        {
+            if (addedBlobs[idx].name == name)
+                break;
+        }
+
+        if (idx < 0)
+        {
+            CV_Error(Error::StsObjectNotFound, "Can't find output blob \"" + name + "\"");
+            return;
+        }
+
+        dstNet.connect(addedBlobs[idx].layerId, addedBlobs[idx].outNum, layerId, inNum);
+    }
+};
+
+static Net readNetFromDarknet(std::istream &cfgFile, std::istream &darknetModel)
+{
+    Net net;
+    DarknetImporter darknetImporter(cfgFile, darknetModel);
+    darknetImporter.populateNet(net);
+    return net;
+}
+
+static Net readNetFromDarknet(std::istream &cfgFile)
+{
+    Net net;
+    DarknetImporter darknetImporter(cfgFile);
+    darknetImporter.populateNet(net);
+    return net;
+}
+
+}
+
+Net readNetFromDarknet(const String &cfgFile, const String &darknetModel /*= String()*/)
+{
+    std::ifstream cfgStream(cfgFile.c_str());
+    if (!cfgStream.is_open())
+    {
+        CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(cfgFile));
+    }
+    if (darknetModel != String())
+    {
+        std::ifstream darknetModelStream(darknetModel.c_str(), std::ios::binary);
+        if (!darknetModelStream.is_open())
+        {
+            CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter file: " + std::string(darknetModel));
+        }
+        return readNetFromDarknet(cfgStream, darknetModelStream);
+    }
+    else
+        return readNetFromDarknet(cfgStream);
+}
+
+struct BufferStream : public std::streambuf
+{
+    BufferStream(const char* s, std::size_t n)
+    {
+        char* ptr = const_cast<char*>(s);
+        setg(ptr, ptr, ptr + n);
+    }
+};
+
+Net readNetFromDarknet(const char *bufferCfg, size_t lenCfg, const char *bufferModel, size_t lenModel)
+{
+    BufferStream cfgBufferStream(bufferCfg, lenCfg);
+    std::istream cfgStream(&cfgBufferStream);
+    if (lenModel)
+    {
+        BufferStream weightsBufferStream(bufferModel, lenModel);
+        std::istream weightsStream(&weightsBufferStream);
+        return readNetFromDarknet(cfgStream, weightsStream);
+    }
+    else
+        return readNetFromDarknet(cfgStream);
+}
+
+Net readNetFromDarknet(const std::vector<uchar>& bufferCfg, const std::vector<uchar>& bufferModel)
+{
+    const char* bufferCfgPtr = reinterpret_cast<const char*>(&bufferCfg[0]);
+    const char* bufferModelPtr = bufferModel.empty() ? NULL :
+                                 reinterpret_cast<const char*>(&bufferModel[0]);
+    return readNetFromDarknet(bufferCfgPtr, bufferCfg.size(),
+                              bufferModelPtr, bufferModel.size());
+}
+
+CV__DNN_INLINE_NS_END
+}} // namespace
--- a/Lib/opencv/sources/modules/dnn/src/darknet/darknet_io.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/darknet/darknet_io.cpp
@@ -0,0 +1,806 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//                        (3-clause BSD License)
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * Neither the names of the copyright holders nor the names of the contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//MIT License
+//
+//Copyright (c) 2017 Joseph Redmon
+//
+//Permission is hereby granted, free of charge, to any person obtaining a copy
+//of this software and associated documentation files (the "Software"), to deal
+//in the Software without restriction, including without limitation the rights
+//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//copies of the Software, and to permit persons to whom the Software is
+//furnished to do so, subject to the following conditions:
+//
+//The above copyright notice and this permission notice shall be included in all
+//copies or substantial portions of the Software.
+//
+//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+//SOFTWARE.
+//
+//M*/
+
+#include "../precomp.hpp"
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "darknet_io.hpp"
+
+namespace cv {
+    namespace dnn {
+        namespace darknet {
+
+            template<typename T>
+            T getParam(const std::map<std::string, std::string> &params, const std::string param_name, T init_val)
+            {
+                std::map<std::string, std::string>::const_iterator it = params.find(param_name);
+                if (it != params.end()) {
+                    std::stringstream ss(it->second);
+                    ss >> init_val;
+                }
+                return init_val;
+            }
+
+            static const std::string kFirstLayerName = "data";
+
+            class setLayersParams {
+
+                NetParameter *net;
+                int layer_id;
+                std::string last_layer;
+                std::vector<std::string> fused_layer_names;
+
+            public:
+                setLayersParams(NetParameter *_net) :
+                    net(_net), layer_id(0), last_layer(kFirstLayerName)
+                {}
+
+                void setLayerBlobs(int i, std::vector<cv::Mat> blobs)
+                {
+                    cv::dnn::LayerParams &params = net->layers[i].layerParams;
+                    params.blobs = blobs;
+                }
+
+                cv::dnn::LayerParams getParamConvolution(int kernel, int pad,
+                    int stride, int filters_num)
+                {
+                    cv::dnn::LayerParams params;
+                    params.name = "Convolution-name";
+                    params.type = "Convolution";
+
+                    params.set<int>("kernel_size", kernel);
+                    params.set<int>("pad", pad);
+                    params.set<int>("stride", stride);
+
+                    params.set<bool>("bias_term", false);	// true only if(BatchNorm == false)
+                    params.set<int>("num_output", filters_num);
+
+                    return params;
+                }
+
+
+                void setConvolution(int kernel, int pad, int stride,
+                    int filters_num, int channels_num, int use_batch_normalize)
+                {
+                    cv::dnn::LayerParams conv_param =
+                        getParamConvolution(kernel, pad, stride, filters_num);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("conv_%d", layer_id);
+
+                    // use BIAS in any case
+                    if (!use_batch_normalize) {
+                        conv_param.set<bool>("bias_term", true);
+                    }
+
+                    lp.layer_name = layer_name;
+                    lp.layer_type = conv_param.type;
+                    lp.layerParams = conv_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    if (use_batch_normalize)
+                    {
+                        cv::dnn::LayerParams bn_param;
+
+                        bn_param.name = "BatchNorm-name";
+                        bn_param.type = "BatchNorm";
+                        bn_param.set<bool>("has_weight", true);
+                        bn_param.set<bool>("has_bias", true);
+                        bn_param.set<float>("eps", 1E-6);	// .000001f in Darknet Yolo
+
+                        darknet::LayerParameter lp;
+                        std::string layer_name = cv::format("bn_%d", layer_id);
+                        lp.layer_name = layer_name;
+                        lp.layer_type = bn_param.type;
+                        lp.layerParams = bn_param;
+                        lp.bottom_indexes.push_back(last_layer);
+                        last_layer = layer_name;
+                        net->layers.push_back(lp);
+                    }
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setReLU()
+                {
+                    cv::dnn::LayerParams activation_param;
+                    activation_param.set<float>("negative_slope", 0.1f);
+                    activation_param.name = "ReLU-name";
+                    activation_param.type = "ReLU";
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("relu_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = activation_param.type;
+                    lp.layerParams = activation_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    fused_layer_names.back() = last_layer;
+                }
+
+                void setMaxpool(size_t kernel, size_t pad, size_t stride)
+                {
+                    cv::dnn::LayerParams maxpool_param;
+                    maxpool_param.set<cv::String>("pool", "max");
+                    maxpool_param.set<int>("kernel_size", kernel);
+                    maxpool_param.set<int>("pad", pad);
+                    maxpool_param.set<int>("stride", stride);
+                    maxpool_param.set<cv::String>("pad_mode", "SAME");
+                    maxpool_param.name = "Pooling-name";
+                    maxpool_param.type = "Pooling";
+                    darknet::LayerParameter lp;
+
+                    std::string layer_name = cv::format("pool_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = maxpool_param.type;
+                    lp.layerParams = maxpool_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setAvgpool()
+                {
+                    cv::dnn::LayerParams avgpool_param;
+                    avgpool_param.set<cv::String>("pool", "ave");
+                    avgpool_param.set<bool>("global_pooling", true);
+                    avgpool_param.name = "Pooling-name";
+                    avgpool_param.type = "Pooling";
+                    darknet::LayerParameter lp;
+
+                    std::string layer_name = cv::format("avgpool_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = avgpool_param.type;
+                    lp.layerParams = avgpool_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setSoftmax()
+                {
+                    cv::dnn::LayerParams softmax_param;
+                    softmax_param.name = "Softmax-name";
+                    softmax_param.type = "Softmax";
+                    darknet::LayerParameter lp;
+
+                    std::string layer_name = cv::format("softmax_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = softmax_param.type;
+                    lp.layerParams = softmax_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setConcat(int number_of_inputs, int *input_indexes)
+                {
+                    cv::dnn::LayerParams concat_param;
+                    concat_param.name = "Concat-name";
+                    concat_param.type = "Concat";
+                    concat_param.set<int>("axis", 1);	// channels are in axis = 1
+
+                    darknet::LayerParameter lp;
+
+                    std::string layer_name = cv::format("concat_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = concat_param.type;
+                    lp.layerParams = concat_param;
+                    for (int i = 0; i < number_of_inputs; ++i)
+                        lp.bottom_indexes.push_back(fused_layer_names.at(input_indexes[i]));
+
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setIdentity(int bottom_index)
+                {
+                    cv::dnn::LayerParams identity_param;
+                    identity_param.name = "Identity-name";
+                    identity_param.type = "Identity";
+
+                    darknet::LayerParameter lp;
+
+                    std::string layer_name = cv::format("identity_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = identity_param.type;
+                    lp.layerParams = identity_param;
+                    lp.bottom_indexes.push_back(fused_layer_names.at(bottom_index));
+
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setReorg(int stride)
+                {
+                    cv::dnn::LayerParams reorg_params;
+                    reorg_params.name = "Reorg-name";
+                    reorg_params.type = "Reorg";
+                    reorg_params.set<int>("reorg_stride", stride);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("reorg_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = reorg_params.type;
+                    lp.layerParams = reorg_params;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setPermute(bool isDarknetLayer = true)
+                {
+                    cv::dnn::LayerParams permute_params;
+                    permute_params.name = "Permute-name";
+                    permute_params.type = "Permute";
+                    int permute[] = { 0, 2, 3, 1 };
+                    cv::dnn::DictValue paramOrder = cv::dnn::DictValue::arrayInt(permute, 4);
+
+                    permute_params.set("order", paramOrder);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("permute_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = permute_params.type;
+                    lp.layerParams = permute_params;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    if (isDarknetLayer)
+                    {
+                        layer_id++;
+                        fused_layer_names.push_back(last_layer);
+                    }
+                }
+
+                void setRegion(float thresh, int coords, int classes, int anchors, int classfix, int softmax, int softmax_tree, float *biasData)
+                {
+                    cv::dnn::LayerParams region_param;
+                    region_param.name = "Region-name";
+                    region_param.type = "Region";
+
+                    region_param.set<float>("thresh", thresh);
+                    region_param.set<int>("coords", coords);
+                    region_param.set<int>("classes", classes);
+                    region_param.set<int>("anchors", anchors);
+                    region_param.set<int>("classfix", classfix);
+                    region_param.set<bool>("softmax_tree", softmax_tree);
+                    region_param.set<bool>("softmax", softmax);
+
+                    cv::Mat biasData_mat = cv::Mat(1, anchors * 2, CV_32F, biasData).clone();
+                    region_param.blobs.push_back(biasData_mat);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = "detection_out";
+                    lp.layer_name = layer_name;
+                    lp.layer_type = region_param.type;
+                    lp.layerParams = region_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors, float thresh, float nms_threshold)
+                {
+                    cv::dnn::LayerParams region_param;
+                    region_param.name = "Region-name";
+                    region_param.type = "Region";
+
+                    const int numAnchors = mask.size();
+
+                    region_param.set<int>("classes", classes);
+                    region_param.set<int>("anchors", numAnchors);
+                    region_param.set<bool>("logistic", true);
+                    region_param.set<float>("thresh", thresh);
+                    region_param.set<float>("nms_threshold", nms_threshold);
+
+                    std::vector<float> usedAnchors(numAnchors * 2);
+                    for (int i = 0; i < numAnchors; ++i)
+                    {
+                        usedAnchors[i * 2] = anchors[mask[i] * 2];
+                        usedAnchors[i * 2 + 1] = anchors[mask[i] * 2 + 1];
+                    }
+
+                    cv::Mat biasData_mat = cv::Mat(1, numAnchors * 2, CV_32F, &usedAnchors[0]).clone();
+                    region_param.blobs.push_back(biasData_mat);
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("yolo_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = region_param.type;
+                    lp.layerParams = region_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(kFirstLayerName);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setShortcut(int from, float alpha)
+                {
+                    cv::dnn::LayerParams shortcut_param;
+                    shortcut_param.name = "Shortcut-name";
+                    shortcut_param.type = "Eltwise";
+
+                    if (alpha != 1)
+                    {
+                        std::vector<float> coeffs(2, 1);
+                        coeffs[0] = alpha;
+                        shortcut_param.set("coeff", DictValue::arrayReal<float*>(&coeffs[0], coeffs.size()));
+                    }
+
+                    shortcut_param.set<std::string>("op", "sum");
+                    shortcut_param.set<std::string>("output_channels_mode", "input_0_truncate");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("shortcut_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = shortcut_param.type;
+                    lp.layerParams = shortcut_param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    lp.bottom_indexes.push_back(fused_layer_names.at(from));
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+
+                void setUpsample(int scaleFactor)
+                {
+                    cv::dnn::LayerParams param;
+                    param.name = "Upsample-name";
+                    param.type = "Resize";
+
+                    param.set<int>("zoom_factor", scaleFactor);
+                    param.set<String>("interpolation", "nearest");
+
+                    darknet::LayerParameter lp;
+                    std::string layer_name = cv::format("upsample_%d", layer_id);
+                    lp.layer_name = layer_name;
+                    lp.layer_type = param.type;
+                    lp.layerParams = param;
+                    lp.bottom_indexes.push_back(last_layer);
+                    last_layer = layer_name;
+                    net->layers.push_back(lp);
+
+                    layer_id++;
+                    fused_layer_names.push_back(last_layer);
+                }
+            };
+
+            std::string escapeString(const std::string &src)
+            {
+                std::string dst;
+                for (size_t i = 0; i < src.size(); ++i)
+                    if (src[i] > ' ' && src[i] <= 'z')
+                        dst += src[i];
+                return dst;
+            }
+
+            template<typename T>
+            std::vector<T> getNumbers(const std::string &src)
+            {
+                std::vector<T> dst;
+                std::stringstream ss(src);
+
+                for (std::string str; std::getline(ss, str, ',');) {
+                    std::stringstream line(str);
+                    T val;
+                    line >> val;
+                    dst.push_back(val);
+                }
+                return dst;
+            }
+
+            bool ReadDarknetFromCfgStream(std::istream &ifile, NetParameter *net)
+            {
+                bool read_net = false;
+                int layers_counter = -1;
+                for (std::string line; std::getline(ifile, line);) {
+                    line = escapeString(line);
+                    if (line.empty()) continue;
+                    switch (line[0]) {
+                    case '\0': break;
+                    case '#': break;
+                    case ';': break;
+                    case '[':
+                        if (line == "[net]") {
+                            read_net = true;
+                        }
+                        else {
+                            // read section
+                            read_net = false;
+                            ++layers_counter;
+                            const size_t layer_type_size = line.find("]") - 1;
+                            CV_Assert(layer_type_size < line.size());
+                            std::string layer_type = line.substr(1, layer_type_size);
+                            net->layers_cfg[layers_counter]["type"] = layer_type;
+                        }
+                        break;
+                    default:
+                        // read entry
+                        const size_t separator_index = line.find('=');
+                        CV_Assert(separator_index < line.size());
+                        if (separator_index != std::string::npos) {
+                            std::string name = line.substr(0, separator_index);
+                            std::string value = line.substr(separator_index + 1, line.size() - (separator_index + 1));
+                            name = escapeString(name);
+                            value = escapeString(value);
+                            if (name.empty() || value.empty()) continue;
+                            if (read_net)
+                                net->net_cfg[name] = value;
+                            else
+                                net->layers_cfg[layers_counter][name] = value;
+                        }
+                    }
+                }
+
+                std::string anchors = net->layers_cfg[net->layers_cfg.size() - 1]["anchors"];
+                std::vector<float> vec = getNumbers<float>(anchors);
+                std::map<std::string, std::string> &net_params = net->net_cfg;
+                net->width = getParam(net_params, "width", 416);
+                net->height = getParam(net_params, "height", 416);
+                net->channels = getParam(net_params, "channels", 3);
+                CV_Assert(net->width > 0 && net->height > 0 && net->channels > 0);
+
+                int current_channels = net->channels;
+                net->out_channels_vec.resize(net->layers_cfg.size());
+
+                layers_counter = -1;
+
+                setLayersParams setParams(net);
+
+                typedef std::map<int, std::map<std::string, std::string> >::iterator it_type;
+                for (it_type i = net->layers_cfg.begin(); i != net->layers_cfg.end(); ++i) {
+                    ++layers_counter;
+                    std::map<std::string, std::string> &layer_params = i->second;
+                    std::string layer_type = layer_params["type"];
+
+                    if (layer_type == "convolutional")
+                    {
+                        int kernel_size = getParam<int>(layer_params, "size", -1);
+                        int pad = getParam<int>(layer_params, "pad", 0);
+                        int stride = getParam<int>(layer_params, "stride", 1);
+                        int filters = getParam<int>(layer_params, "filters", -1);
+                        bool batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
+                        int flipped = getParam<int>(layer_params, "flipped", 0);
+                        if (flipped == 1)
+                            CV_Error(cv::Error::StsNotImplemented, "Transpose the convolutional weights is not implemented");
+
+                        // correct the strange value of pad=1 for kernel_size=1 in the Darknet cfg-file
+                        if (kernel_size < 3) pad = 0;
+
+                        CV_Assert(kernel_size > 0 && filters > 0);
+                        CV_Assert(current_channels > 0);
+
+                        setParams.setConvolution(kernel_size, pad, stride, filters, current_channels,
+                            batch_normalize);
+
+                        current_channels = filters;
+                    }
+                    else if (layer_type == "maxpool")
+                    {
+                        int kernel_size = getParam<int>(layer_params, "size", 2);
+                        int stride = getParam<int>(layer_params, "stride", 2);
+                        int pad = getParam<int>(layer_params, "pad", 0);
+                        setParams.setMaxpool(kernel_size, pad, stride);
+                    }
+                    else if (layer_type == "avgpool")
+                    {
+                        setParams.setAvgpool();
+                    }
+                    else if (layer_type == "softmax")
+                    {
+                        int groups = getParam<int>(layer_params, "groups", 1);
+                        if (groups != 1)
+                            CV_Error(Error::StsNotImplemented, "Softmax from Darknet with groups != 1");
+                        setParams.setSoftmax();
+                    }
+                    else if (layer_type == "route")
+                    {
+                        std::string bottom_layers = getParam<std::string>(layer_params, "layers", "");
+                        CV_Assert(!bottom_layers.empty());
+                        std::vector<int> layers_vec = getNumbers<int>(bottom_layers);
+
+                        current_channels = 0;
+                        for (size_t k = 0; k < layers_vec.size(); ++k) {
+                            layers_vec[k] = layers_vec[k] >= 0 ? layers_vec[k] : (layers_vec[k] + layers_counter);
+                            current_channels += net->out_channels_vec[layers_vec[k]];
+                        }
+
+                        if (layers_vec.size() == 1)
+                            setParams.setIdentity(layers_vec.at(0));
+                        else
+                            setParams.setConcat(layers_vec.size(), layers_vec.data());
+                    }
+                    else if (layer_type == "reorg")
+                    {
+                        int stride = getParam<int>(layer_params, "stride", 2);
+                        current_channels = current_channels * (stride*stride);
+
+                        setParams.setReorg(stride);
+                    }
+                    else if (layer_type == "region")
+                    {
+                        float thresh = getParam<float>(layer_params, "thresh", 0.001);
+                        int coords = getParam<int>(layer_params, "coords", 4);
+                        int classes = getParam<int>(layer_params, "classes", -1);
+                        int num_of_anchors = getParam<int>(layer_params, "num", -1);
+                        int classfix = getParam<int>(layer_params, "classfix", 0);
+                        bool softmax = (getParam<int>(layer_params, "softmax", 0) == 1);
+                        bool softmax_tree = (getParam<std::string>(layer_params, "tree", "").size() > 0);
+
+                        std::string anchors_values = getParam<std::string>(layer_params, "anchors", std::string());
+                        CV_Assert(!anchors_values.empty());
+                        std::vector<float> anchors_vec = getNumbers<float>(anchors_values);
+
+                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());
+
+                        setParams.setPermute(false);
+                        setParams.setRegion(thresh, coords, classes, num_of_anchors, classfix, softmax, softmax_tree, anchors_vec.data());
+                    }
+                    else if (layer_type == "shortcut")
+                    {
+                        std::string bottom_layer = getParam<std::string>(layer_params, "from", "");
+                        float alpha = getParam<float>(layer_params, "alpha", 1);
+                        float beta = getParam<float>(layer_params, "beta", 0);
+                        if (beta != 0)
+                            CV_Error(Error::StsNotImplemented, "Non-zero beta");
+                        CV_Assert(!bottom_layer.empty());
+                        int from = std::atoi(bottom_layer.c_str());
+
+                        from = from < 0 ? from + layers_counter : from;
+                        setParams.setShortcut(from, alpha);
+                    }
+                    else if (layer_type == "upsample")
+                    {
+                        int scaleFactor = getParam<int>(layer_params, "stride", 1);
+                        setParams.setUpsample(scaleFactor);
+                    }
+                    else if (layer_type == "yolo")
+                    {
+                        int classes = getParam<int>(layer_params, "classes", -1);
+                        int num_of_anchors = getParam<int>(layer_params, "num", -1);
+                        float thresh = getParam<float>(layer_params, "thresh", 0.2);
+                        float nms_threshold = getParam<float>(layer_params, "nms_threshold", 0.4);
+
+                        std::string anchors_values = getParam<std::string>(layer_params, "anchors", std::string());
+                        CV_Assert(!anchors_values.empty());
+                        std::vector<float> anchors_vec = getNumbers<float>(anchors_values);
+
+                        std::string mask_values = getParam<std::string>(layer_params, "mask", std::string());
+                        CV_Assert(!mask_values.empty());
+                        std::vector<int> mask_vec = getNumbers<int>(mask_values);
+
+                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());
+
+                        setParams.setPermute(false);
+                        setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold);
+                    }
+                    else {
+                        CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);
+                    }
+
+                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
+                    if (activation == "leaky")
+                    {
+                        setParams.setReLU();
+                    }
+                    else if (activation != "linear")
+                        CV_Error(cv::Error::StsParseError, "Unsupported activation: " + activation);
+
+                    net->out_channels_vec[layers_counter] = current_channels;
+                }
+
+                return true;
+            }
+
+            bool ReadDarknetFromWeightsStream(std::istream &ifile, NetParameter *net)
+            {
+                int32_t major_ver, minor_ver, revision;
+                ifile.read(reinterpret_cast<char *>(&major_ver), sizeof(int32_t));
+                ifile.read(reinterpret_cast<char *>(&minor_ver), sizeof(int32_t));
+                ifile.read(reinterpret_cast<char *>(&revision), sizeof(int32_t));
+
+                uint64_t seen;
+                if ((major_ver * 10 + minor_ver) >= 2) {
+                    ifile.read(reinterpret_cast<char *>(&seen), sizeof(uint64_t));
+                }
+                else {
+                    int32_t iseen = 0;
+                    ifile.read(reinterpret_cast<char *>(&iseen), sizeof(int32_t));
+                    seen = iseen;
+                }
+                bool transpose = (major_ver > 1000) || (minor_ver > 1000);
+                if(transpose)
+                    CV_Error(cv::Error::StsNotImplemented, "Transpose the weights (except for convolutional) is not implemented");
+
+                int current_channels = net->channels;
+                int cv_layers_counter = -1;
+                int darknet_layers_counter = -1;
+
+                setLayersParams setParams(net);
+
+                typedef std::map<int, std::map<std::string, std::string> >::iterator it_type;
+                for (it_type i = net->layers_cfg.begin(); i != net->layers_cfg.end(); ++i) {
+                    ++darknet_layers_counter;
+                    ++cv_layers_counter;
+                    std::map<std::string, std::string> &layer_params = i->second;
+                    std::string layer_type = layer_params["type"];
+
+                    if (layer_type == "convolutional")
+                    {
+                        int kernel_size = getParam<int>(layer_params, "size", -1);
+                        int filters = getParam<int>(layer_params, "filters", -1);
+                        bool use_batch_normalize = getParam<int>(layer_params, "batch_normalize", 0) == 1;
+
+                        CV_Assert(kernel_size > 0 && filters > 0);
+                        CV_Assert(current_channels > 0);
+
+                        size_t const weights_size = filters * current_channels * kernel_size * kernel_size;
+                        int sizes_weights[] = { filters, current_channels, kernel_size, kernel_size };
+                        cv::Mat weightsBlob;
+                        weightsBlob.create(4, sizes_weights, CV_32F);
+                        CV_Assert(weightsBlob.isContinuous());
+
+                        cv::Mat meanData_mat(1, filters, CV_32F);	// mean
+                        cv::Mat stdData_mat(1, filters, CV_32F);	// variance
+                        cv::Mat weightsData_mat(1, filters, CV_32F);// scale
+                        cv::Mat biasData_mat(1, filters, CV_32F);	// bias
+
+                        ifile.read(reinterpret_cast<char *>(biasData_mat.ptr<float>()), sizeof(float)*filters);
+                        if (use_batch_normalize) {
+                            ifile.read(reinterpret_cast<char *>(weightsData_mat.ptr<float>()), sizeof(float)*filters);
+                            ifile.read(reinterpret_cast<char *>(meanData_mat.ptr<float>()), sizeof(float)*filters);
+                            ifile.read(reinterpret_cast<char *>(stdData_mat.ptr<float>()), sizeof(float)*filters);
+                        }
+                        ifile.read(reinterpret_cast<char *>(weightsBlob.ptr<float>()), sizeof(float)*weights_size);
+
+                        // set convolutional weights
+                        std::vector<cv::Mat> conv_blobs;
+                        conv_blobs.push_back(weightsBlob);
+                        if (!use_batch_normalize) {
+                            // use BIAS in any case
+                            conv_blobs.push_back(biasData_mat);
+                        }
+                        setParams.setLayerBlobs(cv_layers_counter, conv_blobs);
+
+                        // set batch normalize (mean, variance, scale, bias)
+                        if (use_batch_normalize) {
+                            ++cv_layers_counter;
+                            std::vector<cv::Mat> bn_blobs;
+                            bn_blobs.push_back(meanData_mat);
+                            bn_blobs.push_back(stdData_mat);
+                            bn_blobs.push_back(weightsData_mat);
+                            bn_blobs.push_back(biasData_mat);
+                            setParams.setLayerBlobs(cv_layers_counter, bn_blobs);
+                        }
+                    }
+                    if (layer_type == "region" || layer_type == "yolo")
+                    {
+                        ++cv_layers_counter;  // For permute.
+                    }
+
+                    std::string activation = getParam<std::string>(layer_params, "activation", "linear");
+                    if(activation == "leaky")
+                        ++cv_layers_counter;  // For ReLU
+
+                    current_channels = net->out_channels_vec[darknet_layers_counter];
+                }
+                return true;
+            }
+
+        }
+
+
+        void ReadNetParamsFromCfgStreamOrDie(std::istream &ifile, darknet::NetParameter *net)
+        {
+            if (!darknet::ReadDarknetFromCfgStream(ifile, net)) {
+                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter stream");
+            }
+        }
+
+        void ReadNetParamsFromBinaryStreamOrDie(std::istream &ifile, darknet::NetParameter *net)
+        {
+            if (!darknet::ReadDarknetFromWeightsStream(ifile, net)) {
+                CV_Error(cv::Error::StsParseError, "Failed to parse NetParameter stream");
+            }
+        }
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/darknet/darknet_io.hpp
+++ b/Lib/opencv/sources/modules/dnn/src/darknet/darknet_io.hpp
@@ -0,0 +1,117 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//                        (3-clause BSD License)
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
+//
+// * Neither the names of the copyright holders nor the names of the contributors
+// may be used to endorse or promote products derived from this software
+// without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//MIT License
+//
+//Copyright (c) 2017 Joseph Redmon
+//
+//Permission is hereby granted, free of charge, to any person obtaining a copy
+//of this software and associated documentation files (the "Software"), to deal
+//in the Software without restriction, including without limitation the rights
+//to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+//copies of the Software, and to permit persons to whom the Software is
+//furnished to do so, subject to the following conditions:
+//
+//The above copyright notice and this permission notice shall be included in all
+//copies or substantial portions of the Software.
+//
+//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+//IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+//FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+//AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+//LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+//OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+//SOFTWARE.
+//
+//M*/
+
+#ifndef __OPENCV_DNN_DARKNET_IO_HPP__
+#define __OPENCV_DNN_DARKNET_IO_HPP__
+
+#include <opencv2/dnn/dnn.hpp>
+
+namespace cv {
+    namespace dnn {
+        namespace darknet {
+
+            class LayerParameter {
+                std::string layer_name, layer_type;
+                std::vector<std::string> bottom_indexes;
+                cv::dnn::LayerParams layerParams;
+            public:
+                friend class setLayersParams;
+                cv::dnn::LayerParams getLayerParams() const { return layerParams; }
+                std::string name() const { return layer_name; }
+                std::string type() const { return layer_type; }
+                int bottom_size() const { return bottom_indexes.size(); }
+                std::string bottom(const int index) const { return bottom_indexes.at(index); }
+                int top_size() const { return 1; }
+                std::string top(const int index) const { return layer_name; }
+            };
+
+            class NetParameter {
+            public:
+                int width, height, channels;
+                std::vector<LayerParameter> layers;
+                std::vector<int> out_channels_vec;
+
+                std::map<int, std::map<std::string, std::string> > layers_cfg;
+                std::map<std::string, std::string> net_cfg;
+
+                NetParameter() : width(0), height(0), channels(0) {}
+
+                int layer_size() const { return layers.size(); }
+
+                int input_size() const { return 1; }
+                std::string input(const int index) const { return "data"; }
+                LayerParameter layer(const int index) const { return layers.at(index); }
+            };
+        }
+
+        // Read parameters from a stream into a NetParameter message.
+        void ReadNetParamsFromCfgStreamOrDie(std::istream &ifile, darknet::NetParameter *net);
+        void ReadNetParamsFromBinaryStreamOrDie(std::istream &ifile, darknet::NetParameter *net);
+    }
+}
+#endif
--- a/Lib/opencv/sources/modules/dnn/src/dnn.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/dnn.cpp
--- a/Lib/opencv/sources/modules/dnn/src/halide_scheduler.cpp
+++ b/Lib/opencv/sources/modules/dnn/src/halide_scheduler.cpp
@@ -0,0 +1,285 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+
+#include "precomp.hpp"
+#include "halide_scheduler.hpp"
+#include "op_halide.hpp"
+
+namespace cv
+{
+namespace dnn
+{
+
+#ifdef HAVE_HALIDE
+static void applySplit(const FileNode& directive, Halide::Func& func,
+                       const FileNode& params)
+{
+    for (const auto& varNode : directive)
+    {
+        const std::string varName = varNode.name();
+        const std::string factorName = (std::string)varNode;
+        Halide::Var var(varName);
+        Halide::Var outerVar(varName + "o");
+        Halide::Var innerVar(varName + "i");
+        // If split factor is integer or parameters map has parameter value.
+        CV_Assert(varNode.isString() && !params[factorName].empty() ||
+                  varNode.isInt());
+        int factor = (int)(varNode.isInt() ? varNode : params[factorName]);
+        func.split(var, outerVar, innerVar, factor);
+    }
+}
+
+static void applyReorder(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    const int numVars = directive.size();
+    std::vector<Halide::VarOrRVar> reorderedVars;
+    reorderedVars.reserve(numVars);
+    for (int i = 0; i < numVars; ++i)
+    {
+        directive[i] >> varName;
+        reorderedVars.push_back(Halide::Var(varName));
+    }
+    func.reorder(reorderedVars);
+}
+
+static void applyFuse(const FileNode& directive, Halide::Func& func)
+{
+    CV_Assert(directive["src"].size() >= 2);
+    CV_Assert(directive["dst"].size() == 1);
+
+    std::string str;
+    directive["src"][0] >> str;
+    Halide::Var firstVar(str);
+    directive["src"][1] >> str;
+    Halide::Var secondVar(str);
+    directive["dst"] >> str;
+    Halide::Var dstVar(str);
+
+    func.fuse(firstVar, secondVar, dstVar);
+    for (int i = 2, n = directive["src"].size(); i < n; ++i)
+    {
+        directive["src"][i] >> str;
+        func.fuse(Halide::Var(str), dstVar, dstVar);
+    }
+}
+
+static void applyParallel(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    for (int i = 0, n = directive.size(); i < n; ++i)
+    {
+        directive[i] >> varName;
+        func.parallel(Halide::Var(varName));
+    }
+}
+
+static void applyUnroll(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    for (int i = 0, n = directive.size(); i < n; ++i)
+    {
+        directive[i] >> varName;
+        func.unroll(Halide::Var(varName));
+    }
+}
+
+static void applyVectorize(const FileNode& directive, Halide::Func& func,
+                           const FileNode& params)
+{
+    for (const auto& varNode : directive)
+    {
+        const std::string varName = varNode.name();
+        const std::string factorName = (std::string)varNode;
+        // If split factor is integer or parameters map has parameter value.
+        CV_Assert(varNode.isString() && !params[factorName].empty() ||
+                  varNode.isInt());
+        int factor = (int)(varNode.isInt() ? varNode : params[factorName]);
+        Halide::Var var(varName);
+        Halide::Var inner(varName + "v");
+        func.split(var, var, inner, factor);
+        func.vectorize(inner);
+    }
+}
+
+static void applyStoreAt(const FileNode& directive, Halide::Func& func,
+                         std::map<std::string, Halide::Func>& funcsMap)
+{
+    for (const auto& funcNode : directive)
+    {
+        const std::string targetFuncName = funcNode.name();
+        if (funcsMap.find(targetFuncName) == funcsMap.end())
+            CV_Error(cv::Error::StsParseError, "Function " + targetFuncName +
+                     " is not represented in Halide pipeline");
+        Halide::Func targetFunc = funcsMap[targetFuncName];
+        func.store_at(targetFunc, (std::string)funcNode);
+        break;
+    }
+}
+
+static void applyComputeAt(const FileNode& directive, Halide::Func& func,
+                           std::map<std::string, Halide::Func>& funcsMap)
+{
+    for (const auto& funcNode : directive)
+    {
+        const std::string targetFuncName = funcNode.name();
+        if (funcsMap.find(targetFuncName) == funcsMap.end())
+            CV_Error(cv::Error::StsParseError, "Function " + targetFuncName +
+                     " is not represented in Halide pipeline");
+        Halide::Func targetFunc = funcsMap[targetFuncName];
+        func.compute_at(targetFunc, (std::string)funcNode);
+        break;
+    }
+}
+
+static void applyComputeRoot(const FileNode& directive, Halide::Func& func)
+{
+    bool compute_root;
+    directive >> compute_root;
+    if (compute_root)
+        func.compute_root();
+}
+
+static void applyGpuBlocks(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    for (int i = 0, n = directive.size(); i < n; ++i)
+    {
+        directive[i] >> varName;
+        func.gpu_blocks(Halide::Var(varName));
+    }
+}
+
+static void applyGpuThreads(const FileNode& directive, Halide::Func& func)
+{
+    std::string varName;
+    for (int i = 0, n = directive.size(); i < n; ++i)
+    {
+        directive[i] >> varName;
+        func.gpu_threads(Halide::Var(varName));
+    }
+}
+
+static void apply(const FileNode& directives, Halide::Func& func,
+                  std::map<std::string, Halide::Func>& funcsMap,
+                  const FileNode& params)
+{
+    for (const auto& directive : directives)
+    {
+        if (directive.name() == "split")
+            applySplit(directive, func, params);
+        else if (directive.name() == "reorder")
+            applyReorder(directive, func);
+        else if (directive.name() == "fuse")
+            applyFuse(directive, func);
+        else if (directive.name() == "parallel")
+            applyParallel(directive, func);
+        else if (directive.name() == "unroll")
+            applyUnroll(directive, func);
+        else if (directive.name() == "vectorize")
+            applyVectorize(directive, func, params);
+        else if (directive.name() == "store_at")
+            applyStoreAt(directive, func, funcsMap);
+        else if (directive.name() == "compute_at")
+            applyComputeAt(directive, func, funcsMap);
+        else if (directive.name() == "compute_root")
+            applyComputeRoot(directive, func);
+        else if (directive.name() == "gpu_blocks")
+            applyGpuBlocks(directive, func);
+        else if (directive.name() == "gpu_threads")
+            applyGpuThreads(directive, func);
+        else
+            CV_Error(Error::StsNotImplemented, "Scheduling directive " +
+                     directive.name() + " is not implemented.");
+    }
+}
+
+// Remove any numeric symbols after '$' sign.
+static std::string Deunique(std::string str)
+{
+    int pos = -1;
+    do
+    {
+        pos = str.find('$');
+        if (pos != -1)
+        {
+            int len = str.find_first_not_of("0123456789", pos + 1) - pos;
+            str = str.replace(pos, len, "");
+        }
+    }
+    while (pos != -1);
+    return str;
+}
+#endif  // HAVE_HALIDE
+
+HalideScheduler::HalideScheduler(const std::string& configFile)
+{
+    if (!configFile.empty())
+        fs = FileStorage(configFile, FileStorage::READ);
+}
+
+HalideScheduler::~HalideScheduler()
+{
+    if (fs.isOpened())
+        fs.release();
+}
+
+bool HalideScheduler::process(Ptr<BackendNode>& node)
+{
+#ifdef HAVE_HALIDE
+    if (!fs.isOpened())
+        return false;
+
+    const FileNode& scheduleNode = fs["scheduling"];
+    if (scheduleNode.empty())
+        CV_Error(cv::Error::StsParseError, "Scheduling file should has scheduling node");
+
+    std::string str;
+    std::map<std::string, Halide::Func> funcsMap;  // Scheduled functions.
+    // For every function, from top to bottom, we try to find a scheduling node.
+    // Scheduling is successful (return true) if for the first function (top)
+    // node is represented.
+    CV_Assert(!node.empty());
+    std::vector<Halide::Func>& funcs = node.dynamicCast<HalideBackendNode>()->funcs;
+    for (int i = funcs.size() - 1; i >= 0; --i)
+    {
+        Halide::Func& func = funcs[i];
+        // For functions with the same name Halide generates unique names
+        // for example func, func$1, func$2.
+        // They are always formed with '$' and number.
+        std::string funcName = Deunique(func.name());
+
+        const FileNode& funcNode = scheduleNode[funcName];
+        if (!funcNode.empty())
+        {
+            if (!funcNode["pattern"].empty())
+            {
+                funcNode["pattern"] >> str;
+                if (fs["patterns"][str].empty())
+                    CV_Error(cv::Error::StsParseError, "Scheduling pattern " + str +
+                                                       " is not defined");
+                apply(fs["patterns"][str], func, funcsMap, funcNode["params"]);
+            }
+            else
+            {
+                apply(funcNode, func, funcsMap, funcNode["params"]);
+            }
+        }
+        else
+        {
+            if (funcsMap.empty())
+                return false;
+        }
+        funcsMap[funcName] = func;
+    }
+    return true;
+#endif  // HAVE_HALIDE
+    return false;
+}
+
+}  // namespace dnn
+}  // namespace cv
--- a/Show More
+++ b/Show More