add part of opencv

2020-01-27 20:20:56 +08:00
parent 0c4ac1d8bb
commit a71fa47620
6518 changed files with 3122580 additions and 0 deletions
--- a/Lib/opencv/sources/modules/dnn/src/opencl/activations.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/activations.cl
@@ -0,0 +1,142 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void ReLUForward(const int count, __global const T* in, __global T* out
+#ifndef RELU_NO_SLOPE
+, KERNEL_ARG_DTYPE negative_slope
+#endif
+) {
+  int index = get_global_id(0);
+  if(index < count)
+#ifndef RELU_NO_SLOPE
+  out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
+#else
+  out[index] = in[index] > 0 ? in[index] : 0;
+#endif
+}
+
+__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
+                           const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
+{
+  int index = get_global_id(0);
+  if(index < count)
+  {
+    T x = in[index];
+    out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
+  }
+}
+
+__kernel void PReLUForward(const int count, const int channels, const int plane_size,
+                           __global const T* in, __global T* out,
+                           __global const KERNEL_ARG_DTYPE* slope_data)
+{
+  int index = get_global_id(0);
+  int c = (index / plane_size) % channels;
+  if(index < count)
+  out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
+}
+
+__kernel void TanHForward(const int count, __global T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = tanh(in[index]);
+}
+
+__kernel void SigmoidForward(const int count, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = 1.0f / (1.0f + exp(-in[index]));
+}
+
+__kernel void SwishForward(const int count, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = in[index] / (1.0f + exp(-in[index]));
+}
+
+__kernel void MishForward(const int count, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if(index < count)
+  out[index] = in[index] * tanh(log(1.0f + exp(in[index])));
+}
+
+__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if (index < n) {
+    T x = in[index];
+    out[index] = x > 0 ? x + log(1.0f + exp(-x)) : log(1.0f + exp(x));
+  }
+}
+
+__kernel void AbsValForward(const int n, __global const T* in, __global T* out) {
+  int index = get_global_id(0);
+  if (index < n)
+    out[index] = fabs(in[index]);
+}
+
+__kernel void PowForward(const int n, __global const T* in, __global T* out,
+                         const KERNEL_ARG_DTYPE power,
+                         const KERNEL_ARG_DTYPE scale,
+                         const KERNEL_ARG_DTYPE shift)
+{
+  int index = get_global_id(0);
+  if (index < n)
+    out[index] = pow(shift + scale * in[index], power);
+}
+
+__kernel void ELUForward(const int n, __global const T* in, __global T* out)
+{
+  int index = get_global_id(0);
+  if (index < n)
+  {
+    T src = in[index];
+    out[index] = (src >= 0.f) ? src : exp(src) - 1;
+  }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/batchnorm.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/batchnorm.cl
@@ -0,0 +1,87 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#if NUM == 8
+    #define load(src, index) vload8(0, src + index)
+    #define store(vec, dst, index) vstore8(vec, 0, dst + index)
+    #define float_type float8
+    #define convert_f convert_float8
+    #define BATCH_NORM batch_norm8
+#elif NUM == 4
+    #define load(src, index) vload4(0, src + index)
+    #define store(vec, dst, index) vstore4(vec, 0, dst + index)
+    #define float_type float4
+    #define convert_f convert_float4
+    #define BATCH_NORM batch_norm4
+#elif NUM == 1
+    #define load(src, index) src[index]
+    #define store(vec, dst, index) dst[index] = vec
+    #define float_type float
+    #define convert_f convert_float
+    #define BATCH_NORM batch_norm1
+#endif
+
+__kernel void BATCH_NORM(__global const Dtype* src,
+                         const int rows,
+                         const int cols,
+                         const int channels,
+                         __global const float* weight,
+                         __global const float* bias,
+                         __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+    int index = x * cols + y;
+
+    if (x >= rows || y >= cols)
+        return;
+
+    float w = weight[x % channels];
+    float b = bias[x % channels];
+    float_type src_vec = convert_f(load(src, index));
+    float_type dst_vec = src_vec * w + (float_type)b;
+    store(convert_T(dst_vec), dst, index);
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/col2im.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/col2im.cl
@@ -0,0 +1,79 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void col2im(const int n, __global const T* data_col,
+                     const int data_col_offset,
+                     const int channels,
+                     const int height, const int width,
+                     const int height_col, const int width_col,
+                     const int coeff_h, const int coeff_w,
+                     __global const T* biasvec,
+                     const int bias_offset,
+                     __global T* data_im,
+                     const int data_im_offset)
+{
+    data_col = data_col + data_col_offset;
+    biasvec = biasvec + bias_offset;
+    data_im = data_im + data_im_offset;
+    int index = get_global_id(0);
+
+    if(index < n)
+    {
+        T val = 0.f;
+        int w = index % width + PAD_W;
+        int h = (index / width) % height + PAD_H;
+        int c = index / (width * height);
+        int h_col_start = (h < KERNEL_H) ? 0 : (h - KERNEL_H) / STRIDE_H + 1;
+        int h_col_end = min(h / STRIDE_H + 1, height_col);
+        int plane_size_col = height_col * width_col;
+        int offset = (c * KERNEL_H * KERNEL_W + h * KERNEL_W + w) * plane_size_col;
+
+        int w_col_start = (w < KERNEL_W) ? 0 : (w - KERNEL_W) / STRIDE_W + 1;
+        int w_col_end = min(w / STRIDE_W + 1, width_col);
+
+        for (int h_col = h_col_start; h_col < h_col_end; ++h_col)
+            for (int w_col = w_col_start; w_col < w_col_end; ++w_col)
+                val += data_col[offset + h_col * coeff_h + w_col * coeff_w];
+
+        data_im[index] = val + biasvec[c];
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/concat.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/concat.cl
@@ -0,0 +1,67 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
+                                      __global const Dtype* in_data,
+                                      const int num_concats,
+                                      const int concat_size,
+                                      const int top_concat_axis,
+                                      const int bottom_concat_axis,
+                                      const int offset_concat_axis,
+                                      __global Dtype* out_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        const int total_concat_size = concat_size * bottom_concat_axis;
+        const int concat_num = index / total_concat_size;
+        const int concat_index = index % total_concat_size;
+        const int top_index = concat_index +
+                              (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+        out_data[top_index] = in_data[index];
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/conv_layer_spatial.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/conv_layer_spatial.cl
--- a/Lib/opencv/sources/modules/dnn/src/opencl/conv_spatial_helper.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/conv_spatial_helper.cl
@@ -0,0 +1,73 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define Dtype float
+
+__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
+    (__global Dtype* weightIn,
+     __global Dtype* weightOut,
+     const int kernel_w,
+     const int kernel_h,
+     const int channels,
+     const int outputs,
+     const int swizzleFactor) {
+
+  unsigned int sX = get_global_id(0);
+
+  //Original location
+
+  //Output location
+  int outputSublayer = channels / swizzleFactor;
+  int outputSublayerIndex = channels % swizzleFactor;
+
+  int filter = sX / (kernel_w*kernel_h*channels);
+  int kernel_X = sX % kernel_w;
+  int kernel_Y = (sX / kernel_w) % kernel_h;
+  int kernel_C = (sX / (kernel_w * kernel_h)) % channels;
+
+  int FP = filter / swizzleFactor;
+  int F1 = filter % swizzleFactor;
+
+  weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
+  = weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/detection_output.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/detection_output.cl
@@ -0,0 +1,199 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define Dtype float
+#define Dtype4 float4
+
+__kernel void DecodeBBoxesCORNER(const int nthreads,
+                                 __global const Dtype* loc_data,
+                                 __global const Dtype* prior_data,
+                                 const int variance_encoded_in_target,
+                                 const int num_priors,
+                                 const int share_location,
+                                 const int num_loc_classes,
+                                 const int background_label_id,
+                                 const int clip_bbox,
+                                 const int locPredTransposed,
+                                 __global Dtype* bbox_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;
+        const int i = index % 4;
+        const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;
+
+        const int c = (index / 4) % num_loc_classes;
+        int label = share_location ? -1 : c;
+        if (label == background_label_id)
+            return; // Ignore background class.
+
+        Dtype4 loc_vec = vload4(0, loc_data + index - i);
+        Dtype4 bbox_vec, prior_variance;
+        if (variance_encoded_in_target)
+        {
+            bbox_vec = loc_vec;
+        } else {
+            const int start_index = num_priors * 4 + p;
+            prior_variance = vload4(0, prior_data + start_index);
+            bbox_vec = loc_vec * prior_variance;
+        }
+
+        if (locPredTransposed)
+        {
+            bbox_ymin = bbox_vec.x;
+            bbox_xmin = bbox_vec.y;
+            bbox_ymax = bbox_vec.z;
+            bbox_xmax = bbox_vec.w;
+        } else {
+            bbox_xmin = bbox_vec.x;
+            bbox_ymin = bbox_vec.y;
+            bbox_xmax = bbox_vec.z;
+            bbox_ymax = bbox_vec.w;
+        }
+
+        Dtype4 prior_vec = vload4(0, prior_data + p);
+        Dtype val;
+        switch (i)
+        {
+            case 0:
+                val = prior_vec.x + bbox_xmin;
+                break;
+            case 1:
+                val = prior_vec.y + bbox_ymin;
+                break;
+            case 2:
+                val = prior_vec.z + bbox_xmax;
+                break;
+            case 3:
+                val = prior_vec.w + bbox_ymax;
+                break;
+        }
+
+        if (clip_bbox)
+            val = max(min(val, (Dtype)1.), (Dtype)0.);
+
+        bbox_data[index] = val;
+    }
+}
+
+__kernel void DecodeBBoxesCENTER_SIZE(const int nthreads,
+                                      __global const Dtype* loc_data,
+                                      __global const Dtype* prior_data,
+                                      const int variance_encoded_in_target,
+                                      const int num_priors,
+                                      const int share_location,
+                                      const int num_loc_classes,
+                                      const int background_label_id,
+                                      const int clip_bbox,
+                                      const int locPredTransposed,
+                                      __global Dtype* bbox_data)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;
+        const int i = index % 4;
+        const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;
+
+        const int c = (index / 4) % num_loc_classes;
+        int label = share_location ? -1 : c;
+        if (label == background_label_id)
+            return; // Ignore background class.
+
+        Dtype4 loc_vec = vload4(0, loc_data + index - i);
+        Dtype4 bbox_vec, prior_variance;
+        if (variance_encoded_in_target)
+        {
+            bbox_vec = loc_vec;
+        } else {
+            const int start_index = num_priors * 4 + p;
+            prior_variance = vload4(0, prior_data + start_index);
+            bbox_vec = loc_vec * prior_variance;
+        }
+
+        if (locPredTransposed)
+        {
+            bbox_ymin = bbox_vec.x;
+            bbox_xmin = bbox_vec.y;
+            bbox_ymax = bbox_vec.z;
+            bbox_xmax = bbox_vec.w;
+        } else {
+            bbox_xmin = bbox_vec.x;
+            bbox_ymin = bbox_vec.y;
+            bbox_xmax = bbox_vec.z;
+            bbox_ymax = bbox_vec.w;
+        }
+
+        Dtype4 prior_vec = vload4(0, prior_data + p);
+        Dtype prior_width = prior_vec.z - prior_vec.x;
+        Dtype prior_height = prior_vec.w - prior_vec.y;
+        Dtype prior_center_x = (prior_vec.x + prior_vec.z) * .5;
+        Dtype prior_center_y = (prior_vec.y + prior_vec.w) * .5;
+
+        Dtype decode_bbox_center_x, decode_bbox_center_y;
+        Dtype decode_bbox_width, decode_bbox_height;
+        decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
+        decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
+        decode_bbox_width = exp(bbox_xmax) * prior_width;
+        decode_bbox_height = exp(bbox_ymax) * prior_height;
+
+        Dtype val;
+        switch (i)
+        {
+            case 0:
+                val = decode_bbox_center_x - decode_bbox_width * .5;
+                break;
+            case 1:
+                val = decode_bbox_center_y - decode_bbox_height * .5;
+                break;
+            case 2:
+                val = decode_bbox_center_x + decode_bbox_width * .5;
+                break;
+            case 3:
+                val = decode_bbox_center_y + decode_bbox_height * .5;
+                break;
+        }
+
+        if (clip_bbox)
+            val = max(min(val, (Dtype)1.), (Dtype)0.);
+
+        bbox_data[index] = val;
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/dummy.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/dummy.cl
@@ -0,0 +1,43 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void dummy_kernel()
+{
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/eltwise.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/eltwise.cl
@@ -0,0 +1,98 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void op_sum4(__global const Dtype * A,
+                      __global const Dtype * B,
+                      unsigned int A_col_size,
+                      const float coeff1,
+                      const float coeff2,
+                      __global Dtype * C)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
+    const __global Dtype *src1_read = B + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = C + row_gid * 4 * A_col_size;
+
+    Dtype4 a0, a1, a2, a3;
+    Dtype4 dot0, dot1, dot2, dot3;
+    unsigned int i = lid;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 b0 = vload4(i, src1_read);
+        const Dtype4 b1 = vload4(i, src1_read + A_col_size);
+        const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size);
+        const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size);
+
+#if LOOP == 0
+        a0 = vload4(i, src0_read);
+        a1 = vload4(i, src0_read + A_col_size);
+        a2 = vload4(i, src0_read + 2 * A_col_size);
+        a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
+        dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
+        dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
+        dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
+#else
+        a0 = vload4(i, dst0_read);
+        a1 = vload4(i, dst0_read + A_col_size);
+        a2 = vload4(i, dst0_read + 2 * A_col_size);
+        a3 = vload4(i, dst0_read + 3 * A_col_size);
+
+        dot0 = a0 + b0 * (Dtype4)coeff2;
+        dot1 = a1 + b1 * (Dtype4)coeff2;
+        dot2 = a2 + b2 * (Dtype4)coeff2;
+        dot3 = a3 + b3 * (Dtype4)coeff2;
+#endif
+        vstore4(dot0, i, dst0_read);
+        vstore4(dot1, i, dst0_read + A_col_size);
+        vstore4(dot2, i, dst0_read + 2 * A_col_size);
+        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+
+        i += get_local_size(0);
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/gemm_buffer.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/gemm_buffer.cl
--- a/Lib/opencv/sources/modules/dnn/src/opencl/gemm_image.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/gemm_image.cl
@@ -0,0 +1,994 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+#define KERNEL_ARG_DTYPE float
+#define TYPE_FLOAT  1
+#define TYPE_HALF   2
+
+#if TYPE == TYPE_HALF
+#define Dtype  half
+#define Dtype2 half2
+#define Dtype4 half4
+#define Dtype8 half8
+#define Dtype16 half16
+
+#define as_Dtype  as_half
+#define as_Dtype2 as_half2
+#define as_Dtype4 as_half4
+#define as_Dtype8 as_half8
+#define as_Dtype16 as_half16
+#else
+#define Dtype  float
+#define Dtype2 float2
+#define Dtype4 float4
+#define Dtype8 float8
+#define Dtype16 float16
+
+#define as_Dtype  as_float
+#define as_Dtype2 as_float2
+#define as_Dtype4 as_float4
+#define as_Dtype8 as_float8
+#define as_Dtype16 as_float16
+#endif
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION  cl_intel_subgroups : enable
+#endif
+
+#define TILE_M          32
+#define TILE_K          8
+
+// common block to calculate (alpha * AxB + beta * C) and output to destination image.
+
+#if TYPE == TYPE_HALF
+#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
+#define SHUFFLE_TYPE2(val) as_ushort2(val)
+#define SHUFFLE_TYPE8(val) as_ushort8(val)
+#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
+#define SIZE_OF_ELEMENT sizeof(ushort)
+#define SIMD_SIZE_GEMM 16
+#define TILE_N 16
+#else
+#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
+#define SHUFFLE_TYPE2(val) val
+#define SHUFFLE_TYPE8(val) val
+#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord)
+#define SIZE_OF_ELEMENT sizeof(uint)
+#define SIMD_SIZE_GEMM 8
+#define TILE_N 8
+#endif
+
+//#define USE_IMAGE_C
+#ifdef USE_IMAGE_C
+#if TYPE == TYPE_HALF
+#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
+#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
+#else
+#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
+#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
+#endif
+#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
+#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
+#else
+#define BLOCKC_READ8( _C, _coordC ) \
+          (Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
+                     (_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0)
+
+#define BLOCKC_WRITE8( _C, _coordC, _val) do {\
+                     if (_coordC.x + get_local_id(0) < N) { \
+                       if (_coordC.y < M) \
+                         _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \
+                       if (_coordC.y + 1 < M) \
+                         _C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \
+                       if (_coordC.y + 2 < M) \
+                         _C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \
+                       if (_coordC.y + 3 < M) \
+                         _C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \
+                       if (_coordC.y + 4 < M) \
+                         _C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \
+                       if (_coordC.y + 5 < M) \
+                         _C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \
+                       if (_coordC.y + 6 < M) \
+                         _C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \
+                       if (_coordC.y + 7 < M) \
+                         _C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \
+                     }} while(0)
+#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc
+#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1)
+#endif
+
+#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \
+    int2    coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \
+    int2    coordC = coordDst; \
+    Dtype8 blockC00; \
+    Dtype8 blockC01; \
+    Dtype8 blockC02; \
+    Dtype8 blockC03; \
+    if (BETA_NOT0) { \
+        blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
+        if (!ALPHA1) { \
+            blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
+            blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
+            blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
+            blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
+        } else { \
+            blockC00 += blockAxB00; \
+            blockC01 += blockAxB01; \
+            blockC02 += blockAxB02; \
+            blockC03 += blockAxB03; \
+        } \
+    } else { \
+        blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC );    coordC.y += 8; \
+        blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
+        if (!ALPHA1) { \
+          blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
+          blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
+          blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
+          blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
+        } else { \
+          blockC00 += blockAxB00; \
+          blockC01 += blockAxB01; \
+          blockC02 += blockAxB02; \
+          blockC03 += blockAxB03; \
+        } \
+    } \
+    BLOCKC_WRITE8( _dst, coordDst, blockC00 );    coordDst.y += 8; \
+    BLOCKC_WRITE8( _dst, coordDst, blockC01 );    coordDst.y += 8; \
+    BLOCKC_WRITE8( _dst, coordDst, blockC02 );    coordDst.y += 8; \
+    BLOCKC_WRITE8( _dst, coordDst, blockC03 );
+
+// Get the specified column of the block of the block
+#define TRANSPOSE_BLOCK_8( _block, _col )   \
+        (Dtype8)( intel_sub_group_shuffle( _block.s0, _col ),   \
+                  intel_sub_group_shuffle( _block.s1, _col ),   \
+                  intel_sub_group_shuffle( _block.s2, _col ),   \
+                  intel_sub_group_shuffle( _block.s3, _col ),   \
+                  intel_sub_group_shuffle( _block.s4, _col ),   \
+                  intel_sub_group_shuffle( _block.s5, _col ),   \
+                  intel_sub_group_shuffle( _block.s6, _col ),   \
+                  intel_sub_group_shuffle( _block.s7, _col ) );
+
+// A's column block multiply B 's row block.
+#if TYPE == TYPE_HALF
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \
+            const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \
+            const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \
+            const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \
+            const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \
+            const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \
+            const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \
+            const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \
+            _result = mad( (Dtype8)(_blockB00.s0), acol0, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s1), acol1, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s2), acol2, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s3), acol3, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s4), acol4, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s5), acol5, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s6), acol6, _result );      \
+            _result = mad( (Dtype8)(_blockB00.s7), acol7, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s0), acol8, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s1), acol9, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s2), acola, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s3), acolb, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s4), acolc, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s5), acold, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s6), acole, _result );      \
+            _result = mad( (Dtype8)(_blockB01.s7), acolf, _result );      \
+        }
+#else
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            _result = mad( (Dtype8)(_blockB.s0), acol0, _result );      \
+            _result = mad( (Dtype8)(_blockB.s1), acol1, _result );      \
+            _result = mad( (Dtype8)(_blockB.s2), acol2, _result );      \
+            _result = mad( (Dtype8)(_blockB.s3), acol3, _result );      \
+            _result = mad( (Dtype8)(_blockB.s4), acol4, _result );      \
+            _result = mad( (Dtype8)(_blockB.s5), acol5, _result );      \
+            _result = mad( (Dtype8)(_blockB.s6), acol6, _result );      \
+            _result = mad( (Dtype8)(_blockB.s7), acol7, _result );      \
+        }
+#endif
+
+#if TYPE == TYPE_HALF
+#define GEMM_NN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
+    do \
+    {  \
+        int2    coordBTemp = coordB; \
+        Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        Dtype8  blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        int2    coordATemp = coordA; \
+        Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);  \
+}
+#else
+#define GEMM_NN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0.0f; \
+    Dtype8 blockAxB01 = 0.0f; \
+    Dtype8 blockAxB02 = 0.0f; \
+    Dtype8 blockAxB03 = 0.0f; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
+    do \
+    {  \
+        int2    coordBTemp = coordB; \
+        Dtype8  blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K; \
+        int2    coordATemp = coordA; \
+        Dtype8  blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8  blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#endif
+
+GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
+GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
+GEMM_NN(0, 0) // ALPHA != 1, BETA == 0
+GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
+
+#undef TRANSPOSE_BLOCK_8
+#undef MULTIPLY_BLOCKS_8x8
+#undef GEMM_NN
+
+// replicate the first row to column block.
+#define TRANSPOSE_BLOCK_8(_vec, _col) \
+        (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
+                  intel_sub_group_shuffle(_vec, _col + 1), \
+                  intel_sub_group_shuffle(_vec, _col + 2), \
+                  intel_sub_group_shuffle(_vec, _col + 3), \
+                  intel_sub_group_shuffle(_vec, _col + 4), \
+                  intel_sub_group_shuffle(_vec, _col + 5), \
+                  intel_sub_group_shuffle(_vec, _col + 6), \
+                  intel_sub_group_shuffle(_vec, _col + 7) )
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col )    \
+        {   \
+            _result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result );      \
+            _result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result );      \
+        }
+
+#if TYPE == TYPE_HALF
+#define GEMM_TN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0);\
+    const int group_y = get_group_id(1);\
+    Dtype8 blockAxB00 = 0;\
+    Dtype8 blockAxB01 = 0;\
+    Dtype8 blockAxB02 = 0;\
+    Dtype8 blockAxB03 = 0;\
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
+    do\
+    {\
+        int2    coordBTemp = coordB;\
+        Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\
+        int2    coordATemp = coordA;\
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#else
+#define GEMM_TN(ALPHA1, BETA_NOT0) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    __read_only image2d_t B, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int width0, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0);\
+    const int group_y = get_group_id(1);\
+    Dtype8 blockAxB00 = 0.0f;\
+    Dtype8 blockAxB01 = 0.0f;\
+    Dtype8 blockAxB02 = 0.0f;\
+    Dtype8 blockAxB03 = 0.0f;\
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
+    int2    coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
+    do\
+    {\
+        int2    coordBTemp = coordB;\
+        Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) );    coordB.y += TILE_K;\
+        int2    coordATemp = coordA;\
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \
+    } \
+    while( coordB.y < width0 ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#endif
+
+GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
+GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
+GEMM_TN(0, 0) // ALPHA != 1, BETA == 0
+GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_TN
+
+// The same as GEMM_NN
+#define TRANSPOSE_BLOCK_8( _block, _col )   \
+        (Dtype8)( intel_sub_group_shuffle( _block.s0, _col),   \
+                  intel_sub_group_shuffle( _block.s1, _col),   \
+                  intel_sub_group_shuffle( _block.s2, _col),   \
+                  intel_sub_group_shuffle( _block.s3, _col),   \
+                  intel_sub_group_shuffle( _block.s4, _col),   \
+                  intel_sub_group_shuffle( _block.s5, _col),   \
+                  intel_sub_group_shuffle( _block.s6, _col),   \
+                  intel_sub_group_shuffle( _block.s7, _col) )
+
+#if TYPE == TYPE_HALF
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            const Dtype8    acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 );    \
+            const Dtype8    acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 );    \
+            const Dtype8    acola = TRANSPOSE_BLOCK_8( _blockA, 10 );    \
+            const Dtype8    acolb = TRANSPOSE_BLOCK_8( _blockA, 11 );    \
+            const Dtype8    acolc = TRANSPOSE_BLOCK_8( _blockA, 12 );    \
+            const Dtype8    acold = TRANSPOSE_BLOCK_8( _blockA, 13 );    \
+            const Dtype8    acole = TRANSPOSE_BLOCK_8( _blockA, 14 );    \
+            const Dtype8    acolf = TRANSPOSE_BLOCK_8( _blockA, 15 );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+            _result = mad( (Dtype8)_blockB.s8, acol8, _result );      \
+            _result = mad( (Dtype8)_blockB.s9, acol9, _result );      \
+            _result = mad( (Dtype8)_blockB.sa, acola, _result );      \
+            _result = mad( (Dtype8)_blockB.sb, acolb, _result );      \
+            _result = mad( (Dtype8)_blockB.sc, acolc, _result );      \
+            _result = mad( (Dtype8)_blockB.sd, acold, _result );      \
+            _result = mad( (Dtype8)_blockB.se, acole, _result );      \
+            _result = mad( (Dtype8)_blockB.sf, acolf, _result );      \
+        }
+#else
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+        }
+#endif
+
+#if TYPE == TYPE_HALF
+#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype16 blockB00; \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#else
+#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0.0f; \
+    Dtype8 blockAxB01 = 0.0f; \
+    Dtype8 blockAxB02 = 0.0f; \
+    Dtype8 blockAxB03 = 0.0f; \
+    int2    coordA = (int2)( 0, group_y * TILE_M ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;  \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.y += 8; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.x += TILE_K * SIZE_OF_ELEMENT; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
+}
+#endif
+
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
+#else
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+#endif
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_NT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
+        _blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
+        _coordB.x += TILE_K * 2;
+#else
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
+        _blockb = vload8(0, B_read); \
+        _coordB.x += TILE_K;
+#endif
+
+#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
+
+GEMM_NT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s8 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s9 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sa = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sb = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+         _blockb.sc = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sd = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.se = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.sf = temp.s0; \
+        _coordB.x += 16;
+#else
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        _coordB.x += 8;
+#endif
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_NT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
+GEMM_NT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
+GEMM_NT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
+GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_NT
+
+//The same as GEMM_TN.
+#define TRANSPOSE_BLOCK_8(_vec, _col) \
+        (Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
+                  intel_sub_group_shuffle(_vec, _col + 1), \
+                  intel_sub_group_shuffle(_vec, _col + 2), \
+                  intel_sub_group_shuffle(_vec, _col + 3), \
+                  intel_sub_group_shuffle(_vec, _col + 4), \
+                  intel_sub_group_shuffle(_vec, _col + 5), \
+                  intel_sub_group_shuffle(_vec, _col + 6), \
+                  intel_sub_group_shuffle(_vec, _col + 7) );
+
+#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col )    \
+        {   \
+            const Dtype8    acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col );    \
+            const Dtype8    acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col );    \
+            const Dtype8    acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col );    \
+            const Dtype8    acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col );    \
+            const Dtype8    acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col );    \
+            const Dtype8    acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col );    \
+            const Dtype8    acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col );    \
+            const Dtype8    acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col );    \
+            _result = mad( (Dtype8)_blockB.s0, acol0, _result );      \
+            _result = mad( (Dtype8)_blockB.s1, acol1, _result );      \
+            _result = mad( (Dtype8)_blockB.s2, acol2, _result );      \
+            _result = mad( (Dtype8)_blockB.s3, acol3, _result );      \
+            _result = mad( (Dtype8)_blockB.s4, acol4, _result );      \
+            _result = mad( (Dtype8)_blockB.s5, acol5, _result );      \
+            _result = mad( (Dtype8)_blockB.s6, acol6, _result );      \
+            _result = mad( (Dtype8)_blockB.s7, acol7, _result );      \
+        }
+
+#if TYPE == TYPE_HALF
+#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0; \
+    Dtype8 blockAxB01 = 0; \
+    Dtype8 blockAxB02 = 0; \
+    Dtype8 blockAxB03 = 0; \
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;             \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 16 * SIZE_OF_ELEMENT;\
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K;\
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
+}
+#else
+#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
+__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
+__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
+__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
+    __read_only image2d_t A, \
+    MATB_PARAMETER, \
+    MATC_PARAMETER, \
+    KERNEL_ARG_DTYPE alpha_in, \
+    KERNEL_ARG_DTYPE beta_in, \
+    int padded_k, \
+    int k, \
+    int isFirstColBlock) \
+{ \
+    const Dtype alpha = (Dtype)alpha_in; \
+    const Dtype beta = (Dtype)beta_in; \
+    const int group_x = get_group_id(0); \
+    const int group_y = get_group_id(1); \
+    Dtype8 blockAxB00 = 0.0f; \
+    Dtype8 blockAxB01 = 0.0f; \
+    Dtype8 blockAxB02 = 0.0f; \
+    Dtype8 blockAxB03 = 0.0f; \
+    int2    coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
+    int2    coordB = (int2)( 0, ( group_x * TILE_N )); \
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
+    do \
+    { \
+        Dtype8 blockB00;             \
+        BLOCKB_READ8(blockB00, B, coordB); \
+        int2    coordATemp = coordA; \
+        Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+        Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+        Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordATemp.x += 8 * SIZE_OF_ELEMENT; \
+        Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) );    coordA.y += TILE_K; \
+        MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \
+        MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \
+    } \
+    while( coordB.x < padded_k / VECSIZE ); \
+    GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
+}
+#endif
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        _blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_TT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#if TYPE == TYPE_HALF
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
+        _blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
+        _coordB.x += TILE_K;
+#else
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
+        _blockb = vload8(0, B_read); \
+        _coordB.x += TILE_K;
+#endif
+
+#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
+
+GEMM_TT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#define BLOCKB_READ8(_blockb, _B, _coordB) \
+        int2 _coordBTemp = _coordB; \
+        _coordBTemp.y += get_local_id(0); \
+        Dtype4 temp; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s0 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s1 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s2 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s3 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s4 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s5 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s6 = temp.s0; \
+        temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
+        _blockb.s7 = temp.s0; \
+        _coordB.x += 8;
+
+#define MATB_PARAMETER __read_only image2d_t B
+
+GEMM_TT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
+GEMM_TT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
+GEMM_TT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
+GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
+#undef BLOCKB_READ8
+#undef MATB_PARAMETER
+
+#undef MULTIPLY_BLOCKS_8x8
+#undef TRANSPOSE_BLOCK_8
+#undef GEMM_TT
+
+#undef TILE_M
+#undef TILE_K
+#undef TILE_N
+#undef SUBGROUP_BLOCK_READ8
+#undef READ_IMAGE
+#undef SIZE_OF_ELEMENT
+
+__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
+    __global Dtype* A,
+    __write_only image2d_t ImA,
+    int offA,
+    int width,
+    int height,
+    int ldA)
+{
+    const int gidx = get_global_id(0);
+    const int gidy = get_global_id(1);
+    int2 coord_dst = (int2)(gidx, gidy);
+    __global Dtype* A_off = A + offA;
+    Dtype srcA = A_off[gidy * ldA + gidx];
+#if TYPE == TYPE_HALF
+    write_imageh(ImA, coord_dst, (Dtype4)srcA);
+#else
+    write_imagef(ImA, coord_dst, (Dtype4)srcA);
+#endif
+}
+
+__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
+    __global Dtype* A,
+    __write_only image2d_t ImA,
+    int offA,
+    int width,
+    int height,
+    int ldA)
+{
+    const int gidx = get_global_id(0);
+    const int gidy = get_global_id(1);
+    int2 coord_dst = (int2)(gidx, gidy);
+#if TYPE == TYPE_HALF
+    if (gidx >= width || gidy >= height) {
+      write_imageh(ImA, coord_dst, 0);
+      return;
+    }
+    __global Dtype* A_off = A + offA;
+    write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
+#else
+    if (gidx >= width || gidy >= height) {
+      write_imageui(ImA, coord_dst, (uint4)0);
+      return;
+    }
+    __global Dtype* A_off = A + offA;
+    uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
+    write_imageui(ImA, coord_dst, srcA);
+#endif
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/im2col.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/im2col.cl
@@ -0,0 +1,71 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+__kernel void im2col(__global const T *im_src, int im_src_offset,
+                     int channels, int height_inp, int width_inp,
+                     int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
+                     int height_out, int width_out,
+                     __global T *im_col, int im_col_offset
+                    )
+{
+    int index = get_global_id(0);
+    if (index >= height_out * width_out * channels)
+        return;
+    int j_out = index % width_out;
+    int i_out = (index / width_out) % height_out;
+    int c_inp = (index / width_out) / height_out;
+
+    int c_out = c_inp * kernel_h * kernel_w;
+    int i_inp = i_out * stride_h - pad_h;
+    int j_inp = j_out * stride_w - pad_w;
+
+    im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
+    im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;
+
+    for (int ki = 0; ki < kernel_h; ++ki)
+        for (int kj = 0; kj < kernel_w; ++kj) {
+            int i = i_inp + ki;
+            int j = j_inp + kj;
+            *im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
+                im_src[ki * width_inp + kj] : 0;
+            im_col += height_out * width_out;
+      }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/lrn.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/lrn.cl
@@ -0,0 +1,76 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp)
+  out[index] = in[index] * pow(scale[index], negative_beta);
+}
+
+__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    in = in + offset;
+    scale = scale + offset;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    T accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in[head * step] * in[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in[(head - size) * step]
+        * in[(head - size) * step];
+      }
+      scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
+      ++head;
+    }
+  }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/math.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/math.cl
@@ -0,0 +1,59 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+
+__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
+                                   const int offx, __global Dtype* y,
+                                   const int offy) {
+  for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
+    Dtype src = x[offx + index];
+    Dtype dst = y[offy + index];
+    y[offy + index] = convert_Dtype(alpha) * src + dst;
+  }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/matvec_mul.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/matvec_mul.cl
@@ -0,0 +1,195 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+
+__kernel void TEMPLATE(matvec_mul4,Dtype)(
+          __global const Dtype * A,
+          int offA,
+          unsigned int A_col_size,
+          unsigned int trail_item,
+          __global const Dtype * v,
+          int offv,
+          KERNEL_ARG_DTYPE alpha,
+          KERNEL_ARG_DTYPE beta,
+          __global Dtype4* result,
+          int offr,
+          __local Dtype4* work)
+{
+  unsigned int row_gid = get_group_id(0);
+  unsigned int lid = get_local_id(0);
+  const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
+  const __global Dtype *src1_read = v + offv;
+  result = (__global Dtype4*)((__global Dtype*)result + offr);
+  Dtype4 dot0 = (Dtype4)(0.f);
+  Dtype4 dot1 = (Dtype4)(0.f);
+  Dtype4 dot2 = (Dtype4)(0.f);
+  Dtype4 dot3 = (Dtype4)(0.f);
+
+  unsigned int i = lid;
+  while( i < A_col_size / 4) {
+    const Dtype4 a0 = vload4(i, src0_read);
+    const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+    const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+    const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+    const Dtype4 b0 = vload4(i, src1_read);
+
+    dot0 += a0 * b0;
+    dot1 += a1 * b0;
+    dot2 += a2 * b0;
+    dot3 += a3 * b0;
+
+    i += get_local_size(0);
+  }
+
+  work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w;
+  work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w;
+  work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w;
+  work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w;
+
+  if(i == A_col_size / 4)
+  {
+    if(trail_item != 0)
+    {
+      const __global Dtype *src0_trail = src0_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
+      for(unsigned int i = 0; i < trail_item; ++i) {
+        const Dtype at0 = src0_trail[i];
+        const Dtype at1 = src0_trail[i + A_col_size];
+        const Dtype at2 = src0_trail[i + 2 * A_col_size];
+        const Dtype at3 = src0_trail[i + 3 * A_col_size];
+
+        const Dtype bt = src1_trail[i];
+
+        work[lid].s0 += at0 * bt;
+        work[lid].s1 += at1 * bt;
+        work[lid].s2 += at2 * bt;
+        work[lid].s3 += at3 * bt;
+      }
+    }
+
+  }
+
+  for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      if(lid < stride)
+        work[lid] += work[lid+stride];
+  }
+  if(lid == 0) {
+    if(beta == (Dtype)0)
+      result[row_gid] = convert_Dtype(alpha) * work[0];
+    else
+      result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
+  }
+}
+
+/* This kernel used for the trailing rows when row_of_A %4 !=0 */
+__kernel void TEMPLATE(matvec_mul1,Dtype)(
+          __global const Dtype * A,
+          int offA,
+          unsigned int A_col_size,
+          unsigned int row_offset,
+          unsigned int trail_item,
+          __global const Dtype * v,
+          int offv,
+          KERNEL_ARG_DTYPE alpha,
+          KERNEL_ARG_DTYPE beta,
+          __global Dtype * result,
+          int offr,
+          __local Dtype * work)
+{
+  unsigned int row_gid = get_group_id(0);
+  unsigned int lid = get_local_id(0);
+
+  const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
+  const __global Dtype *src1_read = v + + offv;
+  result = result + offr;
+  Dtype4 dot0 = (Dtype4)(0.f);
+
+  unsigned int i = lid;
+  while( i < A_col_size / 4)
+  {
+    const Dtype4 a0 = vload4(i, src0_read);
+    const Dtype4 b0 = vload4(i, src1_read);
+
+    dot0 += a0 * b0;
+    i += get_local_size(0);
+  }
+
+  work[lid] = dot0.x + dot0.y + dot0.z + dot0.w;
+
+  if(i == A_col_size / 4)
+  {
+    if(trail_item != 0)
+    {
+      const __global Dtype *src0_trail = src0_read + i * 4;
+      const __global Dtype *src1_trail = src1_read + i * 4;
+      for(unsigned int i = 0; i < trail_item; ++i) {
+        const Dtype at0 = src0_trail[i];
+        const Dtype bt = src1_trail[i];
+
+        work[lid] += at0 * bt;
+      }
+    }
+
+  }
+  for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
+      barrier(CLK_LOCAL_MEM_FENCE);
+      if(lid < stride)
+        work[lid] += work[lid+stride];
+  }
+
+  if(lid == 0) {
+    if(beta == (Dtype)0) {
+      result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
+    } else {
+      result[row_gid+row_offset] *= convert_Dtype(beta);
+      result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
+    }
+  }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/mvn.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/mvn.cl
@@ -0,0 +1,331 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#define Dtype  float
+#define Dtype4 float4
+#define Dtype8 float8
+
+#if NUM == 8
+    #define load(src, index) vload8(0, src + index)
+    #define store(vec, dst, index) vstore8(vec, 0, dst + index)
+    #define vec_type Dtype8
+    #define CALC_MEAN calc_mean8
+    #define MVN mvn8
+    #define MEAN_FUSE mean_fuse8
+    #define MVN_FUSE mvn_fuse8
+#elif NUM == 4
+    #define load(src, index) vload4(0, src + index)
+    #define store(vec, dst, index) vstore4(vec, 0, dst + index)
+    #define vec_type Dtype4
+    #define CALC_MEAN calc_mean4
+    #define MVN mvn4
+    #define MEAN_FUSE mean_fuse4
+    #define MVN_FUSE mvn_fuse4
+#elif NUM == 1
+    #define load(src, index) src[index]
+    #define store(vec, dst, index) dst[index] = vec
+    #define vec_type Dtype
+    #define CALC_MEAN calc_mean1
+    #define MVN mvn1
+    #define MEAN_FUSE mean_fuse1
+    #define MVN_FUSE mvn_fuse1
+#endif
+
+#ifdef KERNEL_MEAN
+
+__kernel void CALC_MEAN(__global const Dtype* src,
+                        const int rows,
+                        const int cols,
+                        __global Dtype* mean,
+                        __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+    int index = x * cols + y;
+
+    if (x >= rows || y >= cols)
+        return;
+
+    Dtype mean_val = mean[x];
+    vec_type src_vec = load(src, index);
+    vec_type dst_vec = src_vec - (vec_type)mean_val;
+    dst_vec = dst_vec * dst_vec;
+    store(dst_vec, dst, index);
+}
+
+#elif defined KERNEL_MVN
+
+__kernel void MVN(__global const Dtype* src,
+                  const int rows,
+                  const int cols,
+                  const Dtype eps,
+                  __global const Dtype* mean,
+                  __global const Dtype* dev,
+                  __global const Dtype* bnorm_weight,
+                  __global const Dtype* bnorm_bias,
+                  const int channels,
+                  const float relu_slope,
+                  __global Dtype* dst)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1) * NUM;
+    int index = x * cols + y;
+
+    if (x >= rows || y >= cols)
+        return;
+
+    Dtype mean_val = mean[x];
+    Dtype dev_val = dev[x];
+    Dtype alpha;
+#ifdef NORM_VARIANCE
+    alpha = 1 / sqrt(eps + dev_val);
+#else
+    alpha = 1;
+#endif
+
+    Dtype w = 1.f, b = 0.f;
+#ifdef FUSE_BATCH_NORM
+    w = bnorm_weight[x % channels];
+    b = bnorm_bias[x % channels];
+#endif
+
+    vec_type src_vec = load(src, index) - (vec_type)mean_val;
+    vec_type dst_vec = src_vec * alpha;
+    dst_vec = dst_vec * w + (vec_type)b;
+
+#ifdef FUSE_RELU
+    vec_type new_val = dst_vec * relu_slope;
+    dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);
+#endif
+
+    store(dst_vec, dst, index);
+}
+
+#elif defined KERNEL_MEAN_FUSE
+
+__kernel void MEAN_FUSE(__global const T * A,
+                        unsigned int A_col_size,
+                        float alpha,
+                        __global T4 * mean,
+                        __global Dtype * tmp)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global T *src0_read = A + row_gid * 4 * A_col_size;
+    __global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
+    Dtype4 dot0, dot1, dot2, dot3;
+    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
+
+    unsigned int i = lid;
+    const Dtype4 b0 = (Dtype4)1.f;
+    while( i < A_col_size / 4)
+    {
+        const T4 a0 = vload4(i, src0_read);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 += convert_float4(a0);
+        dot1 += convert_float4(a1);
+        dot2 += convert_float4(a2);
+        dot3 += convert_float4(a3);
+
+        i += LOCAL_SIZE;
+    }
+
+    __local Dtype4 work[LOCAL_SIZE];
+    work[lid].s0 = dot(dot0, b0);
+    work[lid].s1 = dot(dot1, b0);
+    work[lid].s2 = dot(dot2, b0);
+    work[lid].s3 = dot(dot3, b0);
+
+    for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < stride)
+            work[lid] += work[lid+stride];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(lid == 0)
+    {
+        mean[row_gid] = convert_T(alpha * work[0]);
+    }
+
+    Dtype4 sum = work[0] * alpha;
+    i = lid;
+    while( i < A_col_size / 4)
+    {
+        const T4 a0 = vload4(i, src0_read);
+        const T4 a1 = vload4(i, src0_read + A_col_size);
+        const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 = convert_float4(a0) - (Dtype4)sum.x;
+        dot1 = convert_float4(a1) - (Dtype4)sum.y;
+        dot2 = convert_float4(a2) - (Dtype4)sum.z;
+        dot3 = convert_float4(a3) - (Dtype4)sum.w;
+        dot0 = dot0 * dot0;
+        dot1 = dot1 * dot1;
+        dot2 = dot2 * dot2;
+        dot3 = dot3 * dot3;
+
+        vstore4(dot0, i, dst0_read);
+        vstore4(dot1, i, dst0_read + A_col_size);
+        vstore4(dot2, i, dst0_read + 2 * A_col_size);
+        vstore4(dot3, i, dst0_read + 3 * A_col_size);
+
+        i += LOCAL_SIZE;
+    }
+}
+
+#elif defined KERNEL_MVN_FUSE
+
+__kernel void MVN_FUSE(__global const Dtype * tmp,
+                       __global const T * A,
+                       __global const T4 * mean,
+                       unsigned int A_col_size,
+                       const float alpha_val,
+                       const float eps,
+                       const float relu_slope,
+                       __global const Dtype4 * bnorm_weight,
+                       __global const Dtype4 * bnorm_bias,
+                       __global T * B)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
+    const __global T *src1_read = A + row_gid * 4 * A_col_size;
+    __global T *dst0_read = B + row_gid * 4 * A_col_size;
+    Dtype4 dot0, dot1, dot2, dot3;
+    dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
+
+    unsigned int i = lid;
+    const Dtype4 b0 = (Dtype4)1.f;
+    while( i < A_col_size / 4)
+    {
+        const Dtype4 a0 = vload4(i, src0_read);
+        const Dtype4 a1 = vload4(i, src0_read + A_col_size);
+        const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
+        const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
+
+        dot0 += a0;
+        dot1 += a1;
+        dot2 += a2;
+        dot3 += a3;
+
+        i += LOCAL_SIZE;
+    }
+
+    __local Dtype4 work[LOCAL_SIZE];
+    work[lid].s0 = dot(dot0, b0);
+    work[lid].s1 = dot(dot1, b0);
+    work[lid].s2 = dot(dot2, b0);
+    work[lid].s3 = dot(dot3, b0);
+
+    for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)
+    {
+        barrier(CLK_LOCAL_MEM_FENCE);
+        if(lid < stride)
+            work[lid] += work[lid+stride];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    Dtype4 mean_val = convert_float4(mean[row_gid]);
+    Dtype4 dev_val = sqrt(work[0] * alpha_val + (Dtype4)eps);
+    Dtype4 alpha = (Dtype4)1.f / dev_val;
+
+    Dtype4 w = (Dtype4)1.f;
+    Dtype4 b = (Dtype4)0.f;
+#ifdef FUSE_BATCH_NORM
+    w = bnorm_weight[row_gid];
+    b = bnorm_bias[row_gid];
+#endif
+
+    i = lid;
+    while( i < A_col_size / 4)
+    {
+        const T4 a0 = vload4(i, src1_read);
+        const T4 a1 = vload4(i, src1_read + A_col_size);
+        const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
+        const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
+
+        dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
+        dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
+        dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
+        dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
+
+        dot0 = dot0 * w.x + (Dtype4)b.x;
+        dot1 = dot1 * w.y + (Dtype4)b.y;
+        dot2 = dot2 * w.z + (Dtype4)b.z;
+        dot3 = dot3 * w.w + (Dtype4)b.w;
+
+#ifdef FUSE_RELU
+        Dtype4 new0 = dot0 * relu_slope;
+        dot0 = select(new0, dot0, dot0 > (Dtype4)0.f);
+
+        Dtype4 new1 = dot1 * relu_slope;
+        dot1 = select(new1, dot1, dot1 > (Dtype4)0.f);
+
+        Dtype4 new2 = dot2 * relu_slope;
+        dot2 = select(new2, dot2, dot2 > (Dtype4)0.f);
+
+        Dtype4 new3 = dot3 * relu_slope;
+        dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
+#endif
+
+        vstore4(convert_T(dot0), i, dst0_read);
+        vstore4(convert_T(dot1), i, dst0_read + A_col_size);
+        vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
+        vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
+
+        i += LOCAL_SIZE;
+    }
+}
+
+#else
+#error "Configuration error!"
+#endif
--- a/Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_lrn.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_lrn.cl
@@ -0,0 +1,100 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+#define KERNEL_ARG_DTYPE float
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
+                             const int num, const int channels,
+                             const int height, const int width, const int size,
+                             const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
+                             __global Dtype* const out,
+                             const KERNEL_ARG_DTYPE negative_beta) {
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0)) {
+    // find out the local offset
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
+    __global const Dtype* in_off = in + offset;
+    __global Dtype* out_off = out + offset;
+    KERNEL_ARG_DTYPE scale_val;
+    int head = 0;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+    KERNEL_ARG_DTYPE accum_scale = 0;
+    // fill the scale at [n, :, h, w]
+    // accumulate values
+    while (head < post_pad && head < channels) {
+      accum_scale += in_off[head * step] * in_off[head * step];
+      ++head;
+    }
+    // both add and subtract
+    while (head < channels) {
+      accum_scale += in_off[head * step] * in_off[head * step];
+      if (head - size >= 0) {
+        accum_scale -= in_off[(head - size) * step]
+            * in_off[(head - size) * step];
+      }
+      scale_val = k + accum_scale * alpha_over_size;
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
+      ++head;
+    }
+    // subtract only
+    while (head < channels + post_pad) {
+      if (head - size >= 0) {
+        accum_scale -= in_off[(head - size) * step]
+            * in_off[(head - size) * step];
+      }
+      scale_val = k + accum_scale * alpha_over_size;
+      out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
+      ++head;
+    }
+  }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_pooling.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_pooling.cl
@@ -0,0 +1,186 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+#if defined KERNEL_MAX_POOL
+
+__kernel void
+#ifdef HAVE_MASK
+    TEMPLATE(max_pool_forward_mask, Dtype)
+#else
+    TEMPLATE(max_pool_forward, Dtype)
+#endif
+(
+    const int nthreads, __global const Dtype* bottom_data,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data
+#ifdef HAVE_MASK
+    , __global Dtype* mask
+#endif
+)
+{
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_T;
+  int wstart = pw * STRIDE_W - PAD_L;
+  Dtype maxval = -FLT_MAX;
+  int maxidx = -1;
+  int in_offset = ch * height * width;
+  for (int h = 0; h < KERNEL_H; ++h)
+  {
+    int off_y = hstart + h;
+    if (off_y >= 0 && off_y < height)
+    {
+      for (int w = 0; w < KERNEL_W; ++w)
+      {
+        int off_x = wstart + w;
+        if (off_x >= 0 && off_x < width)
+        {
+          Dtype val = bottom_data[in_offset + off_y * width + off_x];
+          maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;
+          maxval = fmax(val, maxval);
+        }
+      }
+    }
+  }
+  top_data[index] = maxval;
+#ifdef HAVE_MASK
+  mask[index] = maxidx;
+#endif
+}
+
+#elif defined KERNEL_AVE_POOL
+
+__kernel void TEMPLATE(ave_pool_forward, Dtype)(
+    const int nthreads, __global const Dtype* bottom_data,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data)
+{
+  int index = get_global_id(0);
+  if (index >= nthreads)
+    return;
+
+  const int pw = index % pooled_width;
+  const int xx = index / pooled_width;
+  const int ph = xx % pooled_height;
+  const int ch = xx / pooled_height;
+  int hstart = ph * STRIDE_H - PAD_T;
+  int wstart = pw * STRIDE_W - PAD_L;
+  int hend = min(hstart + KERNEL_H, height + PAD_B);
+  int wend = min(wstart + KERNEL_W, width + PAD_R);
+  int pool_size;
+#ifdef AVE_POOL_PADDING_AREA
+  pool_size = (hend - hstart) * (wend - wstart);
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
+#else
+  hstart = max(hstart, (int)0);
+  wstart = max(wstart, (int)0);
+  hend = min(hend, height);
+  wend = min(wend, width);
+  pool_size = (hend - hstart) * (wend - wstart);
+#endif
+  Dtype aveval = 0;
+  int in_offset = ch * height * width;
+  for (int h = hstart; h < hend; ++h)
+  {
+    for (int w = wstart; w < wend; ++w)
+    {
+      aveval += bottom_data[in_offset + h * width + w];
+    }
+  }
+  top_data[index] = aveval / pool_size;
+}
+
+#elif defined KERNEL_STO_POOL
+
+__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
+    const int nthreads, __global const Dtype* bottom_data,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    __global Dtype* top_data)
+{
+  for (int index = get_global_id(0); index < nthreads;
+      index += get_global_size(0))
+  {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    const int hstart = ph * STRIDE_H;
+    const int hend = min(hstart + KERNEL_H, height);
+    const int wstart = pw * STRIDE_W;
+    const int wend = min(wstart + KERNEL_W, width);
+    // We set cumsum to be 0 to avoid divide-by-zero problems
+    Dtype cumsum = FLT_MIN;
+    Dtype cumvalues = 0.;
+    __global const Dtype* bottom_slice = bottom_data
+        + (n * channels + c) * height * width;
+    // First pass: get sum
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        Dtype v = bottom_slice[h * width + w];
+        cumsum += v;
+        cumvalues += v * v;
+      }
+    }
+    top_data[index] = cumvalues / cumsum;
+  }
+}
+
+#endif // KERNEL_*
--- a/Lib/opencv/sources/modules/dnn/src/opencl/permute.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/permute.cl
@@ -0,0 +1,69 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void permute(const int nthreads,
+                      __global Dtype* bottom_data,
+                      global int* permute_order,
+                      global int* oldStride,
+                      global int* newStride,
+                      const int num_axes,
+                      __global Dtype* top_data)
+{
+    for (int i = get_global_id(0); i < nthreads; i += get_global_size(0))
+    {
+        int oldPosition = 0;
+        int newPosition = i;
+
+        for (int j = 0; j < num_axes; ++j)
+        {
+            int order = permute_order[j];
+            oldPosition += (newPosition / newStride[j]) * oldStride[order];
+            newPosition %= newStride[j];
+        }
+
+        top_data[i] = bottom_data[oldPosition];
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/pooling.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/pooling.cl
@@ -0,0 +1,106 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+__kernel void MaxPoolForward(const int nthreads,
+    __global T* bottom_data, const int num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+    __global T* top_data
+#ifdef MASK
+    , __global float* mask
+#endif
+    )
+{
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index += tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_t;
+    int wstart = pw * stride_w - pad_l;
+    const int hend = min(hstart + kernel_h, height);
+    const int wend = min(wstart + kernel_w, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T maxval = -FLT_MAX;
+    int maxidx = -1;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_data[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_data[maxidx];
+        }
+      }
+    }
+
+    top_data[index] = maxval;
+
+#ifdef MASK
+    mask[index] = maxidx;
+#endif
+  }
+}
+
+__kernel void AvePoolForward(const int nthreads,
+    __global T* bottom_data, const int num, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,
+    __global T* top_data
+#ifdef MASK
+    , __global float* mask // NOT USED
+#endif
+    )
+{
+  int index = get_global_id(0);
+  int tmp = get_global_size(0);
+  for(index; index < nthreads; index+=tmp) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_t; int wstart = pw * stride_w - pad_l;
+    int hend = min(hstart + kernel_h, height + pad_b);
+    int wend = min(wstart + kernel_w, width + pad_r);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    hend = min(hend, height);
+    wend = min(wend, width);
+    T aveval = 0;
+    bottom_data =
+    bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_data[h * width + w];
+      }
+    }
+    top_data[index] = aveval / pool_size;
+  }
+
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/prior_box.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/prior_box.cl
@@ -0,0 +1,119 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void prior_box(const int nthreads,
+                        const float stepX,
+                        const float stepY,
+                        __global const float* _offsetsX,
+                        __global const float* _offsetsY,
+                        const int offsetsX_size,
+                        __global const float* _widths,
+                        __global const float* _heights,
+                        const int widths_size,
+                        __global Dtype* dst,
+                        const int _layerHeight,
+                        const int _layerWidth,
+                        const int imgHeight,
+                        const int imgWidth)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        int w = index % _layerWidth;
+        int h = index / _layerWidth;
+        __global Dtype* outputPtr;
+
+        outputPtr = dst + index * 4 * offsetsX_size * widths_size;
+
+        float _boxWidth, _boxHeight;
+        Dtype4 vec;
+        for (int i = 0; i < widths_size; ++i)
+        {
+            _boxWidth = _widths[i];
+            _boxHeight = _heights[i];
+            for (int j = 0; j < offsetsX_size; ++j)
+            {
+                Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
+                Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
+
+                vec.x = (center_x - _boxWidth * 0.5f) / imgWidth;    // xmin
+                vec.y = (center_y - _boxHeight * 0.5f) / imgHeight;  // ymin
+                vec.z = (center_x + _boxWidth * 0.5f) / imgWidth;    // xmax
+                vec.w = (center_y + _boxHeight * 0.5f) / imgHeight;  // ymax
+                vstore4(vec, 0, outputPtr);
+
+                outputPtr += 4;
+            }
+        }
+    }
+}
+
+__kernel void set_variance(const int nthreads,
+                           const int offset,
+                           const int variance_size,
+                           __global const float* variance,
+                           __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype4 var_vec;
+
+        if (variance_size == 1)
+            var_vec = (Dtype4)(variance[0]);
+        else
+            var_vec = convert_T(vload4(0, variance));
+
+        vstore4(var_vec, 0, dst + offset + index * 4);
+    }
+}
+
+__kernel void clip(const int nthreads,
+                   __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
+    {
+        Dtype4 vec = vload4(index, dst);
+        vstore4(clamp(vec, 0.0f, 1.0f), index, dst);
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/region.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/region.cl
@@ -0,0 +1,109 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define Dtype float
+
+__kernel void logistic_activ(const int count,
+                             __global const Dtype* src,
+                             const int cell_size,
+                             __global Dtype* dst)
+{
+    for (int i = get_global_id(0); i < count; i += get_global_size(0))
+    {
+        int index = cell_size * i;
+        Dtype x = src[index + 4];
+        dst[index + 4] = 1.f / (1.f + exp(-x));
+    }
+}
+
+__kernel void softmax_activ(const int count,
+                            __global const Dtype* src,
+                            __global const Dtype* biasData,
+                            const int cell_size,
+                            const int classes,
+                            const int classfix,
+                            const int rows,
+                            const int cols,
+                            const int anchors,
+                            const float thresh,
+                            __global Dtype* dst)
+{
+    for (int index = get_global_id(0); index < count; index += get_global_size(0))
+    {
+        int box_index = index * cell_size;
+        float largest = -FLT_MAX;
+        __global const Dtype *input = src + box_index + 5;
+        __global Dtype *output = dst + box_index + 5;
+
+        for (int i = 0; i < classes; ++i)
+            largest = fmax(largest, input[i]);
+
+        float sum = 0;
+        for (int i = 0; i < classes; ++i)
+        {
+            float e = exp((input[i] - largest));
+            sum += e;
+            output[i] = e;
+        }
+
+        int y = (index / (anchors * cols)) % rows;
+        int x = (index / anchors) % cols;
+        int a = index % anchors;
+        float scale = dst[box_index + 4];
+        if (classfix == -1 && scale < .5) scale = 0;
+
+        float v1 = src[box_index + 0];
+        float v2 = src[box_index + 1];
+        float l1 = 1.f / (1.f + exp(-v1));
+        float l2 = 1.f / (1.f + exp(-v2));
+
+        dst[box_index + 0] = (x + l1) / cols;
+        dst[box_index + 1] = (y + l2) / rows;
+        dst[box_index + 2] = exp(src[box_index + 2]) * biasData[2 * a] / cols;
+        dst[box_index + 3] = exp(src[box_index + 3]) * biasData[2 * a + 1] / rows;
+
+        for (int i = 0; i < classes; ++i)
+        {
+            float prob = scale * output[i] / sum;
+            output[i] = (prob > thresh) ? prob : 0;
+        }
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/slice.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/slice.cl
@@ -0,0 +1,81 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void slice(__global const Dtype* src,
+                    const int src_plane_size,
+                    const int dst_plane_size,
+                    const int src_cols,
+                    const int dst_cols,
+                    const int row_offset,
+                    const int col_offset,
+                    __global Dtype* dst)
+{
+    unsigned int row_gid = get_group_id(0);
+    unsigned int lid = get_local_id(0);
+    const __global Dtype *src_read = src + row_gid * 4 * src_plane_size;
+    __global Dtype *dst_read = dst + row_gid * 4 * dst_plane_size;
+    Dtype4 a0, a1, a2, a3;
+
+    int i = lid;
+    while( i < dst_plane_size / 4)
+    {
+        int row = (4 * i) / dst_cols + row_offset;
+        int col = (4 * i) % dst_cols + col_offset;
+        int src_index = row * src_cols + col;
+
+        a0 = vload4(0, src_read + src_index);
+        a1 = vload4(0, src_read + src_index + src_plane_size);
+        a2 = vload4(0, src_read + src_index + 2 * src_plane_size);
+        a3 = vload4(0, src_read + src_index + 3 * src_plane_size);
+
+        vstore4(a0, i, dst_read);
+        vstore4(a1, i, dst_read + dst_plane_size);
+        vstore4(a2, i, dst_read + 2 * dst_plane_size);
+        vstore4(a3, i, dst_read + 3 * dst_plane_size);
+
+        i += get_local_size(0);
+    }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/softmax.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/softmax.cl
@@ -0,0 +1,83 @@
+/*************************************************************************************
+ * Copyright (c) 2015, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation and/or
+ *  other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ **************************************************************************************/
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void kernel_channel_max(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* out) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T maxval = -FLT_MAX;
+    for (int c = 0; c < channels; ++c) {
+      maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
+    }
+    out[index] = maxval;
+  }
+}
+
+__kernel void kernel_channel_subtract(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
+  }
+}
+
+__kernel void kernel_channel_sum(const int num, const int channels,
+    const int spatial_dim, __global const T* data, __global T* channel_sum) {
+  int index = get_global_id(0);
+  if(index < num * spatial_dim) {
+    int n = index / spatial_dim;
+    int s = index % spatial_dim;
+    T sum = 0;
+    for (int c = 0; c < channels; ++c) {
+      sum += data[(n * channels + c) * spatial_dim + s];
+    }
+    channel_sum[index] = sum;
+  }
+}
+
+__kernel void kernel_channel_div(const int count,
+    const int num, const int channels,
+    const int spatial_dim, __global const T* channel_sum, __global T* data) {
+  int index = get_global_id(0);
+  if(index < count) {
+    int n = index / channels / spatial_dim;
+    int s = index % spatial_dim;
+    T v = data[index] / channel_sum[n * spatial_dim + s];
+#ifdef LOG_SOFTMAX
+    v = log(v);
+#endif
+    data[index] = v;
+  }
+}
--- a/Lib/opencv/sources/modules/dnn/src/opencl/softmax_loss.cl
+++ b/Lib/opencv/sources/modules/dnn/src/opencl/softmax_loss.cl
@@ -0,0 +1,193 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2017, Intel Corporation, all rights reserved.
+// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#define CONCAT(A,B) A##_##B
+#define TEMPLATE(name,type) CONCAT(name,type)
+
+#if defined(cl_intel_subgroups)
+#pragma OPENCL EXTENSION  cl_intel_subgroups : enable
+#endif
+
+#if defined(cl_khr_fp16)
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
+                                   const int spatial_dim,
+                                   __global Dtype* scale,
+                                   __global const Dtype* data,
+                                   __global Dtype* out,
+                                   __local Dtype *out_tmp,
+                                   __local Dtype *scale_tmp,
+                                   __local Dtype *group_tmp) {
+
+  int n = get_global_id(1);
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    Dtype maxval = -DTYPE_MAX;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
+      maxval = max((Dtype)tmp, (Dtype)maxval);
+    }
+    maxval = sub_group_reduce_max(maxval);
+    //if (get_sub_group_local_id() == 0)
+    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale_tmp[s] = maxval;
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    Dtype sum = 0;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      sum += out_tmp[c * spatial_dim + s];
+    }
+    sum = sub_group_reduce_add(sum);
+    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale_tmp[s] = sum;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    Dtype v = out_tmp[index] / scale_tmp[s];
+#ifdef LOG_SOFTMAX
+    v = log(v);
+#endif
+    out[n * channels * spatial_dim + index] = v;
+  }
+}
+
+__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
+                                   const int spatial_dim,
+                                   __global Dtype* scale,
+                                   __global const Dtype* data,
+                                   __global Dtype* out) {
+
+  int n = get_global_id(1);
+  __global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    Dtype maxval = -DTYPE_MAX;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      Dtype tmp = data[(n * channels + c) * spatial_dim + s];
+      maxval = max((Dtype)tmp, (Dtype)maxval);
+    }
+    maxval = sub_group_reduce_max(maxval);
+    //if (get_sub_group_local_id() == 0)
+    group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale[n * spatial_dim + s] = maxval;
+  }
+
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
+      get_global_size(0), ++s) {
+    Dtype sum = 0;
+    for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
+      sum += out[n * channels * spatial_dim + c * spatial_dim + s];
+    }
+    sum = sub_group_reduce_add(sum);
+    group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
+      get_global_size(0)) {
+    int s = index / get_max_sub_group_size();
+    Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
+    //if (get_sub_group_local_id() == 0)
+    scale[n * spatial_dim + s] = sum;
+  }
+  barrier(CLK_GLOBAL_MEM_FENCE);
+
+  for (int index = get_global_id(0); index < channels * spatial_dim;
+      index += get_global_size(0)) {
+    int s = index % spatial_dim;
+    Dtype v = out[n * channels * spatial_dim + index] / scale[n * spatial_dim + s];
+#ifdef LOG_SOFTMAX
+    v = log(v);
+#endif
+    out[n * channels * spatial_dim + index] = v;
+  }
+}