add part of opencv
This commit is contained in:
142
Lib/opencv/sources/modules/dnn/src/opencl/activations.cl
Normal file
142
Lib/opencv/sources/modules/dnn/src/opencl/activations.cl
Normal file
@@ -0,0 +1,142 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void ReLUForward(const int count, __global const T* in, __global T* out
|
||||
#ifndef RELU_NO_SLOPE
|
||||
, KERNEL_ARG_DTYPE negative_slope
|
||||
#endif
|
||||
) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
#ifndef RELU_NO_SLOPE
|
||||
out[index] = in[index] > 0 ? in[index] : in[index] * negative_slope;
|
||||
#else
|
||||
out[index] = in[index] > 0 ? in[index] : 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void ReLU6Forward(const int count, __global const T* in, __global T* out,
|
||||
const KERNEL_ARG_DTYPE minValue, const KERNEL_ARG_DTYPE maxValue)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
{
|
||||
T x = in[index];
|
||||
out[index] = clamp(x, convert_T(minValue), convert_T(maxValue));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void PReLUForward(const int count, const int channels, const int plane_size,
|
||||
__global const T* in, __global T* out,
|
||||
__global const KERNEL_ARG_DTYPE* slope_data)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
int c = (index / plane_size) % channels;
|
||||
if(index < count)
|
||||
out[index] = in[index] > 0 ? in[index] : in[index] * slope_data[c];
|
||||
}
|
||||
|
||||
__kernel void TanHForward(const int count, __global T* in, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
out[index] = tanh(in[index]);
|
||||
}
|
||||
|
||||
__kernel void SigmoidForward(const int count, __global const T* in, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
out[index] = 1.0f / (1.0f + exp(-in[index]));
|
||||
}
|
||||
|
||||
__kernel void SwishForward(const int count, __global const T* in, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
out[index] = in[index] / (1.0f + exp(-in[index]));
|
||||
}
|
||||
|
||||
__kernel void MishForward(const int count, __global const T* in, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count)
|
||||
out[index] = in[index] * tanh(log(1.0f + exp(in[index])));
|
||||
}
|
||||
|
||||
__kernel void BNLLForward(const int n, __global const T* in, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if (index < n) {
|
||||
T x = in[index];
|
||||
out[index] = x > 0 ? x + log(1.0f + exp(-x)) : log(1.0f + exp(x));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void AbsValForward(const int n, __global const T* in, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if (index < n)
|
||||
out[index] = fabs(in[index]);
|
||||
}
|
||||
|
||||
__kernel void PowForward(const int n, __global const T* in, __global T* out,
|
||||
const KERNEL_ARG_DTYPE power,
|
||||
const KERNEL_ARG_DTYPE scale,
|
||||
const KERNEL_ARG_DTYPE shift)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index < n)
|
||||
out[index] = pow(shift + scale * in[index], power);
|
||||
}
|
||||
|
||||
__kernel void ELUForward(const int n, __global const T* in, __global T* out)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index < n)
|
||||
{
|
||||
T src = in[index];
|
||||
out[index] = (src >= 0.f) ? src : exp(src) - 1;
|
||||
}
|
||||
}
|
||||
87
Lib/opencv/sources/modules/dnn/src/opencl/batchnorm.cl
Normal file
87
Lib/opencv/sources/modules/dnn/src/opencl/batchnorm.cl
Normal file
@@ -0,0 +1,87 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#if NUM == 8
|
||||
#define load(src, index) vload8(0, src + index)
|
||||
#define store(vec, dst, index) vstore8(vec, 0, dst + index)
|
||||
#define float_type float8
|
||||
#define convert_f convert_float8
|
||||
#define BATCH_NORM batch_norm8
|
||||
#elif NUM == 4
|
||||
#define load(src, index) vload4(0, src + index)
|
||||
#define store(vec, dst, index) vstore4(vec, 0, dst + index)
|
||||
#define float_type float4
|
||||
#define convert_f convert_float4
|
||||
#define BATCH_NORM batch_norm4
|
||||
#elif NUM == 1
|
||||
#define load(src, index) src[index]
|
||||
#define store(vec, dst, index) dst[index] = vec
|
||||
#define float_type float
|
||||
#define convert_f convert_float
|
||||
#define BATCH_NORM batch_norm1
|
||||
#endif
|
||||
|
||||
__kernel void BATCH_NORM(__global const Dtype* src,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int channels,
|
||||
__global const float* weight,
|
||||
__global const float* bias,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1) * NUM;
|
||||
int index = x * cols + y;
|
||||
|
||||
if (x >= rows || y >= cols)
|
||||
return;
|
||||
|
||||
float w = weight[x % channels];
|
||||
float b = bias[x % channels];
|
||||
float_type src_vec = convert_f(load(src, index));
|
||||
float_type dst_vec = src_vec * w + (float_type)b;
|
||||
store(convert_T(dst_vec), dst, index);
|
||||
}
|
||||
79
Lib/opencv/sources/modules/dnn/src/opencl/col2im.cl
Normal file
79
Lib/opencv/sources/modules/dnn/src/opencl/col2im.cl
Normal file
@@ -0,0 +1,79 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel void col2im(const int n, __global const T* data_col,
|
||||
const int data_col_offset,
|
||||
const int channels,
|
||||
const int height, const int width,
|
||||
const int height_col, const int width_col,
|
||||
const int coeff_h, const int coeff_w,
|
||||
__global const T* biasvec,
|
||||
const int bias_offset,
|
||||
__global T* data_im,
|
||||
const int data_im_offset)
|
||||
{
|
||||
data_col = data_col + data_col_offset;
|
||||
biasvec = biasvec + bias_offset;
|
||||
data_im = data_im + data_im_offset;
|
||||
int index = get_global_id(0);
|
||||
|
||||
if(index < n)
|
||||
{
|
||||
T val = 0.f;
|
||||
int w = index % width + PAD_W;
|
||||
int h = (index / width) % height + PAD_H;
|
||||
int c = index / (width * height);
|
||||
int h_col_start = (h < KERNEL_H) ? 0 : (h - KERNEL_H) / STRIDE_H + 1;
|
||||
int h_col_end = min(h / STRIDE_H + 1, height_col);
|
||||
int plane_size_col = height_col * width_col;
|
||||
int offset = (c * KERNEL_H * KERNEL_W + h * KERNEL_W + w) * plane_size_col;
|
||||
|
||||
int w_col_start = (w < KERNEL_W) ? 0 : (w - KERNEL_W) / STRIDE_W + 1;
|
||||
int w_col_end = min(w / STRIDE_W + 1, width_col);
|
||||
|
||||
for (int h_col = h_col_start; h_col < h_col_end; ++h_col)
|
||||
for (int w_col = w_col_start; w_col < w_col_end; ++w_col)
|
||||
val += data_col[offset + h_col * coeff_h + w_col * coeff_w];
|
||||
|
||||
data_im[index] = val + biasvec[c];
|
||||
}
|
||||
}
|
||||
67
Lib/opencv/sources/modules/dnn/src/opencl/concat.cl
Normal file
67
Lib/opencv/sources/modules/dnn/src/opencl/concat.cl
Normal file
@@ -0,0 +1,67 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
|
||||
__kernel void TEMPLATE(concat, Dtype)(const int nthreads,
|
||||
__global const Dtype* in_data,
|
||||
const int num_concats,
|
||||
const int concat_size,
|
||||
const int top_concat_axis,
|
||||
const int bottom_concat_axis,
|
||||
const int offset_concat_axis,
|
||||
__global Dtype* out_data)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
const int total_concat_size = concat_size * bottom_concat_axis;
|
||||
const int concat_num = index / total_concat_size;
|
||||
const int concat_index = index % total_concat_size;
|
||||
const int top_index = concat_index +
|
||||
(concat_num * top_concat_axis + offset_concat_axis) * concat_size;
|
||||
out_data[top_index] = in_data[index];
|
||||
}
|
||||
}
|
||||
1823
Lib/opencv/sources/modules/dnn/src/opencl/conv_layer_spatial.cl
Normal file
1823
Lib/opencv/sources/modules/dnn/src/opencl/conv_layer_spatial.cl
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,73 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define Dtype float
|
||||
|
||||
__kernel void TEMPLATE(copyWeightsSwizzled, Dtype)
|
||||
(__global Dtype* weightIn,
|
||||
__global Dtype* weightOut,
|
||||
const int kernel_w,
|
||||
const int kernel_h,
|
||||
const int channels,
|
||||
const int outputs,
|
||||
const int swizzleFactor) {
|
||||
|
||||
unsigned int sX = get_global_id(0);
|
||||
|
||||
//Original location
|
||||
|
||||
//Output location
|
||||
int outputSublayer = channels / swizzleFactor;
|
||||
int outputSublayerIndex = channels % swizzleFactor;
|
||||
|
||||
int filter = sX / (kernel_w*kernel_h*channels);
|
||||
int kernel_X = sX % kernel_w;
|
||||
int kernel_Y = (sX / kernel_w) % kernel_h;
|
||||
int kernel_C = (sX / (kernel_w * kernel_h)) % channels;
|
||||
|
||||
int FP = filter / swizzleFactor;
|
||||
int F1 = filter % swizzleFactor;
|
||||
|
||||
weightOut[FP*(kernel_w*kernel_h*channels*swizzleFactor) + kernel_C*(kernel_w*kernel_h*swizzleFactor) + kernel_Y*(kernel_w*swizzleFactor) + kernel_X*swizzleFactor + F1]
|
||||
= weightIn[filter*(kernel_w*kernel_h*channels) + kernel_C*(kernel_w*kernel_h) + kernel_Y*kernel_w + kernel_X];
|
||||
}
|
||||
199
Lib/opencv/sources/modules/dnn/src/opencl/detection_output.cl
Normal file
199
Lib/opencv/sources/modules/dnn/src/opencl/detection_output.cl
Normal file
@@ -0,0 +1,199 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
|
||||
__kernel void DecodeBBoxesCORNER(const int nthreads,
|
||||
__global const Dtype* loc_data,
|
||||
__global const Dtype* prior_data,
|
||||
const int variance_encoded_in_target,
|
||||
const int num_priors,
|
||||
const int share_location,
|
||||
const int num_loc_classes,
|
||||
const int background_label_id,
|
||||
const int clip_bbox,
|
||||
const int locPredTransposed,
|
||||
__global Dtype* bbox_data)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;
|
||||
const int i = index % 4;
|
||||
const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;
|
||||
|
||||
const int c = (index / 4) % num_loc_classes;
|
||||
int label = share_location ? -1 : c;
|
||||
if (label == background_label_id)
|
||||
return; // Ignore background class.
|
||||
|
||||
Dtype4 loc_vec = vload4(0, loc_data + index - i);
|
||||
Dtype4 bbox_vec, prior_variance;
|
||||
if (variance_encoded_in_target)
|
||||
{
|
||||
bbox_vec = loc_vec;
|
||||
} else {
|
||||
const int start_index = num_priors * 4 + p;
|
||||
prior_variance = vload4(0, prior_data + start_index);
|
||||
bbox_vec = loc_vec * prior_variance;
|
||||
}
|
||||
|
||||
if (locPredTransposed)
|
||||
{
|
||||
bbox_ymin = bbox_vec.x;
|
||||
bbox_xmin = bbox_vec.y;
|
||||
bbox_ymax = bbox_vec.z;
|
||||
bbox_xmax = bbox_vec.w;
|
||||
} else {
|
||||
bbox_xmin = bbox_vec.x;
|
||||
bbox_ymin = bbox_vec.y;
|
||||
bbox_xmax = bbox_vec.z;
|
||||
bbox_ymax = bbox_vec.w;
|
||||
}
|
||||
|
||||
Dtype4 prior_vec = vload4(0, prior_data + p);
|
||||
Dtype val;
|
||||
switch (i)
|
||||
{
|
||||
case 0:
|
||||
val = prior_vec.x + bbox_xmin;
|
||||
break;
|
||||
case 1:
|
||||
val = prior_vec.y + bbox_ymin;
|
||||
break;
|
||||
case 2:
|
||||
val = prior_vec.z + bbox_xmax;
|
||||
break;
|
||||
case 3:
|
||||
val = prior_vec.w + bbox_ymax;
|
||||
break;
|
||||
}
|
||||
|
||||
if (clip_bbox)
|
||||
val = max(min(val, (Dtype)1.), (Dtype)0.);
|
||||
|
||||
bbox_data[index] = val;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void DecodeBBoxesCENTER_SIZE(const int nthreads,
|
||||
__global const Dtype* loc_data,
|
||||
__global const Dtype* prior_data,
|
||||
const int variance_encoded_in_target,
|
||||
const int num_priors,
|
||||
const int share_location,
|
||||
const int num_loc_classes,
|
||||
const int background_label_id,
|
||||
const int clip_bbox,
|
||||
const int locPredTransposed,
|
||||
__global Dtype* bbox_data)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
Dtype bbox_xmin, bbox_ymin, bbox_xmax, bbox_ymax;
|
||||
const int i = index % 4;
|
||||
const int p = ((index / 4 / num_loc_classes) % num_priors) * 4;
|
||||
|
||||
const int c = (index / 4) % num_loc_classes;
|
||||
int label = share_location ? -1 : c;
|
||||
if (label == background_label_id)
|
||||
return; // Ignore background class.
|
||||
|
||||
Dtype4 loc_vec = vload4(0, loc_data + index - i);
|
||||
Dtype4 bbox_vec, prior_variance;
|
||||
if (variance_encoded_in_target)
|
||||
{
|
||||
bbox_vec = loc_vec;
|
||||
} else {
|
||||
const int start_index = num_priors * 4 + p;
|
||||
prior_variance = vload4(0, prior_data + start_index);
|
||||
bbox_vec = loc_vec * prior_variance;
|
||||
}
|
||||
|
||||
if (locPredTransposed)
|
||||
{
|
||||
bbox_ymin = bbox_vec.x;
|
||||
bbox_xmin = bbox_vec.y;
|
||||
bbox_ymax = bbox_vec.z;
|
||||
bbox_xmax = bbox_vec.w;
|
||||
} else {
|
||||
bbox_xmin = bbox_vec.x;
|
||||
bbox_ymin = bbox_vec.y;
|
||||
bbox_xmax = bbox_vec.z;
|
||||
bbox_ymax = bbox_vec.w;
|
||||
}
|
||||
|
||||
Dtype4 prior_vec = vload4(0, prior_data + p);
|
||||
Dtype prior_width = prior_vec.z - prior_vec.x;
|
||||
Dtype prior_height = prior_vec.w - prior_vec.y;
|
||||
Dtype prior_center_x = (prior_vec.x + prior_vec.z) * .5;
|
||||
Dtype prior_center_y = (prior_vec.y + prior_vec.w) * .5;
|
||||
|
||||
Dtype decode_bbox_center_x, decode_bbox_center_y;
|
||||
Dtype decode_bbox_width, decode_bbox_height;
|
||||
decode_bbox_center_x = bbox_xmin * prior_width + prior_center_x;
|
||||
decode_bbox_center_y = bbox_ymin * prior_height + prior_center_y;
|
||||
decode_bbox_width = exp(bbox_xmax) * prior_width;
|
||||
decode_bbox_height = exp(bbox_ymax) * prior_height;
|
||||
|
||||
Dtype val;
|
||||
switch (i)
|
||||
{
|
||||
case 0:
|
||||
val = decode_bbox_center_x - decode_bbox_width * .5;
|
||||
break;
|
||||
case 1:
|
||||
val = decode_bbox_center_y - decode_bbox_height * .5;
|
||||
break;
|
||||
case 2:
|
||||
val = decode_bbox_center_x + decode_bbox_width * .5;
|
||||
break;
|
||||
case 3:
|
||||
val = decode_bbox_center_y + decode_bbox_height * .5;
|
||||
break;
|
||||
}
|
||||
|
||||
if (clip_bbox)
|
||||
val = max(min(val, (Dtype)1.), (Dtype)0.);
|
||||
|
||||
bbox_data[index] = val;
|
||||
}
|
||||
}
|
||||
43
Lib/opencv/sources/modules/dnn/src/opencl/dummy.cl
Normal file
43
Lib/opencv/sources/modules/dnn/src/opencl/dummy.cl
Normal file
@@ -0,0 +1,43 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel void dummy_kernel()
|
||||
{
|
||||
}
|
||||
98
Lib/opencv/sources/modules/dnn/src/opencl/eltwise.cl
Normal file
98
Lib/opencv/sources/modules/dnn/src/opencl/eltwise.cl
Normal file
@@ -0,0 +1,98 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void op_sum4(__global const Dtype * A,
|
||||
__global const Dtype * B,
|
||||
unsigned int A_col_size,
|
||||
const float coeff1,
|
||||
const float coeff2,
|
||||
__global Dtype * C)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size;
|
||||
const __global Dtype *src1_read = B + row_gid * 4 * A_col_size;
|
||||
__global Dtype *dst0_read = C + row_gid * 4 * A_col_size;
|
||||
|
||||
Dtype4 a0, a1, a2, a3;
|
||||
Dtype4 dot0, dot1, dot2, dot3;
|
||||
unsigned int i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const Dtype4 b0 = vload4(i, src1_read);
|
||||
const Dtype4 b1 = vload4(i, src1_read + A_col_size);
|
||||
const Dtype4 b2 = vload4(i, src1_read + 2 * A_col_size);
|
||||
const Dtype4 b3 = vload4(i, src1_read + 3 * A_col_size);
|
||||
|
||||
#if LOOP == 0
|
||||
a0 = vload4(i, src0_read);
|
||||
a1 = vload4(i, src0_read + A_col_size);
|
||||
a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 = a0 * (Dtype4)coeff1 + b0 * (Dtype4)coeff2;
|
||||
dot1 = a1 * (Dtype4)coeff1 + b1 * (Dtype4)coeff2;
|
||||
dot2 = a2 * (Dtype4)coeff1 + b2 * (Dtype4)coeff2;
|
||||
dot3 = a3 * (Dtype4)coeff1 + b3 * (Dtype4)coeff2;
|
||||
#else
|
||||
a0 = vload4(i, dst0_read);
|
||||
a1 = vload4(i, dst0_read + A_col_size);
|
||||
a2 = vload4(i, dst0_read + 2 * A_col_size);
|
||||
a3 = vload4(i, dst0_read + 3 * A_col_size);
|
||||
|
||||
dot0 = a0 + b0 * (Dtype4)coeff2;
|
||||
dot1 = a1 + b1 * (Dtype4)coeff2;
|
||||
dot2 = a2 + b2 * (Dtype4)coeff2;
|
||||
dot3 = a3 + b3 * (Dtype4)coeff2;
|
||||
#endif
|
||||
vstore4(dot0, i, dst0_read);
|
||||
vstore4(dot1, i, dst0_read + A_col_size);
|
||||
vstore4(dot2, i, dst0_read + 2 * A_col_size);
|
||||
vstore4(dot3, i, dst0_read + 3 * A_col_size);
|
||||
|
||||
i += get_local_size(0);
|
||||
}
|
||||
}
|
||||
1342
Lib/opencv/sources/modules/dnn/src/opencl/gemm_buffer.cl
Normal file
1342
Lib/opencv/sources/modules/dnn/src/opencl/gemm_buffer.cl
Normal file
File diff suppressed because it is too large
Load Diff
994
Lib/opencv/sources/modules/dnn/src/opencl/gemm_image.cl
Normal file
994
Lib/opencv/sources/modules/dnn/src/opencl/gemm_image.cl
Normal file
@@ -0,0 +1,994 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
#define TYPE_FLOAT 1
|
||||
#define TYPE_HALF 2
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define Dtype half
|
||||
#define Dtype2 half2
|
||||
#define Dtype4 half4
|
||||
#define Dtype8 half8
|
||||
#define Dtype16 half16
|
||||
|
||||
#define as_Dtype as_half
|
||||
#define as_Dtype2 as_half2
|
||||
#define as_Dtype4 as_half4
|
||||
#define as_Dtype8 as_half8
|
||||
#define as_Dtype16 as_half16
|
||||
#else
|
||||
#define Dtype float
|
||||
#define Dtype2 float2
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
#define Dtype16 float16
|
||||
|
||||
#define as_Dtype as_float
|
||||
#define as_Dtype2 as_float2
|
||||
#define as_Dtype4 as_float4
|
||||
#define as_Dtype8 as_float8
|
||||
#define as_Dtype16 as_float16
|
||||
#endif
|
||||
|
||||
#if defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#endif
|
||||
|
||||
#define TILE_M 32
|
||||
#define TILE_K 8
|
||||
|
||||
// common block to calculate (alpha * AxB + beta * C) and output to destination image.
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read_us8( __image, __coord )
|
||||
#define SHUFFLE_TYPE2(val) as_ushort2(val)
|
||||
#define SHUFFLE_TYPE8(val) as_ushort8(val)
|
||||
#define READ_IMAGE(__image, __coord) read_imageh(__image, sampler, __coord)
|
||||
#define SIZE_OF_ELEMENT sizeof(ushort)
|
||||
#define SIMD_SIZE_GEMM 16
|
||||
#define TILE_N 16
|
||||
#else
|
||||
#define SUBGROUP_BLOCK_READ8( __image, __coord ) intel_sub_group_block_read8( __image, __coord )
|
||||
#define SHUFFLE_TYPE2(val) val
|
||||
#define SHUFFLE_TYPE8(val) val
|
||||
#define READ_IMAGE(__image, __coord) read_imagef(__image, sampler, __coord)
|
||||
#define SIZE_OF_ELEMENT sizeof(uint)
|
||||
#define SIMD_SIZE_GEMM 8
|
||||
#define TILE_N 8
|
||||
#endif
|
||||
|
||||
//#define USE_IMAGE_C
|
||||
#ifdef USE_IMAGE_C
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read_us8( _C, _coordC ) )
|
||||
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write_us8( _C, _coordC, as_ushort8( _val ) )
|
||||
#else
|
||||
#define BLOCKC_READ8( _C, _coordC ) as_Dtype8( intel_sub_group_block_read8( _C, _coordC ) )
|
||||
#define BLOCKC_WRITE8( _C, _coordC, _val ) intel_sub_group_block_write8( _C, _coordC, as_uint8( _val ) )
|
||||
#endif
|
||||
#define MATC_PARAMETER __read_only image2d_t C, __write_only image2d_t dst
|
||||
#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, C, dst, sizeof(uint))
|
||||
#else
|
||||
#define BLOCKC_READ8( _C, _coordC ) \
|
||||
(Dtype8) ( (_coordC.x + get_local_id(0) < N && _coordC.y < M) ? _C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 1 < M) ? _C[ ( _coordC.y + 1 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 2 < M) ? _C[ ( _coordC.y + 2 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 3 < M) ? _C[ ( _coordC.y + 3 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 4 < M) ? _C[ ( _coordC.y + 4 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 5 < M) ? _C[ ( _coordC.y + 5 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 6 < M) ? _C[ ( _coordC.y + 6 ) * ldc + _coordC.x + get_local_id(0) ] : 0, \
|
||||
(_coordC.x + get_local_id(0) < N && _coordC.y + 7 < M) ? _C[ ( _coordC.y + 7 ) * ldc + _coordC.x + get_local_id(0) ] : 0)
|
||||
|
||||
#define BLOCKC_WRITE8( _C, _coordC, _val) do {\
|
||||
if (_coordC.x + get_local_id(0) < N) { \
|
||||
if (_coordC.y < M) \
|
||||
_C[ _coordC.y * ldc + _coordC.x + get_local_id(0) ] = _val.s0; \
|
||||
if (_coordC.y + 1 < M) \
|
||||
_C[ ( _coordC.y + 1 )* ldc + _coordC.x + get_local_id(0) ] = _val.s1; \
|
||||
if (_coordC.y + 2 < M) \
|
||||
_C[ ( _coordC.y + 2 )* ldc + _coordC.x + get_local_id(0) ] = _val.s2; \
|
||||
if (_coordC.y + 3 < M) \
|
||||
_C[ ( _coordC.y + 3 )* ldc + _coordC.x + get_local_id(0) ] = _val.s3; \
|
||||
if (_coordC.y + 4 < M) \
|
||||
_C[ ( _coordC.y + 4 )* ldc + _coordC.x + get_local_id(0) ] = _val.s4; \
|
||||
if (_coordC.y + 5 < M) \
|
||||
_C[ ( _coordC.y + 5 )* ldc + _coordC.x + get_local_id(0) ] = _val.s5; \
|
||||
if (_coordC.y + 6 < M) \
|
||||
_C[ ( _coordC.y + 6 )* ldc + _coordC.x + get_local_id(0) ] = _val.s6; \
|
||||
if (_coordC.y + 7 < M) \
|
||||
_C[ ( _coordC.y + 7 )* ldc + _coordC.x + get_local_id(0) ] = _val.s7; \
|
||||
}} while(0)
|
||||
#define MATC_PARAMETER __global Dtype * C, const int offC, const int M, const int N, const int ldc
|
||||
#define GEMM_OUTPUT(ALPHA1, BETA_NOT0) GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, (C + offC), (C + offC), 1)
|
||||
#endif
|
||||
|
||||
#define GEMM_OUTPUT_EXT(ALPHA1, BETA_NOT0, _C, _dst, _C_step) \
|
||||
int2 coordDst = (int2)( ( group_x * TILE_N ) * _C_step, ( group_y * TILE_M ) ); \
|
||||
int2 coordC = coordDst; \
|
||||
Dtype8 blockC00; \
|
||||
Dtype8 blockC01; \
|
||||
Dtype8 blockC02; \
|
||||
Dtype8 blockC03; \
|
||||
if (BETA_NOT0) { \
|
||||
blockC00 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC01 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC02 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC03 = isFirstColBlock ? BLOCKC_READ8( _C, coordC ) * beta : BLOCKC_READ8( _C, coordC ); \
|
||||
if (!ALPHA1) { \
|
||||
blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
|
||||
blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
|
||||
blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
|
||||
blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
|
||||
} else { \
|
||||
blockC00 += blockAxB00; \
|
||||
blockC01 += blockAxB01; \
|
||||
blockC02 += blockAxB02; \
|
||||
blockC03 += blockAxB03; \
|
||||
} \
|
||||
} else { \
|
||||
blockC00 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC01 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC02 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); coordC.y += 8; \
|
||||
blockC03 = isFirstColBlock ? (Dtype)0. : BLOCKC_READ8( _C, coordC ); \
|
||||
if (!ALPHA1) { \
|
||||
blockC00 = mad(blockAxB00, (Dtype8)alpha, blockC00); \
|
||||
blockC01 = mad(blockAxB01, (Dtype8)alpha, blockC01); \
|
||||
blockC02 = mad(blockAxB02, (Dtype8)alpha, blockC02); \
|
||||
blockC03 = mad(blockAxB03, (Dtype8)alpha, blockC03); \
|
||||
} else { \
|
||||
blockC00 += blockAxB00; \
|
||||
blockC01 += blockAxB01; \
|
||||
blockC02 += blockAxB02; \
|
||||
blockC03 += blockAxB03; \
|
||||
} \
|
||||
} \
|
||||
BLOCKC_WRITE8( _dst, coordDst, blockC00 ); coordDst.y += 8; \
|
||||
BLOCKC_WRITE8( _dst, coordDst, blockC01 ); coordDst.y += 8; \
|
||||
BLOCKC_WRITE8( _dst, coordDst, blockC02 ); coordDst.y += 8; \
|
||||
BLOCKC_WRITE8( _dst, coordDst, blockC03 );
|
||||
|
||||
// Get the specified column of the block of the block
|
||||
#define TRANSPOSE_BLOCK_8( _block, _col ) \
|
||||
(Dtype8)( intel_sub_group_shuffle( _block.s0, _col ), \
|
||||
intel_sub_group_shuffle( _block.s1, _col ), \
|
||||
intel_sub_group_shuffle( _block.s2, _col ), \
|
||||
intel_sub_group_shuffle( _block.s3, _col ), \
|
||||
intel_sub_group_shuffle( _block.s4, _col ), \
|
||||
intel_sub_group_shuffle( _block.s5, _col ), \
|
||||
intel_sub_group_shuffle( _block.s6, _col ), \
|
||||
intel_sub_group_shuffle( _block.s7, _col ) );
|
||||
|
||||
// A's column block multiply B 's row block.
|
||||
#if TYPE == TYPE_HALF
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB00, _blockB01 ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
|
||||
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
|
||||
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
|
||||
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
|
||||
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
|
||||
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
|
||||
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
|
||||
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
|
||||
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s0), acol0, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s1), acol1, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s2), acol2, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s3), acol3, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s4), acol4, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s5), acol5, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s6), acol6, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB00.s7), acol7, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s0), acol8, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s1), acol9, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s2), acola, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s3), acolb, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s4), acolc, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s5), acold, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s6), acole, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB01.s7), acolf, _result ); \
|
||||
}
|
||||
#else
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
|
||||
_result = mad( (Dtype8)(_blockB.s0), acol0, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s1), acol1, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s2), acol2, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s3), acol3, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s4), acol4, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s5), acol5, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s6), acol6, _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s7), acol7, _result ); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_NN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
__read_only image2d_t B, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int width0, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0; \
|
||||
Dtype8 blockAxB01 = 0; \
|
||||
Dtype8 blockAxB02 = 0; \
|
||||
Dtype8 blockAxB03 = 0; \
|
||||
int2 coordA = (int2)( 0, group_y * TILE_M ); \
|
||||
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
|
||||
do \
|
||||
{ \
|
||||
int2 coordBTemp = coordB; \
|
||||
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
|
||||
Dtype8 blockB01 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, blockB01 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, blockB01 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, blockB01 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, blockB01 ); \
|
||||
} \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#else
|
||||
#define GEMM_NN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_NN_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
__read_only image2d_t B, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int width0, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0.0f; \
|
||||
Dtype8 blockAxB01 = 0.0f; \
|
||||
Dtype8 blockAxB02 = 0.0f; \
|
||||
Dtype8 blockAxB03 = 0.0f; \
|
||||
int2 coordA = (int2)( 0, group_y * TILE_M ); \
|
||||
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 ); \
|
||||
do \
|
||||
{ \
|
||||
int2 coordBTemp = coordB; \
|
||||
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K; \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
|
||||
} \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
GEMM_NN(1, 0) // ALPHA == 1, BETA == 0
|
||||
GEMM_NN(1, 1) // ALPHA == 1, BETA != 0
|
||||
GEMM_NN(0, 0) // ALPHA != 1, BETA == 0
|
||||
GEMM_NN(0, 1) // ALPHA != 1, BETA != 0
|
||||
|
||||
#undef TRANSPOSE_BLOCK_8
|
||||
#undef MULTIPLY_BLOCKS_8x8
|
||||
#undef GEMM_NN
|
||||
|
||||
// replicate the first row to column block.
|
||||
#define TRANSPOSE_BLOCK_8(_vec, _col) \
|
||||
(Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
|
||||
intel_sub_group_shuffle(_vec, _col + 1), \
|
||||
intel_sub_group_shuffle(_vec, _col + 2), \
|
||||
intel_sub_group_shuffle(_vec, _col + 3), \
|
||||
intel_sub_group_shuffle(_vec, _col + 4), \
|
||||
intel_sub_group_shuffle(_vec, _col + 5), \
|
||||
intel_sub_group_shuffle(_vec, _col + 6), \
|
||||
intel_sub_group_shuffle(_vec, _col + 7) )
|
||||
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \
|
||||
{ \
|
||||
_result = mad( (Dtype8)(_blockB.s0), TRANSPOSE_BLOCK_8(_blockA.s0, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s1), TRANSPOSE_BLOCK_8(_blockA.s1, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s2), TRANSPOSE_BLOCK_8(_blockA.s2, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s3), TRANSPOSE_BLOCK_8(_blockA.s3, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s4), TRANSPOSE_BLOCK_8(_blockA.s4, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s5), TRANSPOSE_BLOCK_8(_blockA.s5, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s6), TRANSPOSE_BLOCK_8(_blockA.s6, _col), _result ); \
|
||||
_result = mad( (Dtype8)(_blockB.s7), TRANSPOSE_BLOCK_8(_blockA.s7, _col), _result ); \
|
||||
}
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_TN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
__read_only image2d_t B, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int width0, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0);\
|
||||
const int group_y = get_group_id(1);\
|
||||
Dtype8 blockAxB00 = 0;\
|
||||
Dtype8 blockAxB01 = 0;\
|
||||
Dtype8 blockAxB02 = 0;\
|
||||
Dtype8 blockAxB03 = 0;\
|
||||
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
|
||||
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
|
||||
do\
|
||||
{\
|
||||
int2 coordBTemp = coordB;\
|
||||
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\
|
||||
int2 coordATemp = coordA;\
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
|
||||
} \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#else
|
||||
#define GEMM_TN(ALPHA1, BETA_NOT0) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_TN_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
__read_only image2d_t B, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int width0, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0);\
|
||||
const int group_y = get_group_id(1);\
|
||||
Dtype8 blockAxB00 = 0.0f;\
|
||||
Dtype8 blockAxB01 = 0.0f;\
|
||||
Dtype8 blockAxB02 = 0.0f;\
|
||||
Dtype8 blockAxB03 = 0.0f;\
|
||||
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 );\
|
||||
int2 coordB = (int2)( ( group_x * TILE_N ) * SIZE_OF_ELEMENT, 0 );\
|
||||
do\
|
||||
{\
|
||||
int2 coordBTemp = coordB;\
|
||||
Dtype8 blockB00 = as_Dtype8( SUBGROUP_BLOCK_READ8( B, coordBTemp ) ); coordB.y += TILE_K;\
|
||||
int2 coordATemp = coordA;\
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00, 0 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00, 0 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00, 0 ); \
|
||||
} \
|
||||
while( coordB.y < width0 ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
GEMM_TN(1, 0) // ALPHA == 1, BETA == 0
|
||||
GEMM_TN(1, 1) // ALPHA == 1, BETA != 0
|
||||
GEMM_TN(0, 0) // ALPHA != 1, BETA == 0
|
||||
GEMM_TN(0, 1) // ALPHA != 1, BETA != 0
|
||||
|
||||
#undef MULTIPLY_BLOCKS_8x8
|
||||
#undef TRANSPOSE_BLOCK_8
|
||||
#undef GEMM_TN
|
||||
|
||||
// The same as GEMM_NN
|
||||
#define TRANSPOSE_BLOCK_8( _block, _col ) \
|
||||
(Dtype8)( intel_sub_group_shuffle( _block.s0, _col), \
|
||||
intel_sub_group_shuffle( _block.s1, _col), \
|
||||
intel_sub_group_shuffle( _block.s2, _col), \
|
||||
intel_sub_group_shuffle( _block.s3, _col), \
|
||||
intel_sub_group_shuffle( _block.s4, _col), \
|
||||
intel_sub_group_shuffle( _block.s5, _col), \
|
||||
intel_sub_group_shuffle( _block.s6, _col), \
|
||||
intel_sub_group_shuffle( _block.s7, _col) )
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
|
||||
const Dtype8 acol8 = TRANSPOSE_BLOCK_8( _blockA, 8 ); \
|
||||
const Dtype8 acol9 = TRANSPOSE_BLOCK_8( _blockA, 9 ); \
|
||||
const Dtype8 acola = TRANSPOSE_BLOCK_8( _blockA, 10 ); \
|
||||
const Dtype8 acolb = TRANSPOSE_BLOCK_8( _blockA, 11 ); \
|
||||
const Dtype8 acolc = TRANSPOSE_BLOCK_8( _blockA, 12 ); \
|
||||
const Dtype8 acold = TRANSPOSE_BLOCK_8( _blockA, 13 ); \
|
||||
const Dtype8 acole = TRANSPOSE_BLOCK_8( _blockA, 14 ); \
|
||||
const Dtype8 acolf = TRANSPOSE_BLOCK_8( _blockA, 15 ); \
|
||||
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s8, acol8, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s9, acol9, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sa, acola, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sb, acolb, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sc, acolc, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sd, acold, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.se, acole, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.sf, acolf, _result ); \
|
||||
}
|
||||
#else
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA, 0 ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA, 1 ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA, 2 ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA, 3 ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA, 4 ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA, 5 ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA, 6 ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA, 7 ); \
|
||||
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
MATB_PARAMETER, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int padded_k, \
|
||||
int k, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0; \
|
||||
Dtype8 blockAxB01 = 0; \
|
||||
Dtype8 blockAxB02 = 0; \
|
||||
Dtype8 blockAxB03 = 0; \
|
||||
int2 coordA = (int2)( 0, group_y * TILE_M ); \
|
||||
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
|
||||
do \
|
||||
{ \
|
||||
Dtype16 blockB00; \
|
||||
BLOCKB_READ8(blockB00, B, coordB); \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT * 2; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
|
||||
} \
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#else
|
||||
#define GEMM_NT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_NT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0,Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
MATB_PARAMETER, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int padded_k, \
|
||||
int k, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0.0f; \
|
||||
Dtype8 blockAxB01 = 0.0f; \
|
||||
Dtype8 blockAxB02 = 0.0f; \
|
||||
Dtype8 blockAxB03 = 0.0f; \
|
||||
int2 coordA = (int2)( 0, group_y * TILE_M ); \
|
||||
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
|
||||
do \
|
||||
{ \
|
||||
Dtype8 blockB00; \
|
||||
BLOCKB_READ8(blockB00, B, coordB); \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.y += 8; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.x += TILE_K * SIZE_OF_ELEMENT; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02, blockB00 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03, blockB00 ); \
|
||||
} \
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s89ab = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.scdef = READ_IMAGE(_B, _coordBTemp); _coordB.x += 4;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __read_only image2d_t B
|
||||
|
||||
GEMM_NT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
|
||||
GEMM_NT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
|
||||
GEMM_NT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
|
||||
GEMM_NT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
|
||||
_blockb = as_Dtype16(as_ushort16(vload8(0, B_read))); \
|
||||
_coordB.x += TILE_K * 2;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * ldb) + _coordBTemp.x + offB); \
|
||||
_blockb = vload8(0, B_read); \
|
||||
_coordB.x += TILE_K;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
|
||||
|
||||
GEMM_NT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
|
||||
GEMM_NT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
|
||||
GEMM_NT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
|
||||
GEMM_NT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
Dtype4 temp; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s0 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s1 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s2 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s3 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s5 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s6 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s7 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s8 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s9 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sa = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sb = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sc = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sd = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.se = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.sf = temp.s0; \
|
||||
_coordB.x += 16;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
Dtype4 temp; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s0 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s1 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s2 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s3 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s5 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s6 = temp.s0; \
|
||||
temp = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s7 = temp.s0; \
|
||||
_coordB.x += 8;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __read_only image2d_t B
|
||||
|
||||
GEMM_NT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
|
||||
GEMM_NT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
|
||||
GEMM_NT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
|
||||
GEMM_NT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#undef MULTIPLY_BLOCKS_8x8
|
||||
#undef TRANSPOSE_BLOCK_8
|
||||
#undef GEMM_NT
|
||||
|
||||
//The same as GEMM_TN.
|
||||
#define TRANSPOSE_BLOCK_8(_vec, _col) \
|
||||
(Dtype8)( intel_sub_group_shuffle(_vec, _col + 0), \
|
||||
intel_sub_group_shuffle(_vec, _col + 1), \
|
||||
intel_sub_group_shuffle(_vec, _col + 2), \
|
||||
intel_sub_group_shuffle(_vec, _col + 3), \
|
||||
intel_sub_group_shuffle(_vec, _col + 4), \
|
||||
intel_sub_group_shuffle(_vec, _col + 5), \
|
||||
intel_sub_group_shuffle(_vec, _col + 6), \
|
||||
intel_sub_group_shuffle(_vec, _col + 7) );
|
||||
|
||||
#define MULTIPLY_BLOCKS_8x8( _result, _blockA, _blockB, _col ) \
|
||||
{ \
|
||||
const Dtype8 acol0 = TRANSPOSE_BLOCK_8( _blockA.s0, _col ); \
|
||||
const Dtype8 acol1 = TRANSPOSE_BLOCK_8( _blockA.s1, _col ); \
|
||||
const Dtype8 acol2 = TRANSPOSE_BLOCK_8( _blockA.s2, _col ); \
|
||||
const Dtype8 acol3 = TRANSPOSE_BLOCK_8( _blockA.s3, _col ); \
|
||||
const Dtype8 acol4 = TRANSPOSE_BLOCK_8( _blockA.s4, _col ); \
|
||||
const Dtype8 acol5 = TRANSPOSE_BLOCK_8( _blockA.s5, _col ); \
|
||||
const Dtype8 acol6 = TRANSPOSE_BLOCK_8( _blockA.s6, _col ); \
|
||||
const Dtype8 acol7 = TRANSPOSE_BLOCK_8( _blockA.s7, _col ); \
|
||||
_result = mad( (Dtype8)_blockB.s0, acol0, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s1, acol1, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s2, acol2, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s3, acol3, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s4, acol4, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s5, acol5, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s6, acol6, _result ); \
|
||||
_result = mad( (Dtype8)_blockB.s7, acol7, _result ); \
|
||||
}
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
MATB_PARAMETER, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int padded_k, \
|
||||
int k, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0; \
|
||||
Dtype8 blockAxB01 = 0; \
|
||||
Dtype8 blockAxB02 = 0; \
|
||||
Dtype8 blockAxB03 = 0; \
|
||||
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
|
||||
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
|
||||
do \
|
||||
{ \
|
||||
Dtype8 blockB00; \
|
||||
BLOCKB_READ8(blockB00, B, coordB); \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 16 * SIZE_OF_ELEMENT;\
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K;\
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA00, blockB00, 8); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA01, blockB00, 0); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA01, blockB00, 8); \
|
||||
} \
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
|
||||
}
|
||||
#else
|
||||
#define GEMM_TT(ALPHA1, BETA_NOT0, VECSCALAR, VECSIZE) \
|
||||
__attribute__((intel_reqd_sub_group_size(SIMD_SIZE_GEMM))) \
|
||||
__attribute__((reqd_work_group_size(SIMD_SIZE_GEMM, 1, 1))) \
|
||||
__kernel void TEMPLATE(gemm_32_1_TT_ ##VECSCALAR ##_ ##ALPHA1 ##_ ##BETA_NOT0, Dtype)( \
|
||||
__read_only image2d_t A, \
|
||||
MATB_PARAMETER, \
|
||||
MATC_PARAMETER, \
|
||||
KERNEL_ARG_DTYPE alpha_in, \
|
||||
KERNEL_ARG_DTYPE beta_in, \
|
||||
int padded_k, \
|
||||
int k, \
|
||||
int isFirstColBlock) \
|
||||
{ \
|
||||
const Dtype alpha = (Dtype)alpha_in; \
|
||||
const Dtype beta = (Dtype)beta_in; \
|
||||
const int group_x = get_group_id(0); \
|
||||
const int group_y = get_group_id(1); \
|
||||
Dtype8 blockAxB00 = 0.0f; \
|
||||
Dtype8 blockAxB01 = 0.0f; \
|
||||
Dtype8 blockAxB02 = 0.0f; \
|
||||
Dtype8 blockAxB03 = 0.0f; \
|
||||
int2 coordA = (int2)( group_y * TILE_M * SIZE_OF_ELEMENT, 0 ); \
|
||||
int2 coordB = (int2)( 0, ( group_x * TILE_N )); \
|
||||
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; \
|
||||
do \
|
||||
{ \
|
||||
Dtype8 blockB00; \
|
||||
BLOCKB_READ8(blockB00, B, coordB); \
|
||||
int2 coordATemp = coordA; \
|
||||
Dtype8 blockA00 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \
|
||||
Dtype8 blockA01 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \
|
||||
Dtype8 blockA02 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordATemp.x += 8 * SIZE_OF_ELEMENT; \
|
||||
Dtype8 blockA03 = as_Dtype8( SUBGROUP_BLOCK_READ8( A, coordATemp ) ); coordA.y += TILE_K; \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB00, blockA00 , blockB00, 0 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB01, blockA01 , blockB00, 0 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB02, blockA02 , blockB00, 0 ); \
|
||||
MULTIPLY_BLOCKS_8x8( blockAxB03, blockA03 , blockB00, 0 ); \
|
||||
} \
|
||||
while( coordB.x < padded_k / VECSIZE ); \
|
||||
GEMM_OUTPUT(ALPHA1, BETA_NOT0);\
|
||||
}
|
||||
#endif
|
||||
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
_blockb.s0123 = READ_IMAGE(_B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4567 = READ_IMAGE(_B, _coordBTemp); _coordB.x += 2;
|
||||
|
||||
#define MATB_PARAMETER __read_only image2d_t B
|
||||
|
||||
GEMM_TT(1, 0, VEC4, 4) // ALPHA == 1, BETA == 0
|
||||
GEMM_TT(1, 1, VEC4, 4) // ALPHA == 1, BETA != 0
|
||||
GEMM_TT(0, 0, VEC4, 4) // ALPHA != 1, BETA == 0
|
||||
GEMM_TT(0, 1, VEC4, 4) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#if TYPE == TYPE_HALF
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global float *B_read = (__global float *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
|
||||
_blockb = as_Dtype8(as_ushort8(vload4(0, B_read))); \
|
||||
_coordB.x += TILE_K;
|
||||
#else
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
const __global Dtype *B_read = (__global Dtype *)(_B + (_coordBTemp.y * k) + _coordBTemp.x + offB); \
|
||||
_blockb = vload8(0, B_read); \
|
||||
_coordB.x += TILE_K;
|
||||
#endif
|
||||
|
||||
#define MATB_PARAMETER __global Dtype *B, int offB, int ldb
|
||||
|
||||
GEMM_TT(1, 0, BUFFER, 1) // ALPHA == 1, BETA == 0
|
||||
GEMM_TT(1, 1, BUFFER, 1) // ALPHA == 1, BETA != 0
|
||||
GEMM_TT(0, 0, BUFFER, 1) // ALPHA != 1, BETA == 0
|
||||
GEMM_TT(0, 1, BUFFER, 1) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#define BLOCKB_READ8(_blockb, _B, _coordB) \
|
||||
int2 _coordBTemp = _coordB; \
|
||||
_coordBTemp.y += get_local_id(0); \
|
||||
Dtype4 temp; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s0 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s1 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s2 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s3 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s4 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s5 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s6 = temp.s0; \
|
||||
temp = READ_IMAGE(B, _coordBTemp); _coordBTemp.x += 1; \
|
||||
_blockb.s7 = temp.s0; \
|
||||
_coordB.x += 8;
|
||||
|
||||
#define MATB_PARAMETER __read_only image2d_t B
|
||||
|
||||
GEMM_TT(1, 0, SCALAR, 1) // ALPHA == 1, BETA == 0
|
||||
GEMM_TT(1, 1, SCALAR, 1) // ALPHA == 1, BETA != 0
|
||||
GEMM_TT(0, 0, SCALAR, 1) // ALPHA != 1, BETA == 0
|
||||
GEMM_TT(0, 1, SCALAR, 1) // ALPHA != 1, BETA != 0
|
||||
#undef BLOCKB_READ8
|
||||
#undef MATB_PARAMETER
|
||||
|
||||
#undef MULTIPLY_BLOCKS_8x8
|
||||
#undef TRANSPOSE_BLOCK_8
|
||||
#undef GEMM_TT
|
||||
|
||||
#undef TILE_M
|
||||
#undef TILE_K
|
||||
#undef TILE_N
|
||||
#undef SUBGROUP_BLOCK_READ8
|
||||
#undef READ_IMAGE
|
||||
#undef SIZE_OF_ELEMENT
|
||||
|
||||
__kernel void TEMPLATE(gemm_buffer_copy_image_transpose, Dtype)(
|
||||
__global Dtype* A,
|
||||
__write_only image2d_t ImA,
|
||||
int offA,
|
||||
int width,
|
||||
int height,
|
||||
int ldA)
|
||||
{
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
__global Dtype* A_off = A + offA;
|
||||
Dtype srcA = A_off[gidy * ldA + gidx];
|
||||
#if TYPE == TYPE_HALF
|
||||
write_imageh(ImA, coord_dst, (Dtype4)srcA);
|
||||
#else
|
||||
write_imagef(ImA, coord_dst, (Dtype4)srcA);
|
||||
#endif
|
||||
}
|
||||
|
||||
__kernel void TEMPLATE(gemm_buffer_copy_image_no_transpose, Dtype)(
|
||||
__global Dtype* A,
|
||||
__write_only image2d_t ImA,
|
||||
int offA,
|
||||
int width,
|
||||
int height,
|
||||
int ldA)
|
||||
{
|
||||
const int gidx = get_global_id(0);
|
||||
const int gidy = get_global_id(1);
|
||||
int2 coord_dst = (int2)(gidx, gidy);
|
||||
#if TYPE == TYPE_HALF
|
||||
if (gidx >= width || gidy >= height) {
|
||||
write_imageh(ImA, coord_dst, 0);
|
||||
return;
|
||||
}
|
||||
__global Dtype* A_off = A + offA;
|
||||
write_imageh(ImA, coord_dst, A_off[gidy * ldA + gidx]);
|
||||
#else
|
||||
if (gidx >= width || gidy >= height) {
|
||||
write_imageui(ImA, coord_dst, (uint4)0);
|
||||
return;
|
||||
}
|
||||
__global Dtype* A_off = A + offA;
|
||||
uint4 srcA = convert_uint4(as_uchar4(A_off[gidy * ldA + gidx]));
|
||||
write_imageui(ImA, coord_dst, srcA);
|
||||
#endif
|
||||
}
|
||||
71
Lib/opencv/sources/modules/dnn/src/opencl/im2col.cl
Normal file
71
Lib/opencv/sources/modules/dnn/src/opencl/im2col.cl
Normal file
@@ -0,0 +1,71 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
__kernel void im2col(__global const T *im_src, int im_src_offset,
|
||||
int channels, int height_inp, int width_inp,
|
||||
int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w,
|
||||
int height_out, int width_out,
|
||||
__global T *im_col, int im_col_offset
|
||||
)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index >= height_out * width_out * channels)
|
||||
return;
|
||||
int j_out = index % width_out;
|
||||
int i_out = (index / width_out) % height_out;
|
||||
int c_inp = (index / width_out) / height_out;
|
||||
|
||||
int c_out = c_inp * kernel_h * kernel_w;
|
||||
int i_inp = i_out * stride_h - pad_h;
|
||||
int j_inp = j_out * stride_w - pad_w;
|
||||
|
||||
im_src += (c_inp * height_inp + i_inp) * width_inp + j_inp + im_src_offset;
|
||||
im_col += (c_out * height_out + i_out) * width_out + j_out + im_col_offset;
|
||||
|
||||
for (int ki = 0; ki < kernel_h; ++ki)
|
||||
for (int kj = 0; kj < kernel_w; ++kj) {
|
||||
int i = i_inp + ki;
|
||||
int j = j_inp + kj;
|
||||
*im_col = (i >= 0 && j >= 0 && i < height_inp && j < width_inp) ?
|
||||
im_src[ki * width_inp + kj] : 0;
|
||||
im_col += height_out * width_out;
|
||||
}
|
||||
}
|
||||
76
Lib/opencv/sources/modules/dnn/src/opencl/lrn.cl
Normal file
76
Lib/opencv/sources/modules/dnn/src/opencl/lrn.cl
Normal file
@@ -0,0 +1,76 @@
|
||||
/*************************************************************************************
|
||||
* Copyright (c) 2015, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation and/or
|
||||
* other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
**************************************************************************************/
|
||||
|
||||
__kernel void LRNComputeOutput(const int nthreads, __global T* in, __global T* scale, const T negative_beta, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
int tmp = get_global_size(0);
|
||||
for(index; index < nthreads; index += tmp)
|
||||
out[index] = in[index] * pow(scale[index], negative_beta);
|
||||
}
|
||||
|
||||
__kernel void LRNFillScale(const int nthreads, __global T* in, const int num, const int channels, const int height, const int width, const int size, const T alpha_over_size, const T k, __global T* scale) {
|
||||
int index = get_global_id(0);
|
||||
int tmp = get_global_size(0);
|
||||
for(index; index < nthreads; index += tmp) {
|
||||
// find out the local offset
|
||||
const int w = index % width;
|
||||
const int h = (index / width) % height;
|
||||
const int n = index / width / height;
|
||||
const int offset = (n * channels * height + h) * width + w;
|
||||
const int step = height * width;
|
||||
in = in + offset;
|
||||
scale = scale + offset;
|
||||
int head = 0;
|
||||
const int pre_pad = (size - 1) / 2;
|
||||
const int post_pad = size - pre_pad - 1;
|
||||
T accum_scale = 0;
|
||||
// fill the scale at [n, :, h, w]
|
||||
// accumulate values
|
||||
while (head < post_pad && head < channels) {
|
||||
accum_scale += in[head * step] * in[head * step];
|
||||
++head;
|
||||
}
|
||||
// both add and subtract
|
||||
while (head < channels) {
|
||||
accum_scale += in[head * step] * in[head * step];
|
||||
if (head - size >= 0) {
|
||||
accum_scale -= in[(head - size) * step]
|
||||
* in[(head - size) * step];
|
||||
}
|
||||
scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
|
||||
++head;
|
||||
}
|
||||
// subtract only
|
||||
while (head < channels + post_pad) {
|
||||
if (head - size >= 0) {
|
||||
accum_scale -= in[(head - size) * step]
|
||||
* in[(head - size) * step];
|
||||
}
|
||||
scale[(head - post_pad) * step] = k + accum_scale * alpha_over_size;
|
||||
++head;
|
||||
}
|
||||
}
|
||||
}
|
||||
59
Lib/opencv/sources/modules/dnn/src/opencl/math.cl
Normal file
59
Lib/opencv/sources/modules/dnn/src/opencl/math.cl
Normal file
@@ -0,0 +1,59 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
__kernel void TEMPLATE(axpy,Dtype)(const int n, const KERNEL_ARG_DTYPE alpha, __global const Dtype* x,
|
||||
const int offx, __global Dtype* y,
|
||||
const int offy) {
|
||||
for (int index = get_global_id(0); index < n; index += get_global_size(0)) {
|
||||
Dtype src = x[offx + index];
|
||||
Dtype dst = y[offy + index];
|
||||
y[offy + index] = convert_Dtype(alpha) * src + dst;
|
||||
}
|
||||
}
|
||||
195
Lib/opencv/sources/modules/dnn/src/opencl/matvec_mul.cl
Normal file
195
Lib/opencv/sources/modules/dnn/src/opencl/matvec_mul.cl
Normal file
@@ -0,0 +1,195 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
__kernel void TEMPLATE(matvec_mul4,Dtype)(
|
||||
__global const Dtype * A,
|
||||
int offA,
|
||||
unsigned int A_col_size,
|
||||
unsigned int trail_item,
|
||||
__global const Dtype * v,
|
||||
int offv,
|
||||
KERNEL_ARG_DTYPE alpha,
|
||||
KERNEL_ARG_DTYPE beta,
|
||||
__global Dtype4* result,
|
||||
int offr,
|
||||
__local Dtype4* work)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global Dtype *src0_read = A + row_gid * 4 * A_col_size + offA;
|
||||
const __global Dtype *src1_read = v + offv;
|
||||
result = (__global Dtype4*)((__global Dtype*)result + offr);
|
||||
Dtype4 dot0 = (Dtype4)(0.f);
|
||||
Dtype4 dot1 = (Dtype4)(0.f);
|
||||
Dtype4 dot2 = (Dtype4)(0.f);
|
||||
Dtype4 dot3 = (Dtype4)(0.f);
|
||||
|
||||
unsigned int i = lid;
|
||||
while( i < A_col_size / 4) {
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
const Dtype4 b0 = vload4(i, src1_read);
|
||||
|
||||
dot0 += a0 * b0;
|
||||
dot1 += a1 * b0;
|
||||
dot2 += a2 * b0;
|
||||
dot3 += a3 * b0;
|
||||
|
||||
i += get_local_size(0);
|
||||
}
|
||||
|
||||
work[lid].s0 = dot0.x + dot0.y + dot0.z + dot0.w;
|
||||
work[lid].s1 = dot1.x + dot1.y + dot1.z + dot1.w;
|
||||
work[lid].s2 = dot2.x + dot2.y + dot2.z + dot2.w;
|
||||
work[lid].s3 = dot3.x + dot3.y + dot3.z + dot3.w;
|
||||
|
||||
if(i == A_col_size / 4)
|
||||
{
|
||||
if(trail_item != 0)
|
||||
{
|
||||
const __global Dtype *src0_trail = src0_read + i * 4;
|
||||
const __global Dtype *src1_trail = src1_read + i * 4;
|
||||
for(unsigned int i = 0; i < trail_item; ++i) {
|
||||
const Dtype at0 = src0_trail[i];
|
||||
const Dtype at1 = src0_trail[i + A_col_size];
|
||||
const Dtype at2 = src0_trail[i + 2 * A_col_size];
|
||||
const Dtype at3 = src0_trail[i + 3 * A_col_size];
|
||||
|
||||
const Dtype bt = src1_trail[i];
|
||||
|
||||
work[lid].s0 += at0 * bt;
|
||||
work[lid].s1 += at1 * bt;
|
||||
work[lid].s2 += at2 * bt;
|
||||
work[lid].s3 += at3 * bt;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < stride)
|
||||
work[lid] += work[lid+stride];
|
||||
}
|
||||
if(lid == 0) {
|
||||
if(beta == (Dtype)0)
|
||||
result[row_gid] = convert_Dtype(alpha) * work[0];
|
||||
else
|
||||
result[row_gid] = convert_Dtype(alpha) * work[0] + convert_Dtype(beta) * result[row_gid];
|
||||
}
|
||||
}
|
||||
|
||||
/* This kernel used for the trailing rows when row_of_A %4 !=0 */
|
||||
__kernel void TEMPLATE(matvec_mul1,Dtype)(
|
||||
__global const Dtype * A,
|
||||
int offA,
|
||||
unsigned int A_col_size,
|
||||
unsigned int row_offset,
|
||||
unsigned int trail_item,
|
||||
__global const Dtype * v,
|
||||
int offv,
|
||||
KERNEL_ARG_DTYPE alpha,
|
||||
KERNEL_ARG_DTYPE beta,
|
||||
__global Dtype * result,
|
||||
int offr,
|
||||
__local Dtype * work)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
|
||||
const __global Dtype *src0_read = A + (row_offset + row_gid) * A_col_size + offA;
|
||||
const __global Dtype *src1_read = v + + offv;
|
||||
result = result + offr;
|
||||
Dtype4 dot0 = (Dtype4)(0.f);
|
||||
|
||||
unsigned int i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 b0 = vload4(i, src1_read);
|
||||
|
||||
dot0 += a0 * b0;
|
||||
i += get_local_size(0);
|
||||
}
|
||||
|
||||
work[lid] = dot0.x + dot0.y + dot0.z + dot0.w;
|
||||
|
||||
if(i == A_col_size / 4)
|
||||
{
|
||||
if(trail_item != 0)
|
||||
{
|
||||
const __global Dtype *src0_trail = src0_read + i * 4;
|
||||
const __global Dtype *src1_trail = src1_read + i * 4;
|
||||
for(unsigned int i = 0; i < trail_item; ++i) {
|
||||
const Dtype at0 = src0_trail[i];
|
||||
const Dtype bt = src1_trail[i];
|
||||
|
||||
work[lid] += at0 * bt;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1) {
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < stride)
|
||||
work[lid] += work[lid+stride];
|
||||
}
|
||||
|
||||
if(lid == 0) {
|
||||
if(beta == (Dtype)0) {
|
||||
result[row_gid+row_offset] = convert_Dtype(alpha) * work[0];
|
||||
} else {
|
||||
result[row_gid+row_offset] *= convert_Dtype(beta);
|
||||
result[row_gid+row_offset] += convert_Dtype(alpha) * work[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
331
Lib/opencv/sources/modules/dnn/src/opencl/mvn.cl
Normal file
331
Lib/opencv/sources/modules/dnn/src/opencl/mvn.cl
Normal file
@@ -0,0 +1,331 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#define Dtype float
|
||||
#define Dtype4 float4
|
||||
#define Dtype8 float8
|
||||
|
||||
#if NUM == 8
|
||||
#define load(src, index) vload8(0, src + index)
|
||||
#define store(vec, dst, index) vstore8(vec, 0, dst + index)
|
||||
#define vec_type Dtype8
|
||||
#define CALC_MEAN calc_mean8
|
||||
#define MVN mvn8
|
||||
#define MEAN_FUSE mean_fuse8
|
||||
#define MVN_FUSE mvn_fuse8
|
||||
#elif NUM == 4
|
||||
#define load(src, index) vload4(0, src + index)
|
||||
#define store(vec, dst, index) vstore4(vec, 0, dst + index)
|
||||
#define vec_type Dtype4
|
||||
#define CALC_MEAN calc_mean4
|
||||
#define MVN mvn4
|
||||
#define MEAN_FUSE mean_fuse4
|
||||
#define MVN_FUSE mvn_fuse4
|
||||
#elif NUM == 1
|
||||
#define load(src, index) src[index]
|
||||
#define store(vec, dst, index) dst[index] = vec
|
||||
#define vec_type Dtype
|
||||
#define CALC_MEAN calc_mean1
|
||||
#define MVN mvn1
|
||||
#define MEAN_FUSE mean_fuse1
|
||||
#define MVN_FUSE mvn_fuse1
|
||||
#endif
|
||||
|
||||
#ifdef KERNEL_MEAN
|
||||
|
||||
__kernel void CALC_MEAN(__global const Dtype* src,
|
||||
const int rows,
|
||||
const int cols,
|
||||
__global Dtype* mean,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1) * NUM;
|
||||
int index = x * cols + y;
|
||||
|
||||
if (x >= rows || y >= cols)
|
||||
return;
|
||||
|
||||
Dtype mean_val = mean[x];
|
||||
vec_type src_vec = load(src, index);
|
||||
vec_type dst_vec = src_vec - (vec_type)mean_val;
|
||||
dst_vec = dst_vec * dst_vec;
|
||||
store(dst_vec, dst, index);
|
||||
}
|
||||
|
||||
#elif defined KERNEL_MVN
|
||||
|
||||
__kernel void MVN(__global const Dtype* src,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const Dtype eps,
|
||||
__global const Dtype* mean,
|
||||
__global const Dtype* dev,
|
||||
__global const Dtype* bnorm_weight,
|
||||
__global const Dtype* bnorm_bias,
|
||||
const int channels,
|
||||
const float relu_slope,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1) * NUM;
|
||||
int index = x * cols + y;
|
||||
|
||||
if (x >= rows || y >= cols)
|
||||
return;
|
||||
|
||||
Dtype mean_val = mean[x];
|
||||
Dtype dev_val = dev[x];
|
||||
Dtype alpha;
|
||||
#ifdef NORM_VARIANCE
|
||||
alpha = 1 / sqrt(eps + dev_val);
|
||||
#else
|
||||
alpha = 1;
|
||||
#endif
|
||||
|
||||
Dtype w = 1.f, b = 0.f;
|
||||
#ifdef FUSE_BATCH_NORM
|
||||
w = bnorm_weight[x % channels];
|
||||
b = bnorm_bias[x % channels];
|
||||
#endif
|
||||
|
||||
vec_type src_vec = load(src, index) - (vec_type)mean_val;
|
||||
vec_type dst_vec = src_vec * alpha;
|
||||
dst_vec = dst_vec * w + (vec_type)b;
|
||||
|
||||
#ifdef FUSE_RELU
|
||||
vec_type new_val = dst_vec * relu_slope;
|
||||
dst_vec = select(new_val, dst_vec, dst_vec > (vec_type)0.f);
|
||||
#endif
|
||||
|
||||
store(dst_vec, dst, index);
|
||||
}
|
||||
|
||||
#elif defined KERNEL_MEAN_FUSE
|
||||
|
||||
__kernel void MEAN_FUSE(__global const T * A,
|
||||
unsigned int A_col_size,
|
||||
float alpha,
|
||||
__global T4 * mean,
|
||||
__global Dtype * tmp)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global T *src0_read = A + row_gid * 4 * A_col_size;
|
||||
__global Dtype *dst0_read = tmp + row_gid * 4 * A_col_size;
|
||||
Dtype4 dot0, dot1, dot2, dot3;
|
||||
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
|
||||
|
||||
unsigned int i = lid;
|
||||
const Dtype4 b0 = (Dtype4)1.f;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const T4 a0 = vload4(i, src0_read);
|
||||
const T4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 += convert_float4(a0);
|
||||
dot1 += convert_float4(a1);
|
||||
dot2 += convert_float4(a2);
|
||||
dot3 += convert_float4(a3);
|
||||
|
||||
i += LOCAL_SIZE;
|
||||
}
|
||||
|
||||
__local Dtype4 work[LOCAL_SIZE];
|
||||
work[lid].s0 = dot(dot0, b0);
|
||||
work[lid].s1 = dot(dot1, b0);
|
||||
work[lid].s2 = dot(dot2, b0);
|
||||
work[lid].s3 = dot(dot3, b0);
|
||||
|
||||
for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < stride)
|
||||
work[lid] += work[lid+stride];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if(lid == 0)
|
||||
{
|
||||
mean[row_gid] = convert_T(alpha * work[0]);
|
||||
}
|
||||
|
||||
Dtype4 sum = work[0] * alpha;
|
||||
i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const T4 a0 = vload4(i, src0_read);
|
||||
const T4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const T4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const T4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 = convert_float4(a0) - (Dtype4)sum.x;
|
||||
dot1 = convert_float4(a1) - (Dtype4)sum.y;
|
||||
dot2 = convert_float4(a2) - (Dtype4)sum.z;
|
||||
dot3 = convert_float4(a3) - (Dtype4)sum.w;
|
||||
dot0 = dot0 * dot0;
|
||||
dot1 = dot1 * dot1;
|
||||
dot2 = dot2 * dot2;
|
||||
dot3 = dot3 * dot3;
|
||||
|
||||
vstore4(dot0, i, dst0_read);
|
||||
vstore4(dot1, i, dst0_read + A_col_size);
|
||||
vstore4(dot2, i, dst0_read + 2 * A_col_size);
|
||||
vstore4(dot3, i, dst0_read + 3 * A_col_size);
|
||||
|
||||
i += LOCAL_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined KERNEL_MVN_FUSE
|
||||
|
||||
__kernel void MVN_FUSE(__global const Dtype * tmp,
|
||||
__global const T * A,
|
||||
__global const T4 * mean,
|
||||
unsigned int A_col_size,
|
||||
const float alpha_val,
|
||||
const float eps,
|
||||
const float relu_slope,
|
||||
__global const Dtype4 * bnorm_weight,
|
||||
__global const Dtype4 * bnorm_bias,
|
||||
__global T * B)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global Dtype *src0_read = tmp + row_gid * 4 * A_col_size;
|
||||
const __global T *src1_read = A + row_gid * 4 * A_col_size;
|
||||
__global T *dst0_read = B + row_gid * 4 * A_col_size;
|
||||
Dtype4 dot0, dot1, dot2, dot3;
|
||||
dot0 = dot1 = dot2 = dot3 = (Dtype4)(0.f);
|
||||
|
||||
unsigned int i = lid;
|
||||
const Dtype4 b0 = (Dtype4)1.f;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const Dtype4 a0 = vload4(i, src0_read);
|
||||
const Dtype4 a1 = vload4(i, src0_read + A_col_size);
|
||||
const Dtype4 a2 = vload4(i, src0_read + 2 * A_col_size);
|
||||
const Dtype4 a3 = vload4(i, src0_read + 3 * A_col_size);
|
||||
|
||||
dot0 += a0;
|
||||
dot1 += a1;
|
||||
dot2 += a2;
|
||||
dot3 += a3;
|
||||
|
||||
i += LOCAL_SIZE;
|
||||
}
|
||||
|
||||
__local Dtype4 work[LOCAL_SIZE];
|
||||
work[lid].s0 = dot(dot0, b0);
|
||||
work[lid].s1 = dot(dot1, b0);
|
||||
work[lid].s2 = dot(dot2, b0);
|
||||
work[lid].s3 = dot(dot3, b0);
|
||||
|
||||
for(unsigned int stride=LOCAL_SIZE/2 ; stride>0 ; stride>>=1)
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
if(lid < stride)
|
||||
work[lid] += work[lid+stride];
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
Dtype4 mean_val = convert_float4(mean[row_gid]);
|
||||
Dtype4 dev_val = sqrt(work[0] * alpha_val + (Dtype4)eps);
|
||||
Dtype4 alpha = (Dtype4)1.f / dev_val;
|
||||
|
||||
Dtype4 w = (Dtype4)1.f;
|
||||
Dtype4 b = (Dtype4)0.f;
|
||||
#ifdef FUSE_BATCH_NORM
|
||||
w = bnorm_weight[row_gid];
|
||||
b = bnorm_bias[row_gid];
|
||||
#endif
|
||||
|
||||
i = lid;
|
||||
while( i < A_col_size / 4)
|
||||
{
|
||||
const T4 a0 = vload4(i, src1_read);
|
||||
const T4 a1 = vload4(i, src1_read + A_col_size);
|
||||
const T4 a2 = vload4(i, src1_read + 2 * A_col_size);
|
||||
const T4 a3 = vload4(i, src1_read + 3 * A_col_size);
|
||||
|
||||
dot0 = (convert_float4(a0) - (Dtype4)mean_val.x) * alpha.x;
|
||||
dot1 = (convert_float4(a1) - (Dtype4)mean_val.y) * alpha.y;
|
||||
dot2 = (convert_float4(a2) - (Dtype4)mean_val.z) * alpha.z;
|
||||
dot3 = (convert_float4(a3) - (Dtype4)mean_val.w) * alpha.w;
|
||||
|
||||
dot0 = dot0 * w.x + (Dtype4)b.x;
|
||||
dot1 = dot1 * w.y + (Dtype4)b.y;
|
||||
dot2 = dot2 * w.z + (Dtype4)b.z;
|
||||
dot3 = dot3 * w.w + (Dtype4)b.w;
|
||||
|
||||
#ifdef FUSE_RELU
|
||||
Dtype4 new0 = dot0 * relu_slope;
|
||||
dot0 = select(new0, dot0, dot0 > (Dtype4)0.f);
|
||||
|
||||
Dtype4 new1 = dot1 * relu_slope;
|
||||
dot1 = select(new1, dot1, dot1 > (Dtype4)0.f);
|
||||
|
||||
Dtype4 new2 = dot2 * relu_slope;
|
||||
dot2 = select(new2, dot2, dot2 > (Dtype4)0.f);
|
||||
|
||||
Dtype4 new3 = dot3 * relu_slope;
|
||||
dot3 = select(new3, dot3, dot3 > (Dtype4)0.f);
|
||||
#endif
|
||||
|
||||
vstore4(convert_T(dot0), i, dst0_read);
|
||||
vstore4(convert_T(dot1), i, dst0_read + A_col_size);
|
||||
vstore4(convert_T(dot2), i, dst0_read + 2 * A_col_size);
|
||||
vstore4(convert_T(dot3), i, dst0_read + 3 * A_col_size);
|
||||
|
||||
i += LOCAL_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#error "Configuration error!"
|
||||
#endif
|
||||
100
Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_lrn.cl
Normal file
100
Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_lrn.cl
Normal file
@@ -0,0 +1,100 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
#define KERNEL_ARG_DTYPE float
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void TEMPLATE(lrn_full_no_scale,Dtype)(const int nthreads, __global const Dtype* in,
|
||||
const int num, const int channels,
|
||||
const int height, const int width, const int size,
|
||||
const KERNEL_ARG_DTYPE alpha_over_size, const KERNEL_ARG_DTYPE k,
|
||||
__global Dtype* const out,
|
||||
const KERNEL_ARG_DTYPE negative_beta) {
|
||||
for (int index = get_global_id(0); index < nthreads;
|
||||
index += get_global_size(0)) {
|
||||
// find out the local offset
|
||||
const int w = index % width;
|
||||
const int h = (index / width) % height;
|
||||
const int n = index / width / height;
|
||||
const int offset = (n * channels * height + h) * width + w;
|
||||
const int step = height * width;
|
||||
__global const Dtype* in_off = in + offset;
|
||||
__global Dtype* out_off = out + offset;
|
||||
KERNEL_ARG_DTYPE scale_val;
|
||||
int head = 0;
|
||||
const int pre_pad = (size - 1) / 2;
|
||||
const int post_pad = size - pre_pad - 1;
|
||||
KERNEL_ARG_DTYPE accum_scale = 0;
|
||||
// fill the scale at [n, :, h, w]
|
||||
// accumulate values
|
||||
while (head < post_pad && head < channels) {
|
||||
accum_scale += in_off[head * step] * in_off[head * step];
|
||||
++head;
|
||||
}
|
||||
// both add and subtract
|
||||
while (head < channels) {
|
||||
accum_scale += in_off[head * step] * in_off[head * step];
|
||||
if (head - size >= 0) {
|
||||
accum_scale -= in_off[(head - size) * step]
|
||||
* in_off[(head - size) * step];
|
||||
}
|
||||
scale_val = k + accum_scale * alpha_over_size;
|
||||
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
|
||||
++head;
|
||||
}
|
||||
// subtract only
|
||||
while (head < channels + post_pad) {
|
||||
if (head - size >= 0) {
|
||||
accum_scale -= in_off[(head - size) * step]
|
||||
* in_off[(head - size) * step];
|
||||
}
|
||||
scale_val = k + accum_scale * alpha_over_size;
|
||||
out_off[(head - post_pad) * step] = in_off[(head - post_pad) * step] * (Dtype)native_powr((Dtype)scale_val, (Dtype)negative_beta);
|
||||
++head;
|
||||
}
|
||||
}
|
||||
}
|
||||
186
Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_pooling.cl
Normal file
186
Lib/opencv/sources/modules/dnn/src/opencl/ocl4dnn_pooling.cl
Normal file
@@ -0,0 +1,186 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
#if defined KERNEL_MAX_POOL
|
||||
|
||||
__kernel void
|
||||
#ifdef HAVE_MASK
|
||||
TEMPLATE(max_pool_forward_mask, Dtype)
|
||||
#else
|
||||
TEMPLATE(max_pool_forward, Dtype)
|
||||
#endif
|
||||
(
|
||||
const int nthreads, __global const Dtype* bottom_data,
|
||||
const int channels, const int height, const int width,
|
||||
const int pooled_height, const int pooled_width,
|
||||
__global Dtype* top_data
|
||||
#ifdef HAVE_MASK
|
||||
, __global Dtype* mask
|
||||
#endif
|
||||
)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index >= nthreads)
|
||||
return;
|
||||
|
||||
const int pw = index % pooled_width;
|
||||
const int xx = index / pooled_width;
|
||||
const int ph = xx % pooled_height;
|
||||
const int ch = xx / pooled_height;
|
||||
int hstart = ph * STRIDE_H - PAD_T;
|
||||
int wstart = pw * STRIDE_W - PAD_L;
|
||||
Dtype maxval = -FLT_MAX;
|
||||
int maxidx = -1;
|
||||
int in_offset = ch * height * width;
|
||||
for (int h = 0; h < KERNEL_H; ++h)
|
||||
{
|
||||
int off_y = hstart + h;
|
||||
if (off_y >= 0 && off_y < height)
|
||||
{
|
||||
for (int w = 0; w < KERNEL_W; ++w)
|
||||
{
|
||||
int off_x = wstart + w;
|
||||
if (off_x >= 0 && off_x < width)
|
||||
{
|
||||
Dtype val = bottom_data[in_offset + off_y * width + off_x];
|
||||
maxidx = (val > maxval) ? (off_y * width + off_x) : maxidx;
|
||||
maxval = fmax(val, maxval);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
top_data[index] = maxval;
|
||||
#ifdef HAVE_MASK
|
||||
mask[index] = maxidx;
|
||||
#endif
|
||||
}
|
||||
|
||||
#elif defined KERNEL_AVE_POOL
|
||||
|
||||
__kernel void TEMPLATE(ave_pool_forward, Dtype)(
|
||||
const int nthreads, __global const Dtype* bottom_data,
|
||||
const int channels, const int height, const int width,
|
||||
const int pooled_height, const int pooled_width,
|
||||
__global Dtype* top_data)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
if (index >= nthreads)
|
||||
return;
|
||||
|
||||
const int pw = index % pooled_width;
|
||||
const int xx = index / pooled_width;
|
||||
const int ph = xx % pooled_height;
|
||||
const int ch = xx / pooled_height;
|
||||
int hstart = ph * STRIDE_H - PAD_T;
|
||||
int wstart = pw * STRIDE_W - PAD_L;
|
||||
int hend = min(hstart + KERNEL_H, height + PAD_B);
|
||||
int wend = min(wstart + KERNEL_W, width + PAD_R);
|
||||
int pool_size;
|
||||
#ifdef AVE_POOL_PADDING_AREA
|
||||
pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
#else
|
||||
hstart = max(hstart, (int)0);
|
||||
wstart = max(wstart, (int)0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
pool_size = (hend - hstart) * (wend - wstart);
|
||||
#endif
|
||||
Dtype aveval = 0;
|
||||
int in_offset = ch * height * width;
|
||||
for (int h = hstart; h < hend; ++h)
|
||||
{
|
||||
for (int w = wstart; w < wend; ++w)
|
||||
{
|
||||
aveval += bottom_data[in_offset + h * width + w];
|
||||
}
|
||||
}
|
||||
top_data[index] = aveval / pool_size;
|
||||
}
|
||||
|
||||
#elif defined KERNEL_STO_POOL
|
||||
|
||||
__kernel void TEMPLATE(sto_pool_forward_test,Dtype)(
|
||||
const int nthreads, __global const Dtype* bottom_data,
|
||||
const int channels, const int height, const int width,
|
||||
const int pooled_height, const int pooled_width,
|
||||
__global Dtype* top_data)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads;
|
||||
index += get_global_size(0))
|
||||
{
|
||||
const int pw = index % pooled_width;
|
||||
const int ph = (index / pooled_width) % pooled_height;
|
||||
const int c = (index / pooled_width / pooled_height) % channels;
|
||||
const int n = index / pooled_width / pooled_height / channels;
|
||||
const int hstart = ph * STRIDE_H;
|
||||
const int hend = min(hstart + KERNEL_H, height);
|
||||
const int wstart = pw * STRIDE_W;
|
||||
const int wend = min(wstart + KERNEL_W, width);
|
||||
// We set cumsum to be 0 to avoid divide-by-zero problems
|
||||
Dtype cumsum = FLT_MIN;
|
||||
Dtype cumvalues = 0.;
|
||||
__global const Dtype* bottom_slice = bottom_data
|
||||
+ (n * channels + c) * height * width;
|
||||
// First pass: get sum
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
Dtype v = bottom_slice[h * width + w];
|
||||
cumsum += v;
|
||||
cumvalues += v * v;
|
||||
}
|
||||
}
|
||||
top_data[index] = cumvalues / cumsum;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // KERNEL_*
|
||||
69
Lib/opencv/sources/modules/dnn/src/opencl/permute.cl
Normal file
69
Lib/opencv/sources/modules/dnn/src/opencl/permute.cl
Normal file
@@ -0,0 +1,69 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void permute(const int nthreads,
|
||||
__global Dtype* bottom_data,
|
||||
global int* permute_order,
|
||||
global int* oldStride,
|
||||
global int* newStride,
|
||||
const int num_axes,
|
||||
__global Dtype* top_data)
|
||||
{
|
||||
for (int i = get_global_id(0); i < nthreads; i += get_global_size(0))
|
||||
{
|
||||
int oldPosition = 0;
|
||||
int newPosition = i;
|
||||
|
||||
for (int j = 0; j < num_axes; ++j)
|
||||
{
|
||||
int order = permute_order[j];
|
||||
oldPosition += (newPosition / newStride[j]) * oldStride[order];
|
||||
newPosition %= newStride[j];
|
||||
}
|
||||
|
||||
top_data[i] = bottom_data[oldPosition];
|
||||
}
|
||||
}
|
||||
106
Lib/opencv/sources/modules/dnn/src/opencl/pooling.cl
Normal file
106
Lib/opencv/sources/modules/dnn/src/opencl/pooling.cl
Normal file
@@ -0,0 +1,106 @@
|
||||
/*************************************************************************************
|
||||
* Copyright (c) 2015, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation and/or
|
||||
* other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
**************************************************************************************/
|
||||
|
||||
__kernel void MaxPoolForward(const int nthreads,
|
||||
__global T* bottom_data, const int num, const int channels, const int height, const int width,
|
||||
const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,
|
||||
__global T* top_data
|
||||
#ifdef MASK
|
||||
, __global float* mask
|
||||
#endif
|
||||
)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
int tmp = get_global_size(0);
|
||||
for(index; index < nthreads; index += tmp) {
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
int n = index / pooled_width / pooled_height / channels;
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
const int hend = min(hstart + kernel_h, height);
|
||||
const int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
T maxval = -FLT_MAX;
|
||||
int maxidx = -1;
|
||||
bottom_data =
|
||||
bottom_data + (n * channels + c) * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
if (bottom_data[h * width + w] > maxval) {
|
||||
maxidx = h * width + w;
|
||||
maxval = bottom_data[maxidx];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
top_data[index] = maxval;
|
||||
|
||||
#ifdef MASK
|
||||
mask[index] = maxidx;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void AvePoolForward(const int nthreads,
|
||||
__global T* bottom_data, const int num, const int channels, const int height, const int width,
|
||||
const int pooled_height, const int pooled_width, const int kernel_h, const int kernel_w,
|
||||
const int stride_h, const int stride_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r,
|
||||
__global T* top_data
|
||||
#ifdef MASK
|
||||
, __global float* mask // NOT USED
|
||||
#endif
|
||||
)
|
||||
{
|
||||
int index = get_global_id(0);
|
||||
int tmp = get_global_size(0);
|
||||
for(index; index < nthreads; index+=tmp) {
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
int n = index / pooled_width / pooled_height / channels; int hstart = ph * stride_h - pad_t; int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height + pad_b);
|
||||
int wend = min(wstart + kernel_w, width + pad_r);
|
||||
const int pool_size = (hend - hstart) * (wend - wstart);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
hend = min(hend, height);
|
||||
wend = min(wend, width);
|
||||
T aveval = 0;
|
||||
bottom_data =
|
||||
bottom_data + (n * channels + c) * height * width;
|
||||
for (int h = hstart; h < hend; ++h) {
|
||||
for (int w = wstart; w < wend; ++w) {
|
||||
aveval += bottom_data[h * width + w];
|
||||
}
|
||||
}
|
||||
top_data[index] = aveval / pool_size;
|
||||
}
|
||||
|
||||
}
|
||||
119
Lib/opencv/sources/modules/dnn/src/opencl/prior_box.cl
Normal file
119
Lib/opencv/sources/modules/dnn/src/opencl/prior_box.cl
Normal file
@@ -0,0 +1,119 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void prior_box(const int nthreads,
|
||||
const float stepX,
|
||||
const float stepY,
|
||||
__global const float* _offsetsX,
|
||||
__global const float* _offsetsY,
|
||||
const int offsetsX_size,
|
||||
__global const float* _widths,
|
||||
__global const float* _heights,
|
||||
const int widths_size,
|
||||
__global Dtype* dst,
|
||||
const int _layerHeight,
|
||||
const int _layerWidth,
|
||||
const int imgHeight,
|
||||
const int imgWidth)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
int w = index % _layerWidth;
|
||||
int h = index / _layerWidth;
|
||||
__global Dtype* outputPtr;
|
||||
|
||||
outputPtr = dst + index * 4 * offsetsX_size * widths_size;
|
||||
|
||||
float _boxWidth, _boxHeight;
|
||||
Dtype4 vec;
|
||||
for (int i = 0; i < widths_size; ++i)
|
||||
{
|
||||
_boxWidth = _widths[i];
|
||||
_boxHeight = _heights[i];
|
||||
for (int j = 0; j < offsetsX_size; ++j)
|
||||
{
|
||||
Dtype center_x = (w + _offsetsX[j]) * (Dtype)stepX;
|
||||
Dtype center_y = (h + _offsetsY[j]) * (Dtype)stepY;
|
||||
|
||||
vec.x = (center_x - _boxWidth * 0.5f) / imgWidth; // xmin
|
||||
vec.y = (center_y - _boxHeight * 0.5f) / imgHeight; // ymin
|
||||
vec.z = (center_x + _boxWidth * 0.5f) / imgWidth; // xmax
|
||||
vec.w = (center_y + _boxHeight * 0.5f) / imgHeight; // ymax
|
||||
vstore4(vec, 0, outputPtr);
|
||||
|
||||
outputPtr += 4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void set_variance(const int nthreads,
|
||||
const int offset,
|
||||
const int variance_size,
|
||||
__global const float* variance,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
Dtype4 var_vec;
|
||||
|
||||
if (variance_size == 1)
|
||||
var_vec = (Dtype4)(variance[0]);
|
||||
else
|
||||
var_vec = convert_T(vload4(0, variance));
|
||||
|
||||
vstore4(var_vec, 0, dst + offset + index * 4);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void clip(const int nthreads,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
for (int index = get_global_id(0); index < nthreads; index += get_global_size(0))
|
||||
{
|
||||
Dtype4 vec = vload4(index, dst);
|
||||
vstore4(clamp(vec, 0.0f, 1.0f), index, dst);
|
||||
}
|
||||
}
|
||||
109
Lib/opencv/sources/modules/dnn/src/opencl/region.cl
Normal file
109
Lib/opencv/sources/modules/dnn/src/opencl/region.cl
Normal file
@@ -0,0 +1,109 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define Dtype float
|
||||
|
||||
__kernel void logistic_activ(const int count,
|
||||
__global const Dtype* src,
|
||||
const int cell_size,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
for (int i = get_global_id(0); i < count; i += get_global_size(0))
|
||||
{
|
||||
int index = cell_size * i;
|
||||
Dtype x = src[index + 4];
|
||||
dst[index + 4] = 1.f / (1.f + exp(-x));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void softmax_activ(const int count,
|
||||
__global const Dtype* src,
|
||||
__global const Dtype* biasData,
|
||||
const int cell_size,
|
||||
const int classes,
|
||||
const int classfix,
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int anchors,
|
||||
const float thresh,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
for (int index = get_global_id(0); index < count; index += get_global_size(0))
|
||||
{
|
||||
int box_index = index * cell_size;
|
||||
float largest = -FLT_MAX;
|
||||
__global const Dtype *input = src + box_index + 5;
|
||||
__global Dtype *output = dst + box_index + 5;
|
||||
|
||||
for (int i = 0; i < classes; ++i)
|
||||
largest = fmax(largest, input[i]);
|
||||
|
||||
float sum = 0;
|
||||
for (int i = 0; i < classes; ++i)
|
||||
{
|
||||
float e = exp((input[i] - largest));
|
||||
sum += e;
|
||||
output[i] = e;
|
||||
}
|
||||
|
||||
int y = (index / (anchors * cols)) % rows;
|
||||
int x = (index / anchors) % cols;
|
||||
int a = index % anchors;
|
||||
float scale = dst[box_index + 4];
|
||||
if (classfix == -1 && scale < .5) scale = 0;
|
||||
|
||||
float v1 = src[box_index + 0];
|
||||
float v2 = src[box_index + 1];
|
||||
float l1 = 1.f / (1.f + exp(-v1));
|
||||
float l2 = 1.f / (1.f + exp(-v2));
|
||||
|
||||
dst[box_index + 0] = (x + l1) / cols;
|
||||
dst[box_index + 1] = (y + l2) / rows;
|
||||
dst[box_index + 2] = exp(src[box_index + 2]) * biasData[2 * a] / cols;
|
||||
dst[box_index + 3] = exp(src[box_index + 3]) * biasData[2 * a + 1] / rows;
|
||||
|
||||
for (int i = 0; i < classes; ++i)
|
||||
{
|
||||
float prob = scale * output[i] / sum;
|
||||
output[i] = (prob > thresh) ? prob : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
81
Lib/opencv/sources/modules/dnn/src/opencl/slice.cl
Normal file
81
Lib/opencv/sources/modules/dnn/src/opencl/slice.cl
Normal file
@@ -0,0 +1,81 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void slice(__global const Dtype* src,
|
||||
const int src_plane_size,
|
||||
const int dst_plane_size,
|
||||
const int src_cols,
|
||||
const int dst_cols,
|
||||
const int row_offset,
|
||||
const int col_offset,
|
||||
__global Dtype* dst)
|
||||
{
|
||||
unsigned int row_gid = get_group_id(0);
|
||||
unsigned int lid = get_local_id(0);
|
||||
const __global Dtype *src_read = src + row_gid * 4 * src_plane_size;
|
||||
__global Dtype *dst_read = dst + row_gid * 4 * dst_plane_size;
|
||||
Dtype4 a0, a1, a2, a3;
|
||||
|
||||
int i = lid;
|
||||
while( i < dst_plane_size / 4)
|
||||
{
|
||||
int row = (4 * i) / dst_cols + row_offset;
|
||||
int col = (4 * i) % dst_cols + col_offset;
|
||||
int src_index = row * src_cols + col;
|
||||
|
||||
a0 = vload4(0, src_read + src_index);
|
||||
a1 = vload4(0, src_read + src_index + src_plane_size);
|
||||
a2 = vload4(0, src_read + src_index + 2 * src_plane_size);
|
||||
a3 = vload4(0, src_read + src_index + 3 * src_plane_size);
|
||||
|
||||
vstore4(a0, i, dst_read);
|
||||
vstore4(a1, i, dst_read + dst_plane_size);
|
||||
vstore4(a2, i, dst_read + 2 * dst_plane_size);
|
||||
vstore4(a3, i, dst_read + 3 * dst_plane_size);
|
||||
|
||||
i += get_local_size(0);
|
||||
}
|
||||
}
|
||||
83
Lib/opencv/sources/modules/dnn/src/opencl/softmax.cl
Normal file
83
Lib/opencv/sources/modules/dnn/src/opencl/softmax.cl
Normal file
@@ -0,0 +1,83 @@
|
||||
/*************************************************************************************
|
||||
* Copyright (c) 2015, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification,
|
||||
* are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation and/or
|
||||
* other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
|
||||
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
|
||||
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
**************************************************************************************/
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void kernel_channel_max(const int num, const int channels,
|
||||
const int spatial_dim, __global const T* data, __global T* out) {
|
||||
int index = get_global_id(0);
|
||||
if(index < num * spatial_dim) {
|
||||
int n = index / spatial_dim;
|
||||
int s = index % spatial_dim;
|
||||
T maxval = -FLT_MAX;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
maxval = max(data[(n * channels + c) * spatial_dim + s], maxval);
|
||||
}
|
||||
out[index] = maxval;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_channel_subtract(const int count,
|
||||
const int num, const int channels,
|
||||
const int spatial_dim, __global const T* channel_max, __global const T* src, __global T* data) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count) {
|
||||
int n = index / channels / spatial_dim;
|
||||
int s = index % spatial_dim;
|
||||
data[index] = exp(src[index] - channel_max[n * spatial_dim + s]);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_channel_sum(const int num, const int channels,
|
||||
const int spatial_dim, __global const T* data, __global T* channel_sum) {
|
||||
int index = get_global_id(0);
|
||||
if(index < num * spatial_dim) {
|
||||
int n = index / spatial_dim;
|
||||
int s = index % spatial_dim;
|
||||
T sum = 0;
|
||||
for (int c = 0; c < channels; ++c) {
|
||||
sum += data[(n * channels + c) * spatial_dim + s];
|
||||
}
|
||||
channel_sum[index] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_channel_div(const int count,
|
||||
const int num, const int channels,
|
||||
const int spatial_dim, __global const T* channel_sum, __global T* data) {
|
||||
int index = get_global_id(0);
|
||||
if(index < count) {
|
||||
int n = index / channels / spatial_dim;
|
||||
int s = index % spatial_dim;
|
||||
T v = data[index] / channel_sum[n * spatial_dim + s];
|
||||
#ifdef LOG_SOFTMAX
|
||||
v = log(v);
|
||||
#endif
|
||||
data[index] = v;
|
||||
}
|
||||
}
|
||||
193
Lib/opencv/sources/modules/dnn/src/opencl/softmax_loss.cl
Normal file
193
Lib/opencv/sources/modules/dnn/src/opencl/softmax_loss.cl
Normal file
@@ -0,0 +1,193 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2017, Intel Corporation, all rights reserved.
|
||||
// Copyright (c) 2016-2017 Fabian David Tschopp, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#define CONCAT(A,B) A##_##B
|
||||
#define TEMPLATE(name,type) CONCAT(name,type)
|
||||
|
||||
#if defined(cl_intel_subgroups)
|
||||
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
||||
#endif
|
||||
|
||||
#if defined(cl_khr_fp16)
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#endif
|
||||
|
||||
__kernel void TEMPLATE(softmax_forward_slm,Dtype)(const int num, const int channels,
|
||||
const int spatial_dim,
|
||||
__global Dtype* scale,
|
||||
__global const Dtype* data,
|
||||
__global Dtype* out,
|
||||
__local Dtype *out_tmp,
|
||||
__local Dtype *scale_tmp,
|
||||
__local Dtype *group_tmp) {
|
||||
|
||||
int n = get_global_id(1);
|
||||
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
|
||||
get_global_size(0), ++s) {
|
||||
Dtype maxval = -DTYPE_MAX;
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
Dtype tmp = data[(n * channels + c) * spatial_dim + s];
|
||||
maxval = max((Dtype)tmp, (Dtype)maxval);
|
||||
}
|
||||
maxval = sub_group_reduce_max(maxval);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
|
||||
get_global_size(0)) {
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale_tmp[s] = maxval;
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < channels * spatial_dim;
|
||||
index += get_global_size(0)) {
|
||||
int s = index % spatial_dim;
|
||||
out_tmp[index] = exp(data[n * channels * spatial_dim + index] - scale_tmp[s]);
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
|
||||
get_global_size(0), ++s) {
|
||||
Dtype sum = 0;
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
sum += out_tmp[c * spatial_dim + s];
|
||||
}
|
||||
sum = sub_group_reduce_add(sum);
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
|
||||
get_global_size(0)) {
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale_tmp[s] = sum;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < channels * spatial_dim;
|
||||
index += get_global_size(0)) {
|
||||
int s = index % spatial_dim;
|
||||
Dtype v = out_tmp[index] / scale_tmp[s];
|
||||
#ifdef LOG_SOFTMAX
|
||||
v = log(v);
|
||||
#endif
|
||||
out[n * channels * spatial_dim + index] = v;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void TEMPLATE(softmax_forward,Dtype)(const int num, const int channels,
|
||||
const int spatial_dim,
|
||||
__global Dtype* scale,
|
||||
__global const Dtype* data,
|
||||
__global Dtype* out) {
|
||||
|
||||
int n = get_global_id(1);
|
||||
__global Dtype *group_tmp = scale + spatial_dim * num + n * get_max_sub_group_size() * spatial_dim;
|
||||
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
|
||||
get_global_size(0), ++s) {
|
||||
Dtype maxval = -DTYPE_MAX;
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
Dtype tmp = data[(n * channels + c) * spatial_dim + s];
|
||||
maxval = max((Dtype)tmp, (Dtype)maxval);
|
||||
}
|
||||
maxval = sub_group_reduce_max(maxval);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = maxval;
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
|
||||
get_global_size(0)) {
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype maxval = sub_group_reduce_max(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale[n * spatial_dim + s] = maxval;
|
||||
}
|
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < channels * spatial_dim;
|
||||
index += get_global_size(0)) {
|
||||
int s = index % spatial_dim;
|
||||
out[n * channels * spatial_dim + index] = exp(data[n * channels * spatial_dim + index] - scale[n * spatial_dim + s]);
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0), s = 0; index < spatial_dim * get_local_size(0); index +=
|
||||
get_global_size(0), ++s) {
|
||||
Dtype sum = 0;
|
||||
for (int c = get_global_id(0); c < channels; c += get_global_size(0)) {
|
||||
sum += out[n * channels * spatial_dim + c * spatial_dim + s];
|
||||
}
|
||||
sum = sub_group_reduce_add(sum);
|
||||
group_tmp[get_sub_group_id() * spatial_dim + s] = sum;
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < spatial_dim * get_max_sub_group_size(); index +=
|
||||
get_global_size(0)) {
|
||||
int s = index / get_max_sub_group_size();
|
||||
Dtype sum = sub_group_reduce_add(group_tmp[get_sub_group_local_id() * spatial_dim + s]);
|
||||
//if (get_sub_group_local_id() == 0)
|
||||
scale[n * spatial_dim + s] = sum;
|
||||
}
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
|
||||
for (int index = get_global_id(0); index < channels * spatial_dim;
|
||||
index += get_global_size(0)) {
|
||||
int s = index % spatial_dim;
|
||||
Dtype v = out[n * channels * spatial_dim + index] / scale[n * spatial_dim + s];
|
||||
#ifdef LOG_SOFTMAX
|
||||
v = log(v);
|
||||
#endif
|
||||
out[n * channels * spatial_dim + index] = v;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user