Unverified Commit ad6c0f90 authored by Steve Bronder's avatar Steve Bronder
Browse files

Merge branch 'opencl_context_rewrite' of https://github.com/rok-cesnovar/math...

Merge branch 'opencl_context_rewrite' of https://github.com/rok-cesnovar/math into opencl_context_rewrite
parents 596a5448 e36c820b
Showing with 711 additions and 3 deletions
+711 -3
......@@ -124,7 +124,7 @@ pipeline {
CppLint: { sh "make cpplint" },
Dependencies: { sh 'make test-math-dependencies' } ,
Documentation: { sh 'make doxygen' },
Headers: { sh "make -j${env.PARALLEL} test-headers" }
Headers: { sh "make -j${env.PARALLEL} test-headers STAN_OPENCL=true" }
)
}
}
......@@ -139,10 +139,11 @@ pipeline {
stage('Tests') {
parallel {
stage('Unit') {
agent any
agent { label "gelman-group-mac" }
steps {
unstash 'MathSetup'
sh setupCC()
sh "echo STAN_OPENCL=true>> make/local"
runTests("test/unit")
}
post { always { retry(3) { deleteDir() } } }
......
......@@ -29,10 +29,16 @@ ifeq (g++,$(CC_TYPE))
CXXFLAGS += -Wno-unused-but-set-variable
LDLIBS += -static-libgcc
LDLIBS += -static-libstdc++
ifdef STAN_OPENCL
LDFLAGS += -lOpenCL
endif
endif
ifeq (mingw32-g++,$(CC_TYPE))
LDLIBS += -static-libgcc
# LDLIBS += -static-libstdc++
ifdef STAN_OPENCL
LDFLAGS += -lOpenCL
endif
endif
ifdef STAN_OPENCL
......
......@@ -13,7 +13,7 @@ $(GTEST)/src/gtest-all.o: CXXFLAGS += $(GTEST_CXXFLAGS)
test/%$(EXE) : CXXFLAGS += $(GTEST_CXXFLAGS)
test/%$(EXE) : test/%.o $(GTEST_MAIN) $(GTEST)/src/gtest-all.o
$(LINK.cpp) -o $@ $^
$(LINK.cpp) -o $@ $^ $(LDLIBS_OPENCL)
##
......
R"=====(
#define A(i,j) A[j*rows+i]
#define AT(i,j) A[j*cols+i]
#define B(i,j) B[j*rows+i]
#define BT(i,j) B[j*cols+i]
#define C(i,j) C[j*rows+i]
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double not supported by OpenCL implementation."
#endif
__kernel void transpose(
__global double *B,
__global double *A,
int rows,
int cols ) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
BT(j,i) = A(i,j);
}
}
__kernel void copy(
__global double *A,
__global double *B,
unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
B(i,j) = A(i,j);
}
}
__kernel void zeros(
__global double *A,
unsigned int rows,
unsigned int cols,
unsigned int part) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
if ( part == 0 && j < i ) {
A(i,j) = 0;
} else if ( part == 1 && j > i ) {
A(i,j) = 0;
} else if ( part == 2 ) {
A(i,j) = 0;
}
}
}
__kernel void identity(
__global double *A,
unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
if ( i == j ) {
A(i,j) = 1.0;
} else {
A(i,j) = 0.0;
}
}
}
__kernel void copy_triangular(
__global double *A,
__global double *B,
unsigned int rows,
unsigned int cols,
unsigned int lower_upper) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
if ( !lower_upper && j <= i ) {
A(i,j) = B(i,j);
} else if ( !lower_upper ) {
A(i,j) = 0;
} else if ( lower_upper && j >= i ) {
A(i,j) = B(i,j);
} else if ( lower_upper && j < i ) {
A(i,j) = 0;
}
}
}
__kernel void copy_triangular_transposed(
__global double *A,
unsigned int rows,
unsigned int cols,
unsigned int lower_to_upper) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
if ( lower_to_upper && j > i ) {
AT(j,i) = A(i,j);
} else if ( !lower_to_upper && j > i ) {
A(i,j) = AT(j,i);
}
}
}
__kernel void add(
__global double *C,
__global double *A,
__global double *B,
unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
C(i,j) = A(i,j) + B(i,j);
}
}
__kernel void subtract(
__global double *C,
__global double *A,
__global double *B,
unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
C(i,j) = A(i,j)-B(i,j);
}
}
#define src(i,j) src[j*src_rows+i]
#define dst(i,j) dst[j*dst_rows+i]
__kernel void copy_submatrix(
__global double *src,
__global double *dst,
unsigned int src_offset_i,
unsigned int src_offset_j,
unsigned int dst_offset_i,
unsigned int dst_offset_j,
unsigned int size_i,
unsigned int size_j,
unsigned int src_rows,
unsigned int src_cols,
unsigned int dst_rows,
unsigned int dst_cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( (i+src_offset_i) < src_rows && (j+src_offset_j) < src_cols
&& (i+dst_offset_i) < dst_rows && (j+dst_offset_j) < dst_cols ) {
dst((dst_offset_i+i), (dst_offset_j+j)) =
src((src_offset_i+i),(src_offset_j+j));
}
}
)====="
R"=====(
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double not supported by OpenCL implementation."
#endif
#define A(i,j) A[j*rows+i]
__kernel void check_nan(
__global double *A,
int rows,
int cols,
__global int *flag) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if( i < rows && j < cols ) {
if (isnan(A(i,j))) {
flag[0] = 1;
}
}
}
__kernel void check_diagonal_zeros(
__global double *A,
int rows,
int cols,
__global int *flag) {
const int i = get_global_id(0);
if( i < rows && i < cols ) {
if (A(i,i) == 0) {
flag[0] = 1;
}
}
}
__kernel void check_symmetric(
__global double *A,
int rows,
int cols,
__global int *flag,
double tolerance) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if( i < rows && j < cols ) {
double diff = fabs(A(i,j)-A(j,i));
if ( diff > tolerance ) {
flag[0] = 1;
}
}
}
)====="
R"=====(
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double not supported by OpenCL implementation."
#endif
#define V(i,j) V[j*n+i]
#define d(i,j) d[j*n+i]
__kernel void cholesky_block(
__global double *V,
__global double *d,
int n) {
int f = get_local_id(0);
for (int j = 0; j <n; j++) {
double ss = 0;
for (int k = 0; k < j; k++) {
ss += V(j,k) * V(j,k);
}
V(j,j) = sqrt(d(j,j) - ss);
barrier(CLK_LOCAL_MEM_FENCE);
int i=f;
if ( i >= (j+1) && i < n ){
double s = 0;
for (int k = 0; k < j; k++) {
s += V(i,k) * V(j,k);
}
V(i,j) = (1.0 / V(j,j) * (d(i,j) - s));
}
}
}
)====="
R"=====(
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double not supported by OpenCL implementation."
#endif
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double precision not supported by OpenCL implementation."
#endif
#define A(i,j) A[j*rows+i]
#define V(i,j) V[offset+j*(part_size_fixed+1)+i]
__kernel void lower_tri_inv_step1(
__global double* A,
__global double* V,
int remainder,
int part_size_fixed,
int rows) {
int indeks = get_global_id(0);
int i = indeks*part_size_fixed;
int part_size;
double faktor;
if ( indeks < remainder ) {
i += indeks;
part_size = part_size_fixed+1;
} else {
i += remainder;
part_size = part_size_fixed;
}
int offset = indeks*(part_size_fixed+1)*(part_size_fixed+1);
for (int p = 0; p < part_size; p++) {
for (int r = 0; r < part_size; r++) {
if ( p == r )
V(p,r) = 1;
else
V(p,r) = 0;
}
}
for (unsigned int ii = 0; ii < part_size; ii++) {
if ( ii > 0 ) {
for (unsigned int j = ii; j < part_size; j++) {
faktor = A((j+i),(i+ii-1));
for (unsigned int k = 0; k < part_size; k++) {
V(j,k) -= faktor*V((ii-1),k);
}
}
}
faktor = A((ii+i),(ii+i));
for (unsigned int k = 0; k < part_size; k++) {
V(ii,k) /= faktor;
}
}
for (int p = 0; p < part_size; p++) {
for (int r=0; r < part_size; r++) {
A((p+i),(i+r)) = V(p,r);
}
}
}
#define temp(i,j) temp[(n/2)*(sizeM)*(sizeM)+j*part_size2+i]
#define WPT 4
#define RTS TS2/WPT
__kernel void lower_tri_inv_step2(
__global double* A,
__global int* sizes,
__global double* temp,
int repeat,
int remainder,
int part_size_fixed,
int rows) {
int n = get_global_id(2)*2;
double sum = 0;
int part_size1 = 0, part_size2 = 0;
int offset_i, offset_j;
for (int r = n*repeat; r < (n+1)*repeat; r++)
part_size1 += sizes[r];
for (int r = (n+1)*repeat; r < (n+2)*repeat; r++)
part_size2 += sizes[r];
int sizeM = repeat*(part_size_fixed+1);
offset_i = (n+1)*repeat*part_size_fixed;
offset_j = n*repeat*part_size_fixed;
if ( ((n+1)*repeat) <= remainder )
offset_i += (n+1)*repeat;
else
offset_i += remainder;
if ( (n*repeat) <= remainder )
offset_j += n*repeat;
else
offset_j += remainder;
const int row = get_local_id(0);
const int col = get_local_id(1);
const int i = TS2*get_group_id(0) + row;
const int j = TS2*get_group_id(1) + col;
__local double Asub[TS2][TS2];
__local double Bsub[TS2][TS2];
double acc[WPT];
for (int w = 0; w < WPT; w++) {
acc[w] = 0.0;
}
const int numTiles = (part_size2+TS2-1)/TS2;
sum = 0;
for (int t = 0; t < numTiles; t++) {
for (int w = 0; w < WPT; w++) {
const int tiledRow = TS2*t + row;
const int tiledCol = TS2*t + col;
if ( i < part_size2 && (tiledCol+w*RTS) < part_size1 &&
(tiledCol+offset_j+part_size1+w*RTS) < rows && (i+offset_i) < rows ) {
Asub[col+w*RTS][row] =
A((i+offset_i),(tiledCol+offset_j+part_size1+w*RTS));
} else {
Asub[col+w*RTS][row] = 0.0;
}
if ( (j+w*RTS) < part_size1 && tiledRow < part_size2 &&
(tiledRow+offset_i) < rows && (j+offset_j+w*RTS) < rows ) {
Bsub[col+w*RTS][row] = A((tiledRow+offset_i),(j+offset_j+w*RTS));
} else {
Bsub[col+w*RTS][row] = 0.0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TS2; k++) {
for (int w = 0; w < WPT; w++) {
acc[w] += Asub[k][row]*Bsub[col+w*RTS][k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int w = 0; w < WPT; w++) {
if ( i < part_size2 && (j+w*RTS) < part_size1 &&
i < sizeM && (j+w*RTS) < sizeM ) {
temp(i,(j+w*RTS)) = acc[w];
}
}
}
__kernel void lower_tri_inv_step3(
__global double* A,
__global int* sizes,
__global double* temp,
int repeat,
int remainder,
int part_size_fixed,
int rows) {
int n = get_global_id(2)*2;
double sum = 0;
int part_size1 = 0, part_size2 = 0;
int offset_i, offset_j;
for (int r = n*repeat; r < (n+1)*repeat; r++)
part_size1 += sizes[r];
for (int r = (n+1)*repeat; r < (n+2)*repeat; r++)
part_size2 += sizes[r];
int sizeM = repeat*(part_size_fixed+1);
offset_i = (n+1)*repeat*part_size_fixed;
offset_j = n*repeat*part_size_fixed;
if ( ((n+1)*repeat) <= remainder )
offset_i += (n+1)*repeat;
else
offset_i += remainder;
if ( (n*repeat) <= remainder )
offset_j += n*repeat;
else
offset_j += remainder;
const int row = get_local_id(0);
const int col = get_local_id(1);
const int i = TS2*get_group_id(0) + row;
const int j = TS2*get_group_id(1) + col;
__local double Asub[TS2][TS2];
__local double Bsub[TS2][TS2];
double acc[WPT];
for (int w = 0; w < WPT; w++) {
acc[w] = 0.0f;
}
const int numTiles = (part_size1+TS2-1)/TS2;
sum = 0;
for (int t = 0; t < numTiles; t++) {
for (int w = 0; w < WPT; w++) {
const int tiledRow = TS2*t + row;
const int tiledCol = TS2*t + col;
if ( i < part_size2 && (tiledCol+w*RTS) < part_size1 &&
i < sizeM && (tiledCol+w*RTS) < sizeM ) {
Asub[col+w*RTS][row] =
temp(i,(tiledCol+w*RTS));
} else {
Asub[col+w*RTS][row] = 0.0;
}
if ( (j+w*RTS) < part_size1 && (j+offset_j+w*RTS) < rows &&
(tiledRow+offset_i-part_size1) < rows ) {
Bsub[col+w*RTS][row] =
A((tiledRow+offset_i-part_size1),(j+offset_j+w*RTS));
} else {
Bsub[col+w*RTS][row] = 0.0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k = 0; k < TS2; k++) {
for (int w = 0; w < WPT; w++) {
acc[w] += Asub[k][row]*Bsub[col+w*RTS][k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int w = 0; w < WPT; w++) {
if ( i < part_size2 && (j+w*RTS) < part_size1 &&
(i+offset_i) < rows && (j+offset_j+w*RTS) < rows ) {
A((i+offset_i),(j+offset_j+w*RTS)) = -acc[w];
}
}
}
)====="
R"=====(
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#error "Double not supported by OpenCL implementation."
#endif
#define a(i,j) a[j*rows+i]
#define b(i,j) b[j*rows+i]
#define c(i,j) c[j*rows+i]
__kernel void scalar_mul_diagonal(
__global double *a,
double scalar,
unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
if ( i < rows && i < cols ) {
a(i,i) *= scalar;
}
}
__kernel void scalar_mul(
__global double *a,
__global double *b,
double scalar,
unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ( i < rows && j < cols ) {
a(i,j) = b(i,j)*scalar;
}
}
#define A(i,j) A[j*M+i]
#define B(i,j) B[j*K+i]
#define C(i,j) C[j*M+i]
#define WPT 4
#define RTS TS/WPT
__kernel void basic_multiply(const int M, const int N, const int K,
const __global double* A,
const __global double* B,
__global double* C) {
const int row = get_local_id(0);
const int col = get_local_id(1);
const int i = TS*get_group_id(0) + row;
const int j = TS*get_group_id(1) + col;
__local double Asub[TS][TS];
__local double Bsub[TS][TS];
double acc[WPT];
for (int w=0; w<WPT; w++) {
acc[w] = 0.0;
}
const int numTiles = (K+TS-1)/TS;
for (int t=0; t<numTiles; t++) {
for (int w=0; w<WPT; w++) {
const int tiled_i = TS*t + row;
const int tiled_j = TS*t + col;
if ( i < M && (tiled_j + w*RTS) < K ) {
Asub[col + w*RTS][row] = A[(tiled_j + w*RTS)*M + i];
}else{
Asub[col + w*RTS][row] = 0.0;
}
if ( tiled_i < K && (j + w*RTS) < N ) {
Bsub[col + w*RTS][row] = B[(j + w*RTS)*K + tiled_i];
}else{
Bsub[col + w*RTS][row] = 0.0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k=0; k<TS; k++) {
for (int w=0; w<WPT; w++) {
acc[w] += Asub[k][row] * Bsub[col + w*RTS][k];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int w=0; w<WPT; w++) {
if ( (j + w*RTS) < N && i < M ) {
C[(j + w*RTS)*M + i] = acc[w];
}
}
}
#define WPT1 4
#define RTS1 TS1/WPT1
__kernel void multiply_self_transposed(const int M, const int N, const int K,
const __global double* A,
const __global double* B,
__global double* C) {
const int row = get_local_id(0);
const int col = get_local_id(1);
const int i = TS1*get_group_id(0) + row;
const int j = TS1*get_group_id(1) + col;
const int jMin = TS1*get_group_id(1);
const int iMax = TS1*get_group_id(0) + get_local_size(0);
__local double Asub[TS1][TS1];
__local double Bsub[TS1][TS1];
double acc[WPT1];
for (int w=0; w<WPT1; w++) {
acc[w] = 0.0;
}
const int numTiles = (K+TS1-1)/TS1;
for (int t=0; t<numTiles; t++) {
if(jMin <= iMax){
for (int w=0; w<WPT1; w++) {
const int tiled_i = TS1*t + row;
const int tiled_j = TS1*t + col;
if ( i < M && (tiled_j + w*RTS1) < K ) {
Asub[col + w*RTS1][row] = A[(tiled_j + w*RTS1)*M + i];
}else{
Asub[col + w*RTS1][row] = 0.0;
}
if ( tiled_i < K && (j + w*RTS1) < N ) {
Bsub[col + w*RTS1][row] = B[(j + w*RTS1)*K + tiled_i];
}else{
Bsub[col + w*RTS1][row] = 0.0;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (int k=0; k<TS1; k++) {
for (int w=0; w<WPT1; w++) {
if((j + w*RTS1)<=i){
acc[w] += Asub[k][row] * Bsub[col + w*RTS1][k];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int w=0; w<WPT1; w++) {
if ( (j + w*RTS1) < N && i < M && (j + w*RTS1)<=i ) {
C[(j + w*RTS1)*M + i] = acc[w];
C[i*M + (j + w*RTS1)] = acc[w];
}
}
}
)====="
#ifdef STAN_OPENCL
#include <stan/math/prim/mat.hpp>
#include <gtest/gtest.h>
#include <CL/cl.hpp>
TEST(MathMatrix, kernel_initialize) {
stan::math::get_context();
cl::Kernel kernel_transpose = stan::math::get_kernel("transpose");
EXPECT_NO_THROW(cl::Kernel kernel_transpose
= stan::math::get_kernel("transpose"));
EXPECT_NO_THROW(cl::Kernel kernel_copy = stan::math::get_kernel("copy"));
EXPECT_NO_THROW(cl::Kernel kernel_zeros = stan::math::get_kernel("zeros"));
EXPECT_NO_THROW(cl::Kernel kernel_identity
= stan::math::get_kernel("identity"));
EXPECT_NO_THROW(cl::Kernel kernel_copy_triangular
= stan::math::get_kernel("copy_triangular"));
EXPECT_NO_THROW(cl::Kernel kernel_scalar_copy_triangular_transposed
= stan::math::get_kernel("copy_triangular_transposed"));
EXPECT_NO_THROW(cl::Kernel kernel_add = stan::math::get_kernel("add"));
EXPECT_NO_THROW(cl::Kernel kernel_subtract
= stan::math::get_kernel("subtract"));
EXPECT_NO_THROW(cl::Kernel kernel_copy_submatrix
= stan::math::get_kernel("copy_submatrix"));
EXPECT_NO_THROW(cl::Kernel kernel_scalar_mul_diagonal
= stan::math::get_kernel("scalar_mul_diagonal"));
EXPECT_NO_THROW(cl::Kernel kernel_scalar_mul
= stan::math::get_kernel("scalar_mul"));
EXPECT_NO_THROW(cl::Kernel kernel_basic_multiply
= stan::math::get_kernel("basic_multiply"));
EXPECT_NO_THROW(cl::Kernel kernel_multiply_self_transposed
= stan::math::get_kernel("multiply_self_transposed"));
EXPECT_NO_THROW(cl::Kernel kernel_lower_tri_inv_step1
= stan::math::get_kernel("lower_tri_inv_step1"));
EXPECT_NO_THROW(cl::Kernel kernel_lower_tri_inv_step2
= stan::math::get_kernel("lower_tri_inv_step2"));
EXPECT_NO_THROW(cl::Kernel kernel_lower_tri_inv_step3
= stan::math::get_kernel("lower_tri_inv_step3"));
EXPECT_NO_THROW(cl::Kernel kernel_chol_block
= stan::math::get_kernel("cholesky_block"));
EXPECT_NO_THROW(cl::Kernel kernel_check_nan
= stan::math::get_kernel("check_nan"));
EXPECT_NO_THROW(cl::Kernel kernel_check_diagonal_zeros
= stan::math::get_kernel("check_diagonal_zeros"));
EXPECT_NO_THROW(cl::Kernel kernel_check_symmetric
= stan::math::get_kernel("check_symmetric"));
}
#else
#include <gtest/gtest.h>
TEST(MathMatrix, kernel_initializeDummy) { EXPECT_NO_THROW(); }
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment