Merge branch 'opencl_context_rewrite' of https://github.com/rok-cesnovar/math...

Merge branch 'opencl_context_rewrite' of https://github.com/rok-cesnovar/math into opencl_context_rewrite

Merge branch 'opencl_context_rewrite' of https://github.com/rok-cesnovar/math...
Merge branch 'opencl_context_rewrite' of https://github.com/rok-cesnovar/math into opencl_context_rewrite
ad6c0f90 · Steve Bronder · 596a5448 · e36c820b · ad6c0f90 · ad6c0f90
Unverified Commit ad6c0f90 authored 7 years ago by Steve Bronder
Hide whitespace changes
Inline Side-by-side

Showing

with 711 additions and 3 deletions
+711 -3
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -124,7 +124,7 @@ pipeline {
                        CppLint: { sh "make cpplint" },
                        Dependencies: { sh 'make test-math-dependencies' } ,
                        Documentation: { sh 'make doxygen' },
-                        Headers: { sh "make -j${env.PARALLEL} test-headers" }
+                        Headers: { sh "make -j${env.PARALLEL} test-headers STAN_OPENCL=true" }
                    )
                }
            }
@@ -139,10 +139,11 @@ pipeline {
        stage('Tests') {
            parallel {
                stage('Unit') {
-                    agent any
+                    agent { label "gelman-group-mac" }
                    steps {
                        unstash 'MathSetup'
                        sh setupCC()
+                        sh "echo STAN_OPENCL=true>> make/local"
                        runTests("test/unit")
                    }
                    post { always { retry(3) { deleteDir() } } }

--- a/make/os_win
+++ b/make/os_win
@@ -29,10 +29,16 @@ ifeq (g++,$(CC_TYPE))
  CXXFLAGS += -Wno-unused-but-set-variable
  LDLIBS += -static-libgcc
  LDLIBS += -static-libstdc++
+  ifdef STAN_OPENCL
+    LDFLAGS += -lOpenCL
+  endif
 endif
 ifeq (mingw32-g++,$(CC_TYPE))
  LDLIBS += -static-libgcc
 #  LDLIBS += -static-libstdc++
+  ifdef STAN_OPENCL
+    LDFLAGS += -lOpenCL
+  endif
 endif

 ifdef STAN_OPENCL

--- a/make/tests
+++ b/make/tests
@@ -13,7 +13,7 @@ $(GTEST)/src/gtest-all.o: CXXFLAGS += $(GTEST_CXXFLAGS)

 test/%$(EXE) : CXXFLAGS += $(GTEST_CXXFLAGS)
 test/%$(EXE) : test/%.o $(GTEST_MAIN) $(GTEST)/src/gtest-all.o
-	$(LINK.cpp) -o $@ $^
+	$(LINK.cpp) -o $@ $^ $(LDLIBS_OPENCL)


 ##

--- a/stan/math/prim/mat/fun/kern_gpu/basic_matrix.cl
+++ b/stan/math/prim/mat/fun/kern_gpu/basic_matrix.cl
+R"=====(
+
+#define A(i,j)  A[j*rows+i]
+#define AT(i,j)  A[j*cols+i]
+#define B(i,j)  B[j*rows+i]
+#define BT(i,j)  B[j*cols+i]
+#define C(i,j)  C[j*rows+i]
+
+#ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+    #error "Double not supported by OpenCL implementation."
+#endif
+
+__kernel void transpose(
+          __global double *B,
+          __global double *A,
+          int rows,
+          int cols ) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     BT(j,i) = A(i,j);     
+    }
+}
+
+__kernel void copy(
+          __global double *A,
+          __global double *B,
+          unsigned int rows,
+          unsigned int cols) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     B(i,j) = A(i,j);
+    }
+}
+
+__kernel void zeros(
+          __global double *A,
+          unsigned int rows,
+          unsigned int cols,
+          unsigned int part) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     if ( part == 0 && j < i ) {
+       A(i,j) = 0;
+     } else if ( part == 1 && j > i ) {
+       A(i,j) = 0;
+     } else if ( part == 2 ) {
+       A(i,j) = 0;
+     }
+    }
+}
+
+__kernel void identity(
+          __global double *A,
+          unsigned int rows,
+          unsigned int cols) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+       if ( i == j ) {
+         A(i,j) = 1.0;
+       } else {
+         A(i,j) = 0.0;
+       }
+    }
+}
+
+__kernel void copy_triangular(
+          __global double *A,
+          __global double *B,
+          unsigned int rows,
+          unsigned int cols,
+          unsigned int lower_upper) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     if ( !lower_upper && j <= i ) {
+       A(i,j) = B(i,j);
+     } else if ( !lower_upper ) {
+       A(i,j) = 0;
+     } else if ( lower_upper && j >= i ) {
+       A(i,j) = B(i,j);
+     } else if ( lower_upper && j < i ) {
+       A(i,j) = 0;
+     }
+    }
+}
+
+__kernel void copy_triangular_transposed(
+          __global double *A,
+          unsigned int rows,
+          unsigned int cols,
+          unsigned int lower_to_upper) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     if ( lower_to_upper && j > i ) {
+       AT(j,i) = A(i,j);
+     } else if ( !lower_to_upper && j > i ) {
+       A(i,j) = AT(j,i);
+     }
+    }
+}
+
+__kernel void add(
+          __global double *C,
+          __global double *A,
+          __global double *B,
+          unsigned int rows,
+          unsigned int cols) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+      C(i,j) = A(i,j) + B(i,j);
+    }
+}
+
+__kernel void subtract(
+          __global double *C,
+          __global double *A,
+          __global double *B,
+          unsigned int rows,
+          unsigned int cols) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     C(i,j) = A(i,j)-B(i,j);
+    }
+}
+
+#define src(i,j) src[j*src_rows+i]
+#define dst(i,j) dst[j*dst_rows+i]
+
+__kernel void copy_submatrix(
+          __global double *src,
+          __global double *dst,
+          unsigned int src_offset_i,
+          unsigned int src_offset_j,
+          unsigned int dst_offset_i,
+          unsigned int dst_offset_j,
+          unsigned int size_i,
+          unsigned int size_j,
+          unsigned int src_rows,
+          unsigned int src_cols,
+          unsigned int dst_rows,
+          unsigned int dst_cols) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( (i+src_offset_i) < src_rows && (j+src_offset_j) < src_cols
+      && (i+dst_offset_i) < dst_rows && (j+dst_offset_j) < dst_cols ) {
+      dst((dst_offset_i+i), (dst_offset_j+j)) =
+          src((src_offset_i+i),(src_offset_j+j));
+    }
+}
+
+
+)====="
--- a/stan/math/prim/mat/fun/kern_gpu/check_gpu.cl
+++ b/stan/math/prim/mat/fun/kern_gpu/check_gpu.cl
+R"=====(
+#ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+    #error "Double not supported by OpenCL implementation."
+#endif
+
+#define A(i,j)  A[j*rows+i]
+
+__kernel void check_nan(
+            __global double *A,
+            int rows,
+            int cols,
+            __global int *flag) {
+  const int i = get_global_id(0);
+  const int j = get_global_id(1);
+  if( i < rows && j < cols ) { 
+    if (isnan(A(i,j))) {
+      flag[0] = 1;
+    }
+  }
+}
+
+__kernel void check_diagonal_zeros(
+              __global double *A,
+              int rows,
+              int cols,
+              __global int *flag) {
+  const int i = get_global_id(0);
+  if( i < rows && i < cols ) { 
+    if (A(i,i) == 0) {
+      flag[0] = 1;
+    }
+  }
+}
+
+__kernel void check_symmetric(
+            __global double *A,
+            int rows,
+            int cols,
+            __global int *flag,
+            double tolerance) {
+  const int i = get_global_id(0);
+  const int j = get_global_id(1);
+  if( i < rows && j < cols ) { 
+    double diff = fabs(A(i,j)-A(j,i));
+    if ( diff > tolerance ) {
+      flag[0] = 1;
+    }
+  }
+}
+
+)====="
--- a/stan/math/prim/mat/fun/kern_gpu/cholesky_decomposition.cl
+++ b/stan/math/prim/mat/fun/kern_gpu/cholesky_decomposition.cl
+R"=====(
+#ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+    #error "Double not supported by OpenCL implementation."
+#endif
+
+#define V(i,j) V[j*n+i]
+#define d(i,j) d[j*n+i]
+
+__kernel void cholesky_block(
+                __global double *V,
+                __global double *d,
+                int n) {
+  int f = get_local_id(0);
+  for (int j = 0; j <n; j++) { 
+      double ss = 0;
+      for (int k = 0; k < j; k++) {
+          ss += V(j,k) * V(j,k);
+      }
+      V(j,j) = sqrt(d(j,j) - ss);
+      barrier(CLK_LOCAL_MEM_FENCE);
+      int i=f;
+      if ( i >= (j+1) && i < n ){
+          double s = 0;
+          for (int k = 0; k < j; k++) {
+              s += V(i,k) * V(j,k);
+          }
+          V(i,j) = (1.0 / V(j,j) * (d(i,j) - s));
+      }
+  }
+}
+)====="
--- a/stan/math/prim/mat/fun/kern_gpu/matrix_inverse.cl
+++ b/stan/math/prim/mat/fun/kern_gpu/matrix_inverse.cl
+R"=====(
+#ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+    #error "Double not supported by OpenCL implementation."
+#endif
+#ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+    #error "Double precision not supported by OpenCL implementation."
+#endif
+
+#define A(i,j) A[j*rows+i]
+#define V(i,j) V[offset+j*(part_size_fixed+1)+i] 
+
+__kernel void lower_tri_inv_step1(
+                __global double* A,
+                __global double* V,
+                int remainder,
+                int part_size_fixed,
+                int rows) {
+  int indeks = get_global_id(0);
+  int i = indeks*part_size_fixed;
+  int part_size;
+  double faktor;
+  if ( indeks < remainder ) {
+    i += indeks;
+    part_size = part_size_fixed+1;
+  } else {
+    i += remainder;
+    part_size = part_size_fixed;
+  }
+  int offset = indeks*(part_size_fixed+1)*(part_size_fixed+1);
+
+  for (int p = 0; p < part_size; p++) {
+    for (int r = 0; r < part_size; r++) {
+      if ( p == r )
+        V(p,r) = 1;
+      else
+        V(p,r) = 0;
+    }
+  }
+
+  for (unsigned int ii = 0; ii < part_size; ii++) {
+    if ( ii > 0 ) {
+      for (unsigned int j = ii; j < part_size; j++) {
+        faktor = A((j+i),(i+ii-1));
+        for (unsigned int k = 0; k < part_size; k++) {
+          V(j,k) -= faktor*V((ii-1),k);
+        }
+      }
+    }
+    faktor = A((ii+i),(ii+i));
+    for (unsigned int k = 0; k < part_size; k++) {
+      V(ii,k) /= faktor;
+    }
+  }
+  for (int p = 0; p < part_size; p++) {
+    for (int r=0; r < part_size; r++) {
+      A((p+i),(i+r)) = V(p,r);
+    }
+  }
+}
+
+#define temp(i,j) temp[(n/2)*(sizeM)*(sizeM)+j*part_size2+i]
+
+#define WPT 4
+#define RTS TS2/WPT
+__kernel void lower_tri_inv_step2(
+                __global double* A,
+                __global int* sizes,
+                __global double* temp,
+                int repeat,
+                int remainder,
+                int part_size_fixed,
+                int rows) {
+ int n = get_global_id(2)*2;
+ double sum = 0;
+ int part_size1 = 0, part_size2 = 0;
+ int offset_i, offset_j;
+ for (int r = n*repeat; r < (n+1)*repeat; r++)
+  part_size1 += sizes[r];
+ for (int r = (n+1)*repeat; r < (n+2)*repeat; r++)
+  part_size2 += sizes[r];
+ int sizeM = repeat*(part_size_fixed+1);
+ offset_i = (n+1)*repeat*part_size_fixed;
+ offset_j = n*repeat*part_size_fixed;
+ if ( ((n+1)*repeat) <= remainder )
+  offset_i += (n+1)*repeat;
+ else
+  offset_i += remainder;
+ if ( (n*repeat) <= remainder )
+  offset_j += n*repeat;
+ else
+  offset_j += remainder;
+ const int row = get_local_id(0);
+ const int col = get_local_id(1);
+  const int i = TS2*get_group_id(0) + row;
+  const int j = TS2*get_group_id(1) + col;
+  __local double Asub[TS2][TS2];
+  __local double Bsub[TS2][TS2];
+  double acc[WPT];
+  for (int w = 0; w < WPT; w++) {
+      acc[w] = 0.0;
+  }
+
+ const int numTiles = (part_size2+TS2-1)/TS2;
+ sum = 0;
+ for (int t = 0; t < numTiles; t++) {
+  for (int w = 0; w < WPT; w++) {
+   const int tiledRow = TS2*t + row;
+   const int tiledCol = TS2*t + col;
+
+   if ( i < part_size2 && (tiledCol+w*RTS) < part_size1 &&
+      (tiledCol+offset_j+part_size1+w*RTS) < rows  && (i+offset_i) < rows ) {
+    Asub[col+w*RTS][row] =
+      A((i+offset_i),(tiledCol+offset_j+part_size1+w*RTS));
+   } else {
+    Asub[col+w*RTS][row] = 0.0;
+   }
+
+   if ( (j+w*RTS) < part_size1 && tiledRow < part_size2 &&
+        (tiledRow+offset_i) < rows && (j+offset_j+w*RTS) < rows ) {
+    Bsub[col+w*RTS][row] = A((tiledRow+offset_i),(j+offset_j+w*RTS));
+   } else {
+    Bsub[col+w*RTS][row] = 0.0;
+   }
+  }
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int k = 0; k < TS2; k++) {
+   for (int w = 0; w < WPT; w++) {
+    acc[w] += Asub[k][row]*Bsub[col+w*RTS][k];
+   }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+ }
+
+ for (int w = 0; w < WPT; w++) {
+  if ( i < part_size2 && (j+w*RTS) < part_size1 &&
+       i < sizeM && (j+w*RTS) < sizeM ) {
+   temp(i,(j+w*RTS)) = acc[w];
+  }
+ }
+}
+
+__kernel void lower_tri_inv_step3(
+                __global double* A,
+                __global int* sizes,
+                __global double* temp,
+                int repeat,
+                int remainder,
+                int part_size_fixed,
+                int rows) {
+ int n = get_global_id(2)*2;
+ double sum = 0;
+ int part_size1 = 0, part_size2 = 0;
+ int offset_i, offset_j;
+ for (int r = n*repeat; r < (n+1)*repeat; r++)
+  part_size1 += sizes[r];
+ for (int r = (n+1)*repeat; r < (n+2)*repeat; r++)
+  part_size2 += sizes[r];
+
+ int sizeM = repeat*(part_size_fixed+1);
+ offset_i = (n+1)*repeat*part_size_fixed;
+ offset_j = n*repeat*part_size_fixed;
+ if ( ((n+1)*repeat) <= remainder )
+  offset_i += (n+1)*repeat;
+ else
+  offset_i += remainder;
+
+ if ( (n*repeat) <= remainder )
+  offset_j += n*repeat;
+ else
+  offset_j += remainder;
+
+ const int row = get_local_id(0);
+ const int col = get_local_id(1);
+  const int i = TS2*get_group_id(0) + row;
+  const int j = TS2*get_group_id(1) + col;
+
+  __local double Asub[TS2][TS2];
+  __local double Bsub[TS2][TS2];
+
+  double acc[WPT];
+  for (int w = 0; w < WPT; w++) {
+      acc[w] = 0.0f;
+  }
+
+  const int numTiles = (part_size1+TS2-1)/TS2;
+
+ sum = 0;
+ for (int t = 0; t < numTiles; t++) {
+  for (int w = 0; w < WPT; w++) {
+   const int tiledRow = TS2*t + row;
+   const int tiledCol = TS2*t + col;
+   if ( i < part_size2 && (tiledCol+w*RTS) < part_size1 &&
+        i < sizeM && (tiledCol+w*RTS) < sizeM ) {
+    Asub[col+w*RTS][row] =
+        temp(i,(tiledCol+w*RTS));
+   } else {
+    Asub[col+w*RTS][row] = 0.0;
+   }
+   if ( (j+w*RTS) < part_size1 && (j+offset_j+w*RTS) < rows &&
+        (tiledRow+offset_i-part_size1) < rows ) {
+    Bsub[col+w*RTS][row] =
+        A((tiledRow+offset_i-part_size1),(j+offset_j+w*RTS));
+   } else {
+    Bsub[col+w*RTS][row] = 0.0;
+   }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  for (int k = 0; k < TS2; k++) {
+   for (int w = 0; w < WPT; w++) {
+    acc[w] += Asub[k][row]*Bsub[col+w*RTS][k];
+   }
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+ }
+ for (int w = 0; w < WPT; w++) {
+  if ( i < part_size2 && (j+w*RTS) < part_size1 &&
+      (i+offset_i) < rows && (j+offset_j+w*RTS) < rows ) {
+   A((i+offset_i),(j+offset_j+w*RTS)) = -acc[w];
+  }
+ }
+}
+)====="
--- a/stan/math/prim/mat/fun/kern_gpu/matrix_multiply.cl
+++ b/stan/math/prim/mat/fun/kern_gpu/matrix_multiply.cl
+R"=====(
+#ifdef cl_khr_fp64
+    #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)
+    #pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+    #error "Double not supported by OpenCL implementation."
+#endif
+
+#define a(i,j)  a[j*rows+i]
+#define b(i,j)  b[j*rows+i]
+#define c(i,j)  c[j*rows+i]
+
+__kernel void scalar_mul_diagonal(
+          __global double *a,
+          double scalar,
+          unsigned int rows,
+          unsigned int cols) {
+    int i = get_global_id(0);
+    if ( i < rows && i < cols ) {
+     a(i,i) *= scalar;
+    }
+}
+
+__kernel void scalar_mul(
+          __global double *a,
+          __global double *b,
+          double scalar,
+          unsigned int rows,
+          unsigned int cols) {
+    int i = get_global_id(0);
+    int j = get_global_id(1);
+    if ( i < rows && j < cols ) {
+     a(i,j) = b(i,j)*scalar;
+    }
+}
+
+#define A(i,j)  A[j*M+i]
+#define B(i,j)  B[j*K+i]
+#define C(i,j)  C[j*M+i]
+
+#define WPT 4
+#define RTS TS/WPT
+__kernel void basic_multiply(const int M, const int N, const int K,
+                      const __global double* A,
+                      const __global double* B,
+                      __global double* C) {
+        
+    const int row = get_local_id(0);
+    const int col = get_local_id(1);
+    const int i = TS*get_group_id(0) + row;
+    const int j = TS*get_group_id(1) + col;
+    
+    __local double Asub[TS][TS];
+    __local double Bsub[TS][TS];
+ 
+    double acc[WPT];
+    for (int w=0; w<WPT; w++) {
+        acc[w] = 0.0;
+    }
+    
+    const int numTiles = (K+TS-1)/TS;
+    for (int t=0; t<numTiles; t++) {
+        for (int w=0; w<WPT; w++) {
+            const int tiled_i = TS*t + row;
+            const int tiled_j = TS*t + col;
+            if ( i < M && (tiled_j + w*RTS) < K ) {
+              Asub[col + w*RTS][row] = A[(tiled_j + w*RTS)*M + i];
+            }else{
+              Asub[col + w*RTS][row] = 0.0;
+            }
+            if ( tiled_i < K && (j + w*RTS) < N ) {
+              Bsub[col + w*RTS][row] = B[(j + w*RTS)*K + tiled_i];
+            }else{
+              Bsub[col + w*RTS][row] = 0.0;
+            }
+        }
+        
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+ 
+        for (int k=0; k<TS; k++) {
+            for (int w=0; w<WPT; w++) {
+                acc[w] += Asub[k][row] * Bsub[col + w*RTS][k];
+            }
+        }
+ 
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+ 
+    for (int w=0; w<WPT; w++) {
+      if ( (j + w*RTS) < N && i < M ) {
+        C[(j + w*RTS)*M + i] = acc[w];
+      }
+    }
+}
+
+#define WPT1 4
+#define RTS1 TS1/WPT1
+__kernel void multiply_self_transposed(const int M, const int N, const int K,
+                      const __global double* A,
+                      const __global double* B,
+                      __global double* C) {
+    
+    const int row = get_local_id(0); 
+    const int col = get_local_id(1); 
+    const int i = TS1*get_group_id(0) + row; 
+    const int j = TS1*get_group_id(1) + col; 
+    const int jMin = TS1*get_group_id(1);
+    const int iMax = TS1*get_group_id(0)  + get_local_size(0);
+    
+    __local double Asub[TS1][TS1];
+    __local double Bsub[TS1][TS1];
+ 
+    double acc[WPT1];
+    for (int w=0; w<WPT1; w++) {
+        acc[w] = 0.0;
+    }
+    
+    
+    const int numTiles = (K+TS1-1)/TS1;
+    for (int t=0; t<numTiles; t++) {
+        if(jMin <= iMax){
+        
+        for (int w=0; w<WPT1; w++) {
+            const int tiled_i = TS1*t + row;
+            const int tiled_j = TS1*t + col;
+            if ( i < M && (tiled_j + w*RTS1) < K ) {
+              Asub[col + w*RTS1][row] = A[(tiled_j + w*RTS1)*M + i];
+            }else{
+              Asub[col + w*RTS1][row] = 0.0;
+            }
+            if ( tiled_i < K && (j + w*RTS1) < N ) {
+              Bsub[col + w*RTS1][row] = B[(j + w*RTS1)*K + tiled_i];
+            }else{
+              Bsub[col + w*RTS1][row] = 0.0;
+            }
+        }
+        }
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+ 
+        
+        for (int k=0; k<TS1; k++) {
+            for (int w=0; w<WPT1; w++) {
+              if((j + w*RTS1)<=i){
+                acc[w] += Asub[k][row] * Bsub[col + w*RTS1][k];
+              }
+            }
+        }
+ 
+        
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+ 
+    for (int w=0; w<WPT1; w++) {
+      if ( (j + w*RTS1) < N && i < M  && (j + w*RTS1)<=i ) {
+        C[(j + w*RTS1)*M + i] = acc[w];
+        C[i*M + (j + w*RTS1)] = acc[w];
+      }
+    }
+ }
+)====="
--- a/test/unit/math/prim/mat/fun/kernel_initialization_test.cpp
+++ b/test/unit/math/prim/mat/fun/kernel_initialization_test.cpp
+#ifdef STAN_OPENCL
+#include <stan/math/prim/mat.hpp>
+#include <gtest/gtest.h>
+#include <CL/cl.hpp>
+
+TEST(MathMatrix, kernel_initialize) {
+  stan::math::get_context();
+  cl::Kernel kernel_transpose = stan::math::get_kernel("transpose");
+  EXPECT_NO_THROW(cl::Kernel kernel_transpose
+                  = stan::math::get_kernel("transpose"));
+  EXPECT_NO_THROW(cl::Kernel kernel_copy = stan::math::get_kernel("copy"));
+  EXPECT_NO_THROW(cl::Kernel kernel_zeros = stan::math::get_kernel("zeros"));
+  EXPECT_NO_THROW(cl::Kernel kernel_identity
+                  = stan::math::get_kernel("identity"));
+  EXPECT_NO_THROW(cl::Kernel kernel_copy_triangular
+                  = stan::math::get_kernel("copy_triangular"));
+  EXPECT_NO_THROW(cl::Kernel kernel_scalar_copy_triangular_transposed
+                  = stan::math::get_kernel("copy_triangular_transposed"));
+  EXPECT_NO_THROW(cl::Kernel kernel_add = stan::math::get_kernel("add"));
+  EXPECT_NO_THROW(cl::Kernel kernel_subtract
+                  = stan::math::get_kernel("subtract"));
+  EXPECT_NO_THROW(cl::Kernel kernel_copy_submatrix
+                  = stan::math::get_kernel("copy_submatrix"));
+  EXPECT_NO_THROW(cl::Kernel kernel_scalar_mul_diagonal
+                  = stan::math::get_kernel("scalar_mul_diagonal"));
+  EXPECT_NO_THROW(cl::Kernel kernel_scalar_mul
+                  = stan::math::get_kernel("scalar_mul"));
+  EXPECT_NO_THROW(cl::Kernel kernel_basic_multiply
+                  = stan::math::get_kernel("basic_multiply"));
+  EXPECT_NO_THROW(cl::Kernel kernel_multiply_self_transposed
+                  = stan::math::get_kernel("multiply_self_transposed"));
+  EXPECT_NO_THROW(cl::Kernel kernel_lower_tri_inv_step1
+                  = stan::math::get_kernel("lower_tri_inv_step1"));
+  EXPECT_NO_THROW(cl::Kernel kernel_lower_tri_inv_step2
+                  = stan::math::get_kernel("lower_tri_inv_step2"));
+  EXPECT_NO_THROW(cl::Kernel kernel_lower_tri_inv_step3
+                  = stan::math::get_kernel("lower_tri_inv_step3"));
+  EXPECT_NO_THROW(cl::Kernel kernel_chol_block
+                  = stan::math::get_kernel("cholesky_block"));
+  EXPECT_NO_THROW(cl::Kernel kernel_check_nan
+                  = stan::math::get_kernel("check_nan"));
+  EXPECT_NO_THROW(cl::Kernel kernel_check_diagonal_zeros
+                  = stan::math::get_kernel("check_diagonal_zeros"));
+  EXPECT_NO_THROW(cl::Kernel kernel_check_symmetric
+                  = stan::math::get_kernel("check_symmetric"));
+}
+
+#else
+
+#include <gtest/gtest.h>
+TEST(MathMatrix, kernel_initializeDummy) { EXPECT_NO_THROW(); }
+#endif