Merge branch 'develop' of https://github.com/stan-dev/math into feature/issue-838-linseq

6adb7122 · Rayleigh L · 44a66ffe · 159f3dfe · 44a66ffe · 6adb7122
Commit 6adb7122 authored 6 years ago by Rayleigh L
Hide whitespace changes
Inline Side-by-side

Showing

with 725 additions and 258 deletions
+725 -258
--- a/stan/math/gpu/kernels/check_nan_kernel.cl
+++ b/stan/math/gpu/kernels/check_nan_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-/**
- * Check if the <code>matrix_gpu</code> has NaN values
- *
- * @param[in] A The matrix to check.
- * @param rows The number of rows in matrix A.
- * @param cols The number of columns in matrix A.
- * @param[out] flag the flag to be written to if any diagonal is zero.
- *
- * @note Kernel for stan/math/gpu/err/check_nan.hpp
- */
-__kernel void is_nan(__global double *A, int rows, int cols,
-  __global int *flag) {
-  const int i = get_global_id(0);
-  const int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    if (isnan(A(i, j))) {
-      flag[0] = 1;
-    }
-  }
-};)"
--- a/stan/math/gpu/kernels/check_symmetric.hpp
+++ b/stan/math/gpu/kernels/check_symmetric.hpp
+#ifndef STAN_MATH_GPU_KERNELS_CHECK_SYMMETRIC_HPP
+#define STAN_MATH_GPU_KERNELS_CHECK_SYMMETRIC_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *is_symmetric_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Check if the <code>matrix_gpu</code> is symmetric
+     *
+     * @param[in] A The matrix to check.
+     * @param rows The number of rows in matrix A.
+     * @param cols The number of columns in matrix A.
+     * @param[out] flag the flag to be written to if any diagonal is zero.
+     * @param tolerance The numerical tolerance to check wheter
+     *   two values are equal
+     * @note Code is a <code>const char*</code> held in
+     * <code>is_symmetric_kernel_code.</code>
+     * Kernel for stan/math/gpu/err/check_symmetric.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void is_symmetric(
+        __global read_only double *A, __global write_only int *flag,
+        read_only unsigned int rows, read_only unsigned int cols,
+        read_only double tolerance) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        double diff = fabs(A(i, j) - A(j, i));
+        if (diff > tolerance) {
+          flag[0] = 0;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/check_symmetric.hpp check_symmetric() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int, const double>
+    check_symmetric("is_symmetric", is_symmetric_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/check_symmetric_kernel.cl
+++ b/stan/math/gpu/kernels/check_symmetric_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-/**
- * Check if the <code>matrix_gpu</code> is symmetric
- *
- * @param[in] A The matrix to check.
- * @param rows The number of rows in matrix A.
- * @param cols The number of columns in matrix A.
- * @param[out] flag the flag to be written to if any diagonal is zero.
- *
- * @note Kernel for stan/math/gpu/err/check_symmetric.hpp
- */
-__kernel void is_symmetric(__global double *A, int rows, int cols,
-  __global int *flag, double tolerance) {
-  const int i = get_global_id(0);
-  const int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    double diff = fabs(A(i, j) - A(j, i));
-    if (diff > tolerance) {
-      flag[0] = 0;
-    }
-  }
-};)"
--- a/stan/math/gpu/kernels/copy.hpp
+++ b/stan/math/gpu/kernels/copy.hpp
+#ifndef STAN_MATH_GPU_KERNELS_COPY_HPP
+#define STAN_MATH_GPU_KERNELS_COPY_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *copy_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Copy one matrix to another
+     * @param[in] A The matrix to copy.
+     * @param[out] B The matrix to copy A to.
+     * @param rows The number of rows in A.
+     * @param cols The number of cols in A.
+     * @note Code is a <code>const char*</code> held in
+     * <code>copy_kernel_code.</code>
+     * Kernel used in math/gpu/matrix_gpu.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void copy(
+        __global read_only double *A, __global write_only double *B,
+        read_only unsigned int rows, read_only unsigned int cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        B(i, j) = A(i, j);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/copy.hpp copy() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int> copy(
+    "copy", copy_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/copy_matrix_kernel.cl
+++ b/stan/math/gpu/kernels/copy_matrix_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-
-#ifndef B
-#define B(i, j)  B[j * rows + i]
-#endif
-/**
- * Copy one matrix to another
- * @param[in] A The matrix to copy.
- * @param[out] B The matrix to copy A to.
- * @param rows The number of rows in A.
- * @param cols The number of cols in A.
- *
- * @note Kernel used in math/gpu/matrix_gpu.hpp
- */
-__kernel void copy(__global double *A, __global double *B, unsigned int rows,
-  unsigned int cols) {
-  int i = get_global_id(0);
-  int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    B(i, j) = A(i, j);
-  }
-};)"
--- a/stan/math/gpu/kernels/copy_submatrix_kernel.cl
+++ b/stan/math/gpu/kernels/copy_submatrix_kernel.cl
-R"(
-#ifndef src
-#define src(i,j) src[j * src_rows + i]
-#endif
-#ifndef dst
-#define dst(i,j) dst[j * dst_rows + i]
-#endif
-/**
- * Copies a submatrix of the source matrix to
- * the destination matrix. The submatrix to copy
- * starts at (0, 0)
- * and is of size size_rows x size_cols.
- * The submatrix is copied to the
- * destination matrix starting at
- * (dst_offset_rows, dst_offset_cols)
- *
- * @param[in] src The source matrix.
- * @param[out] dst The destination submatrix.
- * @param dst_offset_i The offset row in dst.
- * @param dst_offset_j The offset column in dst.
- * @param size_i The number of rows in the submatrix.
- * @param size_j The number of columns in the submatrix.
- * @param src_rows The number of rows in the source matrix.
- * @param src_cols The number of cols in the source matrix.
- * @param src_rows The number of rows in the destination matrix.
- * @param dst_cols The number of cols in the destination matrix.
- *
- * @note used in math/gpu/copy_submatrix_opencl.hpp
- */
-__kernel void copy_submatrix(__global double *src, __global double *dst,
-  unsigned int dst_offset_i, unsigned int dst_offset_j,
-  unsigned int size_i, unsigned int size_j,
-  unsigned int src_rows, unsigned int src_cols,
-  unsigned int dst_rows, unsigned int dst_cols) {
-  int i = get_global_id(0);
-  int j = get_global_id(1);
-  if ((i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
-    dst((dst_offset_i + i), (dst_offset_j + j)) =
-      src((0 + i),(0 + j));
-  }
-};)"
--- a/stan/math/gpu/kernels/copy_triangular.hpp
+++ b/stan/math/gpu/kernels/copy_triangular.hpp
+#ifndef STAN_MATH_GPU_KERNELS_COPY_TRIANGULAR_HPP
+#define STAN_MATH_GPU_KERNELS_COPY_TRIANGULAR_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *copy_triangular_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Copies the lower or upper
+     * triangular of the source matrix to
+     * the destination matrix.
+     * Both matrices are stored on the GPU.
+     *
+     * @param[out] A Output matrix to copy triangular to.
+     * @param[in] B The matrix to copy the triangular from.
+     * @param rows The number of rows of B.
+     * @param cols The number of cols of B.
+     * @param lower_upper determines
+     * which part of the matrix to copy:
+     *  LOWER: 0 - copies the lower triangular
+     *  UPPER: 1 - copes the upper triangular
+     * @note Code is a <code>const char*</code> held in
+     * <code>copy_triangular_kernel_code.</code>
+     * Used in math/gpu/copy_triangular_opencl.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void copy_triangular(
+        __global write_only double *A, __global read_only double *B,
+        read_only unsigned int rows, read_only unsigned int cols,
+        read_only unsigned int lower_upper) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        if (!lower_upper && j <= i) {
+          A(i, j) = B(i, j);
+        } else if (!lower_upper) {
+          A(i, j) = 0;
+        } else if (lower_upper && j >= i) {
+          A(i, j) = B(i, j);
+        } else if (lower_upper && j < i) {
+          A(i, j) = 0;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/copy_triangular.hpp copy_triangular() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int, TriangularViewGPU>
+    copy_triangular("copy_triangular", copy_triangular_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/copy_triangular_matrix_kernel.cl
+++ b/stan/math/gpu/kernels/copy_triangular_matrix_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-
-#ifndef B
-#define B(i, j)  B[j * rows + i]
-#endif
-/**
- * Copies the lower or upper
- * triangular of the source matrix to
- * the destination matrix.
- * Both matrices are stored on the GPU.
- *
- * @param[out] A Output matrix to copy triangular to.
- * @param[in] B The matrix to copy the triangular from.
- * @param rows The number of rows of B.
- * @param cols The number of cols of B.
- * @param lower_upper determines
- * which part of the matrix to copy:
- *  LOWER: 0 - copies the lower triangular
- *  UPPER: 1 - copes the upper triangular
- *
- * @note Used in math/gpu/copy_triangular_opencl.hpp
- */
-__kernel void copy_triangular(__global double *A, __global double *B,
-  unsigned int rows, unsigned int cols, unsigned int lower_upper) {
-  int i = get_global_id(0);
-  int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    if (!lower_upper && j <= i) {
-      A(i, j) = B(i, j);
-    } else if (!lower_upper) {
-      A(i, j) = 0;
-    } else if (lower_upper && j >= i) {
-      A(i, j) = B(i, j);
-    } else if (lower_upper && j < i) {
-      A(i, j) = 0;
-    }
-  }
-};)"
--- a/stan/math/gpu/kernels/helpers.hpp
+++ b/stan/math/gpu/kernels/helpers.hpp
+#ifndef STAN_MATH_GPU_KERNELS_HELPERS_HPP
+#define STAN_MATH_GPU_KERNELS_HELPERS_HPP
+#ifdef STAN_OPENCL
+
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+
+/*
+ * Defines some helper macros for the kernels
+ */
+std::string helpers =
+    R"(
+    // Matrix access helpers
+  #ifndef A
+  #define A(i,j) A[j * rows + i]
+  #endif
+  #ifndef B
+  #define B(i,j) B[j * rows + i]
+  #endif
+  #ifndef C
+  #define C(i,j) C[j * rows + i]
+  #endif
+    // Transpose
+  #ifndef BT
+  #define BT(i,j) B[j * cols + i]
+  #endif
+  #ifndef AT
+  #define AT(i,j) A[j * cols + i]
+  #endif
+    // Moving between two buffers
+  #ifndef src
+  #define src(i,j) src[j * src_rows + i]
+  #endif
+  #ifndef dst
+  #define dst(i,j) dst[j * dst_rows + i]
+  #endif
+
+  // The local memory column for each thread block
+  #define THREAD_BLOCK_SIZE_COL THREAD_BLOCK_SIZE/WORK_PER_THREAD
+
+  )";
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/identity.hpp
+++ b/stan/math/gpu/kernels/identity.hpp
+#ifndef STAN_MATH_GPU_KERNELS_IDENTITY_HPP
+#define STAN_MATH_GPU_KERNELS_IDENTITY_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char* identity_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Makes an identity matrix on the GPU
+     *
+     * @param[in,out] A The identity matrix output.
+     * @param rows The number of rows for A.
+     * @param cols The number of cols for A.
+     * @note Code is a <code>const char*</code> held in
+     * <code>identity_kernel_code.</code>
+     *  Used in math/gpu/identity_opencl.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void identity(__global write_only double* A,
+                           read_only unsigned int rows,
+                           read_only unsigned int cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        if (i == j) {
+          A(i, j) = 1.0;
+        } else {
+          A(i, j) = 0.0;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/transpose.hpp transpose() \endlink
+ */
+const global_range_kernel<cl::Buffer, int, int> identity("identity",
+                                                         identity_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/identity_matrix_kernel.cl
+++ b/stan/math/gpu/kernels/identity_matrix_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-
-/**
- * Makes an identity matrix on the GPU
- *
- * @param[in,out] A The identity matrix output.
- * @param rows The number of rows for A.
- * @param cols The number of cols for A.
- *
- * @note Used in math/gpu/identity_opencl.hpp
- */
-__kernel void identity(__global double *A, unsigned int rows,
-  unsigned int cols) {
-  int i = get_global_id(0);
-  int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    if (i == j) {
-      A(i, j) = 1.0;
-    } else {
-      A(i, j) = 0.0;
-    }
-  }
-};)"
--- a/stan/math/gpu/kernels/matrix_multiply.hpp
+++ b/stan/math/gpu/kernels/matrix_multiply.hpp
+#ifndef STAN_MATH_GPU_KERNELS_MATRIX_MULTIPLY_HPP
+#define STAN_MATH_GPU_KERNELS_MATRIX_MULTIPLY_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char* matrix_multiply_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Matrix multiplication on the GPU
+     *
+     * @param[in] A the left matrix in matrix multiplication
+     * @param[in] B the right matrix in matrix multiplication
+     * @param[out] C the output matrix
+     * @param[in] M Number of rows for matrix A
+     * @param[in] N Number of rows for matrix B
+     * @param[in] K Number of cols for matrix A and number of rows for matrix B
+     */
+    __kernel void matrix_multiply(
+        const __global read_only double* A, const __global read_only double* B,
+        __global write_only double* C, const read_only int M,
+        const read_only int N, const read_only int K) {
+      // thread index inside the thread_block
+      const int thread_block_row = get_local_id(0);
+      const int thread_block_col = get_local_id(1);
+      // global thread index
+      const int i = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;
+      const int j = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;
+
+      // local memory
+      __local double A_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
+      __local double B_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
+
+      double acc[WORK_PER_THREAD];
+      for (int w = 0; w < WORK_PER_THREAD; w++) {
+        acc[w] = 0.0;
+      }
+
+      const int num_tiles = (K + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
+      // iterate over all tiles
+      for (int tile_ind = 0; tile_ind < num_tiles; tile_ind++) {
+        // each thread copies WORK_PER_THREAD values to the local
+        // memory
+        for (int w = 0; w < WORK_PER_THREAD; w++) {
+          const int tiled_i = THREAD_BLOCK_SIZE * tile_ind + thread_block_row;
+          const int tiled_j = THREAD_BLOCK_SIZE * tile_ind + thread_block_col;
+
+          A_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
+                 [thread_block_row]
+              = A[(tiled_j + w * THREAD_BLOCK_SIZE_COL) * M + i];
+
+          B_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
+                 [thread_block_row]
+              = B[(j + w * THREAD_BLOCK_SIZE_COL) * K + tiled_i];
+        }
+        // wait until all tile values are loaded to the local memory
+        barrier(CLK_LOCAL_MEM_FENCE);
+        for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE; block_ind++) {
+          for (int w = 0; w < WORK_PER_THREAD; w++) {
+            acc[w] += A_local[block_ind][thread_block_row]
+                      * B_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
+                               [block_ind];
+          }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      // save the values
+      for (int w = 0; w < WORK_PER_THREAD; w++) {
+        // each thread saves WORK_PER_THREAD values
+        C[(j + w * THREAD_BLOCK_SIZE_COL) * M + i] = acc[w];
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/matrix_multiply.hpp add() \endlink
+ */
+const local_range_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int, int, int>
+    matrix_multiply("matrix_multiply", matrix_multiply_kernel_code,
+                    {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/multiply_transpose.hpp
+++ b/stan/math/gpu/kernels/multiply_transpose.hpp
+#ifndef STAN_MATH_GPU_KERNELS_MULTIPLY_TRANSPOSE_HPP
+#define STAN_MATH_GPU_KERNELS_MULTIPLY_TRANSPOSE_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char* multiply_transpose_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Matrix multiplication of the form A*A^T on the GPU
+     *
+     * @param[in] A matrix A
+     * @param[out] B the output matrix
+     * @param[in] M Number of rows for matrix A
+     * @param[in] N Number of cols for matrix A and the number of rows for
+     * matrix A^T
+     */
+    __kernel void multiply_transpose(
+        const __global read_only double* A, __global write_only double* B,
+        const read_only int M, const read_only int N) {
+      // thread index inside the thread block
+      const int thread_block_row = get_local_id(0);
+      const int thread_block_col = get_local_id(1);
+
+      // global thread index
+      const int i = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;
+      const int j = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;
+
+      // indexes that determine the last indexes that need to compute
+      // in order to remove the unnecesary multiplications in the special
+      // multiplication of A*A^T
+      const int jMin = THREAD_BLOCK_SIZE * get_group_id(1);
+      const int iMax = THREAD_BLOCK_SIZE * get_group_id(0) + get_local_size(0);
+
+      // local memory
+      __local double A_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
+      __local double B_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
+
+      double acc[WORK_PER_THREAD];
+      for (int w = 0; w < WORK_PER_THREAD; w++) {
+        acc[w] = 0.0;
+      }
+
+      const int numTiles = (N + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
+      // iterate over all tiles
+      for (int tile_ind = 0; tile_ind < numTiles; tile_ind++) {
+        // in each tile
+        const int tiled_i = THREAD_BLOCK_SIZE * tile_ind + thread_block_row;
+        const int tiled_j = THREAD_BLOCK_SIZE * tile_ind + thread_block_col;
+        // if the data needs to be loaded to local memory
+        if (jMin <= iMax) {
+          // each thread copies WORK_PER_THREAD values to the
+          // local memory
+          for (int w = 0; w < WORK_PER_THREAD; w++) {
+            A_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
+                   [thread_block_row]
+                = A[i + (tiled_j + w * THREAD_BLOCK_SIZE_COL) * M];
+            B_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
+                   [thread_block_row]
+                = A[(j + w * THREAD_BLOCK_SIZE_COL) + tiled_i * M];
+          }
+        }
+        // wait till all tile values are loaded to the local memory
+        barrier(CLK_LOCAL_MEM_FENCE);
+        // multiply the tile products
+        for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE; block_ind++) {
+          // each thread multiplies WORK_PER_THREAD values
+          for (int w = 0; w < WORK_PER_THREAD; w++) {
+            if (jMin <= iMax) {
+              if ((j + w * THREAD_BLOCK_SIZE_COL) <= i) {
+                acc[w] += A_local[block_ind][thread_block_row]
+                          * B_local[thread_block_col
+                                    + w * THREAD_BLOCK_SIZE_COL][block_ind];
+              }
+            }
+          }
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+      }
+      // save the values
+      for (int w = 0; w < WORK_PER_THREAD; w++) {
+        // each thread saves WORK_PER_THREAD values
+        if ((j + w * THREAD_BLOCK_SIZE_COL) <= i) {
+          B[i + (j + w * THREAD_BLOCK_SIZE_COL) * M] = acc[w];
+          B[(j + w * THREAD_BLOCK_SIZE_COL) + i * M] = acc[w];
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/multiply_transpose.hpp add() \endlink
+ */
+const local_range_kernel<cl::Buffer, cl::Buffer, int, int> multiply_transpose(
+    "multiply_transpose", multiply_transpose_kernel_code,
+    {{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 4}});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/scalar_mul.hpp
+++ b/stan/math/gpu/kernels/scalar_mul.hpp
+#ifndef STAN_MATH_GPU_KERNELS_SCALAR_MUL_HPP
+#define STAN_MATH_GPU_KERNELS_SCALAR_MUL_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *scalar_mul_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Multiplication of the matrix A with a scalar
+     *
+     * @param[in] A input matrix
+     * @param[in] B output matrix
+     * @param[in] scalar the value with which to multiply A
+     * @param[in] rows the number of rows in A
+     * @param[in] cols the number of columns in A
+     */
+    __kernel void scalar_mul(__global double *A, const __global double *B,
+                             const double scalar, const unsigned int rows,
+                             const unsigned int cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        A(i, j) = B(i, j) * scalar;
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/scalar_mul.hpp add() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, double, int, int> scalar_mul(
+    "scalar_mul", scalar_mul_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/scalar_mul_diagonal.hpp
+++ b/stan/math/gpu/kernels/scalar_mul_diagonal.hpp
+#ifndef STAN_MATH_GPU_KERNELS_SCALAR_MUL_DIAGONAL_HPP
+#define STAN_MATH_GPU_KERNELS_SCALAR_MUL_DIAGONAL_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *scalar_mul_diagonal_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Multiplication of the matrix A diagonal with a scalar
+     *
+     * @param[in, out] A matrix A
+     * @param[in] scalar the value with which to multiply the diagonal of A
+     * @param[in] rows the number of rows in A
+     * @param[in] min_dim the size of the smaller dimension of A
+     */
+    __kernel void scalar_mul_diagonal(__global read_write double *A,
+                                      const read_only double scalar,
+                                      const read_only unsigned int rows,
+                                      const read_only unsigned int min_dim) {
+      int i = get_global_id(0);
+      if (i < min_dim) {
+        A(i, i) *= scalar;
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/scalar_mul_diagonal.hpp add() \endlink
+ */
+const global_range_kernel<cl::Buffer, double, int, int> scalar_mul_diagonal(
+    "scalar_mul_diagonal", scalar_mul_diagonal_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/sub_block.hpp
+++ b/stan/math/gpu/kernels/sub_block.hpp
+#ifndef STAN_MATH_GPU_KERNELS_SUB_BLOCK_HPP
+#define STAN_MATH_GPU_KERNELS_SUB_BLOCK_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *sub_block_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Copies a submatrix of the source matrix to
+     * the destination matrix. The submatrix to copy
+     * starts at (0, 0)
+     * and is of size size_rows x size_cols.
+     * The submatrix is copied to the
+     * destination matrix starting at
+     * (dst_offset_rows, dst_offset_cols)
+     *
+     * @param[in] src The source matrix.
+     * @param[out] dst The destination submatrix.
+     * @param src_offset_i The offset row in src.
+     * @param src_offset_j The offset column in src.
+     * @param dst_offset_i The offset row in dst.
+     * @param dst_offset_j The offset column in dst.
+     * @param size_i The number of rows in the submatrix.
+     * @param size_j The number of columns in the submatrix.
+     * @param src_rows The number of rows in the source matrix.
+     * @param src_cols The number of cols in the source matrix.
+     * @param src_rows The number of rows in the destination matrix.
+     * @param dst_cols The number of cols in the destination matrix.
+     * @param dst_rows The number of rows in the destination matrix.
+     * @note Code is a <code>const char*</code> held in
+     * <code>sub_block_kernel_code.</code>
+     * Used in math/gpu/copy_submatrix_opencl.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     *
+     */
+    __kernel void sub_block(
+        __global read_only double *src, read_write __global double *dst,
+        read_only unsigned int src_offset_i,
+        read_only unsigned int src_offset_j,
+        read_only unsigned int dst_offset_i,
+        read_only unsigned int dst_offset_j, read_only unsigned int size_i,
+        read_only unsigned int size_j, read_only unsigned int src_rows,
+        read_only unsigned int src_cols, read_only unsigned int dst_rows,
+        read_only unsigned int dst_cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if ((i + src_offset_i) < src_rows && (j + src_offset_j) < src_cols
+          && (i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
+        dst((dst_offset_i + i), (dst_offset_j + j))
+            = src((src_offset_i + i), (src_offset_j + j));
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/sub_block.hpp sub_block() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int, int, int, int, int,
+                          int, int, int, int>
+    sub_block("sub_block", sub_block_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/sub_block_kernel.cl
+++ b/stan/math/gpu/kernels/sub_block_kernel.cl
-R"(
-#ifndef src
-#define src(i,j) src[j * src_rows + i]
-#endif
-#ifndef dst
-#define dst(i,j) dst[j * dst_rows + i]
-#endif
-/**
- * Copies a submatrix of the source matrix to
- * the destination matrix. The submatrix to copy
- * starts at (0, 0)
- * and is of size size_rows x size_cols.
- * The submatrix is copied to the
- * destination matrix starting at
- * (dst_offset_rows, dst_offset_cols)
- *
- * @param[in] src The source matrix.
- * @param[out] dst The destination submatrix.
- * @param src_offset_i The offset row in src.
- * @param src_offset_j The offset column in src.
- * @param dst_offset_i The offset row in dst.
- * @param dst_offset_j The offset column in dst.
- * @param size_i The number of rows in the submatrix.
- * @param size_j The number of columns in the submatrix.
- * @param src_rows The number of rows in the source matrix.
- * @param src_cols The number of cols in the source matrix.
- * @param src_rows The number of rows in the destination matrix.
- * @param dst_cols The number of cols in the destination matrix.
- *
- * @note used in math/gpu/copy_submatrix_opencl.hpp
- */
- __kernel void copy_submatrix(__global double *src, __global double *dst,
-   unsigned int src_offset_i, unsigned int src_offset_j,
-   unsigned int dst_offset_i, unsigned int dst_offset_j,
-   unsigned int size_i, unsigned int size_j,
-   unsigned int src_rows, unsigned int src_cols,
-   unsigned int dst_rows, unsigned int dst_cols) {
-   int i = get_global_id(0);
-   int j = get_global_id(1);
-   if ((i + src_offset_i) < src_rows && (j + src_offset_j) < src_cols &&
-    (i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
-     dst((dst_offset_i + i), (dst_offset_j + j)) =
-       src((src_offset_i + i),(src_offset_j + j));
-   }
- };)"
--- a/stan/math/gpu/kernels/subtract.hpp
+++ b/stan/math/gpu/kernels/subtract.hpp
+#ifndef STAN_MATH_GPU_KERNELS_SUBTRACT_HPP
+#define STAN_MATH_GPU_KERNELS_SUBTRACT_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *subtract_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Matrix subtraction on the GPU Subtracts the second matrix
+     * from the first matrix and stores the result in the third matrix (C=A-B).
+     *
+     * @param[out] C The output matrix.
+     * @param[in] B RHS input matrix.
+     * @param[in] A LHS input matrix.
+     * @param rows The number of rows for matrix A.
+     * @param cols The number of columns for matrix A.
+     * @note Code is a <code>const char*</code> held in
+     * <code>subtract_kernel_code.</code>
+     * Used in math/gpu/subtract_opencl.hpp
+     *  This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void subtract(
+        __global write_only double *C, __global read_only double *A,
+        __global read_only double *B, read_only unsigned int rows,
+        read_only unsigned int cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        C(i, j) = A(i, j) - B(i, j);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/subtract.hpp subtract() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int, int>
+    subtract("subtract", subtract_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/subtract_matrix_kernel.cl
+++ b/stan/math/gpu/kernels/subtract_matrix_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-#ifndef B
-#define B(i, j)  B[j * rows + i]
-#endif
-#ifndef C
-#define C(i, j)  C[j * rows + i]
-#endif
-
-/**
- * Matrix subtraction on the GPU Subtracts the second matrix
- * from the first matrix and stores the result in the third matrix (C=A-B).
- *
- * @param[out] C The output matrix.
- * @param[in] B RHS input matrix.
- * @param[in] A LHS input matrix.
- * @param rows The number of rows for matrix A.
- * @param cols The number of columns for matrix A.
- *
- * @note Used in math/gpu/subtract_opencl.hpp
- */
-__kernel void subtract(__global double *C, __global double *A,
-  __global double *B, unsigned int rows, unsigned int cols) {
-  int i = get_global_id(0);
-  int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    C(i, j) = A(i, j) - B(i, j);
-  }
-};)"
--- a/stan/math/gpu/kernels/transpose.hpp
+++ b/stan/math/gpu/kernels/transpose.hpp
+#ifndef STAN_MATH_GPU_KERNELS_TRANSPOSE_HPP
+#define STAN_MATH_GPU_KERNELS_TRANSPOSE_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *transpose_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Takes the transpose of the matrix on the GPU.
+     *
+     * @param[out] B The output matrix to hold transpose of A.
+     * @param[in] A The input matrix to transpose into B.
+     * @param rows The number of rows for A.
+     * @param cols The number of columns for A.
+     * @note Code is a <code>const char*</code> held in
+     * <code>transpose_kernel_code.</code>
+     * This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void transpose(
+        __global read_write double *B, __global read_only double *A,
+        read_only unsigned int rows, read_only unsigned int cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        BT(j, i) = A(i, j);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/transpose.hpp transpose() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int> transpose(
+    "transpose", transpose_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif