Merge branch 'develop' of https://github.com/stan-dev/math into feature/issue-838-linseq

6adb7122 · Rayleigh L · 44a66ffe · 159f3dfe · 6adb7122 · 6adb7122
Commit 6adb7122 authored 6 years ago by Rayleigh L
Hide whitespace changes
Inline Side-by-side

Showing

with 459 additions and 134 deletions
+459 -134
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -34,7 +34,10 @@ pipeline {
        'Run additional distribution tests on RowVectors (takes 5x as long)',
        name: 'withRowVector')
    }
-    options { skipDefaultCheckout() }
+    options {
+        skipDefaultCheckout()
+        preserveStashes(buildCount: 7)
+    }
    stages {
        stage('Kill previous builds') {
            when {
@@ -126,25 +129,37 @@ pipeline {
                sh "echo CC=${env.CXX} -Werror > make/local"
                sh "make -j${env.PARALLEL} test-headers"
            }
-            post {
-                always {
-                    warnings canRunOnFailed: true, consoleParsers: [[parserName: 'GNU C Compiler 4 (gcc)'], [parserName: 'Clang (LLVM based)']]
-                    deleteDir()
-                }
-            }
+            post { always { deleteDir() } }
        }
-        stage('Linux Unit with MPI') {
-            agent { label 'linux' }
-            steps {
-                deleteDir()
-                unstash 'MathSetup'
-                sh "echo CC=${MPICXX} >> make/local"
-                sh "echo STAN_MPI=true >> make/local"
-                runTests("test/unit")
+        stage('Always-run tests part 1') {
+            parallel {
+                stage('Linux Unit with MPI') {
+                    agent { label 'linux' }
+                    steps {
+                        deleteDir()
+                        unstash 'MathSetup'
+                        sh "echo CC=${MPICXX} >> make/local"
+                        sh "echo STAN_MPI=true >> make/local"
+                        runTests("test/unit")
+                    }
+                    post { always { retry(3) { deleteDir() } } }
+                }
+                stage('GPU Tests') {
+                    agent { label "gpu" }
+                    steps {
+                        deleteDir()
+                        unstash 'MathSetup'
+                        sh "echo CC=${env.CXX} -Werror > make/local"
+                        sh "echo STAN_OPENCL=true>> make/local"
+                        sh "echo OPENCL_PLATFORM_ID=0>> make/local"
+                        sh "echo OPENCL_DEVICE_ID=1>> make/local"
+                        runTests("test/unit/math/gpu")
+                    }
+                    post { always { retry(3) { deleteDir() } } }
+                }
            }
-            post { always { retry(3) { deleteDir() } } }
        }
-        stage('Always-run tests') {
+        stage('Always-run tests part 2') {
            parallel {
                stage('Distribution tests') {
                    agent { label "distribution-tests" }
@@ -170,17 +185,17 @@ pipeline {
                        }
                        failure {
                            echo "Distribution tests failed. Check out dist.log.zip artifact for test logs."
-                        }
+                            }
                    }
                }
-                stage('Mac Unit with Threading') {
-                    agent  { label 'osx' }
+                stage('Threading tests') {
+                    agent any
                    steps {
                        deleteDir()
                        unstash 'MathSetup'
                        sh "echo CC=${env.CXX} -Werror > make/local"
                        sh "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
-                        runTests("test/unit")
+                        runTests("test/unit -f thread")
                    }
                    post { always { retry(3) { deleteDir() } } }
                }
@@ -197,7 +212,7 @@ pipeline {
                        sh "echo CC=${env.CXX} -Werror > make/local"
                        sh "echo STAN_OPENCL=true>> make/local"
                        sh "echo OPENCL_PLATFORM_ID=0>> make/local"
-                        sh "echo OPENCL_DEVICE_ID=0>> make/local"
+                        sh "echo OPENCL_DEVICE_ID=1>> make/local"
                        runTests("test/unit")
                    }
                    post { always { retry(3) { deleteDir() } } }
@@ -213,6 +228,17 @@ pipeline {
                    }
                    post { always { retry(3) { deleteDir() } } }
                }
+                stage('Mac Unit with Threading') {
+                    agent  { label 'osx' }
+                    steps {
+                        deleteDir()
+                        unstash 'MathSetup'
+                        sh "echo CC=${env.CXX} -Werror > make/local"
+                        sh "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
+                        runTests("test/unit")
+                    }
+                    post { always { retry(3) { deleteDir() } } }
+                }
            }
        }
        stage('Upstream tests') {
@@ -252,7 +278,7 @@ pipeline {
    post {
        always {
            node("osx || linux") {
-                warnings canRunOnFailed: true, consoleParsers: [[parserName: 'GNU C Compiler 4 (gcc)'], [parserName: 'Clang (LLVM based)']]
+                warnings canRunOnFailed: true, consoleParsers: [[parserName: 'Clang (LLVM based)']]
            }
        }
        success {

--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ If this is in the file `/path/to/foo/foo.cpp`, then you can compile and run this

 ```
 > cd /path/to/foo
-> clang++ -std=c++11 -I /path/to/stan-math -I /path/to/Eigen -I /path/to/boost -I /path/to/sundials foo.cpp
+> clang++ -std=c++1y -I /path/to/stan-math -I /path/to/Eigen -I /path/to/boost -I /path/to/sundials foo.cpp
 > ./a.out
 log normal(1 | 2, 3)=-2.07311
 ```
@@ -54,7 +54,7 @@ The `-I` includes provide paths pointing to the four necessary includes:
 Note that the paths should *not* include the final directories `stan`, `Eigen`, or `boost` on the paths.  An example of a real instantiation:

 ```
-clang++ -std=c++11 -I ~/stan-dev/math -I ~/stan-dev/math/lib/eigen_3.3.3/ -I ~/stan-dev/math/lib/boost_1.66.0/ -I ~/stan-dev/math/lib/sundials_3.1.0/include foo.cpp
+clang++ -std=c++1y -I ~/stan-dev/math -I ~/stan-dev/math/lib/eigen_3.3.3/ -I ~/stan-dev/math/lib/boost_1.66.0/ -I ~/stan-dev/math/lib/sundials_3.1.0/include foo.cpp
 ```

 The following directories all exist below the links given to `-I`: `~/stan-dev/math/stan` and `~/stan-dev/math/lib/eigen_3.3.3/Eigen` and `~stan-dev/math/lib/boost_1.66.0/boost` and `~stan-dev/math/lib/sundials_3.1.0/include`.

--- a/make/libraries
+++ b/make/libraries
@@ -68,12 +68,16 @@ $(BOOST)/user-config.jam:
 	echo "# In case of a compiler mismatch used by mpicxx and" >> $(BOOST)/user-config.jam
 	echo "# the compiler used for Stan, consider configuring" >> $(BOOST)/user-config.jam
 	echo "# the boost toolset here" >> $(BOOST)/user-config.jam
+	echo "# Moreover, should your mpicxx command live in a" >> $(BOOST)/user-config.jam
+	echo "# in a non-standard directory, then consider to tell" >> $(BOOST)/user-config.jam
+	echo "# boost mpi using this syntax:" >> $(BOOST)/user-config.jam
+	echo "#using mpi : /path/to/mpicxx ;" >> $(BOOST)/user-config.jam
 	echo "using mpi ;" >> $(BOOST)/user-config.jam

 $(BOOST_LIB)/mpi.so: $(BOOST)/user-config.jam
 	@mkdir -p $(dir $@)
 	cd $(BOOST); ./bootstrap.sh
-	cd $(BOOST); ./b2 --user-config=user-config.jam --layout=system --with-mpi --with-serialization -j$(BOOST_PARALLEL_BUILD) variant=release link=shared threading=multi runtime-link=shared
+	cd $(BOOST); ./b2 --user-config=user-config.jam --layout=system --with-mpi --with-serialization -j$(BOOST_PARALLEL_BUILD) variant=release link=shared threading=multi runtime-link=shared hardcode-dll-paths=true dll-path="$(BOOST_LIB_ABS)"

 $(BOOST_LIB)/libboost_serialization.so: $(BOOST_LIB)/mpi.so


--- a/make/setup_mpi
+++ b/make/setup_mpi
@@ -9,5 +9,5 @@
 ifdef STAN_MPI
  LIBMPI = $(BOOST_LIB)/libboost_serialization$(DLL) $(BOOST_LIB)/libboost_mpi$(DLL) $(MATH)bin/math/prim/arr/functor/mpi_cluster_inst.o
  CXXFLAGS_MPI = -DSTAN_MPI
-  LDFLAGS_MPI ?= -Wl,-lboost_mpi -Wl,-lboost_serialization -Wl,-L,"$(BOOST_LIB_ABS)" -Wl,-rpath,"$(BOOST_LIB_ABS)"
+  LDFLAGS_MPI ?= -Wl,-L,"$(BOOST_LIB_ABS)" -Wl,-rpath,"$(BOOST_LIB_ABS)"
 endif
--- a/stan/math/fwd/scal/meta/operands_and_partials.hpp
+++ b/stan/math/fwd/scal/meta/operands_and_partials.hpp
@@ -46,6 +46,13 @@ class ops_partials_edge<Dx, fvar<Dx> > {
 * This is the specialization for when the return type is fvar,
 * which should be for forward mode and all higher-order cases.
 *
+ * NB: since ops_partials_edge.partials_ and ops_partials_edge.partials_vec
+ * are sometimes represented internally as a broadcast_array, we need to take
+ * care with assignments to them. Indeed, we can assign any right hand side
+ * which allows for indexing to a broadcast_array. The resulting behaviour is
+ * that the entry for the first index is what gets assigned. The most common
+ * use-case should be where the rhs is some container of length 1.
+ *
 * @tparam Op1 type of the first operand
 * @tparam Op2 type of the second operand
 * @tparam Op3 type of the third operand

--- a/stan/math/gpu/add.hpp
+++ b/stan/math/gpu/add.hpp
@@ -2,6 +2,7 @@
 #define STAN_MATH_GPU_ADD_HPP
 #ifdef STAN_OPENCL
 #include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/add.hpp>
 #include <stan/math/gpu/err/check_matching_dims.hpp>
 #include <CL/cl.hpp>

@@ -26,14 +27,10 @@ inline matrix_gpu add(const matrix_gpu& A, const matrix_gpu& B) {
  if (C.size() == 0) {
    return C;
  }
-  cl::Kernel kernel = opencl_context.get_kernel("add");
  cl::CommandQueue cmdQueue = opencl_context.queue();
  try {
-    opencl_context.set_kernel_args(kernel, C.buffer(), A.buffer(), B.buffer(),
-                                   A.rows(), A.cols());
-    cmdQueue.enqueueNDRangeKernel(kernel, cl::NullRange,
-                                  cl::NDRange(A.rows(), A.cols()),
-                                  cl::NullRange, NULL, NULL);
+    opencl_kernels::add(cl::NDRange(A.rows(), A.cols()), C.buffer(), A.buffer(),
+                        B.buffer(), A.rows(), A.cols());
  } catch (const cl::Error& e) {
    check_opencl_error("add", e);
  }

--- a/stan/math/gpu/constants.hpp
+++ b/stan/math/gpu/constants.hpp
+#ifndef STAN_MATH_GPU_CONSTANTS_HPP
+#define STAN_MATH_GPU_CONSTANTS_HPP
+#ifdef STAN_OPENCL
+namespace stan {
+namespace math {
+enum class TriangularViewGPU { Lower = 0, Upper = 1, Entire = 2 };
+enum class TriangularMapGPU { UpperToLower = 0, LowerToUpper = 1 };
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/prim/mat/fun/opencl_copy.hpp
+++ b/stan/math/prim/mat/fun/opencl_copy.hpp
@@ -3,7 +3,9 @@
 #ifdef STAN_OPENCL

 #include <stan/math/gpu/opencl_context.hpp>
+#include <stan/math/gpu/kernel_cl.hpp>
 #include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/copy.hpp>
 #include <stan/math/prim/mat/fun/Eigen.hpp>
 #include <stan/math/prim/scal/err/check_size_match.hpp>
 #include <CL/cl.hpp>
@@ -110,15 +112,8 @@ inline void copy(matrix_gpu& dst, const matrix_gpu& src) {
       * see the matrix_gpu(matrix_gpu&) constructor
       *  for explanation
       */
-      cl::CommandQueue& cmdQueue = opencl_context.queue();
-      cl::Kernel kernel = opencl_context.get_kernel("copy");
-      kernel.setArg(0, src.buffer());
-      kernel.setArg(1, dst.buffer());
-      kernel.setArg(2, dst.rows());
-      kernel.setArg(3, dst.cols());
-      cmdQueue.enqueueNDRangeKernel(kernel, cl::NullRange,
-                                    cl::NDRange(dst.rows(), dst.cols()),
-                                    cl::NullRange, NULL, NULL);
+      opencl_kernels::copy(cl::NDRange(dst.rows(), dst.cols()), src.buffer(),
+                           dst.buffer(), dst.rows(), dst.cols());
    } catch (const cl::Error& e) {
      std::cout << e.err() << std::endl;
      check_opencl_error("copy GPU->GPU", e);

--- a/stan/math/gpu/copy_triangular.hpp
+++ b/stan/math/gpu/copy_triangular.hpp
 #ifndef STAN_MATH_GPU_COPY_TRIANGULAR_HPP
 #define STAN_MATH_GPU_COPY_TRIANGULAR_HPP
 #ifdef STAN_OPENCL
+#include <stan/math/gpu/constants.hpp>
 #include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/copy.hpp>
+#include <stan/math/gpu/kernels/copy_triangular.hpp>
 #include <CL/cl.hpp>

 namespace stan {
@@ -16,27 +19,24 @@ namespace math {
 * @param src the source matrix
 * @tparam triangular_map int to describe
 * which part of the matrix to copy:
- * Lower - copies the lower triangular
- * Upper - copes the upper triangular
+ * TriangularViewGPU::Lower - copies the lower triangular
+ * TriangularViewGPU::Upper - copes the upper triangular
 *
 * @return the matrix with the copied content
 *
 */
-template <int triangular_map>
+template <TriangularViewGPU triangular_view = TriangularViewGPU::Entire>
 inline matrix_gpu copy_triangular(const matrix_gpu& src) {
  if (src.size() == 0 || src.size() == 1) {
    matrix_gpu dst(src);
    return dst;
  }
  matrix_gpu dst(src.rows(), src.cols());
-  cl::Kernel kernel = opencl_context.get_kernel("copy_triangular");
  cl::CommandQueue cmdQueue = opencl_context.queue();
  try {
-    opencl_context.set_kernel_args(kernel, dst.buffer(), src.buffer(),
-                                   dst.rows(), dst.cols(), triangular_map);
-    cmdQueue.enqueueNDRangeKernel(kernel, cl::NullRange,
-                                  cl::NDRange(dst.rows(), dst.cols()),
-                                  cl::NullRange, NULL, NULL);
+    opencl_kernels::copy_triangular(cl::NDRange(dst.rows(), dst.cols()),
+                                    dst.buffer(), src.buffer(), dst.rows(),
+                                    dst.cols(), triangular_view);
  } catch (const cl::Error& e) {
    check_opencl_error("copy_triangular", e);
  }

--- a/stan/math/gpu/diagonal_multiply.hpp
+++ b/stan/math/gpu/diagonal_multiply.hpp
+#ifndef STAN_MATH_GPU_DIAGONAL_MULTIPLY_HPP
+#define STAN_MATH_GPU_DIAGONAL_MULTIPLY_HPP
+#ifdef STAN_OPENCL
+#include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/scalar_mul_diagonal.hpp>
+#include <Eigen/Dense>
+
+namespace stan {
+namespace math {
+/**
+ * Multiplies the diagonal of a matrix on the GPU with the specified scalar.
+ *
+ * @param A input matrix
+ * @param scalar scalar
+ * @return copy of the input matrix with the diagonal multiplied by scalar
+ */
+inline matrix_gpu diagonal_multiply(const matrix_gpu& A, const double scalar) {
+  matrix_gpu B(A);
+  if (B.size() == 0)
+    return B;
+  // For rectangular matrices
+  int min_dim = B.rows();
+  if (B.cols() < min_dim)
+    min_dim = B.cols();
+  try {
+    opencl_kernels::scalar_mul_diagonal(cl::NDRange(min_dim), B.buffer(),
+                                        scalar, B.rows(), min_dim);
+  } catch (const cl::Error& e) {
+    check_opencl_error("diagonal_multiply", e);
+  }
+  return B;
+}
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
--- a/stan/math/gpu/err/check_diagonal_zeros.hpp
+++ b/stan/math/gpu/err/check_diagonal_zeros.hpp
@@ -2,6 +2,7 @@
 #define STAN_MATH_GPU_ERR_CHECK_DIAGONAL_ZEROS_HPP
 #ifdef STAN_OPENCL
 #include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/check_diagonal_zeros.hpp>
 #include <stan/math/prim/scal/err/domain_error.hpp>

 namespace stan {
@@ -20,9 +21,6 @@ inline void check_diagonal_zeros(const char* function, const char* name,
                                 const matrix_gpu& y) {
  if (y.size() == 0)
    return;
-
-  cl::Kernel kernel_check_diagonal_zeros
-      = opencl_context.get_kernel("is_zero_on_diagonal");
  cl::CommandQueue cmd_queue = opencl_context.queue();
  cl::Context ctx = opencl_context.context();
  try {
@@ -30,13 +28,9 @@ inline void check_diagonal_zeros(const char* function, const char* name,
    cl::Buffer buffer_flag(ctx, CL_MEM_READ_WRITE, sizeof(int));
    cmd_queue.enqueueWriteBuffer(buffer_flag, CL_TRUE, 0, sizeof(int),
                                 &zero_on_diagonal_flag);
-    opencl_context.set_kernel_args(kernel_check_diagonal_zeros, y.buffer(),
-                                   y.rows(), y.cols(), buffer_flag);
-
-    cmd_queue.enqueueNDRangeKernel(kernel_check_diagonal_zeros, cl::NullRange,
-                                   cl::NDRange(y.rows(), y.cols()),
-                                   cl::NullRange);
-
+    opencl_kernels::check_diagonal_zeros(cl::NDRange(y.rows(), y.cols()),
+                                         y.buffer(), buffer_flag, y.rows(),
+                                         y.cols());
    cmd_queue.enqueueReadBuffer(buffer_flag, CL_TRUE, 0, sizeof(int),
                                &zero_on_diagonal_flag);
    //  if zeros were found on the diagonal

--- a/stan/math/gpu/err/check_nan.hpp
+++ b/stan/math/gpu/err/check_nan.hpp
@@ -2,6 +2,7 @@
 #define STAN_MATH_GPU_ERR_CHECK_NAN_HPP
 #ifdef STAN_OPENCL
 #include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/check_nan.hpp>
 #include <stan/math/prim/scal/err/domain_error.hpp>

 namespace stan {
@@ -21,7 +22,6 @@ inline void check_nan(const char* function, const char* name,
  if (y.size() == 0)
    return;

-  cl::Kernel kernel_check_nan = opencl_context.get_kernel("is_nan");
  cl::CommandQueue cmd_queue = opencl_context.queue();
  cl::Context& ctx = opencl_context.context();
  try {
@@ -29,13 +29,8 @@ inline void check_nan(const char* function, const char* name,
    cl::Buffer buffer_nan_flag(ctx, CL_MEM_READ_WRITE, sizeof(int));
    cmd_queue.enqueueWriteBuffer(buffer_nan_flag, CL_TRUE, 0, sizeof(int),
                                 &nan_flag);
-    opencl_context.set_kernel_args(kernel_check_nan, y.buffer(), y.rows(),
-                                   y.cols(), buffer_nan_flag);
-
-    cmd_queue.enqueueNDRangeKernel(kernel_check_nan, cl::NullRange,
-                                   cl::NDRange(y.rows(), y.cols()),
-                                   cl::NullRange);
-
+    opencl_kernels::check_nan(cl::NDRange(y.rows(), y.cols()), y.buffer(),
+                              buffer_nan_flag, y.rows(), y.cols());
    cmd_queue.enqueueReadBuffer(buffer_nan_flag, CL_TRUE, 0, sizeof(int),
                                &nan_flag);
    //  if NaN values were found in the matrix

--- a/stan/math/gpu/err/check_symmetric.hpp
+++ b/stan/math/gpu/err/check_symmetric.hpp
@@ -3,6 +3,7 @@
 #ifdef STAN_OPENCL
 #include <stan/math/gpu/matrix_gpu.hpp>
 #include <stan/math/prim/scal/err/domain_error.hpp>
+#include <stan/math/gpu/kernels/check_symmetric.hpp>

 namespace stan {
 namespace math {
@@ -21,7 +22,6 @@ inline void check_symmetric(const char* function, const char* name,
  if (y.size() == 0)
    return;
  check_square(function, name, y);
-  cl::Kernel kernel_check_symmetric = opencl_context.get_kernel("is_symmetric");
  cl::CommandQueue cmd_queue = opencl_context.queue();
  cl::Context& ctx = opencl_context.context();
  try {
@@ -29,14 +29,9 @@ inline void check_symmetric(const char* function, const char* name,
    cl::Buffer buffer_symmetric_flag(ctx, CL_MEM_READ_WRITE, sizeof(int));
    cmd_queue.enqueueWriteBuffer(buffer_symmetric_flag, CL_TRUE, 0, sizeof(int),
                                 &symmetric_flag);
-    opencl_context.set_kernel_args(kernel_check_symmetric, y.buffer(), y.rows(),
-                                   y.cols(), buffer_symmetric_flag,
-                                   math::CONSTRAINT_TOLERANCE);
-
-    cmd_queue.enqueueNDRangeKernel(kernel_check_symmetric, cl::NullRange,
-                                   cl::NDRange(y.rows(), y.cols()),
-                                   cl::NullRange);
-
+    opencl_kernels::check_symmetric(cl::NDRange(y.rows(), y.cols()), y.buffer(),
+                                    buffer_symmetric_flag, y.rows(), y.cols(),
+                                    math::CONSTRAINT_TOLERANCE);
    cmd_queue.enqueueReadBuffer(buffer_symmetric_flag, CL_TRUE, 0, sizeof(int),
                                &symmetric_flag);
    //  if the matrix is not symmetric

--- a/stan/math/gpu/identity.hpp
+++ b/stan/math/gpu/identity.hpp
@@ -2,6 +2,7 @@
 #define STAN_MATH_GPU_IDENTITY_HPP
 #ifdef STAN_OPENCL
 #include <stan/math/gpu/matrix_gpu.hpp>
+#include <stan/math/gpu/kernels/identity.hpp>
 #include <CL/cl.hpp>

 namespace stan {
@@ -20,14 +21,11 @@ inline matrix_gpu identity(int rows_cols) {
  if (rows_cols == 0) {
    return A;
  }
-  cl::Kernel kernel = opencl_context.get_kernel("identity");
  cl::CommandQueue cmdQueue = opencl_context.queue();

  try {
-    opencl_context.set_kernel_args(kernel, A.buffer(), A.rows(), A.cols());
-    cmdQueue.enqueueNDRangeKernel(kernel, cl::NullRange,
-                                  cl::NDRange(A.rows(), A.cols()),
-                                  cl::NullRange, NULL, NULL);
+    opencl_kernels::identity(cl::NDRange(A.rows(), A.cols()), A.buffer(),
+                             A.rows(), A.cols());
  } catch (const cl::Error& e) {
    check_opencl_error("identity", e);
  }

--- a/stan/math/gpu/kernel_cl.hpp
+++ b/stan/math/gpu/kernel_cl.hpp
+#ifndef STAN_MATH_GPU_KERNEL_CL_HPP
+#define STAN_MATH_GPU_KERNEL_CL_HPP
+#ifdef STAN_OPENCL
+#include <stan/math/gpu/opencl_context.hpp>
+#include <stan/math/gpu/kernels/helpers.hpp>
+#include <CL/cl.hpp>
+#include <string>
+#include <algorithm>
+#include <map>
+#include <vector>
+
+// Used for importing the opencl kernels at compile time.
+// There has been much discussion about the best ways to do this:
+// https://github.com/bstatcomp/math/pull/7
+// and https://github.com/stan-dev/math/pull/966
+#ifndef STRINGIFY
+#define STRINGIFY(src) #src
+#endif
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+
+/**
+ * Compile an OpenCL kernel.
+ *
+ * @param name The name for the kernel
+ * @param source A string literal containing the code for the kernel.
+ * @param options The values of macros to be passed at compile time.
+ * @note The macros defined in kernels/helpers.hpp are included in the kernel
+ *  compilation for ease of writing and reading kernels.
+ */
+auto compile_kernel(const char* name, const char* source,
+                    std::map<const char*, int> options) {
+  std::string kernel_opts = "";
+  for (auto&& comp_opts : options) {
+    kernel_opts += std::string(" -D") + comp_opts.first + "="
+                   + std::to_string(comp_opts.second);
+  }
+  std::string kernel_source(opencl_kernels::helpers);
+  kernel_source.append(source);
+  cl::Program program;
+  try {
+    cl::Program::Sources src(1, std::make_pair(kernel_source.c_str(),
+                                               strlen(kernel_source.c_str())));
+    program = cl::Program(opencl_context.context(), src);
+    program.build({opencl_context.device()}, kernel_opts.c_str());
+
+    return cl::Kernel(program, name);
+  } catch (const cl::Error& e) {
+    // in case of CL_BUILD_PROGRAM_FAILURE, print the build error
+    if (e.err() == -11) {
+      std::string buildlog = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(
+          opencl_context.device()[0]);
+      system_error("compile_kernel", name, e.err(), buildlog.c_str());
+    } else {
+      check_opencl_error(name, e);
+    }
+  }
+  return cl::Kernel();  // never reached because check_opencl_error throws
+}
+
+/**
+ * Functor used for compiling kernels.
+ *
+ * @tparam Args Parameter pack of all kernel argument types.
+ */
+template <typename... Args>
+class kernel_functor {
+ private:
+  cl::Kernel kernel_;
+  std::map<const char*, int> opts_;
+
+ public:
+  /**
+   * functor to access the kernel compiler.
+   * @param name The name for the kernel.
+   * @param source A string literal containing the code for the kernel.
+   * @param options The values of macros to be passed at compile time.
+   */
+  kernel_functor(const char* name, const char* source,
+                 std::map<const char*, int> options) {
+    auto base_opts = opencl_context.base_opts();
+    options.insert(base_opts.begin(), base_opts.end());
+    kernel_ = compile_kernel(name, source, options);
+    opts_ = options;
+  }
+
+  auto operator()() const { return cl::make_kernel<Args...>(kernel_); }
+
+  /**
+   * @return The options that the kernel was compiled with.
+   */
+  const std::map<const char*, int>& get_opts() const { return opts_; }
+};
+
+/**
+ * Creates functor for kernels that only need access to defining
+ *  the global work size.
+ *
+ * @tparam Args Parameter pack of all kernel argument types.
+ */
+template <typename... Args>
+struct global_range_kernel {
+  const kernel_functor<Args...> make_functor;
+  /**
+   * Creates functor for kernels that only need access to defining
+   *  the global work size.
+   * @param name The name for the kernel
+   * @param source A string literal containing the code for the kernel.
+   * @param options The values of macros to be passed at compile time.
+   */
+  global_range_kernel(const char* name, const char* source,
+                      const std::map<const char*, int> options = {})
+      : make_functor(name, source, options) {}
+  /**
+   * Executes a kernel
+   * @param global_thread_size The global work size.
+   * @param args The arguments to pass to the kernel.
+   * @tparam Args Parameter pack of all kernel argument types.
+   */
+  auto operator()(cl::NDRange global_thread_size, Args... args) const {
+    auto f = make_functor();
+    cl::EnqueueArgs eargs(opencl_context.queue(), global_thread_size);
+    f(eargs, args...).wait();
+  }
+};
+/**
+ * Creates functor for kernels that need to define both
+ *  local and global work size.
+ * @tparam Args Parameter pack of all kernel argument types.
+ */
+template <typename... Args>
+struct local_range_kernel {
+  const kernel_functor<Args...> make_functor;
+  /**
+   * Creates kernels that need access to defining the global thread
+   * siez and the thread block size.
+   * @param name The name for the kernel
+   * @param source A string literal containing the code for the kernel.
+   * @param options The values of macros to be passed at compile time.
+   */
+  local_range_kernel(const char* name, const char* source,
+                     const std::map<const char*, int> options = {})
+      : make_functor(name, source, options) {}
+  /**
+   * Executes a kernel
+   * @param global_thread_size The global work size.
+   * @param thread_block_size The thread block size.
+   * @param args The arguments to pass to the kernel.
+   * @tparam Args Parameter pack of all kernel argument types.
+   */
+  auto operator()(cl::NDRange global_thread_size, cl::NDRange thread_block_size,
+                  Args... args) const {
+    auto f = make_functor();
+    cl::EnqueueArgs eargs(opencl_context.queue(), global_thread_size,
+                          thread_block_size);
+    f(eargs, args...).wait();
+  }
+};
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
--- a/stan/math/gpu/kernels/add.hpp
+++ b/stan/math/gpu/kernels/add.hpp
+#ifndef STAN_MATH_GPU_KERNELS_ADD_HPP
+#define STAN_MATH_GPU_KERNELS_ADD_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *add_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Matrix addition on the GPU
+     *
+     * @param[out] C Output matrix.
+     * @param[in] A LHS of matrix addition.
+     * @param[in] B RHS of matrix addition.
+     * @param rows Number of rows for matrix A.
+     * @param cols Number of cols for matrix A.
+     * @note Code is a <code>const char*</code> held in
+     * <code>add_kernel_code.</code>
+     * This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void add(__global write_only double *C,
+                      __global read_only double *A,
+                      __global read_only double *B, read_only unsigned int rows,
+                      read_only unsigned int cols) {
+      int i = get_global_id(0);
+      int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        C(i, j) = A(i, j) + B(i, j);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/add.hpp add() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int, int> add(
+    "add", add_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/add_matrix_kernel.cl
+++ b/stan/math/gpu/kernels/add_matrix_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-#ifndef B
-#define B(i, j)  B[j * rows + i]
-#endif
-#ifndef C
-#define C(i, j)  C[j * rows + i]
-#endif
-/**
- * Matrix addition on the GPU
- *
- * @param[out] C Output matrix.
- * @param[in] A LHS of matrix addition.
- * @param[in] B RHS of matrix addition.
- * @param rows Number of rows for matrix A.
- * @param cols Number of rows for matrix B.
- *
- */
-__kernel void add(__global double *C, __global double *A, __global double *B,
-  unsigned int rows, unsigned int cols) {
-  int i = get_global_id(0);
-  int j = get_global_id(1);
-  if (i < rows && j < cols) {
-    C(i, j) = A(i, j) + B(i, j);
-  }
-};)"
--- a/stan/math/gpu/kernels/check_diagonal_zeros.hpp
+++ b/stan/math/gpu/kernels/check_diagonal_zeros.hpp
+#ifndef STAN_MATH_GPU_KERNELS_CHECK_DIAGONAL_ZEROS_HPP
+#define STAN_MATH_GPU_KERNELS_CHECK_DIAGONAL_ZEROS_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *is_zero_on_diagonal_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Check if the <code>matrix_gpu</code> has zeros on the diagonal
+     *
+     * @param[in] A Matrix to check.
+     * @param[out] flag the flag to be written to if any diagonal is zero.
+     * @param rows The number of rows for A.
+     * @param cols The number of cols of A.
+     * @note Code is a <code>const char*</code> held in
+     * <code>is_zero_on_diagonal_kernel_code.</code>
+     * Kernel for stan/math/gpu/err/check_diagonal_zeros.hpp.
+     * This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void is_zero_on_diagonal(
+        __global read_only double *A, __global int *flag,
+        read_only unsigned int rows, write_only unsigned int cols) {
+      const int i = get_global_id(0);
+      if (i < rows && i < cols) {
+        if (A(i, i) == 0) {
+          flag[0] = 1;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/check_diagonal_zeros.hpp
+ * check_diagonal_zeros() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int>
+    check_diagonal_zeros("is_zero_on_diagonal",
+                         is_zero_on_diagonal_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
--- a/stan/math/gpu/kernels/check_diagonal_zeros_kernel.cl
+++ b/stan/math/gpu/kernels/check_diagonal_zeros_kernel.cl
-R"(
-#ifndef A
-#define A(i, j)  A[j * rows + i]
-#endif
-/**
- * Check if the <code>matrix_gpu</code> has zeros on the diagonal
- *
- * @param[in] A Matrix to check.
- * @param rows The number of rows for A.
- * @param cols The number of cols of A.
- * @param[out] flag the flag to be written to if any diagonal is zero.
- *
- * @note Kernel for stan/math/gpu/err/check_diagonal_zeros.hpp
- */
-__kernel void is_zero_on_diagonal(__global double *A, int rows, int cols,
-  __global int *flag) {
-  const int i = get_global_id(0);
-  if (i < rows && i < cols) {
-    if (A(i, i) == 0) {
-      flag[0] = 1;
-    }
-  }
-};)"
--- a/stan/math/gpu/kernels/check_nan.hpp
+++ b/stan/math/gpu/kernels/check_nan.hpp
+#ifndef STAN_MATH_GPU_KERNELS_CHECK_NAN_HPP
+#define STAN_MATH_GPU_KERNELS_CHECK_NAN_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/gpu/kernel_cl.hpp>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+const char *is_nan_kernel_code = STRINGIFY(
+    // \endcond
+    /**
+     * Check if the <code>matrix_gpu</code> has NaN values
+     *
+     * @param[in] A The matrix to check.
+     * @param rows The number of rows in matrix A.
+     * @param cols The number of columns in matrix A.
+     * @param[out] flag the flag to be written to if any diagonal is zero.
+     * @note Code is a <code>const char*</code> held in
+     * <code>is_nan_kernel_code.</code>
+     *  Kernel for stan/math/gpu/err/check_nan.hpp.
+     *  This kernel uses the helper macros available in helpers.cl.
+     */
+    __kernel void is_nan(
+        __global read_only double *A, __global write_only int *flag,
+        read_only unsigned int rows, read_only unsigned int cols) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < rows && j < cols) {
+        if (isnan(A(i, j))) {
+          flag[0] = 1;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/**
+ * See the docs for \link kernels/check_nan.hpp is_nan() \endlink
+ */
+const global_range_kernel<cl::Buffer, cl::Buffer, int, int> check_nan(
+    "is_nan", is_nan_kernel_code);
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif