Commit 6adb7122 authored by Rayleigh L's avatar Rayleigh L
Browse files

Merge branch 'develop' of https://github.com/stan-dev/math into feature/issue-838-linseq

parents 44a66ffe 159f3dfe
Showing with 725 additions and 258 deletions
+725 -258
R"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
/**
* Check if the <code>matrix_gpu</code> has NaN values
*
* @param[in] A The matrix to check.
* @param rows The number of rows in matrix A.
* @param cols The number of columns in matrix A.
* @param[out] flag the flag to be written to if any diagonal is zero.
*
* @note Kernel for stan/math/gpu/err/check_nan.hpp
*/
__kernel void is_nan(__global double *A, int rows, int cols,
__global int *flag) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if (i < rows && j < cols) {
if (isnan(A(i, j))) {
flag[0] = 1;
}
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_CHECK_SYMMETRIC_HPP
#define STAN_MATH_GPU_KERNELS_CHECK_SYMMETRIC_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *is_symmetric_kernel_code = STRINGIFY(
// \endcond
/**
* Check if the <code>matrix_gpu</code> is symmetric
*
* @param[in] A The matrix to check.
* @param rows The number of rows in matrix A.
* @param cols The number of columns in matrix A.
* @param[out] flag the flag to be written to if any diagonal is zero.
* @param tolerance The numerical tolerance to check wheter
* two values are equal
* @note Code is a <code>const char*</code> held in
* <code>is_symmetric_kernel_code.</code>
* Kernel for stan/math/gpu/err/check_symmetric.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void is_symmetric(
__global read_only double *A, __global write_only int *flag,
read_only unsigned int rows, read_only unsigned int cols,
read_only double tolerance) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if (i < rows && j < cols) {
double diff = fabs(A(i, j) - A(j, i));
if (diff > tolerance) {
flag[0] = 0;
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/check_symmetric.hpp check_symmetric() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, int, int, const double>
check_symmetric("is_symmetric", is_symmetric_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
R"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
/**
* Check if the <code>matrix_gpu</code> is symmetric
*
* @param[in] A The matrix to check.
* @param rows The number of rows in matrix A.
* @param cols The number of columns in matrix A.
* @param[out] flag the flag to be written to if any diagonal is zero.
*
* @note Kernel for stan/math/gpu/err/check_symmetric.hpp
*/
__kernel void is_symmetric(__global double *A, int rows, int cols,
__global int *flag, double tolerance) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if (i < rows && j < cols) {
double diff = fabs(A(i, j) - A(j, i));
if (diff > tolerance) {
flag[0] = 0;
}
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_COPY_HPP
#define STAN_MATH_GPU_KERNELS_COPY_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *copy_kernel_code = STRINGIFY(
// \endcond
/**
* Copy one matrix to another
* @param[in] A The matrix to copy.
* @param[out] B The matrix to copy A to.
* @param rows The number of rows in A.
* @param cols The number of cols in A.
* @note Code is a <code>const char*</code> held in
* <code>copy_kernel_code.</code>
* Kernel used in math/gpu/matrix_gpu.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void copy(
__global read_only double *A, __global write_only double *B,
read_only unsigned int rows, read_only unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
B(i, j) = A(i, j);
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/copy.hpp copy() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, int, int> copy(
"copy", copy_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
R"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
#ifndef B
#define B(i, j) B[j * rows + i]
#endif
/**
* Copy one matrix to another
* @param[in] A The matrix to copy.
* @param[out] B The matrix to copy A to.
* @param rows The number of rows in A.
* @param cols The number of cols in A.
*
* @note Kernel used in math/gpu/matrix_gpu.hpp
*/
__kernel void copy(__global double *A, __global double *B, unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
B(i, j) = A(i, j);
}
};)"
R"(
#ifndef src
#define src(i,j) src[j * src_rows + i]
#endif
#ifndef dst
#define dst(i,j) dst[j * dst_rows + i]
#endif
/**
* Copies a submatrix of the source matrix to
* the destination matrix. The submatrix to copy
* starts at (0, 0)
* and is of size size_rows x size_cols.
* The submatrix is copied to the
* destination matrix starting at
* (dst_offset_rows, dst_offset_cols)
*
* @param[in] src The source matrix.
* @param[out] dst The destination submatrix.
* @param dst_offset_i The offset row in dst.
* @param dst_offset_j The offset column in dst.
* @param size_i The number of rows in the submatrix.
* @param size_j The number of columns in the submatrix.
* @param src_rows The number of rows in the source matrix.
* @param src_cols The number of cols in the source matrix.
* @param src_rows The number of rows in the destination matrix.
* @param dst_cols The number of cols in the destination matrix.
*
* @note used in math/gpu/copy_submatrix_opencl.hpp
*/
__kernel void copy_submatrix(__global double *src, __global double *dst,
unsigned int dst_offset_i, unsigned int dst_offset_j,
unsigned int size_i, unsigned int size_j,
unsigned int src_rows, unsigned int src_cols,
unsigned int dst_rows, unsigned int dst_cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ((i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
dst((dst_offset_i + i), (dst_offset_j + j)) =
src((0 + i),(0 + j));
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_COPY_TRIANGULAR_HPP
#define STAN_MATH_GPU_KERNELS_COPY_TRIANGULAR_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *copy_triangular_kernel_code = STRINGIFY(
// \endcond
/**
* Copies the lower or upper
* triangular of the source matrix to
* the destination matrix.
* Both matrices are stored on the GPU.
*
* @param[out] A Output matrix to copy triangular to.
* @param[in] B The matrix to copy the triangular from.
* @param rows The number of rows of B.
* @param cols The number of cols of B.
* @param lower_upper determines
* which part of the matrix to copy:
* LOWER: 0 - copies the lower triangular
* UPPER: 1 - copes the upper triangular
* @note Code is a <code>const char*</code> held in
* <code>copy_triangular_kernel_code.</code>
* Used in math/gpu/copy_triangular_opencl.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void copy_triangular(
__global write_only double *A, __global read_only double *B,
read_only unsigned int rows, read_only unsigned int cols,
read_only unsigned int lower_upper) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
if (!lower_upper && j <= i) {
A(i, j) = B(i, j);
} else if (!lower_upper) {
A(i, j) = 0;
} else if (lower_upper && j >= i) {
A(i, j) = B(i, j);
} else if (lower_upper && j < i) {
A(i, j) = 0;
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/copy_triangular.hpp copy_triangular() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, int, int, TriangularViewGPU>
copy_triangular("copy_triangular", copy_triangular_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
R"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
#ifndef B
#define B(i, j) B[j * rows + i]
#endif
/**
* Copies the lower or upper
* triangular of the source matrix to
* the destination matrix.
* Both matrices are stored on the GPU.
*
* @param[out] A Output matrix to copy triangular to.
* @param[in] B The matrix to copy the triangular from.
* @param rows The number of rows of B.
* @param cols The number of cols of B.
* @param lower_upper determines
* which part of the matrix to copy:
* LOWER: 0 - copies the lower triangular
* UPPER: 1 - copes the upper triangular
*
* @note Used in math/gpu/copy_triangular_opencl.hpp
*/
__kernel void copy_triangular(__global double *A, __global double *B,
unsigned int rows, unsigned int cols, unsigned int lower_upper) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
if (!lower_upper && j <= i) {
A(i, j) = B(i, j);
} else if (!lower_upper) {
A(i, j) = 0;
} else if (lower_upper && j >= i) {
A(i, j) = B(i, j);
} else if (lower_upper && j < i) {
A(i, j) = 0;
}
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_HELPERS_HPP
#define STAN_MATH_GPU_KERNELS_HELPERS_HPP
#ifdef STAN_OPENCL
#include <string>
namespace stan {
namespace math {
namespace opencl_kernels {
/*
* Defines some helper macros for the kernels
*/
std::string helpers =
R"(
// Matrix access helpers
#ifndef A
#define A(i,j) A[j * rows + i]
#endif
#ifndef B
#define B(i,j) B[j * rows + i]
#endif
#ifndef C
#define C(i,j) C[j * rows + i]
#endif
// Transpose
#ifndef BT
#define BT(i,j) B[j * cols + i]
#endif
#ifndef AT
#define AT(i,j) A[j * cols + i]
#endif
// Moving between two buffers
#ifndef src
#define src(i,j) src[j * src_rows + i]
#endif
#ifndef dst
#define dst(i,j) dst[j * dst_rows + i]
#endif
// The local memory column for each thread block
#define THREAD_BLOCK_SIZE_COL THREAD_BLOCK_SIZE/WORK_PER_THREAD
)";
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
#ifndef STAN_MATH_GPU_KERNELS_IDENTITY_HPP
#define STAN_MATH_GPU_KERNELS_IDENTITY_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* identity_kernel_code = STRINGIFY(
// \endcond
/**
* Makes an identity matrix on the GPU
*
* @param[in,out] A The identity matrix output.
* @param rows The number of rows for A.
* @param cols The number of cols for A.
* @note Code is a <code>const char*</code> held in
* <code>identity_kernel_code.</code>
* Used in math/gpu/identity_opencl.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void identity(__global write_only double* A,
read_only unsigned int rows,
read_only unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
if (i == j) {
A(i, j) = 1.0;
} else {
A(i, j) = 0.0;
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/transpose.hpp transpose() \endlink
*/
const global_range_kernel<cl::Buffer, int, int> identity("identity",
identity_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
R"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
/**
* Makes an identity matrix on the GPU
*
* @param[in,out] A The identity matrix output.
* @param rows The number of rows for A.
* @param cols The number of cols for A.
*
* @note Used in math/gpu/identity_opencl.hpp
*/
__kernel void identity(__global double *A, unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
if (i == j) {
A(i, j) = 1.0;
} else {
A(i, j) = 0.0;
}
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_MATRIX_MULTIPLY_HPP
#define STAN_MATH_GPU_KERNELS_MATRIX_MULTIPLY_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* matrix_multiply_kernel_code = STRINGIFY(
// \endcond
/**
* Matrix multiplication on the GPU
*
* @param[in] A the left matrix in matrix multiplication
* @param[in] B the right matrix in matrix multiplication
* @param[out] C the output matrix
* @param[in] M Number of rows for matrix A
* @param[in] N Number of rows for matrix B
* @param[in] K Number of cols for matrix A and number of rows for matrix B
*/
__kernel void matrix_multiply(
const __global read_only double* A, const __global read_only double* B,
__global write_only double* C, const read_only int M,
const read_only int N, const read_only int K) {
// thread index inside the thread_block
const int thread_block_row = get_local_id(0);
const int thread_block_col = get_local_id(1);
// global thread index
const int i = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;
const int j = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;
// local memory
__local double A_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
__local double B_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
double acc[WORK_PER_THREAD];
for (int w = 0; w < WORK_PER_THREAD; w++) {
acc[w] = 0.0;
}
const int num_tiles = (K + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
// iterate over all tiles
for (int tile_ind = 0; tile_ind < num_tiles; tile_ind++) {
// each thread copies WORK_PER_THREAD values to the local
// memory
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int tiled_i = THREAD_BLOCK_SIZE * tile_ind + thread_block_row;
const int tiled_j = THREAD_BLOCK_SIZE * tile_ind + thread_block_col;
A_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
[thread_block_row]
= A[(tiled_j + w * THREAD_BLOCK_SIZE_COL) * M + i];
B_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
[thread_block_row]
= B[(j + w * THREAD_BLOCK_SIZE_COL) * K + tiled_i];
}
// wait until all tile values are loaded to the local memory
barrier(CLK_LOCAL_MEM_FENCE);
for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE; block_ind++) {
for (int w = 0; w < WORK_PER_THREAD; w++) {
acc[w] += A_local[block_ind][thread_block_row]
* B_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
[block_ind];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// save the values
for (int w = 0; w < WORK_PER_THREAD; w++) {
// each thread saves WORK_PER_THREAD values
C[(j + w * THREAD_BLOCK_SIZE_COL) * M + i] = acc[w];
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/matrix_multiply.hpp add() \endlink
*/
const local_range_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int, int, int>
matrix_multiply("matrix_multiply", matrix_multiply_kernel_code,
{{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}});
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
#ifndef STAN_MATH_GPU_KERNELS_MULTIPLY_TRANSPOSE_HPP
#define STAN_MATH_GPU_KERNELS_MULTIPLY_TRANSPOSE_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* multiply_transpose_kernel_code = STRINGIFY(
// \endcond
/**
* Matrix multiplication of the form A*A^T on the GPU
*
* @param[in] A matrix A
* @param[out] B the output matrix
* @param[in] M Number of rows for matrix A
* @param[in] N Number of cols for matrix A and the number of rows for
* matrix A^T
*/
__kernel void multiply_transpose(
const __global read_only double* A, __global write_only double* B,
const read_only int M, const read_only int N) {
// thread index inside the thread block
const int thread_block_row = get_local_id(0);
const int thread_block_col = get_local_id(1);
// global thread index
const int i = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;
const int j = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;
// indexes that determine the last indexes that need to compute
// in order to remove the unnecesary multiplications in the special
// multiplication of A*A^T
const int jMin = THREAD_BLOCK_SIZE * get_group_id(1);
const int iMax = THREAD_BLOCK_SIZE * get_group_id(0) + get_local_size(0);
// local memory
__local double A_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
__local double B_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
double acc[WORK_PER_THREAD];
for (int w = 0; w < WORK_PER_THREAD; w++) {
acc[w] = 0.0;
}
const int numTiles = (N + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
// iterate over all tiles
for (int tile_ind = 0; tile_ind < numTiles; tile_ind++) {
// in each tile
const int tiled_i = THREAD_BLOCK_SIZE * tile_ind + thread_block_row;
const int tiled_j = THREAD_BLOCK_SIZE * tile_ind + thread_block_col;
// if the data needs to be loaded to local memory
if (jMin <= iMax) {
// each thread copies WORK_PER_THREAD values to the
// local memory
for (int w = 0; w < WORK_PER_THREAD; w++) {
A_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
[thread_block_row]
= A[i + (tiled_j + w * THREAD_BLOCK_SIZE_COL) * M];
B_local[thread_block_col + w * THREAD_BLOCK_SIZE_COL]
[thread_block_row]
= A[(j + w * THREAD_BLOCK_SIZE_COL) + tiled_i * M];
}
}
// wait till all tile values are loaded to the local memory
barrier(CLK_LOCAL_MEM_FENCE);
// multiply the tile products
for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE; block_ind++) {
// each thread multiplies WORK_PER_THREAD values
for (int w = 0; w < WORK_PER_THREAD; w++) {
if (jMin <= iMax) {
if ((j + w * THREAD_BLOCK_SIZE_COL) <= i) {
acc[w] += A_local[block_ind][thread_block_row]
* B_local[thread_block_col
+ w * THREAD_BLOCK_SIZE_COL][block_ind];
}
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// save the values
for (int w = 0; w < WORK_PER_THREAD; w++) {
// each thread saves WORK_PER_THREAD values
if ((j + w * THREAD_BLOCK_SIZE_COL) <= i) {
B[i + (j + w * THREAD_BLOCK_SIZE_COL) * M] = acc[w];
B[(j + w * THREAD_BLOCK_SIZE_COL) + i * M] = acc[w];
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/multiply_transpose.hpp add() \endlink
*/
const local_range_kernel<cl::Buffer, cl::Buffer, int, int> multiply_transpose(
"multiply_transpose", multiply_transpose_kernel_code,
{{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 4}});
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
#ifndef STAN_MATH_GPU_KERNELS_SCALAR_MUL_HPP
#define STAN_MATH_GPU_KERNELS_SCALAR_MUL_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *scalar_mul_kernel_code = STRINGIFY(
// \endcond
/**
* Multiplication of the matrix A with a scalar
*
* @param[in] A input matrix
* @param[in] B output matrix
* @param[in] scalar the value with which to multiply A
* @param[in] rows the number of rows in A
* @param[in] cols the number of columns in A
*/
__kernel void scalar_mul(__global double *A, const __global double *B,
const double scalar, const unsigned int rows,
const unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
A(i, j) = B(i, j) * scalar;
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/scalar_mul.hpp add() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, double, int, int> scalar_mul(
"scalar_mul", scalar_mul_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
#ifndef STAN_MATH_GPU_KERNELS_SCALAR_MUL_DIAGONAL_HPP
#define STAN_MATH_GPU_KERNELS_SCALAR_MUL_DIAGONAL_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *scalar_mul_diagonal_kernel_code = STRINGIFY(
// \endcond
/**
* Multiplication of the matrix A diagonal with a scalar
*
* @param[in, out] A matrix A
* @param[in] scalar the value with which to multiply the diagonal of A
* @param[in] rows the number of rows in A
* @param[in] min_dim the size of the smaller dimension of A
*/
__kernel void scalar_mul_diagonal(__global read_write double *A,
const read_only double scalar,
const read_only unsigned int rows,
const read_only unsigned int min_dim) {
int i = get_global_id(0);
if (i < min_dim) {
A(i, i) *= scalar;
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/scalar_mul_diagonal.hpp add() \endlink
*/
const global_range_kernel<cl::Buffer, double, int, int> scalar_mul_diagonal(
"scalar_mul_diagonal", scalar_mul_diagonal_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
#ifndef STAN_MATH_GPU_KERNELS_SUB_BLOCK_HPP
#define STAN_MATH_GPU_KERNELS_SUB_BLOCK_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *sub_block_kernel_code = STRINGIFY(
// \endcond
/**
* Copies a submatrix of the source matrix to
* the destination matrix. The submatrix to copy
* starts at (0, 0)
* and is of size size_rows x size_cols.
* The submatrix is copied to the
* destination matrix starting at
* (dst_offset_rows, dst_offset_cols)
*
* @param[in] src The source matrix.
* @param[out] dst The destination submatrix.
* @param src_offset_i The offset row in src.
* @param src_offset_j The offset column in src.
* @param dst_offset_i The offset row in dst.
* @param dst_offset_j The offset column in dst.
* @param size_i The number of rows in the submatrix.
* @param size_j The number of columns in the submatrix.
* @param src_rows The number of rows in the source matrix.
* @param src_cols The number of cols in the source matrix.
* @param src_rows The number of rows in the destination matrix.
* @param dst_cols The number of cols in the destination matrix.
* @param dst_rows The number of rows in the destination matrix.
* @note Code is a <code>const char*</code> held in
* <code>sub_block_kernel_code.</code>
* Used in math/gpu/copy_submatrix_opencl.hpp.
* This kernel uses the helper macros available in helpers.cl.
*
*/
__kernel void sub_block(
__global read_only double *src, read_write __global double *dst,
read_only unsigned int src_offset_i,
read_only unsigned int src_offset_j,
read_only unsigned int dst_offset_i,
read_only unsigned int dst_offset_j, read_only unsigned int size_i,
read_only unsigned int size_j, read_only unsigned int src_rows,
read_only unsigned int src_cols, read_only unsigned int dst_rows,
read_only unsigned int dst_cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ((i + src_offset_i) < src_rows && (j + src_offset_j) < src_cols
&& (i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
dst((dst_offset_i + i), (dst_offset_j + j))
= src((src_offset_i + i), (src_offset_j + j));
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/sub_block.hpp sub_block() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, int, int, int, int, int, int,
int, int, int, int>
sub_block("sub_block", sub_block_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
R"(
#ifndef src
#define src(i,j) src[j * src_rows + i]
#endif
#ifndef dst
#define dst(i,j) dst[j * dst_rows + i]
#endif
/**
* Copies a submatrix of the source matrix to
* the destination matrix. The submatrix to copy
* starts at (0, 0)
* and is of size size_rows x size_cols.
* The submatrix is copied to the
* destination matrix starting at
* (dst_offset_rows, dst_offset_cols)
*
* @param[in] src The source matrix.
* @param[out] dst The destination submatrix.
* @param src_offset_i The offset row in src.
* @param src_offset_j The offset column in src.
* @param dst_offset_i The offset row in dst.
* @param dst_offset_j The offset column in dst.
* @param size_i The number of rows in the submatrix.
* @param size_j The number of columns in the submatrix.
* @param src_rows The number of rows in the source matrix.
* @param src_cols The number of cols in the source matrix.
* @param src_rows The number of rows in the destination matrix.
* @param dst_cols The number of cols in the destination matrix.
*
* @note used in math/gpu/copy_submatrix_opencl.hpp
*/
__kernel void copy_submatrix(__global double *src, __global double *dst,
unsigned int src_offset_i, unsigned int src_offset_j,
unsigned int dst_offset_i, unsigned int dst_offset_j,
unsigned int size_i, unsigned int size_j,
unsigned int src_rows, unsigned int src_cols,
unsigned int dst_rows, unsigned int dst_cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ((i + src_offset_i) < src_rows && (j + src_offset_j) < src_cols &&
(i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
dst((dst_offset_i + i), (dst_offset_j + j)) =
src((src_offset_i + i),(src_offset_j + j));
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_SUBTRACT_HPP
#define STAN_MATH_GPU_KERNELS_SUBTRACT_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *subtract_kernel_code = STRINGIFY(
// \endcond
/**
* Matrix subtraction on the GPU Subtracts the second matrix
* from the first matrix and stores the result in the third matrix (C=A-B).
*
* @param[out] C The output matrix.
* @param[in] B RHS input matrix.
* @param[in] A LHS input matrix.
* @param rows The number of rows for matrix A.
* @param cols The number of columns for matrix A.
* @note Code is a <code>const char*</code> held in
* <code>subtract_kernel_code.</code>
* Used in math/gpu/subtract_opencl.hpp
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void subtract(
__global write_only double *C, __global read_only double *A,
__global read_only double *B, read_only unsigned int rows,
read_only unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
C(i, j) = A(i, j) - B(i, j);
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/subtract.hpp subtract() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, cl::Buffer, int, int>
subtract("subtract", subtract_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
R"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
#ifndef B
#define B(i, j) B[j * rows + i]
#endif
#ifndef C
#define C(i, j) C[j * rows + i]
#endif
/**
* Matrix subtraction on the GPU Subtracts the second matrix
* from the first matrix and stores the result in the third matrix (C=A-B).
*
* @param[out] C The output matrix.
* @param[in] B RHS input matrix.
* @param[in] A LHS input matrix.
* @param rows The number of rows for matrix A.
* @param cols The number of columns for matrix A.
*
* @note Used in math/gpu/subtract_opencl.hpp
*/
__kernel void subtract(__global double *C, __global double *A,
__global double *B, unsigned int rows, unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
C(i, j) = A(i, j) - B(i, j);
}
};)"
#ifndef STAN_MATH_GPU_KERNELS_TRANSPOSE_HPP
#define STAN_MATH_GPU_KERNELS_TRANSPOSE_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char *transpose_kernel_code = STRINGIFY(
// \endcond
/**
* Takes the transpose of the matrix on the GPU.
*
* @param[out] B The output matrix to hold transpose of A.
* @param[in] A The input matrix to transpose into B.
* @param rows The number of rows for A.
* @param cols The number of columns for A.
* @note Code is a <code>const char*</code> held in
* <code>transpose_kernel_code.</code>
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void transpose(
__global read_write double *B, __global read_only double *A,
read_only unsigned int rows, read_only unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
BT(j, i) = A(i, j);
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/transpose.hpp transpose() \endlink
*/
const global_range_kernel<cl::Buffer, cl::Buffer, int, int> transpose(
"transpose", transpose_kernel_code);
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment