Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Francesco Brarda
stan-math-petsc
Commits
6adb7122
Commit
6adb7122
authored
6 years ago
by
Rayleigh L
Browse files
Options
Download
Plain Diff
Merge branch 'develop' of
https://github.com/stan-dev/math
into feature/issue-838-linseq
parents
44a66ffe
159f3dfe
Changes
163
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
stan/math/gpu/kernels/check_nan_kernel.cl
+0
-24
stan/math/gpu/kernels/check_nan_kernel.cl
stan/math/gpu/kernels/check_symmetric.hpp
+54
-0
stan/math/gpu/kernels/check_symmetric.hpp
stan/math/gpu/kernels/check_symmetric_kernel.cl
+0
-25
stan/math/gpu/kernels/check_symmetric_kernel.cl
stan/math/gpu/kernels/copy.hpp
+47
-0
stan/math/gpu/kernels/copy.hpp
stan/math/gpu/kernels/copy_matrix_kernel.cl
+0
-25
stan/math/gpu/kernels/copy_matrix_kernel.cl
stan/math/gpu/kernels/copy_submatrix_kernel.cl
+0
-41
stan/math/gpu/kernels/copy_submatrix_kernel.cl
stan/math/gpu/kernels/copy_triangular.hpp
+64
-0
stan/math/gpu/kernels/copy_triangular.hpp
stan/math/gpu/kernels/copy_triangular_matrix_kernel.cl
+0
-41
stan/math/gpu/kernels/copy_triangular_matrix_kernel.cl
stan/math/gpu/kernels/helpers.hpp
+49
-0
stan/math/gpu/kernels/helpers.hpp
stan/math/gpu/kernels/identity.hpp
+51
-0
stan/math/gpu/kernels/identity.hpp
stan/math/gpu/kernels/identity_matrix_kernel.cl
+0
-26
stan/math/gpu/kernels/identity_matrix_kernel.cl
stan/math/gpu/kernels/matrix_multiply.hpp
+92
-0
stan/math/gpu/kernels/matrix_multiply.hpp
stan/math/gpu/kernels/multiply_transpose.hpp
+108
-0
stan/math/gpu/kernels/multiply_transpose.hpp
stan/math/gpu/kernels/scalar_mul.hpp
+45
-0
stan/math/gpu/kernels/scalar_mul.hpp
stan/math/gpu/kernels/scalar_mul_diagonal.hpp
+44
-0
stan/math/gpu/kernels/scalar_mul_diagonal.hpp
stan/math/gpu/kernels/sub_block.hpp
+73
-0
stan/math/gpu/kernels/sub_block.hpp
stan/math/gpu/kernels/sub_block_kernel.cl
+0
-45
stan/math/gpu/kernels/sub_block_kernel.cl
stan/math/gpu/kernels/subtract.hpp
+51
-0
stan/math/gpu/kernels/subtract.hpp
stan/math/gpu/kernels/subtract_matrix_kernel.cl
+0
-31
stan/math/gpu/kernels/subtract_matrix_kernel.cl
stan/math/gpu/kernels/transpose.hpp
+47
-0
stan/math/gpu/kernels/transpose.hpp
with
725 additions
and
258 deletions
+725
-258
stan/math/gpu/kernels/check_nan_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
/**
* Check if the <code>matrix_gpu</code> has NaN values
*
* @param[in] A The matrix to check.
* @param rows The number of rows in matrix A.
* @param cols The number of columns in matrix A.
* @param[out] flag the flag to be written to if any diagonal is zero.
*
* @note Kernel for stan/math/gpu/err/check_nan.hpp
*/
__kernel void is_nan(__global double *A, int rows, int cols,
__global int *flag) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if (i < rows && j < cols) {
if (isnan(A(i, j))) {
flag[0] = 1;
}
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/check_symmetric.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_CHECK_SYMMETRIC_HPP
#define STAN_MATH_GPU_KERNELS_CHECK_SYMMETRIC_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
is_symmetric_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Check if the <code>matrix_gpu</code> is symmetric
*
* @param[in] A The matrix to check.
* @param rows The number of rows in matrix A.
* @param cols The number of columns in matrix A.
* @param[out] flag the flag to be written to if any diagonal is zero.
* @param tolerance The numerical tolerance to check wheter
* two values are equal
* @note Code is a <code>const char*</code> held in
* <code>is_symmetric_kernel_code.</code>
* Kernel for stan/math/gpu/err/check_symmetric.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel
void
is_symmetric
(
__global
read_only
double
*
A
,
__global
write_only
int
*
flag
,
read_only
unsigned
int
rows
,
read_only
unsigned
int
cols
,
read_only
double
tolerance
)
{
const
int
i
=
get_global_id
(
0
);
const
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
double
diff
=
fabs
(
A
(
i
,
j
)
-
A
(
j
,
i
));
if
(
diff
>
tolerance
)
{
flag
[
0
]
=
0
;
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/check_symmetric.hpp check_symmetric() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
const
double
>
check_symmetric
(
"is_symmetric"
,
is_symmetric_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/check_symmetric_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
/**
* Check if the <code>matrix_gpu</code> is symmetric
*
* @param[in] A The matrix to check.
* @param rows The number of rows in matrix A.
* @param cols The number of columns in matrix A.
* @param[out] flag the flag to be written to if any diagonal is zero.
*
* @note Kernel for stan/math/gpu/err/check_symmetric.hpp
*/
__kernel void is_symmetric(__global double *A, int rows, int cols,
__global int *flag, double tolerance) {
const int i = get_global_id(0);
const int j = get_global_id(1);
if (i < rows && j < cols) {
double diff = fabs(A(i, j) - A(j, i));
if (diff > tolerance) {
flag[0] = 0;
}
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/copy.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_COPY_HPP
#define STAN_MATH_GPU_KERNELS_COPY_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
copy_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Copy one matrix to another
* @param[in] A The matrix to copy.
* @param[out] B The matrix to copy A to.
* @param rows The number of rows in A.
* @param cols The number of cols in A.
* @note Code is a <code>const char*</code> held in
* <code>copy_kernel_code.</code>
* Kernel used in math/gpu/matrix_gpu.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel
void
copy
(
__global
read_only
double
*
A
,
__global
write_only
double
*
B
,
read_only
unsigned
int
rows
,
read_only
unsigned
int
cols
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
B
(
i
,
j
)
=
A
(
i
,
j
);
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/copy.hpp copy() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
copy
(
"copy"
,
copy_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/copy_matrix_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
#ifndef B
#define B(i, j) B[j * rows + i]
#endif
/**
* Copy one matrix to another
* @param[in] A The matrix to copy.
* @param[out] B The matrix to copy A to.
* @param rows The number of rows in A.
* @param cols The number of cols in A.
*
* @note Kernel used in math/gpu/matrix_gpu.hpp
*/
__kernel void copy(__global double *A, __global double *B, unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
B(i, j) = A(i, j);
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/copy_submatrix_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef src
#define src(i,j) src[j * src_rows + i]
#endif
#ifndef dst
#define dst(i,j) dst[j * dst_rows + i]
#endif
/**
* Copies a submatrix of the source matrix to
* the destination matrix. The submatrix to copy
* starts at (0, 0)
* and is of size size_rows x size_cols.
* The submatrix is copied to the
* destination matrix starting at
* (dst_offset_rows, dst_offset_cols)
*
* @param[in] src The source matrix.
* @param[out] dst The destination submatrix.
* @param dst_offset_i The offset row in dst.
* @param dst_offset_j The offset column in dst.
* @param size_i The number of rows in the submatrix.
* @param size_j The number of columns in the submatrix.
* @param src_rows The number of rows in the source matrix.
* @param src_cols The number of cols in the source matrix.
* @param src_rows The number of rows in the destination matrix.
* @param dst_cols The number of cols in the destination matrix.
*
* @note used in math/gpu/copy_submatrix_opencl.hpp
*/
__kernel void copy_submatrix(__global double *src, __global double *dst,
unsigned int dst_offset_i, unsigned int dst_offset_j,
unsigned int size_i, unsigned int size_j,
unsigned int src_rows, unsigned int src_cols,
unsigned int dst_rows, unsigned int dst_cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ((i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
dst((dst_offset_i + i), (dst_offset_j + j)) =
src((0 + i),(0 + j));
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/copy_triangular.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_COPY_TRIANGULAR_HPP
#define STAN_MATH_GPU_KERNELS_COPY_TRIANGULAR_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
copy_triangular_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Copies the lower or upper
* triangular of the source matrix to
* the destination matrix.
* Both matrices are stored on the GPU.
*
* @param[out] A Output matrix to copy triangular to.
* @param[in] B The matrix to copy the triangular from.
* @param rows The number of rows of B.
* @param cols The number of cols of B.
* @param lower_upper determines
* which part of the matrix to copy:
* LOWER: 0 - copies the lower triangular
* UPPER: 1 - copes the upper triangular
* @note Code is a <code>const char*</code> held in
* <code>copy_triangular_kernel_code.</code>
* Used in math/gpu/copy_triangular_opencl.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel
void
copy_triangular
(
__global
write_only
double
*
A
,
__global
read_only
double
*
B
,
read_only
unsigned
int
rows
,
read_only
unsigned
int
cols
,
read_only
unsigned
int
lower_upper
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
if
(
!
lower_upper
&&
j
<=
i
)
{
A
(
i
,
j
)
=
B
(
i
,
j
);
}
else
if
(
!
lower_upper
)
{
A
(
i
,
j
)
=
0
;
}
else
if
(
lower_upper
&&
j
>=
i
)
{
A
(
i
,
j
)
=
B
(
i
,
j
);
}
else
if
(
lower_upper
&&
j
<
i
)
{
A
(
i
,
j
)
=
0
;
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/copy_triangular.hpp copy_triangular() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
TriangularViewGPU
>
copy_triangular
(
"copy_triangular"
,
copy_triangular_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/copy_triangular_matrix_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
#ifndef B
#define B(i, j) B[j * rows + i]
#endif
/**
* Copies the lower or upper
* triangular of the source matrix to
* the destination matrix.
* Both matrices are stored on the GPU.
*
* @param[out] A Output matrix to copy triangular to.
* @param[in] B The matrix to copy the triangular from.
* @param rows The number of rows of B.
* @param cols The number of cols of B.
* @param lower_upper determines
* which part of the matrix to copy:
* LOWER: 0 - copies the lower triangular
* UPPER: 1 - copes the upper triangular
*
* @note Used in math/gpu/copy_triangular_opencl.hpp
*/
__kernel void copy_triangular(__global double *A, __global double *B,
unsigned int rows, unsigned int cols, unsigned int lower_upper) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
if (!lower_upper && j <= i) {
A(i, j) = B(i, j);
} else if (!lower_upper) {
A(i, j) = 0;
} else if (lower_upper && j >= i) {
A(i, j) = B(i, j);
} else if (lower_upper && j < i) {
A(i, j) = 0;
}
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/helpers.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_HELPERS_HPP
#define STAN_MATH_GPU_KERNELS_HELPERS_HPP
#ifdef STAN_OPENCL
#include <string>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
/*
* Defines some helper macros for the kernels
*/
std
::
string
helpers
=
R"(
// Matrix access helpers
#ifndef A
#define A(i,j) A[j * rows + i]
#endif
#ifndef B
#define B(i,j) B[j * rows + i]
#endif
#ifndef C
#define C(i,j) C[j * rows + i]
#endif
// Transpose
#ifndef BT
#define BT(i,j) B[j * cols + i]
#endif
#ifndef AT
#define AT(i,j) A[j * cols + i]
#endif
// Moving between two buffers
#ifndef src
#define src(i,j) src[j * src_rows + i]
#endif
#ifndef dst
#define dst(i,j) dst[j * dst_rows + i]
#endif
// The local memory column for each thread block
#define THREAD_BLOCK_SIZE_COL THREAD_BLOCK_SIZE/WORK_PER_THREAD
)"
;
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/identity.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_IDENTITY_HPP
#define STAN_MATH_GPU_KERNELS_IDENTITY_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
identity_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Makes an identity matrix on the GPU
*
* @param[in,out] A The identity matrix output.
* @param rows The number of rows for A.
* @param cols The number of cols for A.
* @note Code is a <code>const char*</code> held in
* <code>identity_kernel_code.</code>
* Used in math/gpu/identity_opencl.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel
void
identity
(
__global
write_only
double
*
A
,
read_only
unsigned
int
rows
,
read_only
unsigned
int
cols
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
if
(
i
==
j
)
{
A
(
i
,
j
)
=
1.0
;
}
else
{
A
(
i
,
j
)
=
0.0
;
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/transpose.hpp transpose() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
int
,
int
>
identity
(
"identity"
,
identity_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/identity_matrix_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
/**
* Makes an identity matrix on the GPU
*
* @param[in,out] A The identity matrix output.
* @param rows The number of rows for A.
* @param cols The number of cols for A.
*
* @note Used in math/gpu/identity_opencl.hpp
*/
__kernel void identity(__global double *A, unsigned int rows,
unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
if (i == j) {
A(i, j) = 1.0;
} else {
A(i, j) = 0.0;
}
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/matrix_multiply.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_MATRIX_MULTIPLY_HPP
#define STAN_MATH_GPU_KERNELS_MATRIX_MULTIPLY_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
matrix_multiply_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Matrix multiplication on the GPU
*
* @param[in] A the left matrix in matrix multiplication
* @param[in] B the right matrix in matrix multiplication
* @param[out] C the output matrix
* @param[in] M Number of rows for matrix A
* @param[in] N Number of rows for matrix B
* @param[in] K Number of cols for matrix A and number of rows for matrix B
*/
__kernel
void
matrix_multiply
(
const
__global
read_only
double
*
A
,
const
__global
read_only
double
*
B
,
__global
write_only
double
*
C
,
const
read_only
int
M
,
const
read_only
int
N
,
const
read_only
int
K
)
{
// thread index inside the thread_block
const
int
thread_block_row
=
get_local_id
(
0
);
const
int
thread_block_col
=
get_local_id
(
1
);
// global thread index
const
int
i
=
THREAD_BLOCK_SIZE
*
get_group_id
(
0
)
+
thread_block_row
;
const
int
j
=
THREAD_BLOCK_SIZE
*
get_group_id
(
1
)
+
thread_block_col
;
// local memory
__local
double
A_local
[
THREAD_BLOCK_SIZE
][
THREAD_BLOCK_SIZE
];
__local
double
B_local
[
THREAD_BLOCK_SIZE
][
THREAD_BLOCK_SIZE
];
double
acc
[
WORK_PER_THREAD
];
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
acc
[
w
]
=
0.0
;
}
const
int
num_tiles
=
(
K
+
THREAD_BLOCK_SIZE
-
1
)
/
THREAD_BLOCK_SIZE
;
// iterate over all tiles
for
(
int
tile_ind
=
0
;
tile_ind
<
num_tiles
;
tile_ind
++
)
{
// each thread copies WORK_PER_THREAD values to the local
// memory
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
const
int
tiled_i
=
THREAD_BLOCK_SIZE
*
tile_ind
+
thread_block_row
;
const
int
tiled_j
=
THREAD_BLOCK_SIZE
*
tile_ind
+
thread_block_col
;
A_local
[
thread_block_col
+
w
*
THREAD_BLOCK_SIZE_COL
]
[
thread_block_row
]
=
A
[(
tiled_j
+
w
*
THREAD_BLOCK_SIZE_COL
)
*
M
+
i
];
B_local
[
thread_block_col
+
w
*
THREAD_BLOCK_SIZE_COL
]
[
thread_block_row
]
=
B
[(
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
*
K
+
tiled_i
];
}
// wait until all tile values are loaded to the local memory
barrier
(
CLK_LOCAL_MEM_FENCE
);
for
(
int
block_ind
=
0
;
block_ind
<
THREAD_BLOCK_SIZE
;
block_ind
++
)
{
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
acc
[
w
]
+=
A_local
[
block_ind
][
thread_block_row
]
*
B_local
[
thread_block_col
+
w
*
THREAD_BLOCK_SIZE_COL
]
[
block_ind
];
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
);
}
// save the values
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
// each thread saves WORK_PER_THREAD values
C
[(
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
*
M
+
i
]
=
acc
[
w
];
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/matrix_multiply.hpp add() \endlink
*/
const
local_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
int
>
matrix_multiply
(
"matrix_multiply"
,
matrix_multiply_kernel_code
,
{{
"THREAD_BLOCK_SIZE"
,
32
},
{
"WORK_PER_THREAD"
,
8
}});
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/multiply_transpose.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_MULTIPLY_TRANSPOSE_HPP
#define STAN_MATH_GPU_KERNELS_MULTIPLY_TRANSPOSE_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
multiply_transpose_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Matrix multiplication of the form A*A^T on the GPU
*
* @param[in] A matrix A
* @param[out] B the output matrix
* @param[in] M Number of rows for matrix A
* @param[in] N Number of cols for matrix A and the number of rows for
* matrix A^T
*/
__kernel
void
multiply_transpose
(
const
__global
read_only
double
*
A
,
__global
write_only
double
*
B
,
const
read_only
int
M
,
const
read_only
int
N
)
{
// thread index inside the thread block
const
int
thread_block_row
=
get_local_id
(
0
);
const
int
thread_block_col
=
get_local_id
(
1
);
// global thread index
const
int
i
=
THREAD_BLOCK_SIZE
*
get_group_id
(
0
)
+
thread_block_row
;
const
int
j
=
THREAD_BLOCK_SIZE
*
get_group_id
(
1
)
+
thread_block_col
;
// indexes that determine the last indexes that need to compute
// in order to remove the unnecesary multiplications in the special
// multiplication of A*A^T
const
int
jMin
=
THREAD_BLOCK_SIZE
*
get_group_id
(
1
);
const
int
iMax
=
THREAD_BLOCK_SIZE
*
get_group_id
(
0
)
+
get_local_size
(
0
);
// local memory
__local
double
A_local
[
THREAD_BLOCK_SIZE
][
THREAD_BLOCK_SIZE
];
__local
double
B_local
[
THREAD_BLOCK_SIZE
][
THREAD_BLOCK_SIZE
];
double
acc
[
WORK_PER_THREAD
];
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
acc
[
w
]
=
0.0
;
}
const
int
numTiles
=
(
N
+
THREAD_BLOCK_SIZE
-
1
)
/
THREAD_BLOCK_SIZE
;
// iterate over all tiles
for
(
int
tile_ind
=
0
;
tile_ind
<
numTiles
;
tile_ind
++
)
{
// in each tile
const
int
tiled_i
=
THREAD_BLOCK_SIZE
*
tile_ind
+
thread_block_row
;
const
int
tiled_j
=
THREAD_BLOCK_SIZE
*
tile_ind
+
thread_block_col
;
// if the data needs to be loaded to local memory
if
(
jMin
<=
iMax
)
{
// each thread copies WORK_PER_THREAD values to the
// local memory
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
A_local
[
thread_block_col
+
w
*
THREAD_BLOCK_SIZE_COL
]
[
thread_block_row
]
=
A
[
i
+
(
tiled_j
+
w
*
THREAD_BLOCK_SIZE_COL
)
*
M
];
B_local
[
thread_block_col
+
w
*
THREAD_BLOCK_SIZE_COL
]
[
thread_block_row
]
=
A
[(
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
+
tiled_i
*
M
];
}
}
// wait till all tile values are loaded to the local memory
barrier
(
CLK_LOCAL_MEM_FENCE
);
// multiply the tile products
for
(
int
block_ind
=
0
;
block_ind
<
THREAD_BLOCK_SIZE
;
block_ind
++
)
{
// each thread multiplies WORK_PER_THREAD values
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
if
(
jMin
<=
iMax
)
{
if
((
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
<=
i
)
{
acc
[
w
]
+=
A_local
[
block_ind
][
thread_block_row
]
*
B_local
[
thread_block_col
+
w
*
THREAD_BLOCK_SIZE_COL
][
block_ind
];
}
}
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
);
}
// save the values
for
(
int
w
=
0
;
w
<
WORK_PER_THREAD
;
w
++
)
{
// each thread saves WORK_PER_THREAD values
if
((
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
<=
i
)
{
B
[
i
+
(
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
*
M
]
=
acc
[
w
];
B
[(
j
+
w
*
THREAD_BLOCK_SIZE_COL
)
+
i
*
M
]
=
acc
[
w
];
}
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/multiply_transpose.hpp add() \endlink
*/
const
local_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
multiply_transpose
(
"multiply_transpose"
,
multiply_transpose_kernel_code
,
{{
"THREAD_BLOCK_SIZE"
,
32
},
{
"WORK_PER_THREAD"
,
4
}});
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/scalar_mul.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_SCALAR_MUL_HPP
#define STAN_MATH_GPU_KERNELS_SCALAR_MUL_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
scalar_mul_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Multiplication of the matrix A with a scalar
*
* @param[in] A input matrix
* @param[in] B output matrix
* @param[in] scalar the value with which to multiply A
* @param[in] rows the number of rows in A
* @param[in] cols the number of columns in A
*/
__kernel
void
scalar_mul
(
__global
double
*
A
,
const
__global
double
*
B
,
const
double
scalar
,
const
unsigned
int
rows
,
const
unsigned
int
cols
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
A
(
i
,
j
)
=
B
(
i
,
j
)
*
scalar
;
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/scalar_mul.hpp add() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
double
,
int
,
int
>
scalar_mul
(
"scalar_mul"
,
scalar_mul_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/scalar_mul_diagonal.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_SCALAR_MUL_DIAGONAL_HPP
#define STAN_MATH_GPU_KERNELS_SCALAR_MUL_DIAGONAL_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
scalar_mul_diagonal_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Multiplication of the matrix A diagonal with a scalar
*
* @param[in, out] A matrix A
* @param[in] scalar the value with which to multiply the diagonal of A
* @param[in] rows the number of rows in A
* @param[in] min_dim the size of the smaller dimension of A
*/
__kernel
void
scalar_mul_diagonal
(
__global
read_write
double
*
A
,
const
read_only
double
scalar
,
const
read_only
unsigned
int
rows
,
const
read_only
unsigned
int
min_dim
)
{
int
i
=
get_global_id
(
0
);
if
(
i
<
min_dim
)
{
A
(
i
,
i
)
*=
scalar
;
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/scalar_mul_diagonal.hpp add() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
double
,
int
,
int
>
scalar_mul_diagonal
(
"scalar_mul_diagonal"
,
scalar_mul_diagonal_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/sub_block.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_SUB_BLOCK_HPP
#define STAN_MATH_GPU_KERNELS_SUB_BLOCK_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
sub_block_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Copies a submatrix of the source matrix to
* the destination matrix. The submatrix to copy
* starts at (0, 0)
* and is of size size_rows x size_cols.
* The submatrix is copied to the
* destination matrix starting at
* (dst_offset_rows, dst_offset_cols)
*
* @param[in] src The source matrix.
* @param[out] dst The destination submatrix.
* @param src_offset_i The offset row in src.
* @param src_offset_j The offset column in src.
* @param dst_offset_i The offset row in dst.
* @param dst_offset_j The offset column in dst.
* @param size_i The number of rows in the submatrix.
* @param size_j The number of columns in the submatrix.
* @param src_rows The number of rows in the source matrix.
* @param src_cols The number of cols in the source matrix.
* @param src_rows The number of rows in the destination matrix.
* @param dst_cols The number of cols in the destination matrix.
* @param dst_rows The number of rows in the destination matrix.
* @note Code is a <code>const char*</code> held in
* <code>sub_block_kernel_code.</code>
* Used in math/gpu/copy_submatrix_opencl.hpp.
* This kernel uses the helper macros available in helpers.cl.
*
*/
__kernel
void
sub_block
(
__global
read_only
double
*
src
,
read_write
__global
double
*
dst
,
read_only
unsigned
int
src_offset_i
,
read_only
unsigned
int
src_offset_j
,
read_only
unsigned
int
dst_offset_i
,
read_only
unsigned
int
dst_offset_j
,
read_only
unsigned
int
size_i
,
read_only
unsigned
int
size_j
,
read_only
unsigned
int
src_rows
,
read_only
unsigned
int
src_cols
,
read_only
unsigned
int
dst_rows
,
read_only
unsigned
int
dst_cols
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
((
i
+
src_offset_i
)
<
src_rows
&&
(
j
+
src_offset_j
)
<
src_cols
&&
(
i
+
dst_offset_i
)
<
dst_rows
&&
(
j
+
dst_offset_j
)
<
dst_cols
)
{
dst
((
dst_offset_i
+
i
),
(
dst_offset_j
+
j
))
=
src
((
src_offset_i
+
i
),
(
src_offset_j
+
j
));
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/sub_block.hpp sub_block() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
,
int
>
sub_block
(
"sub_block"
,
sub_block_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/sub_block_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef src
#define src(i,j) src[j * src_rows + i]
#endif
#ifndef dst
#define dst(i,j) dst[j * dst_rows + i]
#endif
/**
* Copies a submatrix of the source matrix to
* the destination matrix. The submatrix to copy
* starts at (0, 0)
* and is of size size_rows x size_cols.
* The submatrix is copied to the
* destination matrix starting at
* (dst_offset_rows, dst_offset_cols)
*
* @param[in] src The source matrix.
* @param[out] dst The destination submatrix.
* @param src_offset_i The offset row in src.
* @param src_offset_j The offset column in src.
* @param dst_offset_i The offset row in dst.
* @param dst_offset_j The offset column in dst.
* @param size_i The number of rows in the submatrix.
* @param size_j The number of columns in the submatrix.
* @param src_rows The number of rows in the source matrix.
* @param src_cols The number of cols in the source matrix.
* @param src_rows The number of rows in the destination matrix.
* @param dst_cols The number of cols in the destination matrix.
*
* @note used in math/gpu/copy_submatrix_opencl.hpp
*/
__kernel void copy_submatrix(__global double *src, __global double *dst,
unsigned int src_offset_i, unsigned int src_offset_j,
unsigned int dst_offset_i, unsigned int dst_offset_j,
unsigned int size_i, unsigned int size_j,
unsigned int src_rows, unsigned int src_cols,
unsigned int dst_rows, unsigned int dst_cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if ((i + src_offset_i) < src_rows && (j + src_offset_j) < src_cols &&
(i + dst_offset_i) < dst_rows && (j + dst_offset_j) < dst_cols) {
dst((dst_offset_i + i), (dst_offset_j + j)) =
src((src_offset_i + i),(src_offset_j + j));
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/subtract.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_SUBTRACT_HPP
#define STAN_MATH_GPU_KERNELS_SUBTRACT_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
subtract_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Matrix subtraction on the GPU Subtracts the second matrix
* from the first matrix and stores the result in the third matrix (C=A-B).
*
* @param[out] C The output matrix.
* @param[in] B RHS input matrix.
* @param[in] A LHS input matrix.
* @param rows The number of rows for matrix A.
* @param cols The number of columns for matrix A.
* @note Code is a <code>const char*</code> held in
* <code>subtract_kernel_code.</code>
* Used in math/gpu/subtract_opencl.hpp
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel
void
subtract
(
__global
write_only
double
*
C
,
__global
read_only
double
*
A
,
__global
read_only
double
*
B
,
read_only
unsigned
int
rows
,
read_only
unsigned
int
cols
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
C
(
i
,
j
)
=
A
(
i
,
j
)
-
B
(
i
,
j
);
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/subtract.hpp subtract() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
subtract
(
"subtract"
,
subtract_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/subtract_matrix_kernel.cl
deleted
100644 → 0
View file @
44a66ffe
R
"(
#ifndef A
#define A(i, j) A[j * rows + i]
#endif
#ifndef B
#define B(i, j) B[j * rows + i]
#endif
#ifndef C
#define C(i, j) C[j * rows + i]
#endif
/**
* Matrix subtraction on the GPU Subtracts the second matrix
* from the first matrix and stores the result in the third matrix (C=A-B).
*
* @param[out] C The output matrix.
* @param[in] B RHS input matrix.
* @param[in] A LHS input matrix.
* @param rows The number of rows for matrix A.
* @param cols The number of columns for matrix A.
*
* @note Used in math/gpu/subtract_opencl.hpp
*/
__kernel void subtract(__global double *C, __global double *A,
__global double *B, unsigned int rows, unsigned int cols) {
int i = get_global_id(0);
int j = get_global_id(1);
if (i < rows && j < cols) {
C(i, j) = A(i, j) - B(i, j);
}
};)"
This diff is collapsed.
Click to expand it.
stan/math/gpu/kernels/transpose.hpp
0 → 100644
View file @
6adb7122
#ifndef STAN_MATH_GPU_KERNELS_TRANSPOSE_HPP
#define STAN_MATH_GPU_KERNELS_TRANSPOSE_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace
stan
{
namespace
math
{
namespace
opencl_kernels
{
// \cond
const
char
*
transpose_kernel_code
=
STRINGIFY
(
// \endcond
/**
* Takes the transpose of the matrix on the GPU.
*
* @param[out] B The output matrix to hold transpose of A.
* @param[in] A The input matrix to transpose into B.
* @param rows The number of rows for A.
* @param cols The number of columns for A.
* @note Code is a <code>const char*</code> held in
* <code>transpose_kernel_code.</code>
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel
void
transpose
(
__global
read_write
double
*
B
,
__global
read_only
double
*
A
,
read_only
unsigned
int
rows
,
read_only
unsigned
int
cols
)
{
int
i
=
get_global_id
(
0
);
int
j
=
get_global_id
(
1
);
if
(
i
<
rows
&&
j
<
cols
)
{
BT
(
j
,
i
)
=
A
(
i
,
j
);
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/transpose.hpp transpose() \endlink
*/
const
global_range_kernel
<
cl
::
Buffer
,
cl
::
Buffer
,
int
,
int
>
transpose
(
"transpose"
,
transpose_kernel_code
);
}
// namespace opencl_kernels
}
// namespace math
}
// namespace stan
#endif
#endif
This diff is collapsed.
Click to expand it.
Prev
1
2
3
4
5
6
…
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help