Unverified Commit 9603f93f authored by Steve Bronder's avatar Steve Bronder
Browse files

Changed the names of the kernels and added some parts of the documentation Rok...

Changed the names of the kernels and added some parts of the documentation Rok posted to the code docs
Showing with 169 additions and 135 deletions
+169 -135
......@@ -8,22 +8,27 @@ namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* lower_tri_inverse_step1_kernel_code = STRINGIFY(
const char* diagonal_inverse_lower_triangular_kernel_code = STRINGIFY(
// \endcond
/**
* Calculates the inverse with no blocking
* Calculates inplace submatrix inversions along the matrix diagonal.
*
* In the special case that the thread block size is larger than the input
* matrix A then this kernel will perform the complete lower triangular
* of matrix A. Otherwise A will have lower triangular inverses calculated
* on submatrices along the diagonal equal to the size of the thread block.
*
* @param[in,out] A The input matrix.
* @param[in, out] tmp_inv A temporary matrix for storing partial
* calculations.
* @param rows The number of rows for A.
* @note Code is a <code>const char*</code> held in
* <code>lower_tri_inverse_step1_kernel_code.</code>
* <code>diagonal_inverse_lower_triangular_kernel_code.</code>
* Used in math/gpu/lower_tri_inverse.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void lower_tri_inverse_step1(__global double* A,
__global double* tmp_inv, int rows) {
__kernel void diagonal_inverse_lower_triangular(
__global double* A, __global double* tmp_inv, int rows) {
int index = get_local_id(0);
int group = get_group_id(0);
int block_size = get_local_size(0);
......@@ -50,11 +55,14 @@ const char* lower_tri_inverse_step1_kernel_code = STRINGIFY(
// \endcond
/**
* See the docs for \link kernels/lower_tri_inverse_step1.hpp add() \endlink
* See the docs for \link kernels/diagonal_inverse_lower_triangular.hpp add()
* \endlink
*/
const local_range_kernel<cl::Buffer, cl::Buffer, int> lower_tri_inverse_step1(
"lower_tri_inverse_step1", lower_tri_inverse_step1_kernel_code,
{{"THREAD_BLOCK_SIZE", 32}});
const local_range_kernel<cl::Buffer, cl::Buffer, int>
diagonal_inverse_lower_triangular(
"diagonal_inverse_lower_triangular",
diagonal_inverse_lower_triangular_kernel_code,
{{"THREAD_BLOCK_SIZE", 32}});
} // namespace opencl_kernels
} // namespace math
......
......@@ -8,10 +8,20 @@ namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* lower_tri_inverse_step2_kernel_code = STRINGIFY(
const char* inverse_lower_triangular_multiply_kernel_code = STRINGIFY(
// \endcond
/**
* Calculates the intermediate products in the lower_tri_inverse
* Calculates B = C * A. C is an inverse matrix and A is lower triangular.
*
* This kernel is used in the final iteration of the batched lower
* triangular inversion.
* The full inverse requires calculation of the lower left rectangular
* matrix within the lower left triangular C3 = -C2*A3*C1. where C2 is the
* inverse of the bottom right lower triangular, C1 is the inverse of the
* upper left lower and A3 is the original lower triangulars lower left
* rectangular. This kernel takes the output from
* <code>negative_rectangular_lower_triangular_multiply</code> and applies
* the submatrix multiplcation to get the final output for C3.
*
* @param[in] A input matrix that is being inverted.
* @param[out] temp output matrix with results of the batched matrix
......@@ -19,13 +29,13 @@ const char* lower_tri_inverse_step2_kernel_code = STRINGIFY(
* @param A_rows The number of rows for A.
* @param rows The number of rows in a single matrix of the batch
* @note Code is a <code>const char*</code> held in
* <code>lower_tri_inverse_step2_kernel_code.</code>
* <code>inverse_lower_triangular_multiply_kernel_code.</code>
* Used in math/gpu/lower_tri_inverse.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void lower_tri_inverse_step2(__global double* A,
__global double* temp,
const int A_rows, const int rows) {
__kernel void inverse_lower_triangular_multiply(
__global double* A, __global double* temp, const int A_rows,
const int rows) {
int result_matrix_id = get_global_id(2);
int offset = result_matrix_id * rows * 2;
// thread index inside the thread_block
......@@ -100,10 +110,10 @@ const char* lower_tri_inverse_step2_kernel_code = STRINGIFY(
* See the docs for \link kernels/matrix_multiply.hpp add() \endlink
*/
const local_range_kernel<cl::Buffer, cl::Buffer, int, int>
lower_tri_inverse_step2("lower_tri_inverse_step2",
lower_tri_inverse_step2_kernel_code,
{{"THREAD_BLOCK_SIZE", 32},
{"WORK_PER_THREAD", 8}});
inverse_lower_triangular_multiply(
"inverse_lower_triangular_multiply",
inverse_lower_triangular_multiply_kernel_code,
{{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}});
} // namespace opencl_kernels
} // namespace math
......
#ifndef STAN_MATH_GPU_KERNELS_LOWER_TRI_INVERSE_STEP3_HPP
#define STAN_MATH_GPU_KERNELS_LOWER_TRI_INVERSE_STEP3_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* lower_tri_inverse_step3_kernel_code = STRINGIFY(
// \endcond
/**
* Calculates the products to calculate parts of the lower_tri_inverse
*
* @param[in, out] A Input matrix that is being inverted.
* @param[in] temp Temporary matrix with the intermediate results.
* @param A_rows Number of rows for A.
* @param rows The number of rows in a single matrix of the batch
* @note Code is a <code>const char*</code> held in
* <code>lower_tri_inverse_step3_kernel_code.</code>
* Used in math/gpu/lower_tri_inverse.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void lower_tri_inverse_step3(__global double* A,
const __global double* temp,
const int A_rows, const int rows) {
int result_matrix_id = get_global_id(2);
int offset = result_matrix_id * rows * 2;
// thread index inside the thread_block
const int thread_block_row = get_local_id(0);
const int thread_block_col = get_local_id(1);
// global thread index
const int i = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;
const int j = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;
// local memory
__local double temp_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
__local double C1_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
double acc[WORK_PER_THREAD] = {0};
const int num_tiles = (rows + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
// iterate over all tiles
for (int tile_ind = 0; tile_ind < num_tiles; tile_ind++) {
// each thread copies WORK_PER_THREAD values to the local
// memory
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int tiled_i = THREAD_BLOCK_SIZE * tile_ind + thread_block_row;
const int tiled_j = THREAD_BLOCK_SIZE * tile_ind + thread_block_col;
const int temp_global_col = tiled_j + w * THREAD_BLOCK_SIZE_COL;
const int C1_global_col = offset + j + w * THREAD_BLOCK_SIZE_COL;
const int C1_global_row = tiled_i + offset;
const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;
const int local_row = thread_block_row;
if ((temp_global_col) < rows && i < rows) {
temp_local[local_col][local_row]
= temp[result_matrix_id * rows * rows + temp_global_col * rows
+ i];
} else {
temp_local[local_col][local_row] = 0.0;
}
if (C1_global_col <= C1_global_row) {
C1_local[local_col][local_row]
= A[C1_global_col * A_rows + C1_global_row];
} else {
C1_local[local_col][local_row] = 0;
}
}
// wait until all tile values are loaded to the local memory
barrier(CLK_LOCAL_MEM_FENCE);
for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE; block_ind++) {
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int local_col = thread_block_col + w * THREAD_BLOCK_SIZE_COL;
const int local_row = thread_block_row;
acc[w] += temp_local[block_ind][local_row]
* C1_local[local_col][block_ind];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// save the values
const int A_global_row = i + rows + offset;
const int A_global_col_offset = offset + j;
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int A_global_col
= A_global_col_offset + w * THREAD_BLOCK_SIZE_COL;
// each thread saves WORK_PER_THREAD values
A[A_global_col * A_rows + i + rows + offset] = -acc[w];
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/matrix_multiply.hpp add() \endlink
*/
const local_range_kernel<cl::Buffer, cl::Buffer, int, int>
lower_tri_inverse_step3("lower_tri_inverse_step3",
lower_tri_inverse_step3_kernel_code,
{{"THREAD_BLOCK_SIZE", 32},
{"WORK_PER_THREAD", 8}});
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
#ifndef STAN_MATH_GPU_KERNELS_LOWER_TRI_INVERSE_STEP3_HPP
#define STAN_MATH_GPU_KERNELS_LOWER_TRI_INVERSE_STEP3_HPP
#ifdef STAN_OPENCL
#include <stan/math/gpu/kernel_cl.hpp>
namespace stan {
namespace math {
namespace opencl_kernels {
// \cond
const char* negative_rectangular_lower_triangular_multiply_kernel_code
= STRINGIFY(
// \endcond
/**
* Calculates C = -B * A where B is rectangular and A is a lower
* triangular.
* The full inverse requires calculation of the lower left rectangular
* matrix within the lower left triangular C3 = -C2*A3*C1. where C2 is the
* inverse of the bottom right lower triangular, C1 is the inverse of the
* upper left lower and A3 is the original lower triangulars lower left
* rectangular. This kernel performs multiplications on submatrices in
* the input matrix A in parallel and includes optimizations
* to account for the lower triangular input matrix A.
*
*
* @param[in, out] A Input matrix that is being inverted.
* @param[in] temp Temporary matrix with the intermediate results.
* @param A_rows Number of rows for A.
* @param rows The number of rows in a single matrix of the batch
* @note Code is a <code>const char*</code> held in
* negative_rectangular_lower_triangular_multiply_kernel_code
* Used in math/gpu/lower_tri_inverse.hpp.
* This kernel uses the helper macros available in helpers.cl.
*/
__kernel void negative_rectangular_lower_triangular_multiply(
__global double* A, const __global double* temp, const int A_rows,
const int rows) {
int result_matrix_id = get_global_id(2);
int offset = result_matrix_id * rows * 2;
// thread index inside the thread_block
const int thread_block_row = get_local_id(0);
const int thread_block_col = get_local_id(1);
// global thread index
const int i = THREAD_BLOCK_SIZE * get_group_id(0) + thread_block_row;
const int j = THREAD_BLOCK_SIZE * get_group_id(1) + thread_block_col;
// local memory
__local double temp_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
__local double C1_local[THREAD_BLOCK_SIZE][THREAD_BLOCK_SIZE];
double acc[WORK_PER_THREAD] = {0};
const int num_tiles
= (rows + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
// iterate over all tiles
for (int tile_ind = 0; tile_ind < num_tiles; tile_ind++) {
// each thread copies WORK_PER_THREAD values to the local
// memory
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int tiled_i
= THREAD_BLOCK_SIZE * tile_ind + thread_block_row;
const int tiled_j
= THREAD_BLOCK_SIZE * tile_ind + thread_block_col;
const int temp_global_col = tiled_j + w * THREAD_BLOCK_SIZE_COL;
const int C1_global_col = offset + j + w * THREAD_BLOCK_SIZE_COL;
const int C1_global_row = tiled_i + offset;
const int local_col
= thread_block_col + w * THREAD_BLOCK_SIZE_COL;
const int local_row = thread_block_row;
if ((temp_global_col) < rows && i < rows) {
temp_local[local_col][local_row]
= temp[result_matrix_id * rows * rows
+ temp_global_col * rows + i];
} else {
temp_local[local_col][local_row] = 0.0;
}
if (C1_global_col <= C1_global_row) {
C1_local[local_col][local_row]
= A[C1_global_col * A_rows + C1_global_row];
} else {
C1_local[local_col][local_row] = 0;
}
}
// wait until all tile values are loaded to the local memory
barrier(CLK_LOCAL_MEM_FENCE);
for (int block_ind = 0; block_ind < THREAD_BLOCK_SIZE;
block_ind++) {
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int local_col
= thread_block_col + w * THREAD_BLOCK_SIZE_COL;
const int local_row = thread_block_row;
acc[w] += temp_local[block_ind][local_row]
* C1_local[local_col][block_ind];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// save the values
const int A_global_row = i + rows + offset;
const int A_global_col_offset = offset + j;
for (int w = 0; w < WORK_PER_THREAD; w++) {
const int A_global_col
= A_global_col_offset + w * THREAD_BLOCK_SIZE_COL;
// each thread saves WORK_PER_THREAD values
A[A_global_col * A_rows + i + rows + offset] = -acc[w];
}
}
// \cond
);
// \endcond
/**
* See the docs for \link kernels/matrix_multiply.hpp add() \endlink
*/
const local_range_kernel<cl::Buffer, cl::Buffer, int, int>
negative_rectangular_lower_triangular_multiply(
"negative_rectangular_lower_triangular_multiply",
negative_rectangular_lower_triangular_multiply_kernel_code,
{{"THREAD_BLOCK_SIZE", 32}, {"WORK_PER_THREAD", 8}});
} // namespace opencl_kernels
} // namespace math
} // namespace stan
#endif
#endif
......@@ -3,9 +3,9 @@
#ifdef STAN_OPENCL
#include <stan/math/gpu/matrix_gpu.hpp>
#include <stan/math/gpu/kernels/lower_tri_inverse_step1.hpp>
#include <stan/math/gpu/kernels/lower_tri_inverse_step2.hpp>
#include <stan/math/gpu/kernels/lower_tri_inverse_step3.hpp>
#include <stan/math/gpu/kernels/diagonal_inverse_lower_triangular.hpp>
#include <stan/math/gpu/kernels/inverse_lower_triangular_multiply.hpp>
#include <stan/math/gpu/kernels/negative_rectangular_lower_triangular_multiply.hpp>
#include <stan/math/gpu/identity.hpp>
#include <stan/math/gpu/err/check_square.hpp>
......@@ -16,7 +16,6 @@ namespace stan {
namespace math {
/**
* Computes the inverse of the lower triangular matrix
* that resides in the GPU global memory
*
* @param A matrix on the GPU
*
......@@ -63,8 +62,9 @@ inline matrix_gpu lower_triangular_inverse(const matrix_gpu& A) {
inv_padded.zeros<stan::math::TriangularViewGPU::Entire>();
int work_per_thread
= opencl_kernels::lower_tri_inverse_step2.make_functor.get_opts().at(
"WORK_PER_THREAD");
= opencl_kernels::inverse_lower_triangular_multiply.make_functor
.get_opts()
.at("WORK_PER_THREAD");
// the number of blocks in the first step
// each block is inverted with using the regular forward substitution
int parts = inv_padded.rows() / thread_block_size_1D;
......@@ -75,7 +75,7 @@ inline matrix_gpu lower_triangular_inverse(const matrix_gpu& A) {
cl::NDRange(parts, thread_block_size_1D, thread_block_size_1D),
temp.buffer(), thread_block_size_1D, temp.size());
// spawn parts thread blocks, each responsible for one block
opencl_kernels::lower_tri_inverse_step1(
opencl_kernels::diagonal_inverse_lower_triangular(
cl::NDRange(parts * thread_block_size_1D),
cl::NDRange(thread_block_size_1D), inv_padded.buffer(), temp.buffer(),
inv_padded.rows());
......@@ -106,10 +106,10 @@ inline matrix_gpu lower_triangular_inverse(const matrix_gpu& A) {
auto result_work_dim = result_matrix_dim / work_per_thread;
auto result_ndrange
= cl::NDRange(result_matrix_dim_x, result_work_dim, parts);
opencl_kernels::lower_tri_inverse_step2(
opencl_kernels::inverse_lower_triangular_multiply(
result_ndrange, ndrange_2d, inv_padded.buffer(), temp.buffer(),
inv_padded.rows(), result_matrix_dim);
opencl_kernels::lower_tri_inverse_step3(
opencl_kernels::negative_rectangular_lower_triangular_multiply(
result_ndrange, ndrange_2d, inv_padded.buffer(), temp.buffer(),
inv_padded.rows(), result_matrix_dim);
// if this is the last submatrix, end
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment