Stan Math Library  2.20.0
reverse mode automatic differentiation
opencl_context.hpp
Go to the documentation of this file.
1 #ifndef STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP
2 #define STAN_MATH_OPENCL_OPENCL_CONTEXT_HPP
3 #ifdef STAN_OPENCL
4 
5 #define DEVICE_FILTER CL_DEVICE_TYPE_ALL
6 #ifndef OPENCL_DEVICE_ID
7 #error OPENCL_DEVICE_ID_NOT_SET
8 #endif
9 #ifndef OPENCL_PLATFORM_ID
10 #error OPENCL_PLATFORM_ID_NOT_SET
11 #endif
12 
16 
17 #include <CL/cl.hpp>
18 #include <string>
19 #include <iostream>
20 #include <fstream>
21 #include <map>
22 #include <vector>
23 #include <cmath>
24 #include <cerrno>
33 namespace stan {
34 namespace math {
35 namespace opencl {
44 template <int N>
45 inline cl::size_t<N> to_size_t(const size_t (&values)[N]) {
46  throw std::domain_error("cl::size_t<N> is not supported for N != 3");
47 }
48 
56 template <>
57 inline cl::size_t<3> to_size_t(const size_t (&values)[3]) {
58  cl::size_t<3> s;
59  for (size_t i = 0; i < 3; i++)
60  s[i] = values[i];
61  return s;
62 }
63 } // namespace opencl
82  friend class opencl_context;
83 
84  private:
101  try {
102  // platform
103  cl::Platform::get(&platforms_);
104  if (OPENCL_PLATFORM_ID >= platforms_.size()) {
105  system_error("OpenCL Initialization", "[Platform]", -1,
106  "CL_INVALID_PLATFORM");
107  }
108  platform_ = platforms_[OPENCL_PLATFORM_ID];
109  platform_name_ = platform_.getInfo<CL_PLATFORM_NAME>();
110  platform_.getDevices(DEVICE_FILTER, &devices_);
111  if (devices_.size() == 0) {
112  system_error("OpenCL Initialization", "[Device]", -1,
113  "CL_DEVICE_NOT_FOUND");
114  }
115  if (OPENCL_DEVICE_ID >= devices_.size()) {
116  system_error("OpenCL Initialization", "[Device]", -1,
117  "CL_INVALID_DEVICE");
118  }
119  device_ = devices_[OPENCL_DEVICE_ID];
120  // context and queue
121  cl_command_queue_properties device_properties;
122  device_.getInfo<cl_command_queue_properties>(CL_DEVICE_QUEUE_PROPERTIES,
123  &device_properties);
124  device_.getInfo<size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE,
125  &max_thread_block_size_);
126 
127  context_ = cl::Context(device_);
128  if (device_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
129  command_queue_ = cl::CommandQueue(
130  context_, device_, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, nullptr);
131  } else {
132  command_queue_ = cl::CommandQueue(context_, device_, 0, nullptr);
133  }
134  int thread_block_size_sqrt
135  = static_cast<int>(sqrt(static_cast<double>(max_thread_block_size_)));
136  // Does a compile time check of the maximum allowed
137  // dimension of a square thread block size
138  // WG size of (32,32) works on all recent GPUs but would fail on some
139  // older integrated GPUs or CPUs
140  if (thread_block_size_sqrt < base_opts_["THREAD_BLOCK_SIZE"]) {
141  base_opts_["THREAD_BLOCK_SIZE"] = thread_block_size_sqrt;
142  base_opts_["WORK_PER_THREAD"] = 1;
143  }
144  if (max_thread_block_size_ < base_opts_["LOCAL_SIZE_"]) {
145  // must be a power of base_opts_["REDUCTION_STEP_SIZE"]
146  const int p = std::log(max_thread_block_size_)
147  / std::log(base_opts_["REDUCTION_STEP_SIZE"]);
148  base_opts_["LOCAL_SIZE_"]
149  = std::pow(base_opts_["REDUCTION_STEP_SIZE"], p);
150  }
151  // Thread block size for the Cholesky
152  // TODO(Steve): This should be tuned in a higher part of the stan language
153  if (max_thread_block_size_ >= 256) {
154  tuning_opts_.cholesky_min_L11_size = 256;
155  } else {
156  tuning_opts_.cholesky_min_L11_size = max_thread_block_size_;
157  }
158  } catch (const cl::Error& e) {
159  check_opencl_error("opencl_context", e);
160  }
161  }
162 
163  protected:
164  cl::Context context_; // Manages the the device, queue, platform, memory,etc.
165  cl::CommandQueue command_queue_; // job queue for device, one per device
166  std::vector<cl::Platform> platforms_; // Vector of available platforms
167  cl::Platform platform_; // The platform for compiling kernels
168  std::string platform_name_; // The platform such as NVIDIA OpenCL or AMD SDK
169  std::vector<cl::Device> devices_; // All available OpenCL devices
170  cl::Device device_; // The selected OpenCL device
171  std::string device_name_; // The name of OpenCL device
172  size_t max_thread_block_size_; // The maximum size of a block of workers on
173  // the device
174 
175  // Holds Default parameter values for each Kernel.
176  typedef std::map<const char*, int> map_base_opts;
177  map_base_opts base_opts_
178  = {{"LOWER", static_cast<int>(TriangularViewCL::Lower)},
179  {"UPPER", static_cast<int>(TriangularViewCL::Upper)},
180  {"ENTIRE", static_cast<int>(TriangularViewCL::Entire)},
181  {"UPPER_TO_LOWER", static_cast<int>(TriangularMapCL::UpperToLower)},
182  {"LOWER_TO_UPPER", static_cast<int>(TriangularMapCL::LowerToUpper)},
183  {"THREAD_BLOCK_SIZE", 32},
184  {"WORK_PER_THREAD", 8},
185  {"REDUCTION_STEP_SIZE", 4},
186  {"LOCAL_SIZE_", 64}};
187  // TODO(Steve): Make these tunable during warmup
188  struct tuning_struct {
189  // Used in math/opencl/cholesky_decompose
190  int cholesky_min_L11_size = 256;
191  int cholesky_partition = 4;
192  int cholesky_size_worth_transfer = 1250;
193  // Used in math/rev/mat/fun/cholesky_decompose
194  int cholesky_rev_min_block_size = 512;
195  int cholesky_rev_block_partition = 8;
196  // used in math/opencl/multiply
197  int multiply_split_upper_limit = 2000000;
198  // used in math/prim/mat/fun/mdivide_left_tri
199  // and math/rev/mat/fun/mdivide_left_tri
200  int tri_inverse_size_worth_transfer = 100;
201  } tuning_opts_;
202 
204  static opencl_context_base instance_;
205  return instance_;
206  }
207 
208  opencl_context_base(opencl_context_base const&) = delete;
209  void operator=(opencl_context_base const&) = delete;
210 };
211 
216  public:
217  opencl_context() = default;
218 
224  inline std::string description() const {
225  std::ostringstream msg;
226 
227  msg << "Platform ID: " << OPENCL_DEVICE_ID << "\n";
228  msg << "Platform Name: "
230  .platform_.getInfo<CL_PLATFORM_NAME>()
231  << "\n";
232  msg << "Platform Vendor: "
234  .platform_.getInfo<CL_PLATFORM_VENDOR>()
235  << "\n";
236  msg << "\tDevice " << OPENCL_DEVICE_ID << ": "
237  << "\n";
238  msg << "\t\tDevice Name: "
239  << opencl_context_base::getInstance().device_.getInfo<CL_DEVICE_NAME>()
240  << "\n";
241  msg << "\t\tDevice Type: "
242  << opencl_context_base::getInstance().device_.getInfo<CL_DEVICE_TYPE>()
243  << "\n";
244  msg << "\t\tDevice Vendor: "
246  .device_.getInfo<CL_DEVICE_VENDOR>()
247  << "\n";
248  msg << "\t\tDevice Max Compute Units: "
250  .device_.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()
251  << "\n";
252  msg << "\t\tDevice Global Memory: "
254  .device_.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()
255  << "\n";
256  msg << "\t\tDevice Max Clock Frequency: "
258  .device_.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>()
259  << "\n";
260  msg << "\t\tDevice Max Allocateable Memory: "
262  .device_.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()
263  << "\n";
264  msg << "\t\tDevice Local Memory: "
266  .device_.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()
267  << "\n";
268  msg << "\t\tDevice Available: "
270  .device_.getInfo<CL_DEVICE_AVAILABLE>()
271  << "\n";
272  return msg.str();
273  }
274 
280  inline std::string capabilities() const {
281  std::vector<cl::Platform> all_platforms;
282  cl::Platform::get(&all_platforms);
283  std::ostringstream msg;
284  int platform_id = 0;
285  int device_id = 0;
286 
287  msg << "Number of Platforms: " << all_platforms.size() << "\n";
288  for (auto plat_iter : all_platforms) {
289  cl::Platform platform(plat_iter);
290 
291  msg << "Platform ID: " << platform_id++ << "\n";
292  msg << "Platform Name: " << platform.getInfo<CL_PLATFORM_NAME>() << "\n";
293  msg << "Platform Vendor: " << platform.getInfo<CL_PLATFORM_VENDOR>()
294  << "\n";
295 
296  try {
297  std::vector<cl::Device> all_devices;
298  platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
299 
300  for (auto device_iter : all_devices) {
301  cl::Device device(device_iter);
302 
303  msg << "\tDevice " << device_id++ << ": "
304  << "\n";
305  msg << "\t\tDevice Name: " << device.getInfo<CL_DEVICE_NAME>()
306  << "\n";
307  msg << "\t\tDevice Type: " << device.getInfo<CL_DEVICE_TYPE>()
308  << "\n";
309  msg << "\t\tDevice Vendor: " << device.getInfo<CL_DEVICE_VENDOR>()
310  << "\n";
311  msg << "\t\tDevice Max Compute Units: "
312  << device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>() << "\n";
313  msg << "\t\tDevice Global Memory: "
314  << device.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>() << "\n";
315  msg << "\t\tDevice Max Clock Frequency: "
316  << device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>() << "\n";
317  msg << "\t\tDevice Max Allocateable Memory: "
318  << device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() << "\n";
319  msg << "\t\tDevice Local Memory: "
320  << device.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() << "\n";
321  msg << "\t\tDevice Available: "
322  << device.getInfo<CL_DEVICE_AVAILABLE>() << "\n";
323  }
324  } catch (const cl::Error& e) {
325  // if one of the platforms have no devices that match the device type
326  // it will throw the error == -1 (DEVICE_NOT_FOUND)
327  // other errors will throw a system error
328  if (e.err() == -1) {
329  msg << "\tno (OpenCL) devices in the platform with ID " << platform_id
330  << "\n";
331  } else {
332  check_opencl_error("capabilities", e);
333  }
334  }
335  }
336  return msg.str();
337  }
338 
345  inline cl::Context& context() {
347  }
353  inline cl::CommandQueue& queue() {
355  }
361  }
369  inline int max_thread_block_size() {
371  }
372 
378  }
379 
383  inline std::vector<cl::Device> device() {
385  }
386 
390  inline std::vector<cl::Platform> platform() {
392  }
393 };
395 } // namespace math
396 } // namespace stan
397 
398 #endif
399 #endif
#define DEVICE_FILTER
The opencl_context_base class represents an OpenCL context in the standard Meyers singleton design pa...
std::vector< cl::Device > device()
Returns a vector containing the OpenCL device used to create the context.
std::vector< cl::Platform > platforms_
std::string capabilities() const
Returns the description of the OpenCL platforms and devices that are available.
fvar< T > sqrt(const fvar< T > &x)
Definition: sqrt.hpp:13
std::string description() const
Returns the description of the OpenCL platform and device that is used.
void system_error(const char *function, const char *name, const int &y, const char *msg1, const char *msg2)
Throw a system error with a consistently formatted message.
std::map< const char *, int > map_base_opts
static opencl_context_base & getInstance()
fvar< T > log(const fvar< T > &x)
Definition: log.hpp:12
The API to access the methods and values in opencl_context_base.
opencl_context_base::tuning_struct & tuning_opts()
Returns the thread block size for the Cholesky Decompositions L_11.
std::vector< cl::Platform > platform()
Returns a vector containing the OpenCL platform used to create the context.
checking OpenCL error numbers
cl::Context & context()
Returns the reference to the OpenCL context.
T get(const std::vector< T > &x, size_t n)
Returns the n-th element of the provided std::vector.
Definition: get.hpp:16
cl::size_t< N > to_size_t(const size_t(&values)[N])
A helper function to convert an array to a cl::size_t<N>.
void domain_error(const char *function, const char *name, const T &y, const char *msg1, const char *msg2)
Throw a domain error with a consistently formatted message.
opencl_context_base::map_base_opts base_opts()
Returns a copy of the map of kernel defines.
double e()
Return the base of the natural logarithm.
Definition: constants.hpp:87
int max_thread_block_size()
Returns the maximum thread block size defined by CL_DEVICE_MAX_WORK_GROUP_SIZE for the device in the ...
static opencl_context opencl_context
struct stan::math::opencl_context_base::tuning_struct tuning_opts_
fvar< T > pow(const fvar< T > &x1, const fvar< T > &x2)
Definition: pow.hpp:16
std::vector< cl::Device > devices_
void check_opencl_error(const char *function, const cl::Error &e)
Throws the domain error with specifying the OpenCL error that occured.
cl::CommandQueue & queue()
Returns the reference to the active OpenCL command queue for the device.

     [ Stan Home Page ] © 2011–2018, Stan Development Team.