Merge pull request #1066 from stan-dev/feature/issue-1062-ode-speedup

Feature/issue 1062 ode speedup

Merge pull request #1066 from stan-dev/feature/issue-1062-ode-speedup
Feature/issue 1062 ode speedup
0784a82f · wds15 · GitHub · d3254859 · fd7846f3 · 0784a82f
Unverified Commit 0784a82f authored 6 years ago by wds15 Committed by GitHub 6 years ago
Hide whitespace changes
Inline Side-by-side

Showing

with 94 additions and 32 deletions
+94 -32
--- a/stan/math/rev/arr/functor/coupled_ode_system.hpp
+++ b/stan/math/rev/arr/functor/coupled_ode_system.hpp
@@ -37,6 +37,20 @@ namespace math {
 * to the second base system equation, and so on through the last base
 * system equation.
 *
+ * <p>Note: Calculating the sensitivity system requires the Jacobian
+ * of the base ODE RHS wrt to the parameters theta. The parameter
+ * vector theta is constant for successive calls to the exposed
+ * operator(). For this reason, the parameter vector theta is copied
+ * upon construction onto the nochain var autodiff tape which is used
+ * in the the nested autodiff performed in the operator() of this
+ * adaptor. Doing so reduces the size of the nested autodiff and
+ * speeds up autodiff. As a side effect, the parameter vector theta
+ * will remain on the nochain autodiff part of the autodiff tape being
+ * in use even after destruction of the given instance. Moreover, the
+ * adjoint zeroing for the nested system does not cover the theta
+ * parameter vector part of the nochain autodiff tape and is therefore
+ * set to zero using a dedicated loop.
+ *
 * @tparam F base ode system functor. Must provide
 *   <code>operator()(double t, std::vector<double> y, std::vector<var> theta,
 *          std::vector<double> x, std::vector<int>x_int, std::ostream*
@@ -47,7 +61,7 @@ struct coupled_ode_system<F, double, var> {
  const F& f_;
  const std::vector<double>& y0_dbl_;
  const std::vector<var>& theta_;
-  const std::vector<double> theta_dbl_;
+  std::vector<var> theta_nochain_;
  const std::vector<double>& x_;
  const std::vector<int>& x_int_;
  const size_t N_;
@@ -74,13 +88,15 @@ struct coupled_ode_system<F, double, var> {
      : f_(f),
        y0_dbl_(y0),
        theta_(theta),
-        theta_dbl_(value_of(theta)),
        x_(x),
        x_int_(x_int),
        N_(y0.size()),
        M_(theta.size()),
        size_(N_ + N_ * M_),
-        msgs_(msgs) {}
+        msgs_(msgs) {
+    for (const var& p : theta)
+      theta_nochain_.emplace_back(var(new vari(p.val(), false)));
+  }

  /**
   * Calculates the derivative of the coupled ode system with respect
@@ -103,11 +119,9 @@ struct coupled_ode_system<F, double, var> {
    try {
      start_nested();

-      vector<var> y_vars(z.begin(), z.begin() + N_);
-
-      vector<var> theta_vars(theta_dbl_.begin(), theta_dbl_.end());
+      const vector<var> y_vars(z.begin(), z.begin() + N_);

-      vector<var> dy_dt_vars = f_(t, y_vars, theta_vars, x_, x_int_, msgs_);
+      vector<var> dy_dt_vars = f_(t, y_vars, theta_nochain_, x_, x_int_, msgs_);

      check_size_match("coupled_ode_system", "dz_dt", dy_dt_vars.size(),
                       "states", N_);
@@ -120,7 +134,7 @@ struct coupled_ode_system<F, double, var> {
          // orders derivatives by equation (i.e. if there are 2 eqns
          // (y1, y2) and 2 parameters (a, b), dy_dt will be ordered as:
          // dy1_dt, dy2_dt, dy1_da, dy2_da, dy1_db, dy2_db
-          double temp_deriv = theta_vars[j].adj();
+          double temp_deriv = theta_nochain_[j].adj();
          const size_t offset = N_ + N_ * j;
          for (size_t k = 0; k < N_; k++)
            temp_deriv += z[offset + k] * y_vars[k].adj();
@@ -129,6 +143,12 @@ struct coupled_ode_system<F, double, var> {
        }

        set_zero_all_adjoints_nested();
+        // Parameters stored on the outer (non-nested) nochain stack are not
+        // reset to zero by the last call. This is done as a separate step here.
+        // See efficiency note above on template specalization for more details
+        // on this.
+        for (size_t j = 0; j < M_; ++j)
+          theta_nochain_[j].vi_->set_zero_adjoint();
      }
    } catch (const std::exception& e) {
      recover_memory_nested();
@@ -226,7 +246,6 @@ template <typename F>
 struct coupled_ode_system<F, var, double> {
  const F& f_;
  const std::vector<var>& y0_;
-  const std::vector<double> y0_dbl_;
  const std::vector<double>& theta_dbl_;
  const std::vector<double>& x_;
  const std::vector<int>& x_int_;
@@ -253,7 +272,6 @@ struct coupled_ode_system<F, var, double> {
                     const std::vector<int>& x_int, std::ostream* msgs)
      : f_(f),
        y0_(y0),
-        y0_dbl_(value_of(y0)),
        theta_dbl_(theta),
        x_(x),
        x_int_(x_int),
@@ -283,7 +301,7 @@ struct coupled_ode_system<F, var, double> {
    try {
      start_nested();

-      vector<var> y_vars(z.begin(), z.begin() + N_);
+      const vector<var> y_vars(z.begin(), z.begin() + N_);

      vector<var> dy_dt_vars = f_(t, y_vars, theta_dbl_, x_, x_int_, msgs_);

@@ -339,7 +357,7 @@ struct coupled_ode_system<F, var, double> {
  std::vector<double> initial_state() const {
    std::vector<double> initial(size_, 0.0);
    for (size_t i = 0; i < N_; i++)
-      initial[i] = y0_dbl_[i];
+      initial[i] = value_of(y0_[i]);
    for (size_t i = 0; i < N_; i++)
      initial[N_ + i * N_ + i] = 1.0;
    return initial;
@@ -406,6 +424,20 @@ struct coupled_ode_system<F, var, double> {
 * parameters with respect to the second base system equation, and
 * so on through the last base system equation.
 *
+ * <p>Note: Calculating the sensitivity system requires the Jacobian
+ * of the base ODE RHS wrt to the parameters theta. The parameter
+ * vector theta is constant for successive calls to the exposed
+ * operator(). For this reason, the parameter vector theta is copied
+ * upon construction onto the nochain var autodiff tape which is used
+ * in the the nested autodiff performed in the operator() of this
+ * adaptor. Doing so reduces the size of the nested autodiff and
+ * speeds up autodiff. As a side effect, the parameter vector theta
+ * will remain on the nochain autodiff part of the autodiff tape being
+ * in use even after destruction of the given instance. Moreover, the
+ * adjoint zeroing for the nested system does not cover the theta
+ * parameter vector part of the nochain autodiff tape and is therefore
+ * set to zero using a dedicated loop.
+ *
 * @tparam F base ode system functor. Must provide
 *   <code>operator()(double t, std::vector<var> y, std::vector<var> theta,
 *          std::vector<double> x, std::vector<int>x_int, std::ostream*
@@ -415,9 +447,8 @@ template <typename F>
 struct coupled_ode_system<F, var, var> {
  const F& f_;
  const std::vector<var>& y0_;
-  const std::vector<double> y0_dbl_;
  const std::vector<var>& theta_;
-  const std::vector<double> theta_dbl_;
+  std::vector<var> theta_nochain_;
  const std::vector<double>& x_;
  const std::vector<int>& x_int_;
  const size_t N_;
@@ -443,15 +474,16 @@ struct coupled_ode_system<F, var, var> {
                     const std::vector<int>& x_int, std::ostream* msgs)
      : f_(f),
        y0_(y0),
-        y0_dbl_(value_of(y0)),
        theta_(theta),
-        theta_dbl_(value_of(theta)),
        x_(x),
        x_int_(x_int),
        N_(y0.size()),
        M_(theta.size()),
        size_(N_ + N_ * (N_ + M_)),
-        msgs_(msgs) {}
+        msgs_(msgs) {
+    for (const var& p : theta)
+      theta_nochain_.emplace_back(var(new vari(p.val(), false)));
+  }

  /**
   * Calculates the derivative of the coupled ode system with respect
@@ -474,11 +506,9 @@ struct coupled_ode_system<F, var, var> {
    try {
      start_nested();

-      vector<var> y_vars(z.begin(), z.begin() + N_);
-
-      vector<var> theta_vars(theta_dbl_.begin(), theta_dbl_.end());
+      const vector<var> y_vars(z.begin(), z.begin() + N_);

-      vector<var> dy_dt_vars = f_(t, y_vars, theta_vars, x_, x_int_, msgs_);
+      vector<var> dy_dt_vars = f_(t, y_vars, theta_nochain_, x_, x_int_, msgs_);

      check_size_match("coupled_ode_system", "dz_dt", dy_dt_vars.size(),
                       "states", N_);
@@ -500,7 +530,7 @@ struct coupled_ode_system<F, var, var> {
        }

        for (size_t j = 0; j < M_; j++) {
-          double temp_deriv = theta_vars[j].adj();
+          double temp_deriv = theta_nochain_[j].adj();
          const size_t offset = N_ + N_ * N_ + N_ * j;
          for (size_t k = 0; k < N_; k++)
            temp_deriv += z[offset + k] * y_vars[k].adj();
@@ -509,6 +539,12 @@ struct coupled_ode_system<F, var, var> {
        }

        set_zero_all_adjoints_nested();
+        // Parameters stored on the outer (non-nested) nochain stack are not
+        // reset to zero by the last call. This is done as a separate step here.
+        // See efficiency note above on template specalization for more details
+        // on this.
+        for (size_t j = 0; j < M_; ++j)
+          theta_nochain_[j].vi_->set_zero_adjoint();
      }
    } catch (const std::exception& e) {
      recover_memory_nested();
@@ -545,7 +581,7 @@ struct coupled_ode_system<F, var, var> {
  std::vector<double> initial_state() const {
    std::vector<double> initial(size_, 0.0);
    for (size_t i = 0; i < N_; i++)
-      initial[i] = y0_dbl_[i];
+      initial[i] = value_of(y0_[i]);
    for (size_t i = 0; i < N_; i++)
      initial[N_ + i * N_ + i] = 1.0;
    return initial;

--- a/stan/math/rev/core/var.hpp
+++ b/stan/math/rev/core/var.hpp
@@ -21,7 +21,7 @@ static void grad(vari* vi);
 * Independent (input) and dependent (output) variables for gradients.
 *
 * This class acts as a smart pointer, with resources managed by
- * an agenda-based memory manager scoped to a single gradient
+ * an arena-based memory manager scoped to a single gradient
 * calculation.
 *
 * An var is constructed with a double and used like any

--- a/stan/math/rev/mat/functor/cvodes_ode_data.hpp
+++ b/stan/math/rev/mat/functor/cvodes_ode_data.hpp
@@ -148,11 +148,11 @@ class cvodes_ode_data {
   */
  inline void rhs(double t, const double y[], double dy_dt[]) const {
    const std::vector<double> y_vec(y, y + N_);
-    const std::vector<double> dy_dt_vec
+    const std::vector<double>& dy_dt_vec
        = f_(t, y_vec, theta_dbl_, x_, x_int_, msgs_);
    check_size_match("cvodes_ode_data", "dz_dt", dy_dt_vec.size(), "states",
                     N_);
-    std::copy(dy_dt_vec.begin(), dy_dt_vec.end(), dy_dt);
+    std::move(dy_dt_vec.begin(), dy_dt_vec.end(), dy_dt);
  }

  /**
@@ -163,14 +163,13 @@ class cvodes_ode_data {
   * y to be the initial of the coupled ode system.
   */
  inline int jacobian_states(double t, const double y[], SUNMatrix J) const {
-    const std::vector<double> y_vec(y, y + N_);
    start_nested();
-    std::vector<var> y_vec_var(y_vec.begin(), y_vec.end());
+    const std::vector<var> y_vec_var(y, y + N_);
    coupled_ode_system<F, var, double> ode_jacobian(f_, y_vec_var, theta_dbl_,
                                                    x_, x_int_, msgs_);
-    std::vector<double> jacobian_y(ode_jacobian.size(), 0);
+    std::vector<double>&& jacobian_y = std::vector<double>(ode_jacobian.size());
    ode_jacobian(ode_jacobian.initial_state(), jacobian_y, t);
-    std::copy(jacobian_y.begin() + N_, jacobian_y.end(), SM_DATA_D(J));
+    std::move(jacobian_y.begin() + N_, jacobian_y.end(), SM_DATA_D(J));
    recover_memory_nested();
    return 0;
  }
@@ -184,14 +183,14 @@ class cvodes_ode_data {
  inline void rhs_sens(double t, const double y[], N_Vector* yS,
                       N_Vector* ySdot) const {
    std::vector<double> z(coupled_state_.size());
-    std::vector<double> dz_dt(coupled_state_.size());
+    std::vector<double>&& dz_dt = std::vector<double>(coupled_state_.size());
    std::copy(y, y + N_, z.begin());
    for (std::size_t s = 0; s < S_; s++)
      std::copy(NV_DATA_S(yS[s]), NV_DATA_S(yS[s]) + N_,
                z.begin() + (s + 1) * N_);
    coupled_ode_(z, dz_dt, t);
    for (std::size_t s = 0; s < S_; s++)
-      std::copy(dz_dt.begin() + (s + 1) * N_, dz_dt.begin() + (s + 2) * N_,
+      std::move(dz_dt.begin() + (s + 1) * N_, dz_dt.begin() + (s + 2) * N_,
                NV_DATA_S(ySdot[s]));
  }
 };

--- a/test/unit/math/rev/arr/functor/coupled_ode_system_test.cpp
+++ b/test/unit/math/rev/arr/functor/coupled_ode_system_test.cpp
@@ -18,6 +18,8 @@ struct StanAgradRevOde : public ::testing::Test {
 TEST_F(StanAgradRevOde, coupled_ode_system_dv) {
  using stan::math::coupled_ode_system;

+  stan::math::start_nested();
+
  harm_osc_ode_fun harm_osc;

  std::vector<stan::math::var> theta;
@@ -39,15 +41,22 @@ TEST_F(StanAgradRevOde, coupled_ode_system_dv) {
  z0.push_back(1.0);
  z0.push_back(2.0);

+  std::size_t stack_size = stan::math::nested_size();
+
  coupled_ode_system<harm_osc_ode_fun, double, stan::math::var> system(
      harm_osc, y0, theta, x, x_int, &msgs);

+  EXPECT_EQ(stack_size, stan::math::nested_size())
+      << "expecting no new things on the stack";
+
  system(z0, dz_dt, t0);

  EXPECT_FLOAT_EQ(0.5, dz_dt[0]);
  EXPECT_FLOAT_EQ(-1.075, dz_dt[1]);
  EXPECT_FLOAT_EQ(2, dz_dt[2]);
  EXPECT_FLOAT_EQ(-1.8, dz_dt[3]);
+
+  stan::math::recover_memory_nested();
 }
 TEST_F(StanAgradRevOde, decouple_states_dv) {
  using stan::math::coupled_ode_system;
@@ -193,6 +202,8 @@ TEST_F(StanAgradRevOde, memory_recovery_exception_dv) {
 TEST_F(StanAgradRevOde, coupled_ode_system_vd) {
  using stan::math::coupled_ode_system;

+  stan::math::start_nested();
+
  harm_osc_ode_fun harm_osc;

  std::vector<double> theta;
@@ -219,9 +230,14 @@ TEST_F(StanAgradRevOde, coupled_ode_system_vd) {
  y0_var.push_back(1.0);
  y0_var.push_back(0.5);

+  std::size_t stack_size = stan::math::nested_size();
+
  coupled_ode_system<harm_osc_ode_fun, stan::math::var, double> system(
      harm_osc, y0_var, theta, x, x_int, &msgs);

+  EXPECT_EQ(stack_size, stan::math::nested_size())
+      << "expecting no new things on the stack";
+
  system(z0, dz_dt, t0);

  EXPECT_FLOAT_EQ(0.5, dz_dt[0]);
@@ -230,6 +246,8 @@ TEST_F(StanAgradRevOde, coupled_ode_system_vd) {
  EXPECT_FLOAT_EQ(-1.0 * 1.0 - 0.15 * 0.0, dz_dt[3]);
  EXPECT_FLOAT_EQ(0.0 * 0.0 + 1.0 * 1.0, dz_dt[4]);
  EXPECT_FLOAT_EQ(-1.0 * 0.0 - 0.15 * 1.0, dz_dt[5]);
+
+  stan::math::recover_memory_nested();
 }
 TEST_F(StanAgradRevOde, decouple_states_vd) {
  using stan::math::coupled_ode_system;
@@ -374,6 +392,7 @@ TEST_F(StanAgradRevOde, memory_recovery_exception_vd) {
 TEST_F(StanAgradRevOde, coupled_ode_system_vv) {
  using stan::math::coupled_ode_system;

+  stan::math::start_nested();
  const size_t N = 2;
  const size_t M = 1;
  const size_t z_size = N + N * N + N * M;
@@ -386,9 +405,15 @@ TEST_F(StanAgradRevOde, coupled_ode_system_vv) {
  theta_var.push_back(0.15);

  harm_osc_ode_fun harm_osc;
+
+  std::size_t stack_size = stan::math::nested_size();
+
  coupled_ode_system<harm_osc_ode_fun, stan::math::var, stan::math::var> system(
      harm_osc, y0_var, theta_var, x, x_int, &msgs);

+  EXPECT_EQ(stack_size, stan::math::nested_size())
+      << "expecting no new things on the stack";
+
  std::vector<double> z0(z_size, 0);
  z0[0] = 1.0;
  z0[1] = 0.5;
@@ -419,6 +444,8 @@ TEST_F(StanAgradRevOde, coupled_ode_system_vv) {
  EXPECT_FLOAT_EQ(-0.15, dz_dt[5]);
  EXPECT_FLOAT_EQ(0, dz_dt[6]);
  EXPECT_FLOAT_EQ(-0.5, dz_dt[7]);
+
+  stan::math::recover_memory_nested();
 }
 TEST_F(StanAgradRevOde, decouple_states_vv) {
  using stan::math::coupled_ode_system;