6 years ago · 71d638ef32
--- a/internal/ceres/CMakeLists.txt
+++ b/internal/ceres/CMakeLists.txt
@@ -514,4 +514,7 @@ if (BUILD_BENCHMARKS)
 
				 
			
 
				   add_executable(invert_psd_matrix_benchmark invert_psd_matrix_benchmark.cc)
			
 
				   add_dependencies_to_benchmark(invert_psd_matrix_benchmark)
			
 
				+
			
 
				+  add_executable(schur_eliminator_benchmark schur_eliminator_benchmark.cc)
			
 
				+  add_dependencies_to_benchmark(schur_eliminator_benchmark)
			
 
				 endif (BUILD_BENCHMARKS)
			
--- a/internal/ceres/schur_eliminator.h
+++ b/internal/ceres/schur_eliminator.h
@@ -1,5 +1,5 @@
 
				 // Ceres Solver - A fast non-linear least squares minimizer
			
 
				-// Copyright 2015 Google Inc. All rights reserved.
			
 
				+// Copyright 2019 Google Inc. All rights reserved.
			
 
				 // http://ceres-solver.org/
			
 
				 //
			
 
				 // Redistribution and use in source and binary forms, with or without
			
@@ -36,10 +36,12 @@
 
				 #include <mutex>
			
 
				 #include <vector>
			
 
				 
			
 
				+#include "Eigen/Dense"
			
 
				 #include "ceres/block_random_access_matrix.h"
			
 
				 #include "ceres/block_sparse_matrix.h"
			
 
				 #include "ceres/block_structure.h"
			
 
				 #include "ceres/internal/eigen.h"
			
 
				+#include "ceres/internal/port.h"
			
 
				 #include "ceres/linear_solver.h"
			
 
				 
			
 
				 namespace ceres {
			
@@ -220,14 +222,13 @@ class SchurEliminatorBase {
 
				 // level linear algebra routines.
			
 
				 template <int kRowBlockSize = Eigen::Dynamic,
			
 
				           int kEBlockSize = Eigen::Dynamic,
			
 
				-          int kFBlockSize = Eigen::Dynamic >
			
 
				+          int kFBlockSize = Eigen::Dynamic>
			
 
				 class SchurEliminator : public SchurEliminatorBase {
			
 
				  public:
			
 
				   explicit SchurEliminator(const LinearSolver::Options& options)
			
 
				-      : num_threads_(options.num_threads),
			
 
				-        context_(options.context) {
			
 
				+      : num_threads_(options.num_threads), context_(options.context) {
			
 
				     CHECK(context_ != nullptr);
			
 
				-}
			
 
				+  }
			
 
				 
			
 
				   // SchurEliminatorBase Interface
			
 
				   virtual ~SchurEliminator();
			
@@ -306,12 +307,11 @@ class SchurEliminator : public SchurEliminatorBase {
 
				                              int row_block_index,
			
 
				                              BlockRandomAccessMatrix* lhs);
			
 
				 
			
 
				-
			
 
				   void NoEBlockRowsUpdate(const BlockSparseMatrix* A,
			
 
				-                             const double* b,
			
 
				-                             int row_block_counter,
			
 
				-                             BlockRandomAccessMatrix* lhs,
			
 
				-                             double* rhs);
			
 
				+                          const double* b,
			
 
				+                          int row_block_counter,
			
 
				+                          BlockRandomAccessMatrix* lhs,
			
 
				+                          double* rhs);
			
 
				 
			
 
				   void NoEBlockRowOuterProduct(const BlockSparseMatrix* A,
			
 
				                                int row_block_index,
			
@@ -357,7 +357,268 @@ class SchurEliminator : public SchurEliminatorBase {
 
				 
			
 
				   // Locks for the blocks in the right hand side of the reduced linear
			
 
				   // system.
			
 
				- std::vector<std::mutex*> rhs_locks_;
			
 
				+  std::vector<std::mutex*> rhs_locks_;
			
 
				+};
			
 
				+
			
 
				+// SchurEliminatorForOneFBlock specializes the SchurEliminatorBase interface for
			
 
				+// the case where there is exactly one f-block and all three parameters
			
 
				+// kRowBlockSize, kEBlockSize and KFBlockSize are known at compile time. This is
			
 
				+// the case for some two view bundle adjustment problems which have very
			
 
				+// stringent latency requirements.
			
 
				+//
			
 
				+// Under these assumptions, we can simplify the more general algorithm
			
 
				+// implemented by SchurEliminatorImpl significantly. Two of the major
			
 
				+// contributors to the increased performance are:
			
 
				+//
			
 
				+// 1. Simpler loop structure and less use of dynamic memory. Almost everything
			
 
				+//    is on the stack and benefits from aligned memory as well as fixed sized
			
 
				+//    vectorization. We are also able to reason about temporaries and control
			
 
				+//    their lifetimes better.
			
 
				+// 2. Use of inverse() over llt().solve(Identity).
			
 
				+template <int kRowBlockSize = Eigen::Dynamic,
			
 
				+          int kEBlockSize = Eigen::Dynamic,
			
 
				+          int kFBlockSize = Eigen::Dynamic>
			
 
				+class SchurEliminatorForOneFBlock : public SchurEliminatorBase {
			
 
				+ public:
			
 
				+  virtual ~SchurEliminatorForOneFBlock() {}
			
 
				+  void Init(int num_eliminate_blocks,
			
 
				+            bool assume_full_rank_ete,
			
 
				+            const CompressedRowBlockStructure* bs) override {
			
 
				+    CHECK_GT(num_eliminate_blocks, 0)
			
 
				+        << "SchurComplementSolver cannot be initialized with "
			
 
				+        << "num_eliminate_blocks = 0.";
			
 
				+    CHECK_EQ(bs->cols.size() - num_eliminate_blocks, 1);
			
 
				+
			
 
				+    num_eliminate_blocks_ = num_eliminate_blocks;
			
 
				+    const int num_row_blocks = bs->rows.size();
			
 
				+    chunks_.clear();
			
 
				+    int r = 0;
			
 
				+    // Iterate over the row blocks of A, and detect the chunks. The
			
 
				+    // matrix should already have been ordered so that all rows
			
 
				+    // containing the same y block are vertically contiguous.
			
 
				+    while (r < num_row_blocks) {
			
 
				+      const int e_block_id = bs->rows[r].cells.front().block_id;
			
 
				+      if (e_block_id >= num_eliminate_blocks_) {
			
 
				+        break;
			
 
				+      }
			
 
				+
			
 
				+      chunks_.push_back(Chunk());
			
 
				+      Chunk& chunk = chunks_.back();
			
 
				+      chunk.num_rows = 0;
			
 
				+      chunk.start = r;
			
 
				+      // Add to the chunk until the first block in the row is
			
 
				+      // different than the one in the first row for the chunk.
			
 
				+      while (r + chunk.num_rows < num_row_blocks) {
			
 
				+        const CompressedRow& row = bs->rows[r + chunk.num_rows];
			
 
				+        if (row.cells.front().block_id != e_block_id) {
			
 
				+          break;
			
 
				+        }
			
 
				+        ++chunk.num_rows;
			
 
				+      }
			
 
				+      r += chunk.num_rows;
			
 
				+    }
			
 
				+
			
 
				+    const Chunk& last_chunk = chunks_.back();
			
 
				+    uneliminated_row_begins_ = last_chunk.start + last_chunk.num_rows;
			
 
				+    e_t_e_inverse_matrices_.resize(kEBlockSize * kEBlockSize * chunks_.size());
			
 
				+    std::fill(
			
 
				+        e_t_e_inverse_matrices_.begin(), e_t_e_inverse_matrices_.end(), 0.0);
			
 
				+  }
			
 
				+
			
 
				+  void Eliminate(const BlockSparseMatrix* A,
			
 
				+                 const double* b,
			
 
				+                 const double* D,
			
 
				+                 BlockRandomAccessMatrix* lhs_bram,
			
 
				+                 double* rhs_ptr) override {
			
 
				+    // Since there is only one f-block, we can call GetCell once, and cache its
			
 
				+    // output.
			
 
				+    int r, c, row_stride, col_stride;
			
 
				+    CellInfo* cell_info =
			
 
				+        lhs_bram->GetCell(0, 0, &r, &c, &row_stride, &col_stride);
			
 
				+    typename EigenTypes<kFBlockSize, kFBlockSize>::MatrixRef lhs(
			
 
				+        cell_info->values, kFBlockSize, kFBlockSize);
			
 
				+    typename EigenTypes<kFBlockSize>::VectorRef rhs(rhs_ptr, kFBlockSize);
			
 
				+
			
 
				+    lhs.setZero();
			
 
				+    rhs.setZero();
			
 
				+
			
 
				+    const CompressedRowBlockStructure* bs = A->block_structure();
			
 
				+    const double* values = A->values();
			
 
				+
			
 
				+    // Add the diagonal to the schur complement.
			
 
				+    if (D != nullptr) {
			
 
				+      typename EigenTypes<kFBlockSize>::ConstVectorRef diag(
			
 
				+          D + bs->cols[num_eliminate_blocks_].position, kFBlockSize);
			
 
				+      lhs.diagonal() = diag.array().square().matrix();
			
 
				+    }
			
 
				+
			
 
				+    Eigen::Matrix<double, kEBlockSize, kFBlockSize> tmp;
			
 
				+    Eigen::Matrix<double, kEBlockSize, 1> tmp2;
			
 
				+
			
 
				+    // The following loop works on a block matrix which looks as follows
			
 
				+    // (number of rows can be anything):
			
 
				+    //
			
 
				+    // [e_1 | f_1] = [b1]
			
 
				+    // [e_2 | f_2] = [b2]
			
 
				+    // [e_3 | f_3] = [b3]
			
 
				+    // [e_4 | f_4] = [b4]
			
 
				+    //
			
 
				+    // and computes the following
			
 
				+    //
			
 
				+    // e_t_e = sum_i e_i^T * e_i
			
 
				+    // e_t_e_inverse = inverse(e_t_e)
			
 
				+    // e_t_f = sum_i e_i^T f_i
			
 
				+    // e_t_b = sum_i e_i^T b_i
			
 
				+    // f_t_b = sum_i f_i^T b_i
			
 
				+    //
			
 
				+    // lhs += sum_i f_i^T * f_i - e_t_f^T * e_t_e_inverse * e_t_f
			
 
				+    // rhs += f_t_b - e_t_f^T * e_t_e_inverse * e_t_b
			
 
				+    for (int i = 0; i < chunks_.size(); ++i) {
			
 
				+      const Chunk& chunk = chunks_[i];
			
 
				+      const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
			
 
				+
			
 
				+      // Naming covention, e_t_e = e_block.transpose() * e_block;
			
 
				+      Eigen::Matrix<double, kEBlockSize, kEBlockSize> e_t_e;
			
 
				+      Eigen::Matrix<double, kEBlockSize, kFBlockSize> e_t_f;
			
 
				+      Eigen::Matrix<double, kEBlockSize, 1> e_t_b;
			
 
				+      Eigen::Matrix<double, kFBlockSize, 1> f_t_b;
			
 
				+
			
 
				+      // Add the square of the diagonal to e_t_e.
			
 
				+      if (D != NULL) {
			
 
				+        const typename EigenTypes<kEBlockSize>::ConstVectorRef diag(
			
 
				+            D + bs->cols[e_block_id].position, kEBlockSize);
			
 
				+        e_t_e = diag.array().square().matrix().asDiagonal();
			
 
				+      } else {
			
 
				+        e_t_e.setZero();
			
 
				+      }
			
 
				+      e_t_f.setZero();
			
 
				+      e_t_b.setZero();
			
 
				+      f_t_b.setZero();
			
 
				+
			
 
				+      for (int j = 0; j < chunk.num_rows; ++j) {
			
 
				+        const int row_id = chunk.start + j;
			
 
				+        const auto& row = bs->rows[row_id];
			
 
				+        const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
			
 
				+            e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
			
 
				+        const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
			
 
				+            b + row.block.position, kRowBlockSize);
			
 
				+
			
 
				+        e_t_e.noalias() += e_block.transpose() * e_block;
			
 
				+        e_t_b.noalias() += e_block.transpose() * b_block;
			
 
				+
			
 
				+        if (row.cells.size() == 1) {
			
 
				+          // There is no f block, so there is nothing more to do.
			
 
				+          continue;
			
 
				+        }
			
 
				+
			
 
				+        const typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
			
 
				+            f_block(values + row.cells[1].position, kRowBlockSize, kFBlockSize);
			
 
				+        e_t_f.noalias() += e_block.transpose() * f_block;
			
 
				+        lhs.noalias() += f_block.transpose() * f_block;
			
 
				+        f_t_b.noalias() += f_block.transpose() * b_block;
			
 
				+      }
			
 
				+
			
 
				+      // BackSubstitute computes the same inverse, and this is the key workload
			
 
				+      // there, so caching these inverses makes BackSubstitute essentially free.
			
 
				+      typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
			
 
				+          &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
			
 
				+          kEBlockSize,
			
 
				+          kEBlockSize);
			
 
				+
			
 
				+      // e_t_e is a symmetric positive definite matrix, so the standard way to
			
 
				+      // compute its inverse is via the Cholesky factorization by calling
			
 
				+      // e_t_e.llt().solve(Identity()). However, the inverse() method even
			
 
				+      // though it is not optimized for symmetric matrices is significantly
			
 
				+      // faster for small fixed size (up to 4x4) matrices.
			
 
				+      //
			
 
				+      // https://eigen.tuxfamily.org/dox/group__TutorialLinearAlgebra.html#title3
			
 
				+      e_t_e_inverse.noalias() = e_t_e.inverse();
			
 
				+
			
 
				+      // The use of these two pre-allocated tmp vectors saves temporaries in the
			
 
				+      // expressions for lhs and rhs updates below and has a significant impact
			
 
				+      // on the performance of this method.
			
 
				+      tmp.noalias() = e_t_e_inverse * e_t_f;
			
 
				+      tmp2.noalias() = e_t_e_inverse * e_t_b;
			
 
				+
			
 
				+      lhs.noalias() -= e_t_f.transpose() * tmp;
			
 
				+      rhs.noalias() += f_t_b - e_t_f.transpose() * tmp2;
			
 
				+    }
			
 
				+
			
 
				+    // The rows without any e-blocks can have arbitrary size but only contain
			
 
				+    // the f-block.
			
 
				+    //
			
 
				+    // lhs += f_i^T f_i
			
 
				+    // rhs += f_i^T b_i
			
 
				+    for (int row_id = uneliminated_row_begins_; row_id < bs->rows.size();
			
 
				+         ++row_id) {
			
 
				+      const auto& row = bs->rows[row_id];
			
 
				+      const auto& cell = row.cells[0];
			
 
				+      const typename EigenTypes<Eigen::Dynamic, kFBlockSize>::ConstMatrixRef
			
 
				+          f_block(values + cell.position, row.block.size, kFBlockSize);
			
 
				+      const typename EigenTypes<Eigen::Dynamic>::ConstVectorRef b_block(
			
 
				+          b + row.block.position, row.block.size);
			
 
				+      lhs.noalias() += f_block.transpose() * f_block;
			
 
				+      rhs.noalias() += f_block.transpose() * b_block;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  // This implementation of BackSubstitute depends on Eliminate being called
			
 
				+  // before this. SchurComplementSolver always does this.
			
 
				+  //
			
 
				+  // y_i = e_t_e_inverse * sum_i e_i^T * (b_i - f_i * z);
			
 
				+  void BackSubstitute(const BlockSparseMatrix* A,
			
 
				+                      const double* b,
			
 
				+                      const double* D,
			
 
				+                      const double* z_ptr,
			
 
				+                      double* y) override {
			
 
				+    typename EigenTypes<kFBlockSize>::ConstVectorRef z(z_ptr, kFBlockSize);
			
 
				+    const CompressedRowBlockStructure* bs = A->block_structure();
			
 
				+    const double* values = A->values();
			
 
				+    Eigen::Matrix<double, kEBlockSize, 1> tmp;
			
 
				+    for (int i = 0; i < chunks_.size(); ++i) {
			
 
				+      const Chunk& chunk = chunks_[i];
			
 
				+      const int e_block_id = bs->rows[chunk.start].cells.front().block_id;
			
 
				+      tmp.setZero();
			
 
				+      for (int j = 0; j < chunk.num_rows; ++j) {
			
 
				+        const int row_id = chunk.start + j;
			
 
				+        const auto& row = bs->rows[row_id];
			
 
				+        const typename EigenTypes<kRowBlockSize, kEBlockSize>::ConstMatrixRef
			
 
				+            e_block(values + row.cells[0].position, kRowBlockSize, kEBlockSize);
			
 
				+        const typename EigenTypes<kRowBlockSize>::ConstVectorRef b_block(
			
 
				+            b + row.block.position, kRowBlockSize);
			
 
				+
			
 
				+        if (row.cells.size() == 1) {
			
 
				+          // There is no f block.
			
 
				+          tmp += e_block.transpose() * b_block;
			
 
				+        } else {
			
 
				+          typename EigenTypes<kRowBlockSize, kFBlockSize>::ConstMatrixRef
			
 
				+              f_block(
			
 
				+                  values + row.cells[1].position, kRowBlockSize, kFBlockSize);
			
 
				+          tmp += e_block.transpose() * (b_block - f_block * z);
			
 
				+        }
			
 
				+      }
			
 
				+
			
 
				+      typename EigenTypes<kEBlockSize, kEBlockSize>::MatrixRef e_t_e_inverse(
			
 
				+          &e_t_e_inverse_matrices_[kEBlockSize * kEBlockSize * i],
			
 
				+          kEBlockSize,
			
 
				+          kEBlockSize);
			
 
				+
			
 
				+      typename EigenTypes<kEBlockSize>::VectorRef y_block(
			
 
				+          y + bs->cols[e_block_id].position, kEBlockSize);
			
 
				+      y_block.noalias() = e_t_e_inverse * tmp;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+ private:
			
 
				+  struct Chunk {
			
 
				+    int start = 0;
			
 
				+    int num_rows = 0;
			
 
				+  };
			
 
				+
			
 
				+  std::vector<Chunk> chunks_;
			
 
				+  int num_eliminate_blocks_;
			
 
				+  int uneliminated_row_begins_;
			
 
				+  std::vector<double> e_t_e_inverse_matrices_;
			
 
				 };
			
 
				 
			
 
				 }  // namespace internal
			
--- a/internal/ceres/schur_eliminator_benchmark.cc
+++ b/internal/ceres/schur_eliminator_benchmark.cc
@@ -0,0 +1,222 @@
 
				+// Ceres Solver - A fast non-linear least squares minimizer
			
 
				+// Copyright 2019 Google Inc. All rights reserved.
			
 
				+// http://ceres-solver.org/
			
 
				+//
			
 
				+// Redistribution and use in source and binary forms, with or without
			
 
				+// modification, are permitted provided that the following conditions are met:
			
 
				+//
			
 
				+// * Redistributions of source code must retain the above copyright notice,
			
 
				+//   this list of conditions and the following disclaimer.
			
 
				+// * Redistributions in binary form must reproduce the above copyright notice,
			
 
				+//   this list of conditions and the following disclaimer in the documentation
			
 
				+//   and/or other materials provided with the distribution.
			
 
				+// * Neither the name of Google Inc. nor the names of its contributors may be
			
 
				+//   used to endorse or promote products derived from this software without
			
 
				+//   specific prior written permission.
			
 
				+//
			
 
				+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
			
 
				+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
			
 
				+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
			
 
				+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
			
 
				+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
			
 
				+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
			
 
				+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
			
 
				+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
			
 
				+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
			
 
				+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
			
 
				+// POSSIBILITY OF SUCH DAMAGE.
			
 
				+//
			
 
				+// Authors: sameeragarwal@google.com (Sameer Agarwal)
			
 
				+
			
 
				+#include "Eigen/Dense"
			
 
				+#include "benchmark/benchmark.h"
			
 
				+#include "ceres/block_random_access_dense_matrix.h"
			
 
				+#include "ceres/block_sparse_matrix.h"
			
 
				+#include "ceres/block_structure.h"
			
 
				+#include "ceres/random.h"
			
 
				+#include "ceres/schur_eliminator.h"
			
 
				+
			
 
				+namespace ceres {
			
 
				+namespace internal {
			
 
				+
			
 
				+constexpr int kRowBlockSize = 2;
			
 
				+constexpr int kEBlockSize = 3;
			
 
				+constexpr int kFBlockSize = 6;
			
 
				+
			
 
				+class BenchmarkData {
			
 
				+ public:
			
 
				+  explicit BenchmarkData(const int num_e_blocks) {
			
 
				+    CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
			
 
				+    bs->cols.resize(num_e_blocks + 1);
			
 
				+    int col_pos = 0;
			
 
				+    for (int i = 0; i < num_e_blocks; ++i) {
			
 
				+      bs->cols[i].position = col_pos;
			
 
				+      bs->cols[i].size = kEBlockSize;
			
 
				+      col_pos += kEBlockSize;
			
 
				+    }
			
 
				+    bs->cols.back().position = col_pos;
			
 
				+    bs->cols.back().size = kFBlockSize;
			
 
				+
			
 
				+    bs->rows.resize(2 * num_e_blocks);
			
 
				+    int row_pos = 0;
			
 
				+    int cell_pos = 0;
			
 
				+    for (int i = 0; i < num_e_blocks; ++i) {
			
 
				+      {
			
 
				+        auto& row = bs->rows[2 * i];
			
 
				+        row.block.position = row_pos;
			
 
				+        row.block.size = kRowBlockSize;
			
 
				+        row_pos += kRowBlockSize;
			
 
				+        auto& cells = row.cells;
			
 
				+        cells.resize(2);
			
 
				+        cells[0].block_id = i;
			
 
				+        cells[0].position = cell_pos;
			
 
				+        cell_pos += kRowBlockSize * kEBlockSize;
			
 
				+        cells[1].block_id = num_e_blocks;
			
 
				+        cells[1].position = cell_pos;
			
 
				+        cell_pos += kRowBlockSize * kFBlockSize;
			
 
				+      }
			
 
				+      {
			
 
				+        auto& row = bs->rows[2 * i + 1];
			
 
				+        row.block.position = row_pos;
			
 
				+        row.block.size = kRowBlockSize;
			
 
				+        row_pos += kRowBlockSize;
			
 
				+        auto& cells = row.cells;
			
 
				+        cells.resize(1);
			
 
				+        cells[0].block_id = i;
			
 
				+        cells[0].position = cell_pos;
			
 
				+        cell_pos += kRowBlockSize * kEBlockSize;
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    matrix_.reset(new BlockSparseMatrix(bs));
			
 
				+    double* values = matrix_->mutable_values();
			
 
				+    for (int i = 0; i < matrix_->num_nonzeros(); ++i) {
			
 
				+      values[i] = RandNormal();
			
 
				+    }
			
 
				+
			
 
				+    b_.resize(matrix_->num_rows());
			
 
				+    b_.setRandom();
			
 
				+
			
 
				+    std::vector<int> blocks(1, kFBlockSize);
			
 
				+    lhs_.reset(new BlockRandomAccessDenseMatrix(blocks));
			
 
				+    diagonal_.resize(matrix_->num_cols());
			
 
				+    diagonal_.setOnes();
			
 
				+    rhs_.resize(kFBlockSize);
			
 
				+
			
 
				+    y_.resize(num_e_blocks * kEBlockSize);
			
 
				+    y_.setZero();
			
 
				+    z_.resize(kFBlockSize);
			
 
				+    z_.setOnes();
			
 
				+  }
			
 
				+
			
 
				+  const BlockSparseMatrix& matrix() const { return *matrix_; }
			
 
				+  const Vector& b() const { return b_; }
			
 
				+  const Vector& diagonal() const { return diagonal_; }
			
 
				+  BlockRandomAccessDenseMatrix* mutable_lhs() { return lhs_.get(); }
			
 
				+  Vector* mutable_rhs() { return &rhs_; }
			
 
				+  Vector* mutable_y() { return &y_; }
			
 
				+  Vector* mutable_z() { return &z_; }
			
 
				+
			
 
				+ private:
			
 
				+  std::unique_ptr<BlockSparseMatrix> matrix_;
			
 
				+  Vector b_;
			
 
				+  std::unique_ptr<BlockRandomAccessDenseMatrix> lhs_;
			
 
				+  Vector rhs_;
			
 
				+  Vector diagonal_;
			
 
				+  Vector z_;
			
 
				+  Vector y_;
			
 
				+};
			
 
				+
			
 
				+void BM_SchurEliminatorEliminate(benchmark::State& state) {
			
 
				+  const int num_e_blocks = state.range(0);
			
 
				+  BenchmarkData data(num_e_blocks);
			
 
				+
			
 
				+  ContextImpl context;
			
 
				+  LinearSolver::Options linear_solver_options;
			
 
				+  linear_solver_options.e_block_size = kEBlockSize;
			
 
				+  linear_solver_options.row_block_size = kRowBlockSize;
			
 
				+  linear_solver_options.f_block_size = kFBlockSize;
			
 
				+  linear_solver_options.context = &context;
			
 
				+  std::unique_ptr<SchurEliminatorBase> eliminator(
			
 
				+      SchurEliminatorBase::Create(linear_solver_options));
			
 
				+
			
 
				+  eliminator->Init(num_e_blocks, true, data.matrix().block_structure());
			
 
				+  for (auto _ : state) {
			
 
				+    eliminator->Eliminate(&data.matrix(),
			
 
				+                          data.b().data(),
			
 
				+                          data.diagonal().data(),
			
 
				+                          data.mutable_lhs(),
			
 
				+                          data.mutable_rhs()->data());
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void BM_SchurEliminatorBackSubstitute(benchmark::State& state) {
			
 
				+  const int num_e_blocks = state.range(0);
			
 
				+  BenchmarkData data(num_e_blocks);
			
 
				+
			
 
				+  ContextImpl context;
			
 
				+  LinearSolver::Options linear_solver_options;
			
 
				+  linear_solver_options.e_block_size = kEBlockSize;
			
 
				+  linear_solver_options.row_block_size = kRowBlockSize;
			
 
				+  linear_solver_options.f_block_size = kFBlockSize;
			
 
				+  linear_solver_options.context = &context;
			
 
				+  std::unique_ptr<SchurEliminatorBase> eliminator(
			
 
				+      SchurEliminatorBase::Create(linear_solver_options));
			
 
				+
			
 
				+  eliminator->Init(num_e_blocks, true, data.matrix().block_structure());
			
 
				+  eliminator->Eliminate(&data.matrix(),
			
 
				+                        data.b().data(),
			
 
				+                        data.diagonal().data(),
			
 
				+                        data.mutable_lhs(),
			
 
				+                        data.mutable_rhs()->data());
			
 
				+  for (auto _ : state) {
			
 
				+    eliminator->BackSubstitute(&data.matrix(),
			
 
				+                               data.b().data(),
			
 
				+                               data.diagonal().data(),
			
 
				+                               data.mutable_z()->data(),
			
 
				+                               data.mutable_y()->data());
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void BM_SchurEliminatorForOneFBlockEliminate(benchmark::State& state) {
			
 
				+  const int num_e_blocks = state.range(0);
			
 
				+  BenchmarkData data(num_e_blocks);
			
 
				+  SchurEliminatorForOneFBlock<2, 3, 6> eliminator;
			
 
				+  eliminator.Init(num_e_blocks, true, data.matrix().block_structure());
			
 
				+  for (auto _ : state) {
			
 
				+    eliminator.Eliminate(&data.matrix(),
			
 
				+                         data.b().data(),
			
 
				+                         data.diagonal().data(),
			
 
				+                         data.mutable_lhs(),
			
 
				+                         data.mutable_rhs()->data());
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+void BM_SchurEliminatorForOneFBlockBackSubstitute(benchmark::State& state) {
			
 
				+  const int num_e_blocks = state.range(0);
			
 
				+  BenchmarkData data(num_e_blocks);
			
 
				+  SchurEliminatorForOneFBlock<2, 3, 6> eliminator;
			
 
				+  eliminator.Init(num_e_blocks, true, data.matrix().block_structure());
			
 
				+  eliminator.Eliminate(&data.matrix(),
			
 
				+                       data.b().data(),
			
 
				+                       data.diagonal().data(),
			
 
				+                       data.mutable_lhs(),
			
 
				+                       data.mutable_rhs()->data());
			
 
				+  for (auto _ : state) {
			
 
				+    eliminator.BackSubstitute(&data.matrix(),
			
 
				+                              data.b().data(),
			
 
				+                              data.diagonal().data(),
			
 
				+                              data.mutable_z()->data(),
			
 
				+                              data.mutable_y()->data());
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+BENCHMARK(BM_SchurEliminatorEliminate)->Range(10, 10000);
			
 
				+BENCHMARK(BM_SchurEliminatorForOneFBlockEliminate)->Range(10, 10000);
			
 
				+BENCHMARK(BM_SchurEliminatorBackSubstitute)->Range(10, 10000);
			
 
				+BENCHMARK(BM_SchurEliminatorForOneFBlockBackSubstitute)->Range(10, 10000);
			
 
				+
			
 
				+}  // namespace internal
			
 
				+}  // namespace ceres
			
 
				+
			
 
				+BENCHMARK_MAIN();
			
--- a/internal/ceres/schur_eliminator_test.cc
+++ b/internal/ceres/schur_eliminator_test.cc
@@ -1,5 +1,5 @@
 
				 // Ceres Solver - A fast non-linear least squares minimizer
			
 
				-// Copyright 2015 Google Inc. All rights reserved.
			
 
				+// Copyright 2019 Google Inc. All rights reserved.
			
 
				 // http://ceres-solver.org/
			
 
				 //
			
 
				 // Redistribution and use in source and binary forms, with or without
			
@@ -31,14 +31,17 @@
 
				 #include "ceres/schur_eliminator.h"
			
 
				 
			
 
				 #include <memory>
			
 
				+
			
 
				 #include "Eigen/Dense"
			
 
				 #include "ceres/block_random_access_dense_matrix.h"
			
 
				 #include "ceres/block_sparse_matrix.h"
			
 
				+#include "ceres/block_structure.h"
			
 
				 #include "ceres/casts.h"
			
 
				 #include "ceres/context_impl.h"
			
 
				 #include "ceres/detect_structure.h"
			
 
				 #include "ceres/internal/eigen.h"
			
 
				 #include "ceres/linear_least_squares_problems.h"
			
 
				+#include "ceres/random.h"
			
 
				 #include "ceres/test_util.h"
			
 
				 #include "ceres/triplet_sparse_matrix.h"
			
 
				 #include "ceres/types.h"
			
@@ -226,5 +229,138 @@ TEST_F(SchurEliminatorTest, VaryingFBlockSizeWithoutStaticStructure) {
 
				   EliminateSolveAndCompare(VectorRef(D.get(), A->num_cols()), false, 1e-14);
			
 
				 }
			
 
				 
			
 
				+TEST(SchurEliminatorForOneFBlock, MatchesSchurEliminator) {
			
 
				+  constexpr int kRowBlockSize = 2;
			
 
				+  constexpr int kEBlockSize = 3;
			
 
				+  constexpr int kFBlockSize = 6;
			
 
				+  constexpr int num_e_blocks = 5;
			
 
				+
			
 
				+  CompressedRowBlockStructure* bs = new CompressedRowBlockStructure;
			
 
				+  bs->cols.resize(num_e_blocks + 1);
			
 
				+  int col_pos = 0;
			
 
				+  for (int i = 0; i < num_e_blocks; ++i) {
			
 
				+    bs->cols[i].position = col_pos;
			
 
				+    bs->cols[i].size = kEBlockSize;
			
 
				+    col_pos += kEBlockSize;
			
 
				+  }
			
 
				+  bs->cols.back().position = col_pos;
			
 
				+  bs->cols.back().size = kFBlockSize;
			
 
				+
			
 
				+  bs->rows.resize(2 * num_e_blocks + 1);
			
 
				+  int row_pos = 0;
			
 
				+  int cell_pos = 0;
			
 
				+  for (int i = 0; i < num_e_blocks; ++i) {
			
 
				+    {
			
 
				+      auto& row = bs->rows[2 * i];
			
 
				+      row.block.position = row_pos;
			
 
				+      row.block.size = kRowBlockSize;
			
 
				+      row_pos += kRowBlockSize;
			
 
				+      auto& cells = row.cells;
			
 
				+      cells.resize(2);
			
 
				+      cells[0].block_id = i;
			
 
				+      cells[0].position = cell_pos;
			
 
				+      cell_pos += kRowBlockSize * kEBlockSize;
			
 
				+      cells[1].block_id = num_e_blocks;
			
 
				+      cells[1].position = cell_pos;
			
 
				+      cell_pos += kRowBlockSize * kFBlockSize;
			
 
				+    }
			
 
				+    {
			
 
				+      auto& row = bs->rows[2 * i + 1];
			
 
				+      row.block.position = row_pos;
			
 
				+      row.block.size = kRowBlockSize;
			
 
				+      row_pos += kRowBlockSize;
			
 
				+      auto& cells = row.cells;
			
 
				+      cells.resize(1);
			
 
				+      cells[0].block_id = i;
			
 
				+      cells[0].position = cell_pos;
			
 
				+      cell_pos += kRowBlockSize * kEBlockSize;
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    auto& row = bs->rows.back();
			
 
				+    row.block.position = row_pos;
			
 
				+    row.block.size = kEBlockSize;
			
 
				+    row_pos += kRowBlockSize;
			
 
				+    auto& cells = row.cells;
			
 
				+    cells.resize(1);
			
 
				+    cells[0].block_id = num_e_blocks;
			
 
				+    cells[0].position = cell_pos;
			
 
				+    cell_pos += kEBlockSize * kEBlockSize;
			
 
				+  }
			
 
				+
			
 
				+  BlockSparseMatrix matrix(bs);
			
 
				+  double* values = matrix.mutable_values();
			
 
				+  for (int i = 0; i < matrix.num_nonzeros(); ++i) {
			
 
				+    values[i] = RandNormal();
			
 
				+  }
			
 
				+
			
 
				+  Vector b(matrix.num_rows());
			
 
				+  b.setRandom();
			
 
				+
			
 
				+  Vector diagonal(matrix.num_cols());
			
 
				+  diagonal.setOnes();
			
 
				+
			
 
				+  std::vector<int> blocks(1, kFBlockSize);
			
 
				+  BlockRandomAccessDenseMatrix actual_lhs(blocks);
			
 
				+  BlockRandomAccessDenseMatrix expected_lhs(blocks);
			
 
				+  Vector actual_rhs(kFBlockSize);
			
 
				+  Vector expected_rhs(kFBlockSize);
			
 
				+
			
 
				+  Vector f_sol(kFBlockSize);
			
 
				+  f_sol.setRandom();
			
 
				+  Vector actual_e_sol(num_e_blocks * kEBlockSize);
			
 
				+  actual_e_sol.setZero();
			
 
				+  Vector expected_e_sol(num_e_blocks * kEBlockSize);
			
 
				+  expected_e_sol.setZero();
			
 
				+
			
 
				+  {
			
 
				+    ContextImpl context;
			
 
				+    LinearSolver::Options linear_solver_options;
			
 
				+    linear_solver_options.e_block_size = kEBlockSize;
			
 
				+    linear_solver_options.row_block_size = kRowBlockSize;
			
 
				+    linear_solver_options.f_block_size = kFBlockSize;
			
 
				+    linear_solver_options.context = &context;
			
 
				+    std::unique_ptr<SchurEliminatorBase> eliminator(
			
 
				+        SchurEliminatorBase::Create(linear_solver_options));
			
 
				+    eliminator->Init(num_e_blocks, true, matrix.block_structure());
			
 
				+    eliminator->Eliminate(&matrix, b.data(), diagonal.data(), &expected_lhs,
			
 
				+                          expected_rhs.data());
			
 
				+    eliminator->BackSubstitute(&matrix, b.data(), diagonal.data(), f_sol.data(),
			
 
				+                               actual_e_sol.data());
			
 
				+  }
			
 
				+
			
 
				+  {
			
 
				+    SchurEliminatorForOneFBlock<2, 3, 6> eliminator;
			
 
				+    eliminator.Init(num_e_blocks, true, matrix.block_structure());
			
 
				+    eliminator.Eliminate(&matrix, b.data(), diagonal.data(), &actual_lhs,
			
 
				+                         actual_rhs.data());
			
 
				+    eliminator.BackSubstitute(&matrix, b.data(), diagonal.data(), f_sol.data(),
			
 
				+                              expected_e_sol.data());
			
 
				+  }
			
 
				+  ConstMatrixRef actual_lhsref(actual_lhs.values(), actual_lhs.num_cols(),
			
 
				+                               actual_lhs.num_cols());
			
 
				+  ConstMatrixRef expected_lhsref(expected_lhs.values(), actual_lhs.num_cols(),
			
 
				+                                 actual_lhs.num_cols());
			
 
				+
			
 
				+  EXPECT_NEAR((actual_lhsref - expected_lhsref).norm() / expected_lhsref.norm(),
			
 
				+              0.0, 1e-12)
			
 
				+      << "expected: \n"
			
 
				+      << expected_lhsref << "\nactual: \n"
			
 
				+      << actual_lhsref;
			
 
				+
			
 
				+  EXPECT_NEAR((actual_rhs - expected_rhs).norm() / expected_rhs.norm(), 0.0,
			
 
				+              1e-12)
			
 
				+      << "expected: \n"
			
 
				+      << expected_rhs << "\nactual: \n"
			
 
				+      << actual_rhs;
			
 
				+
			
 
				+  EXPECT_NEAR((actual_e_sol - expected_e_sol).norm() / expected_e_sol.norm(),
			
 
				+              0.0, 1e-12)
			
 
				+      << "expected: \n"
			
 
				+      << expected_e_sol << "\nactual: \n"
			
 
				+      << actual_e_sol;
			
 
				+}
			
 
				+
			
 
				 }  // namespace internal
			
 
				 }  // namespace ceres