|  | @@ -69,6 +69,39 @@ struct RowColLessThan {
 | 
	
		
			
				|  |  |    const int* cols;
 | 
	
		
			
				|  |  |  };
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +void TransposeForCompressedRowSparseStructure(const int num_rows,
 | 
	
		
			
				|  |  | +                                              const int num_cols,
 | 
	
		
			
				|  |  | +                                              const int num_nonzeros,
 | 
	
		
			
				|  |  | +                                              const int *rows,
 | 
	
		
			
				|  |  | +                                              const int *cols,
 | 
	
		
			
				|  |  | +                                              const double *values,
 | 
	
		
			
				|  |  | +                                              int* transpose_rows,
 | 
	
		
			
				|  |  | +                                              int *transpose_cols,
 | 
	
		
			
				|  |  | +                                              double *transpose_values) {
 | 
	
		
			
				|  |  | +  for (int idx = 0; idx < num_nonzeros; ++idx) {
 | 
	
		
			
				|  |  | +    ++transpose_rows[cols[idx] + 1];
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  for (int i = 1; i < num_cols + 1; ++i) {
 | 
	
		
			
				|  |  | +    transpose_rows[i] += transpose_rows[i - 1];
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  for (int r = 0; r < num_rows; ++r) {
 | 
	
		
			
				|  |  | +    for (int idx = rows[r]; idx < rows[r + 1]; ++idx) {
 | 
	
		
			
				|  |  | +      const int c = cols[idx];
 | 
	
		
			
				|  |  | +      const int transpose_idx = transpose_rows[c]++;
 | 
	
		
			
				|  |  | +      transpose_cols[transpose_idx] = r;
 | 
	
		
			
				|  |  | +      if (values) {
 | 
	
		
			
				|  |  | +         transpose_values[transpose_idx] = values[idx];
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  for (int i = num_cols - 1; i > 0 ; --i) {
 | 
	
		
			
				|  |  | +    transpose_rows[i] = transpose_rows[i - 1];
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +  transpose_rows[0] = 0;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  |  }  // namespace
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  // This constructor gives you a semi-initialized CompressedRowSparseMatrix.
 | 
	
	
		
			
				|  | @@ -220,6 +253,16 @@ void CompressedRowSparseMatrix::DeleteRows(int delta_rows) {
 | 
	
		
			
				|  |  |    num_rows_ -= delta_rows;
 | 
	
		
			
				|  |  |    rows_.resize(num_rows_ + 1);
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +  // The rest of the code update block information.
 | 
	
		
			
				|  |  | +  // Immediately return in case of no block information.
 | 
	
		
			
				|  |  | +  if (row_blocks_.empty()) {
 | 
	
		
			
				|  |  | +    return;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Sanity check for compressed row sparse block information
 | 
	
		
			
				|  |  | +  CHECK_EQ(crsb_rows_.size(), row_blocks_.size() + 1);
 | 
	
		
			
				|  |  | +  CHECK_EQ(crsb_rows_.back(), crsb_cols_.size());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |    // Walk the list of row blocks until we reach the new number of rows
 | 
	
		
			
				|  |  |    // and the drop the rest of the row blocks.
 | 
	
		
			
				|  |  |    int num_row_blocks = 0;
 | 
	
	
		
			
				|  | @@ -230,12 +273,18 @@ void CompressedRowSparseMatrix::DeleteRows(int delta_rows) {
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    row_blocks_.resize(num_row_blocks);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Update compressed row sparse block (crsb) information.
 | 
	
		
			
				|  |  | +  CHECK_EQ(num_rows, num_rows_);
 | 
	
		
			
				|  |  | +  crsb_rows_.resize(num_row_blocks + 1);
 | 
	
		
			
				|  |  | +  crsb_cols_.resize(crsb_rows_[num_row_blocks]);
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  void CompressedRowSparseMatrix::AppendRows(const CompressedRowSparseMatrix& m) {
 | 
	
		
			
				|  |  |    CHECK_EQ(m.num_cols(), num_cols_);
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  CHECK(row_blocks_.size() == 0 || m.row_blocks().size() !=0)
 | 
	
		
			
				|  |  | +  CHECK((row_blocks_.size() == 0 && m.row_blocks().size() == 0) ||
 | 
	
		
			
				|  |  | +        (row_blocks_.size() != 0 && m.row_blocks().size() != 0))
 | 
	
		
			
				|  |  |        << "Cannot append a matrix with row blocks to one without and vice versa."
 | 
	
		
			
				|  |  |        << "This matrix has : " << row_blocks_.size() << " row blocks."
 | 
	
		
			
				|  |  |        << "The matrix being appended has: " << m.row_blocks().size()
 | 
	
	
		
			
				|  | @@ -270,9 +319,38 @@ void CompressedRowSparseMatrix::AppendRows(const CompressedRowSparseMatrix& m) {
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    num_rows_ += m.num_rows();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // The rest of the code update block information.
 | 
	
		
			
				|  |  | +  // Immediately return in case of no block information.
 | 
	
		
			
				|  |  | +  if (row_blocks_.empty()) {
 | 
	
		
			
				|  |  | +    return;
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Sanity check for compressed row sparse block information
 | 
	
		
			
				|  |  | +  CHECK_EQ(crsb_rows_.size(), row_blocks_.size() + 1);
 | 
	
		
			
				|  |  | +  CHECK_EQ(crsb_rows_.back(), crsb_cols_.size());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |    row_blocks_.insert(row_blocks_.end(),
 | 
	
		
			
				|  |  |                       m.row_blocks().begin(),
 | 
	
		
			
				|  |  |                       m.row_blocks().end());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // The rest of the code update compressed row sparse block (crsb) information.
 | 
	
		
			
				|  |  | +  const int num_crsb_nonzeros = crsb_cols_.size();
 | 
	
		
			
				|  |  | +  const int m_num_crsb_nonzeros = m.crsb_cols_.size();
 | 
	
		
			
				|  |  | +  crsb_cols_.resize(num_crsb_nonzeros + m_num_crsb_nonzeros);
 | 
	
		
			
				|  |  | +  std::copy(&m.crsb_cols()[0], &m.crsb_cols()[0] + m_num_crsb_nonzeros,
 | 
	
		
			
				|  |  | +            &crsb_cols_[num_crsb_nonzeros]);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  const int num_crsb_rows = crsb_rows_.size() - 1;
 | 
	
		
			
				|  |  | +  const int m_num_crsb_rows = m.crsb_rows_.size() - 1;
 | 
	
		
			
				|  |  | +  crsb_rows_.resize(num_crsb_rows + m_num_crsb_rows + 1);
 | 
	
		
			
				|  |  | +  std::fill(crsb_rows_.begin() + num_crsb_rows,
 | 
	
		
			
				|  |  | +            crsb_rows_.begin() + num_crsb_rows + m_num_crsb_rows + 1,
 | 
	
		
			
				|  |  | +            crsb_rows_[num_crsb_rows]);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  for (int r = 0; r < m_num_crsb_rows + 1; ++r) {
 | 
	
		
			
				|  |  | +    crsb_rows_[num_crsb_rows + r] += m.crsb_rows()[r];
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  void CompressedRowSparseMatrix::ToTextFile(FILE* file) const {
 | 
	
	
		
			
				|  | @@ -364,6 +442,15 @@ CompressedRowSparseMatrix* CompressedRowSparseMatrix::CreateBlockDiagonalMatrix(
 | 
	
		
			
				|  |  |    *matrix->mutable_row_blocks() = blocks;
 | 
	
		
			
				|  |  |    *matrix->mutable_col_blocks() = blocks;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +  // Fill compressed row sparse block (crsb) information.
 | 
	
		
			
				|  |  | +  vector<int>& crsb_rows = *matrix->mutable_crsb_rows();
 | 
	
		
			
				|  |  | +  vector<int>& crsb_cols = *matrix->mutable_crsb_cols();
 | 
	
		
			
				|  |  | +  for (int i = 0; i < blocks.size(); ++i) {
 | 
	
		
			
				|  |  | +    crsb_rows.push_back(i);
 | 
	
		
			
				|  |  | +    crsb_cols.push_back(i);
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +  crsb_rows.push_back(blocks.size());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |    CHECK_EQ(idx_cursor, num_nonzeros);
 | 
	
		
			
				|  |  |    CHECK_EQ(col_cursor, num_rows);
 | 
	
		
			
				|  |  |    return matrix;
 | 
	
	
		
			
				|  | @@ -373,40 +460,44 @@ CompressedRowSparseMatrix* CompressedRowSparseMatrix::Transpose() const {
 | 
	
		
			
				|  |  |    CompressedRowSparseMatrix* transpose =
 | 
	
		
			
				|  |  |        new CompressedRowSparseMatrix(num_cols_, num_rows_, num_nonzeros());
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  int* transpose_rows = transpose->mutable_rows();
 | 
	
		
			
				|  |  | -  int* transpose_cols = transpose->mutable_cols();
 | 
	
		
			
				|  |  | -  double* transpose_values = transpose->mutable_values();
 | 
	
		
			
				|  |  | +  TransposeForCompressedRowSparseStructure(
 | 
	
		
			
				|  |  | +      num_rows(), num_cols(), num_nonzeros(),
 | 
	
		
			
				|  |  | +      rows(), cols(), values(),
 | 
	
		
			
				|  |  | +      transpose->mutable_rows(),
 | 
	
		
			
				|  |  | +      transpose->mutable_cols(),
 | 
	
		
			
				|  |  | +      transpose->mutable_values());
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  for (int idx = 0; idx < num_nonzeros(); ++idx) {
 | 
	
		
			
				|  |  | -    ++transpose_rows[cols_[idx] + 1];
 | 
	
		
			
				|  |  | +  // The rest of the code update block information.
 | 
	
		
			
				|  |  | +  // Immediately return in case of no block information.
 | 
	
		
			
				|  |  | +  if (row_blocks_.empty()) {
 | 
	
		
			
				|  |  | +    return transpose;
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  for (int i = 1; i < transpose->num_rows() + 1; ++i) {
 | 
	
		
			
				|  |  | -    transpose_rows[i] += transpose_rows[i - 1];
 | 
	
		
			
				|  |  | -  }
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -  for (int r = 0; r < num_rows(); ++r) {
 | 
	
		
			
				|  |  | -    for (int idx = rows_[r]; idx < rows_[r + 1]; ++idx) {
 | 
	
		
			
				|  |  | -      const int c = cols_[idx];
 | 
	
		
			
				|  |  | -      const int transpose_idx = transpose_rows[c]++;
 | 
	
		
			
				|  |  | -      transpose_cols[transpose_idx] = r;
 | 
	
		
			
				|  |  | -      transpose_values[transpose_idx] = values_[idx];
 | 
	
		
			
				|  |  | -    }
 | 
	
		
			
				|  |  | -  }
 | 
	
		
			
				|  |  | -
 | 
	
		
			
				|  |  | -  for (int i = transpose->num_rows() - 1; i > 0 ; --i) {
 | 
	
		
			
				|  |  | -    transpose_rows[i] = transpose_rows[i - 1];
 | 
	
		
			
				|  |  | -  }
 | 
	
		
			
				|  |  | -  transpose_rows[0] = 0;
 | 
	
		
			
				|  |  | +  // Sanity check for compressed row sparse block information
 | 
	
		
			
				|  |  | +  CHECK_EQ(crsb_rows_.size(), row_blocks_.size() + 1);
 | 
	
		
			
				|  |  | +  CHECK_EQ(crsb_rows_.back(), crsb_cols_.size());
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    *(transpose->mutable_row_blocks()) = col_blocks_;
 | 
	
		
			
				|  |  |    *(transpose->mutable_col_blocks()) = row_blocks_;
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +  // The rest of the code update compressed row sparse block (crsb) information.
 | 
	
		
			
				|  |  | +  vector<int>& transpose_crsb_rows = *transpose->mutable_crsb_rows();
 | 
	
		
			
				|  |  | +  vector<int>& transpose_crsb_cols = *transpose->mutable_crsb_cols();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  transpose_crsb_rows.resize(col_blocks_.size() + 1);
 | 
	
		
			
				|  |  | +  std::fill(transpose_crsb_rows.begin(), transpose_crsb_rows.end(), 0);
 | 
	
		
			
				|  |  | +  transpose_crsb_cols.resize(crsb_cols_.size());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  TransposeForCompressedRowSparseStructure(
 | 
	
		
			
				|  |  | +      row_blocks().size(), col_blocks().size(), crsb_cols().size(),
 | 
	
		
			
				|  |  | +      crsb_rows().data(), crsb_cols().data(), NULL,
 | 
	
		
			
				|  |  | +      transpose_crsb_rows.data(), transpose_crsb_cols.data(), NULL);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |    return transpose;
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  namespace {
 | 
	
		
			
				|  |  | -// A ProductTerm is a term in the outer product of a matrix with
 | 
	
		
			
				|  |  | +// A ProductTerm is a term in the block outer product of a matrix with
 | 
	
		
			
				|  |  |  // itself.
 | 
	
		
			
				|  |  |  struct ProductTerm {
 | 
	
		
			
				|  |  |    ProductTerm(const int row, const int col, const int index)
 | 
	
	
		
			
				|  | @@ -428,72 +519,176 @@ struct ProductTerm {
 | 
	
		
			
				|  |  |    int index;
 | 
	
		
			
				|  |  |  };
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +// Create outerproduct matrix based on the block product information.
 | 
	
		
			
				|  |  | +// The input block product is already sorted.  This function does not
 | 
	
		
			
				|  |  | +// set the sparse rows/cols information.  Instead, it only collects the
 | 
	
		
			
				|  |  | +// nonzeros for each compressed row and puts in row_nnz.
 | 
	
		
			
				|  |  | +// The caller of this function will traverse the block product in a second
 | 
	
		
			
				|  |  | +// round to generate the sparse rows/cols information.
 | 
	
		
			
				|  |  | +// This function also computes the block offset information for
 | 
	
		
			
				|  |  | +// the outerproduct matrix, which is used in outer product computation.
 | 
	
		
			
				|  |  |  CompressedRowSparseMatrix*
 | 
	
		
			
				|  |  | -CompressAndFillProgram(const int num_rows,
 | 
	
		
			
				|  |  | -                       const int num_cols,
 | 
	
		
			
				|  |  | -                       const vector<ProductTerm>& product,
 | 
	
		
			
				|  |  | -                       vector<int>* program) {
 | 
	
		
			
				|  |  | -  CHECK_GT(product.size(), 0);
 | 
	
		
			
				|  |  | +CreateOuterProductMatrix(const int num_cols,
 | 
	
		
			
				|  |  | +                         const vector<int>& blocks,
 | 
	
		
			
				|  |  | +                         const vector<ProductTerm>& product,
 | 
	
		
			
				|  |  | +                         vector<int>* row_nnz) {
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    // Count the number of unique product term, which in turn is the
 | 
	
		
			
				|  |  |    // number of non-zeros in the outer product.
 | 
	
		
			
				|  |  | -  int num_nonzeros = 1;
 | 
	
		
			
				|  |  | +  // Also count the number of non-zeros in each row.
 | 
	
		
			
				|  |  | +  row_nnz->resize(blocks.size());
 | 
	
		
			
				|  |  | +  std::fill(row_nnz->begin(), row_nnz->end(), 0);
 | 
	
		
			
				|  |  | +  (*row_nnz)[product[0].row] = blocks[product[0].col];
 | 
	
		
			
				|  |  | +  int num_nonzeros = blocks[product[0].row] * blocks[product[0].col];
 | 
	
		
			
				|  |  |    for (int i = 1; i < product.size(); ++i) {
 | 
	
		
			
				|  |  | +    // Each (row, col) block counts only once.
 | 
	
		
			
				|  |  | +    // This check depends on product sorted on (row, col).
 | 
	
		
			
				|  |  |      if (product[i].row != product[i - 1].row ||
 | 
	
		
			
				|  |  |          product[i].col != product[i - 1].col) {
 | 
	
		
			
				|  |  | -      ++num_nonzeros;
 | 
	
		
			
				|  |  | +      (*row_nnz)[product[i].row] += blocks[product[i].col];
 | 
	
		
			
				|  |  | +      num_nonzeros += blocks[product[i].row] * blocks[product[i].col];
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    CompressedRowSparseMatrix* matrix =
 | 
	
		
			
				|  |  | -      new CompressedRowSparseMatrix(num_rows, num_cols, num_nonzeros);
 | 
	
		
			
				|  |  | +      new CompressedRowSparseMatrix(num_cols, num_cols, num_nonzeros);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Compute block offsets for outer product matrix, which is used
 | 
	
		
			
				|  |  | +  // in ComputeOuterProduct.
 | 
	
		
			
				|  |  | +  vector<int>* block_offsets = matrix->mutable_block_offsets();
 | 
	
		
			
				|  |  | +  block_offsets->resize(blocks.size() + 1);
 | 
	
		
			
				|  |  | +  (*block_offsets)[0] = 0;
 | 
	
		
			
				|  |  | +  for (int i = 0; i < blocks.size(); ++i) {
 | 
	
		
			
				|  |  | +    (*block_offsets)[i + 1] = (*block_offsets)[i] + blocks[i];
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  return matrix;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +CompressedRowSparseMatrix*
 | 
	
		
			
				|  |  | +CompressAndFillProgram(const int num_cols,
 | 
	
		
			
				|  |  | +                       const vector<int>& blocks,
 | 
	
		
			
				|  |  | +                       const vector<ProductTerm>& product,
 | 
	
		
			
				|  |  | +                       vector<int>* program) {
 | 
	
		
			
				|  |  | +  CHECK_GT(product.size(), 0);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  vector<int> row_nnz;
 | 
	
		
			
				|  |  | +  CompressedRowSparseMatrix* matrix =
 | 
	
		
			
				|  |  | +      CreateOuterProductMatrix(num_cols, blocks, product, &row_nnz);
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  const vector<int>& block_offsets = matrix->block_offsets();
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    int* crsm_rows = matrix->mutable_rows();
 | 
	
		
			
				|  |  | -  std::fill(crsm_rows, crsm_rows + num_rows + 1, 0);
 | 
	
		
			
				|  |  | +  std::fill(crsm_rows, crsm_rows + num_cols + 1, 0);
 | 
	
		
			
				|  |  |    int* crsm_cols = matrix->mutable_cols();
 | 
	
		
			
				|  |  | -  std::fill(crsm_cols, crsm_cols + num_nonzeros, 0);
 | 
	
		
			
				|  |  | +  std::fill(crsm_cols, crsm_cols + matrix->num_nonzeros(), 0);
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    CHECK_NOTNULL(program)->clear();
 | 
	
		
			
				|  |  |    program->resize(product.size());
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  // Iterate over the sorted product terms. This means each row is
 | 
	
		
			
				|  |  | -  // filled one at a time, and we are able to assign a position in the
 | 
	
		
			
				|  |  | -  // values array to each term.
 | 
	
		
			
				|  |  | +  // Non zero elements are not stored consecutively across rows in a block.
 | 
	
		
			
				|  |  | +  // We seperate nonzero into three categories:
 | 
	
		
			
				|  |  | +  //   nonzeros in all previous row blocks counted in nnz
 | 
	
		
			
				|  |  | +  //   nonzeros in current row counted in row_nnz
 | 
	
		
			
				|  |  | +  //   nonzeros in previous col blocks of current row counted in col_nnz
 | 
	
		
			
				|  |  |    //
 | 
	
		
			
				|  |  | -  // If terms repeat, i.e., they contribute to the same entry in the
 | 
	
		
			
				|  |  | -  // result matrix), then they do not affect the sparsity structure of
 | 
	
		
			
				|  |  | -  // the result matrix.
 | 
	
		
			
				|  |  | +  // Give an element (j, k) within a block such that j and k
 | 
	
		
			
				|  |  | +  // represent the relative position to the starting row and starting col of
 | 
	
		
			
				|  |  | +  // the block, the row and col for the element is
 | 
	
		
			
				|  |  | +  //   block_offsets[current.row] + j
 | 
	
		
			
				|  |  | +  //   block_offsets[current.col] + k
 | 
	
		
			
				|  |  | +  // The total number of nonzero to the element is
 | 
	
		
			
				|  |  | +  //   nnz + row_nnz[current.row] * j + col_nnz + k
 | 
	
		
			
				|  |  | +  //
 | 
	
		
			
				|  |  | +  // program keeps col_nnz for block product, which is used later for
 | 
	
		
			
				|  |  | +  // outerproduct computation.
 | 
	
		
			
				|  |  | +  //
 | 
	
		
			
				|  |  | +  // There is no special handling for diagonal blocks as we generate
 | 
	
		
			
				|  |  | +  // BLOCK triangular matrix (diagonal block is full block) instead of
 | 
	
		
			
				|  |  | +  // standard triangular matrix.
 | 
	
		
			
				|  |  |    int nnz = 0;
 | 
	
		
			
				|  |  | -  crsm_cols[0] = product[0].col;
 | 
	
		
			
				|  |  | -  crsm_rows[product[0].row + 1]++;
 | 
	
		
			
				|  |  | -  (*program)[product[0].index] = nnz;
 | 
	
		
			
				|  |  | +  int col_nnz = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Process first product term.
 | 
	
		
			
				|  |  | +  for (int j = 0; j < blocks[product[0].row]; ++j) {
 | 
	
		
			
				|  |  | +    crsm_rows[block_offsets[product[0].row] + j + 1] =
 | 
	
		
			
				|  |  | +        row_nnz[product[0].row];
 | 
	
		
			
				|  |  | +    for (int k = 0; k < blocks[product[0].col]; ++k) {
 | 
	
		
			
				|  |  | +      crsm_cols[row_nnz[product[0].row] * j + k]
 | 
	
		
			
				|  |  | +          = block_offsets[product[0].col] + k;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  (*program)[product[0].index] = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Process rest product terms.
 | 
	
		
			
				|  |  |    for (int i = 1; i < product.size(); ++i) {
 | 
	
		
			
				|  |  |      const ProductTerm& previous = product[i - 1];
 | 
	
		
			
				|  |  |      const ProductTerm& current = product[i];
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      // Sparsity structure is updated only if the term is not a repeat.
 | 
	
		
			
				|  |  |      if (previous.row != current.row || previous.col != current.col) {
 | 
	
		
			
				|  |  | -      crsm_cols[++nnz] = current.col;
 | 
	
		
			
				|  |  | -      crsm_rows[current.row + 1]++;
 | 
	
		
			
				|  |  | +      col_nnz += blocks[previous.col];
 | 
	
		
			
				|  |  | +      if (previous.row != current.row) {
 | 
	
		
			
				|  |  | +        nnz += col_nnz * blocks[previous.row];
 | 
	
		
			
				|  |  | +        col_nnz = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        for (int j = 0; j < blocks[current.row]; ++j) {
 | 
	
		
			
				|  |  | +          crsm_rows[block_offsets[current.row] + j + 1] =
 | 
	
		
			
				|  |  | +              row_nnz[current.row];
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      for (int j = 0; j < blocks[current.row]; ++j) {
 | 
	
		
			
				|  |  | +        for (int k = 0; k < blocks[current.col]; ++k) {
 | 
	
		
			
				|  |  | +          crsm_cols[nnz + row_nnz[current.row] * j + col_nnz + k]
 | 
	
		
			
				|  |  | +              = block_offsets[current.col] + k;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    // All terms get assigned the position in the values array where
 | 
	
		
			
				|  |  | -    // their value is accumulated.
 | 
	
		
			
				|  |  | -    (*program)[current.index] = nnz;
 | 
	
		
			
				|  |  | +    (*program)[current.index] = col_nnz;
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  for (int i = 1; i < num_rows + 1; ++i) {
 | 
	
		
			
				|  |  | +  for (int i = 1; i < num_cols + 1; ++i) {
 | 
	
		
			
				|  |  |      crsm_rows[i] += crsm_rows[i - 1];
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    return matrix;
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +// input is a matrix of dimesion <row_block_size, input_cols>
 | 
	
		
			
				|  |  | +// output is a matrix of dimension <col_block1_size, output_cols>
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// Implement block multiplication O = I1' * I2.
 | 
	
		
			
				|  |  | +// I1 is block(0, col_block1_begin, row_block_size, col_block1_size) of input
 | 
	
		
			
				|  |  | +// I2 is block(0, col_block2_begin, row_block_size, col_block2_size) of input
 | 
	
		
			
				|  |  | +// O is block(0, 0, col_block1_size, col_block2_size) of output
 | 
	
		
			
				|  |  | +void ComputeBlockMultiplication(const int row_block_size,
 | 
	
		
			
				|  |  | +                                const int col_block1_size,
 | 
	
		
			
				|  |  | +                                const int col_block2_size,
 | 
	
		
			
				|  |  | +                                const int col_block1_begin,
 | 
	
		
			
				|  |  | +                                const int col_block2_begin,
 | 
	
		
			
				|  |  | +                                const int input_cols,
 | 
	
		
			
				|  |  | +                                const double *input,
 | 
	
		
			
				|  |  | +                                const int output_cols,
 | 
	
		
			
				|  |  | +                                double *output) {
 | 
	
		
			
				|  |  | +  for (int r = 0; r < row_block_size; ++r) {
 | 
	
		
			
				|  |  | +    for (int idx1 = 0; idx1 < col_block1_size; ++idx1) {
 | 
	
		
			
				|  |  | +      for (int idx2 = 0; idx2 < col_block2_size; ++idx2) {
 | 
	
		
			
				|  |  | +        output[output_cols * idx1 + idx2] +=
 | 
	
		
			
				|  |  | +            input[input_cols * r + col_block1_begin + idx1] *
 | 
	
		
			
				|  |  | +            input[input_cols * r + col_block2_begin + idx2];
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +  }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  |  }  // namespace
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  CompressedRowSparseMatrix*
 | 
	
		
			
				|  |  |  CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
 | 
	
		
			
				|  |  |        const CompressedRowSparseMatrix& m,
 | 
	
		
			
				|  |  | +      const int stype,
 | 
	
		
			
				|  |  |        vector<int>* program) {
 | 
	
		
			
				|  |  |    CHECK_NOTNULL(program)->clear();
 | 
	
		
			
				|  |  |    CHECK_GT(m.num_nonzeros(), 0)
 | 
	
	
		
			
				|  | @@ -501,60 +696,152 @@ CompressedRowSparseMatrix::CreateOuterProductMatrixAndProgram(
 | 
	
		
			
				|  |  |                  << "you found a bug in Ceres. Please report it.";
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    vector<ProductTerm> product;
 | 
	
		
			
				|  |  | -  const vector<int>& row_blocks = m.row_blocks();
 | 
	
		
			
				|  |  | -  int row_block_begin = 0;
 | 
	
		
			
				|  |  | -  // Iterate over row blocks
 | 
	
		
			
				|  |  | -  for (int row_block = 0; row_block < row_blocks.size(); ++row_block) {
 | 
	
		
			
				|  |  | -    const int row_block_end = row_block_begin + row_blocks[row_block];
 | 
	
		
			
				|  |  | -    // Compute the outer product terms for just one row per row block.
 | 
	
		
			
				|  |  | -    const int r = row_block_begin;
 | 
	
		
			
				|  |  | -    // Compute the lower triangular part of the product.
 | 
	
		
			
				|  |  | -    for (int idx1 = m.rows()[r]; idx1 < m.rows()[r + 1]; ++idx1) {
 | 
	
		
			
				|  |  | -      for (int idx2 = m.rows()[r]; idx2 <= idx1; ++idx2) {
 | 
	
		
			
				|  |  | -        product.push_back(ProductTerm(m.cols()[idx1],
 | 
	
		
			
				|  |  | -                                      m.cols()[idx2],
 | 
	
		
			
				|  |  | -                                      product.size()));
 | 
	
		
			
				|  |  | +  const vector<int>& col_blocks = m.col_blocks();
 | 
	
		
			
				|  |  | +  const vector<int>& crsb_rows = m.crsb_rows();
 | 
	
		
			
				|  |  | +  const vector<int>& crsb_cols = m.crsb_cols();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Give input matrix m in Compressed Row Sparse Block format
 | 
	
		
			
				|  |  | +  //     (row_block, col_block)
 | 
	
		
			
				|  |  | +  // represent each block multiplication
 | 
	
		
			
				|  |  | +  //     (row_block, col_block1)' X (row_block, col_block2)
 | 
	
		
			
				|  |  | +  // by its product term index and sort the product terms
 | 
	
		
			
				|  |  | +  //     (col_block1, col_block2, index)
 | 
	
		
			
				|  |  | +  //
 | 
	
		
			
				|  |  | +  // Due to the compression on rows, col_block is accessed through idx to
 | 
	
		
			
				|  |  | +  // crsb_cols.  So col_block is accessed as crsb_cols[idx] in the code.
 | 
	
		
			
				|  |  | +  for (int row_block = 1; row_block < crsb_rows.size(); ++row_block) {
 | 
	
		
			
				|  |  | +    for (int idx1 = crsb_rows[row_block - 1]; idx1 < crsb_rows[row_block];
 | 
	
		
			
				|  |  | +         ++idx1) {
 | 
	
		
			
				|  |  | +      if (stype > 0) { // Lower triangular matrix.
 | 
	
		
			
				|  |  | +        for (int idx2 = crsb_rows[row_block - 1]; idx2 <= idx1; ++idx2) {
 | 
	
		
			
				|  |  | +          product.push_back(ProductTerm(crsb_cols[idx1], crsb_cols[idx2],
 | 
	
		
			
				|  |  | +                                        product.size()));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +      else { // Upper triangular matrix.
 | 
	
		
			
				|  |  | +        for (int idx2 = idx1; idx2 < crsb_rows[row_block]; ++idx2) {
 | 
	
		
			
				|  |  | +          product.push_back(ProductTerm(crsb_cols[idx1], crsb_cols[idx2],
 | 
	
		
			
				|  |  | +                                        product.size()));
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  | -    row_block_begin = row_block_end;
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  | -  CHECK_EQ(row_block_begin, m.num_rows());
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |    sort(product.begin(), product.end());
 | 
	
		
			
				|  |  | -  return CompressAndFillProgram(m.num_cols(), m.num_cols(), product, program);
 | 
	
		
			
				|  |  | +  return CompressAndFillProgram(m.num_cols(), col_blocks, product, program);
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +// Give input matrix m in Compressed Row Sparse Block format
 | 
	
		
			
				|  |  | +//     (row_block, col_block)
 | 
	
		
			
				|  |  | +// compute outerproduct m' * m as sum of block multiplications
 | 
	
		
			
				|  |  | +//     (row_block, col_block1)' X (row_block, col_block2)
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// Given row_block of the input matrix m, we use m_row_begin to represent
 | 
	
		
			
				|  |  | +// the starting row of the row block and m_row_nnz to represent number of
 | 
	
		
			
				|  |  | +// nonzero in each row of the row block, then the rows belonging to
 | 
	
		
			
				|  |  | +// the row block can be represented as a dense matrix starting at
 | 
	
		
			
				|  |  | +//     m.values() + m.rows()[m_row_begin]
 | 
	
		
			
				|  |  | +// with dimension
 | 
	
		
			
				|  |  | +//     <m.row_blocks()[row_block], m_row_nnz>
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// Then each input matrix block (row_block, col_block) can be represented as
 | 
	
		
			
				|  |  | +// a block of above dense matrix starting at position
 | 
	
		
			
				|  |  | +//     (0, m_col_nnz)
 | 
	
		
			
				|  |  | +// with size
 | 
	
		
			
				|  |  | +//     <m.row_blocks()[row_block], m.col_blocks()[col_block]>
 | 
	
		
			
				|  |  | +// where m_col_nnz is the number of nonzero before col_block in each row.
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// The outerproduct block is represented similarly with m_row_begin,
 | 
	
		
			
				|  |  | +// m_row_nnz, m_col_nnz, etc. replaced by row_begin, row_nnz, col_nnz, etc.
 | 
	
		
			
				|  |  | +// The difference is, m_row_begin and m_col_nnz is counted during the
 | 
	
		
			
				|  |  | +// traverse of block multiplication, while row_begin and col_nnz are got
 | 
	
		
			
				|  |  | +// from pre-computed block_offsets and program.
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// Due to the compression on rows, col_block is accessed through
 | 
	
		
			
				|  |  | +// idx to crsb_col vector. So col_block is accessed as crsb_col[idx]
 | 
	
		
			
				|  |  | +// in the code.
 | 
	
		
			
				|  |  | +//
 | 
	
		
			
				|  |  | +// Note this function produces a triangular matrix in block unit (i.e.
 | 
	
		
			
				|  |  | +// diagonal block is a normal block) instead of standard triangular matrix.
 | 
	
		
			
				|  |  | +// So there is no special handling for diagonal blocks.
 | 
	
		
			
				|  |  |  void CompressedRowSparseMatrix::ComputeOuterProduct(
 | 
	
		
			
				|  |  |      const CompressedRowSparseMatrix& m,
 | 
	
		
			
				|  |  | +    const int stype,
 | 
	
		
			
				|  |  |      const vector<int>& program,
 | 
	
		
			
				|  |  |      CompressedRowSparseMatrix* result) {
 | 
	
		
			
				|  |  |    result->SetZero();
 | 
	
		
			
				|  |  |    double* values = result->mutable_values();
 | 
	
		
			
				|  |  | -  const vector<int>& row_blocks = m.row_blocks();
 | 
	
		
			
				|  |  | +  const int* rows = result->rows();
 | 
	
		
			
				|  |  | +  const vector<int>& block_offsets = result->block_offsets();
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |    int cursor = 0;
 | 
	
		
			
				|  |  | -  int row_block_begin = 0;
 | 
	
		
			
				|  |  |    const double* m_values = m.values();
 | 
	
		
			
				|  |  |    const int* m_rows = m.rows();
 | 
	
		
			
				|  |  | -  // Iterate over row blocks.
 | 
	
		
			
				|  |  | -  for (int row_block = 0; row_block < row_blocks.size(); ++row_block) {
 | 
	
		
			
				|  |  | -    const int row_block_end = row_block_begin + row_blocks[row_block];
 | 
	
		
			
				|  |  | -    const int saved_cursor = cursor;
 | 
	
		
			
				|  |  | -    for (int r = row_block_begin; r < row_block_end; ++r) {
 | 
	
		
			
				|  |  | -      // Reuse the program segment for each row in this row block.
 | 
	
		
			
				|  |  | -      cursor = saved_cursor;
 | 
	
		
			
				|  |  | -      const int row_begin = m_rows[r];
 | 
	
		
			
				|  |  | -      const int row_end = m_rows[r + 1];
 | 
	
		
			
				|  |  | -      for (int idx1 = row_begin; idx1 < row_end; ++idx1) {
 | 
	
		
			
				|  |  | -        const double v1 =  m_values[idx1];
 | 
	
		
			
				|  |  | -        for (int idx2 = row_begin; idx2 <= idx1; ++idx2, ++cursor) {
 | 
	
		
			
				|  |  | -          values[program[cursor]] += v1 * m_values[idx2];
 | 
	
		
			
				|  |  | +  const vector<int>& row_blocks = m.row_blocks();
 | 
	
		
			
				|  |  | +  const vector<int>& col_blocks = m.col_blocks();
 | 
	
		
			
				|  |  | +  const vector<int>& crsb_rows = m.crsb_rows();
 | 
	
		
			
				|  |  | +  const vector<int>& crsb_cols = m.crsb_cols();
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +#define COL_BLOCK1 (crsb_cols[idx1])
 | 
	
		
			
				|  |  | +#define COL_BLOCK2 (crsb_cols[idx2])
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +  // Iterate row blocks.
 | 
	
		
			
				|  |  | +  for (int row_block = 0, m_row_begin = 0; row_block < row_blocks.size();
 | 
	
		
			
				|  |  | +       m_row_begin += row_blocks[row_block++]) {
 | 
	
		
			
				|  |  | +    // Non zeros are not stored consecutively across rows in a block.
 | 
	
		
			
				|  |  | +    // The gaps between rows is the number of nonzeros of the
 | 
	
		
			
				|  |  | +    // input matrix compressed row.
 | 
	
		
			
				|  |  | +    const int m_row_nnz =
 | 
	
		
			
				|  |  | +        m_rows[m_row_begin + 1] - m_rows[m_row_begin];
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // Iterate (col_block1 x col_block2).
 | 
	
		
			
				|  |  | +    for (int idx1 = crsb_rows[row_block], m_col_nnz1 = 0;
 | 
	
		
			
				|  |  | +         idx1 < crsb_rows[row_block + 1];
 | 
	
		
			
				|  |  | +         m_col_nnz1 += col_blocks[COL_BLOCK1], ++idx1) {
 | 
	
		
			
				|  |  | +      // Non zeros are not stored consecutively across rows in a block.
 | 
	
		
			
				|  |  | +      // The gaps between rows is the number of nonzeros of the
 | 
	
		
			
				|  |  | +      // outerproduct matrix compressed row.
 | 
	
		
			
				|  |  | +      const int row_begin = block_offsets[COL_BLOCK1];
 | 
	
		
			
				|  |  | +      const int row_nnz = rows[row_begin + 1] - rows[row_begin];
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +      if (stype > 0) {  // Lower triangular matrix.
 | 
	
		
			
				|  |  | +        for (int idx2 = crsb_rows[row_block], m_col_nnz2 = 0;
 | 
	
		
			
				|  |  | +             idx2 <= idx1;
 | 
	
		
			
				|  |  | +             m_col_nnz2 += col_blocks[COL_BLOCK2], ++idx2, ++cursor) {
 | 
	
		
			
				|  |  | +          int col_nnz = program[cursor];
 | 
	
		
			
				|  |  | +          ComputeBlockMultiplication(row_blocks[row_block],
 | 
	
		
			
				|  |  | +                                     col_blocks[COL_BLOCK1],
 | 
	
		
			
				|  |  | +                                     col_blocks[COL_BLOCK2],
 | 
	
		
			
				|  |  | +                                     m_col_nnz1,
 | 
	
		
			
				|  |  | +                                     m_col_nnz2,
 | 
	
		
			
				|  |  | +                                     m_row_nnz,
 | 
	
		
			
				|  |  | +                                     m_values + m_rows[m_row_begin],
 | 
	
		
			
				|  |  | +                                     row_nnz,
 | 
	
		
			
				|  |  | +                                     values + rows[row_begin] + col_nnz);
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +      }
 | 
	
		
			
				|  |  | +      else { // Upper triangular matrix.
 | 
	
		
			
				|  |  | +        for (int idx2 = idx1, m_col_nnz2 = m_col_nnz1;
 | 
	
		
			
				|  |  | +             idx2 < crsb_rows[row_block + 1];
 | 
	
		
			
				|  |  | +             m_col_nnz2 += col_blocks[COL_BLOCK2], ++idx2, ++cursor) {
 | 
	
		
			
				|  |  | +          int col_nnz = program[cursor];
 | 
	
		
			
				|  |  | +          ComputeBlockMultiplication(row_blocks[row_block],
 | 
	
		
			
				|  |  | +                                     col_blocks[COL_BLOCK1],
 | 
	
		
			
				|  |  | +                                     col_blocks[COL_BLOCK2],
 | 
	
		
			
				|  |  | +                                     m_col_nnz1,
 | 
	
		
			
				|  |  | +                                     m_col_nnz2,
 | 
	
		
			
				|  |  | +                                     m_row_nnz,
 | 
	
		
			
				|  |  | +                                     m_values + m_rows[m_row_begin],
 | 
	
		
			
				|  |  | +                                     row_nnz,
 | 
	
		
			
				|  |  | +                                     values + rows[row_begin] + col_nnz);
 | 
	
		
			
				|  |  |          }
 | 
	
		
			
				|  |  |        }
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  | -    row_block_begin = row_block_end;
 | 
	
		
			
				|  |  |    }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -  CHECK_EQ(row_block_begin, m.num_rows());
 | 
	
		
			
				|  |  | +#undef COL_BLOCK1
 | 
	
		
			
				|  |  | +#undef COL_BLOCK2
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |    CHECK_EQ(cursor, program.size());
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 |