apply_svd_cusolver_gesvd Class — pytorch Architecture

Architecture documentation for the apply_svd_cusolver_gesvd class in BatchLinearAlgebraLib.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 244–314

template<typename scalar_t>
static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
  const Tensor& infos, bool full_matrices, bool compute_uv,
  const bool calculate_all_batches,
  const std::vector<int64_t>& batches
) {
  using value_t = typename c10::scalar_value_type<scalar_t>::type;
  auto A_data = A.data_ptr<scalar_t>();
  auto S_data = S.data_ptr<value_t>();
  auto A_stride = matrixStride(A);
  auto S_stride = S.size(-1);

  int m = cuda_int_cast(A.size(-2), "m");
  int n = cuda_int_cast(A.size(-1), "n");
  auto k = std::min(m, n);
  int lda = std::max<int>(1, m);
  int ldvh = std::max<int>(1, n);

  TORCH_INTERNAL_ASSERT(m >= n, "cusolver gesvd only supports matrix with sizes m >= n");

  char job = compute_uv ? (full_matrices ? 'A' : 'S') : 'N';
  auto handle = at::cuda::getCurrentCUDASolverDnHandle();

  int lwork = -1;
  at::cuda::solver::gesvd_buffersize<scalar_t>(handle, m, n, &lwork);
  TORCH_INTERNAL_ASSERT(lwork >= 0, "gesvd_buffersize failed to get needed buffer size, got lwork = ", lwork);

  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
  const auto dataPtr_work = allocator.allocate(sizeof(scalar_t)*lwork);
  const auto dataPtr_rwork = allocator.allocate(sizeof(value_t)*std::min(m, n));

  // nb. We can do this .view() because V is a batch of F-contig matrices
  const auto V_view = compute_uv ? V.view({-1, n, V.size(-1)})
                                 : Tensor{};
  // V is F-contig. Since this function computes Vh, we need an auxiliary F-conj-transposed matrix to hold Vh
  const auto Vh_workspace = compute_uv ?  at::empty({n, full_matrices ? n : k},
                                              A.options().memory_format(at::MemoryFormat::Contiguous)).conj()
                                       : Tensor{};
  const auto Vh_ptr = compute_uv ? Vh_workspace.data_ptr<scalar_t>()
                                 : nullptr;

  const auto U_stride = compute_uv ? matrixStride(U) : 0;
  const auto U_ptr = compute_uv ? U.data_ptr<scalar_t>() : nullptr;

  int batchsize = calculate_all_batches ? cuda_int_cast(batchCount(A), "batch size")
                                        : batches.size();


  for(int _i = 0; _i < batchsize; _i++){
    int i = calculate_all_batches ? _i : batches[_i];

    at::cuda::solver::gesvd<scalar_t>(
      handle, job, job, m, n,
      A_data + i * A_stride,
      lda,
      S_data + i * S_stride,
      compute_uv ? U_ptr + i * U_stride : nullptr,
      lda,
      compute_uv ? Vh_ptr : nullptr,
      ldvh,
      reinterpret_cast<scalar_t*>(dataPtr_work.get()),
      lwork,
      reinterpret_cast<value_t*>(dataPtr_rwork.get()),
      infos.data_ptr<int>() + i
    );

    if (compute_uv) {
      V_view[i].copy_(Vh_workspace);
    }
  }
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free