apply_svd_cusolver_gesvdaStridedBatched Class — pytorch Architecture

Architecture documentation for the apply_svd_cusolver_gesvdaStridedBatched class in BatchLinearAlgebraLib.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 517–572

template<typename scalar_t>
static void apply_svd_cusolver_gesvdaStridedBatched(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
    const Tensor& infos, bool full_matrices, bool compute_uv) {
#ifndef CUDART_VERSION
  TORCH_CHECK(false, "gesvda: Batched version is supported only with cuBLAS backend.")
#else
  using value_t = typename c10::scalar_value_type<scalar_t>::type;
  int m = cuda_int_cast(A.size(-2), "m");
  int n = cuda_int_cast(A.size(-1), "n");
  TORCH_INTERNAL_ASSERT(m >= n, "cusolver gesvdaStridedBatched requires m >= n");
  int batchsize = cuda_int_cast(batchCount(A), "batch size");

  int lda = A.stride(-1);
  int ldu = compute_uv ? U.stride(-1) : m;
  int ldv = compute_uv ? V.stride(-1) : n;

  auto A_stride = matrixStride(A);
  auto S_stride = S.size(-1);
  auto rank = S_stride; // number of singular values
  auto U_stride = compute_uv ? matrixStride(U) : ldu * rank;  // The strides for "empty matrices" are needed to satisfy cusolver.
  auto V_stride = compute_uv ? matrixStride(V) : ldv * rank;

  // Need to pass allocated memory to the function, otherwise it fails
  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
  auto dataPtr_U = !compute_uv ? allocator.allocate(sizeof(scalar_t) * batchsize * m * n) : c10::DataPtr{};
  auto dataPtr_V = !compute_uv ? allocator.allocate(sizeof(scalar_t) * batchsize * n * n) : c10::DataPtr{};

  auto A_data = A.data_ptr<scalar_t>();
  auto U_data = compute_uv ? U.data_ptr<scalar_t>() : reinterpret_cast<scalar_t*>(dataPtr_U.get());
  auto S_data = S.data_ptr<value_t>();
  auto V_data = compute_uv ? V.data_ptr<scalar_t>() : reinterpret_cast<scalar_t*>(dataPtr_V.get());

  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
  auto jobz = compute_uv ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;

  int lwork = -1;
  at::cuda::solver::gesvdaStridedBatched_buffersize<scalar_t>(
    handle, jobz, rank, m, n, A_data, lda, A_stride, S_data, S_stride, U_data, ldu, U_stride, V_data, ldv, V_stride,
    &lwork, batchsize);
  TORCH_INTERNAL_ASSERT(lwork >= 0, "gesvdaStridedBatched_buffersize failed to get needed buffer size, got lwork = ", lwork);
  auto workspace = allocator.allocate(sizeof(scalar_t)*lwork);

  // The residual Frobenius norm is always returned in double.
  // cuSOLVER remark: if the user is confident on the accuracy of singular values and singular vectors,
  //   for example, certain conditions hold (required singular value is far from zero),
  //   then the performance can be improved by passing a null pointer to h_RnrmF, i.e. no computation of residual norm.
  // Comment: calculation of Frobenius norm is expensive and doesn't affect accuracy of the result

  at::cuda::solver::gesvdaStridedBatched<scalar_t>(
    handle, jobz, rank, m, n, A_data, lda, A_stride, S_data, S_stride, U_data, ldu, U_stride, V_data, ldv, V_stride,
    reinterpret_cast<scalar_t*>(workspace.get()),
    lwork, infos.data_ptr<int>(),
    nullptr,  // cuSOLVER h_RnrmF is not calculated: reinterpret_cast<double*>(residual_frobenius_norm.get()),
    batchsize);
#endif
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free