apply_svd_cusolver_gesvd Class — pytorch Architecture
Architecture documentation for the apply_svd_cusolver_gesvd class in BatchLinearAlgebraLib.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 244–314
template<typename scalar_t>
static void apply_svd_cusolver_gesvd(const Tensor& A, const Tensor& U, const Tensor& S, const Tensor& V,
const Tensor& infos, bool full_matrices, bool compute_uv,
const bool calculate_all_batches,
const std::vector<int64_t>& batches
) {
using value_t = typename c10::scalar_value_type<scalar_t>::type;
auto A_data = A.data_ptr<scalar_t>();
auto S_data = S.data_ptr<value_t>();
auto A_stride = matrixStride(A);
auto S_stride = S.size(-1);
int m = cuda_int_cast(A.size(-2), "m");
int n = cuda_int_cast(A.size(-1), "n");
auto k = std::min(m, n);
int lda = std::max<int>(1, m);
int ldvh = std::max<int>(1, n);
TORCH_INTERNAL_ASSERT(m >= n, "cusolver gesvd only supports matrix with sizes m >= n");
char job = compute_uv ? (full_matrices ? 'A' : 'S') : 'N';
auto handle = at::cuda::getCurrentCUDASolverDnHandle();
int lwork = -1;
at::cuda::solver::gesvd_buffersize<scalar_t>(handle, m, n, &lwork);
TORCH_INTERNAL_ASSERT(lwork >= 0, "gesvd_buffersize failed to get needed buffer size, got lwork = ", lwork);
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
const auto dataPtr_work = allocator.allocate(sizeof(scalar_t)*lwork);
const auto dataPtr_rwork = allocator.allocate(sizeof(value_t)*std::min(m, n));
// nb. We can do this .view() because V is a batch of F-contig matrices
const auto V_view = compute_uv ? V.view({-1, n, V.size(-1)})
: Tensor{};
// V is F-contig. Since this function computes Vh, we need an auxiliary F-conj-transposed matrix to hold Vh
const auto Vh_workspace = compute_uv ? at::empty({n, full_matrices ? n : k},
A.options().memory_format(at::MemoryFormat::Contiguous)).conj()
: Tensor{};
const auto Vh_ptr = compute_uv ? Vh_workspace.data_ptr<scalar_t>()
: nullptr;
const auto U_stride = compute_uv ? matrixStride(U) : 0;
const auto U_ptr = compute_uv ? U.data_ptr<scalar_t>() : nullptr;
int batchsize = calculate_all_batches ? cuda_int_cast(batchCount(A), "batch size")
: batches.size();
for(int _i = 0; _i < batchsize; _i++){
int i = calculate_all_batches ? _i : batches[_i];
at::cuda::solver::gesvd<scalar_t>(
handle, job, job, m, n,
A_data + i * A_stride,
lda,
S_data + i * S_stride,
compute_uv ? U_ptr + i * U_stride : nullptr,
lda,
compute_uv ? Vh_ptr : nullptr,
ldvh,
reinterpret_cast<scalar_t*>(dataPtr_work.get()),
lwork,
reinterpret_cast<value_t*>(dataPtr_rwork.get()),
infos.data_ptr<int>() + i
);
if (compute_uv) {
V_view[i].copy_(Vh_workspace);
}
}
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free