apply_ldl_solve_cusolver Class — pytorch Architecture
Architecture documentation for the apply_ldl_solve_cusolver class in BatchLinearAlgebraLib.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 117–208
template <typename scalar_t>
void apply_ldl_solve_cusolver(
const Tensor& A,
const Tensor& pivots,
const Tensor& B,
bool upper) {
#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
TORCH_CHECK(
false,
"Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
"PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER 11.1.2+ (CUDA 11.3.1+) support.");
#else
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0);
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(pivots.unsqueeze(-1)) > 0);
auto batch_size = batchCount(B);
auto n = A.size(-2);
auto nrhs = B.size(-1);
auto lda = A.stride(-1);
auto ldb = B.stride(-1);
auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
auto b_stride = B.dim() > 2 ? B.stride(-3) : 0;
auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;
auto a_data = A.const_data_ptr<scalar_t>();
auto b_data = B.data_ptr<scalar_t>();
auto pivots_ = pivots.to(kLong);
auto pivots_data = pivots_.const_data_ptr<int64_t>();
// needed to run ldl_solve tests in parallel
// see https://github.com/pytorch/pytorch/issues/82894 for examples of failures
c10::cuda::device_synchronize();
auto handle = at::cuda::getCurrentCUDASolverDnHandle();
auto datatype = at::cuda::solver::get_cusolver_datatype<scalar_t>();
size_t worksize_device = 0;
size_t worksize_host = 0;
TORCH_CUSOLVER_CHECK(cusolverDnXsytrs_bufferSize(
handle,
uplo,
n,
nrhs,
datatype,
a_data,
lda,
pivots_data,
datatype,
b_data,
ldb,
&worksize_device,
&worksize_host));
// allocate workspace storage
auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
auto workdata_device = device_allocator.allocate(worksize_device);
void* workdata_device_ptr = workdata_device.get();
auto& host_allocator = *at::getCPUAllocator();
auto workdata_host = host_allocator.allocate(worksize_host);
void* workdata_host_ptr = workdata_host.get();
Tensor info = at::zeros({}, A.options().dtype(at::kInt));
for (const auto i : c10::irange(batch_size)) {
const auto* a_working_ptr = &a_data[i * a_stride];
auto* b_working_ptr = &b_data[i * b_stride];
const auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
TORCH_CUSOLVER_CHECK(cusolverDnXsytrs(
handle,
uplo,
n,
nrhs,
datatype,
a_working_ptr,
lda,
pivots_working_ptr,
datatype,
b_working_ptr,
ldb,
workdata_device_ptr,
worksize_device,
workdata_host_ptr,
worksize_host,
info.data_ptr<int>()));
}
// info from sytrs only reports if the i-th parameter is wrong
// so we don't need to check it all the time
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.item().toInt() == 0);
#endif
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free