apply_cholesky_cusolver_potrf_looped Class — pytorch Architecture
Architecture documentation for the apply_cholesky_cusolver_potrf_looped class in BatchLinearAlgebraLib.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 713–777
template<typename scalar_t>
static void apply_cholesky_cusolver_potrf_looped(const Tensor& self_working_copy, bool upper, const Tensor& infos) {
auto handle = at::cuda::getCurrentCUDASolverDnHandle();
const auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const int64_t n = self_working_copy.size(-1);
const int64_t lda = std::max<int64_t>(1, n);
const int64_t batch_size = batchCount(self_working_copy);
const int64_t matrix_stride = matrixStride(self_working_copy);
scalar_t* self_working_copy_ptr = self_working_copy.data_ptr<scalar_t>();
int* infos_ptr = infos.data_ptr<int>();
#ifdef USE_CUSOLVER_64_BIT
size_t worksize_device;
size_t worksize_host;
cusolverDnParams_t params;
cudaDataType datatype = at::cuda::solver::get_cusolver_datatype<scalar_t>();
TORCH_CUSOLVER_CHECK(cusolverDnCreateParams(¶ms));
at::cuda::solver::xpotrf_buffersize(handle, params, uplo, n, datatype, nullptr, lda, datatype, &worksize_device, &worksize_host);
// allocate workspace storage
auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
auto workdata_device = device_allocator.allocate(worksize_device * batch_size);
void* workdata_device_ptr = workdata_device.get();
auto& host_allocator = *at::getCPUAllocator();
auto workdata_host = host_allocator.allocate(worksize_host * batch_size);
void* workdata_host_ptr = workdata_host.get();
for (int64_t i = 0; i < batch_size; i++) {
at::cuda::solver::xpotrf(
handle, params, uplo, n, datatype,
self_working_copy_ptr + i * matrix_stride,
lda, datatype,
static_cast<char*>(workdata_device_ptr) + i * worksize_device, worksize_device,
static_cast<char*>(workdata_host_ptr) + i * worksize_host, worksize_host,
infos_ptr + i
);
}
TORCH_CUSOLVER_CHECK(cusolverDnDestroyParams(params));
#else // USE_CUSOLVER_64_BIT
int n_32 = cuda_int_cast(n, "n");
int lda_32 = cuda_int_cast(lda, "lda");
int lwork;
at::cuda::solver::potrf_buffersize<scalar_t>(
handle, uplo, n_32, nullptr, lda_32, &lwork);
// allocate workspace storage
auto& allocator = *at::cuda::getCUDADeviceAllocator();
auto work_data = allocator.allocate(sizeof(scalar_t)*lwork * batch_size);
scalar_t* work_data_ptr = static_cast<scalar_t*>(work_data.get());
for (int64_t i = 0; i < batch_size; i++) {
at::cuda::solver::potrf<scalar_t>(
handle, uplo, n_32,
self_working_copy_ptr + i * matrix_stride,
lda_32,
work_data_ptr + i * lwork,
lwork,
infos_ptr + i
);
}
#endif // USE_CUSOLVER_64_BIT
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free