apply_triangular_solve_batched Class — pytorch Architecture
Architecture documentation for the apply_triangular_solve_batched class in BatchLinearAlgebraLibBlas.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp lines 194–218
template <typename scalar_t>
static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) {
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
const auto trans = to_cublas(transpose);
cublasSideMode_t side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
cublasDiagType_t diag = unitriangular ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
auto batch_size = cuda_int_cast(batchCount(A), "batch_size");
// This allows to pass rectangular A and B when left = True
auto m = cuda_int_cast(left ? A.size(-1) : B.size(-2), "m");
auto n = cuda_int_cast(B.size(-1), "n");
auto lda = std::max<int>(1, cuda_int_cast(A.size(-2), "lda"));
auto ldb = std::max<int>(1, cuda_int_cast(B.size(-2), "ldb"));
auto alpha = scalar_t{1};
// cuBLAS batched trsm requires input to be the device array of pointers to device single matrices
Tensor A_ptr_array = get_device_pointers<scalar_t>(A);
Tensor B_ptr_array = get_device_pointers<scalar_t>(B);
auto A_ptr_array_data = reinterpret_cast<scalar_t**>(A_ptr_array.data_ptr());
auto B_ptr_array_data = reinterpret_cast<scalar_t**>(B_ptr_array.data_ptr());
auto handle = at::cuda::getCurrentCUDABlasHandle();
at::cuda::blas::trsmBatched(handle, side, uplo, trans, diag, m, n, &alpha, A_ptr_array_data, lda, B_ptr_array_data, ldb, batch_size);
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free