apply_ldl_solve_cusolver Class — pytorch Architecture

Architecture documentation for the apply_ldl_solve_cusolver class in BatchLinearAlgebraLib.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 117–208

template <typename scalar_t>
void apply_ldl_solve_cusolver(
    const Tensor& A,
    const Tensor& pivots,
    const Tensor& B,
    bool upper) {
#if !(defined(CUDART_VERSION) && defined(CUSOLVER_VERSION))
  TORCH_CHECK(
      false,
      "Calling torch.linalg.ldl_solve on a CUDA tensor requires compiling ",
      "PyTorch with cuSOLVER. Please use PyTorch built with cuSOLVER 11.1.2+ (CUDA 11.3.1+) support.");
#else
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(A) > 0);
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(batchCount(pivots.unsqueeze(-1)) > 0);
  auto batch_size = batchCount(B);
  auto n = A.size(-2);
  auto nrhs = B.size(-1);
  auto lda = A.stride(-1);
  auto ldb = B.stride(-1);
  auto uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;

  auto a_stride = A.dim() > 2 ? A.stride(-3) : 0;
  auto b_stride = B.dim() > 2 ? B.stride(-3) : 0;
  auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0;

  auto a_data = A.const_data_ptr<scalar_t>();
  auto b_data = B.data_ptr<scalar_t>();

  auto pivots_ = pivots.to(kLong);
  auto pivots_data = pivots_.const_data_ptr<int64_t>();

  // needed to run ldl_solve tests in parallel
  // see https://github.com/pytorch/pytorch/issues/82894 for examples of failures
  c10::cuda::device_synchronize();
  auto handle = at::cuda::getCurrentCUDASolverDnHandle();
  auto datatype = at::cuda::solver::get_cusolver_datatype<scalar_t>();
  size_t worksize_device = 0;
  size_t worksize_host = 0;

  TORCH_CUSOLVER_CHECK(cusolverDnXsytrs_bufferSize(
      handle,
      uplo,
      n,
      nrhs,
      datatype,
      a_data,
      lda,
      pivots_data,
      datatype,
      b_data,
      ldb,
      &worksize_device,
      &worksize_host));

  // allocate workspace storage
  auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
  auto workdata_device = device_allocator.allocate(worksize_device);
  void* workdata_device_ptr = workdata_device.get();

  auto& host_allocator = *at::getCPUAllocator();
  auto workdata_host = host_allocator.allocate(worksize_host);
  void* workdata_host_ptr = workdata_host.get();

  Tensor info = at::zeros({}, A.options().dtype(at::kInt));
  for (const auto i : c10::irange(batch_size)) {
    const auto* a_working_ptr = &a_data[i * a_stride];
    auto* b_working_ptr = &b_data[i * b_stride];
    const auto* pivots_working_ptr = &pivots_data[i * pivots_stride];
    TORCH_CUSOLVER_CHECK(cusolverDnXsytrs(
        handle,
        uplo,
        n,
        nrhs,
        datatype,
        a_working_ptr,
        lda,
        pivots_working_ptr,
        datatype,
        b_working_ptr,
        ldb,
        workdata_device_ptr,
        worksize_device,
        workdata_host_ptr,
        worksize_host,
        info.data_ptr<int>()));
  }

  // info from sytrs only reports if the i-th parameter is wrong
  // so we don't need to check it all the time
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info.item().toInt() == 0);
#endif
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free