Home / Class/ apply_syevj_batched Class — pytorch Architecture

apply_syevj_batched Class — pytorch Architecture

Architecture documentation for the apply_syevj_batched class in BatchLinearAlgebraLib.cpp from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp lines 1462–1569

template <typename scalar_t>
static void apply_syevj_batched(const Tensor& values, const Tensor& vectors, const Tensor& infos, bool upper, bool compute_eigenvectors) {
  using value_t = typename c10::scalar_value_type<scalar_t>::type;

  cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
  cusolverEigMode_t jobz = compute_eigenvectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;

  int n = cuda_int_cast(vectors.size(-1), "n");
  int lda = std::max<int>(1, n);
  int batch_size = cuda_int_cast(batchCount(vectors), "batch_size");

  auto vectors_data = vectors.data_ptr<scalar_t>();
  auto values_data = values.data_ptr<value_t>();
  auto infos_data = infos.data_ptr<int>();

#ifndef USE_CUSOLVER_64_BIT_XSYEV_BATCHED
  // syevj_params controls the numerical accuracy of syevj
  // by default the tolerance is set to machine accuracy
  // the maximum number of iteration of Jacobi method by default is 100
  // cuSOLVER documentations says: "15 sweeps are good enough to converge to machine accuracy"
  // LAPACK has SVD routine based on similar Jacobi algorithm (gesvj) and there a maximum of 30 iterations is set
  // Let's use the default values for now
  syevjInfo_t syevj_params;
  TORCH_CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
  TORCH_CUSOLVER_CHECK(cusolverDnXsyevjSetSortEig(syevj_params, 1));

  auto handle = at::cuda::getCurrentCUDASolverDnHandle();

  // get the optimal work size and allocate workspace tensor
  int lwork;
  at::cuda::solver::syevjBatched_bufferSize<scalar_t>(
      handle,
      jobz,
      uplo,
      n,
      vectors_data,
      lda,
      values_data,
      &lwork,
      syevj_params,
      batch_size);

  // allocate workspace storage on device
  auto& allocator = *at::cuda::getCUDADeviceAllocator();
  auto work_data = allocator.allocate(sizeof(scalar_t) * lwork);
  at::cuda::solver::syevjBatched<scalar_t>(
      handle,
      jobz,
      uplo,
      n,
      vectors_data,
      lda,
      values_data,
      static_cast<scalar_t*>(work_data.get()),
      lwork,
      infos_data,
      syevj_params,
      batch_size);
  TORCH_CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));

#else

  cusolverDnParams_t syev_params;
  TORCH_CUSOLVER_CHECK(cusolverDnCreateParams(&syev_params));

  auto handle = at::cuda::getCurrentCUDASolverDnHandle();

  // get the optimal work size and allocate workspace tensor
  size_t worksize_device;
  size_t worksize_host;

  at::cuda::solver::xsyevBatched_bufferSize<scalar_t>(
      handle,
      syev_params,
      jobz,
      uplo,
      n,
      vectors_data,
      lda,
      values_data,
      &worksize_device,
      &worksize_host,
      batch_size);

  // allocate workspace storage on device and host
  auto& device_allocator = *at::cuda::getCUDADeviceAllocator();
  auto work_device_data = device_allocator.allocate(worksize_device);
  auto& host_allocator = *at::getCPUAllocator();
  auto work_host_data = host_allocator.allocate(worksize_host);
  at::cuda::solver::xsyevBatched<scalar_t>(
      handle,
      syev_params,
      jobz,
      uplo,
      n,
      vectors_data,
      lda,
      values_data,
      work_device_data.get(),
      worksize_device,
      work_host_data.get(),
      worksize_host,
      infos_data,
      batch_size);
  TORCH_CUSOLVER_CHECK(cusolverDnDestroyParams(syev_params));

#endif // USE_CUSOLVER_64_BIT_XSYEV_BATCHED
}

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free