CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 Class — pytorch Architecture

Architecture documentation for the CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 class in Conv_v7.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/cudnn/Conv_v7.cpp lines 352–435

template <>
struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
  using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
  using algo_t = cudnnConvolutionBwdDataAlgo_t;

  static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
  static BenchmarkCache<perf_t>& cache() {
    return bwd_data_algos;
  }

  static std::vector<perf_t> findAlgorithms(
      const ConvolutionArgs& args,
      bool benchmark) {
    static const algo_t algos[] = {
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
    static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
    static_assert(
        sizeof(algos) / sizeof(algos[0]) == num_algos,
        "Missing cuDNN convolution backward data algorithms.");
    int perf_count;
    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
        perf_results;
    if (!benchmark) {
      AT_CUDNN_CHECK_WITH_SHAPES(
          cudnnGetConvolutionBackwardDataAlgorithm_v7(
              args.handle,
              args.wdesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.idesc.desc(),
              num_algos,
              &perf_count,
              perf_results.data()),
          args);
    } else {
      size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
      Workspace ws(max_ws_size);
      at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
      AT_CUDNN_CHECK_WITH_SHAPES(
          cudnnFindConvolutionBackwardDataAlgorithmEx(
              args.handle,
              args.wdesc.desc(),
              args.weight.const_data_ptr(),
              args.odesc.desc(),
              args.output.const_data_ptr(),
              args.cdesc.desc(),
              args.idesc.desc(),
              args.input.data_ptr(),
              num_algos,
              &perf_count,
              perf_results.data(),
              ws.data,
              ws.size),
          args);

      // Free the cached blocks in our caching allocator. They are
      // needed here because the above benchmarking uses a huge amount of
      // memory, e.g. a few GBs.
      c10::cuda::CUDACachingAllocator::emptyCache();
    }
    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
  }

  static void getWorkspaceSize(
      const ConvolutionArgs& args,
      cudnnConvolutionBwdDataAlgo_t algo,
      size_t* workspaceSize) {
    AT_CUDNN_CHECK_WITH_SHAPES(
        cudnnGetConvolutionBackwardDataWorkspaceSize(
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            algo,
            workspaceSize),
        args);
  }
};

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free