Home / Class/ CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM Class — pytorch Architecture

CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM Class — pytorch Architecture

Architecture documentation for the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM class in Conv_v7.cpp from the pytorch codebase.

Entity Profile

Source Code

aten/src/ATen/native/cudnn/Conv_v7.cpp lines 264–350

template <>
struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
  using perf_t = cudnnConvolutionFwdAlgoPerf_t;
  using algo_t = cudnnConvolutionFwdAlgo_t;

  static constexpr auto DEFAULT_ALGO =
      CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
  static BenchmarkCache<perf_t>& cache() {
    return fwd_algos;
  }

  static std::vector<perf_t> findAlgorithms(
      const ConvolutionArgs& args,
      bool benchmark) {
    static const algo_t algos[] = {
        CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
        CUDNN_CONVOLUTION_FWD_ALGO_FFT,
        CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
        CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
        CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
        CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
    };
    static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
    static_assert(
        sizeof(algos) / sizeof(algos[0]) == num_algos,
        "Missing cuDNN convolution forward algorithms");
    int perf_count;
    c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
    if (!benchmark) {
      AT_CUDNN_CHECK_WITH_SHAPES(
          cudnnGetConvolutionForwardAlgorithm_v7(
              args.handle,
              args.idesc.desc(),
              args.wdesc.desc(),
              args.cdesc.desc(),
              args.odesc.desc(),
              num_algos,
              &perf_count,
              perf_results.data()),
          args);
    } else {
      size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
      Workspace ws(max_ws_size);
      at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
      AT_CUDNN_CHECK_WITH_SHAPES(
          cudnnFindConvolutionForwardAlgorithmEx(
              args.handle,
              args.idesc.desc(),
              args.input.const_data_ptr(),
              args.wdesc.desc(),
              args.weight.const_data_ptr(),
              args.cdesc.desc(),
              args.odesc.desc(),
              args.output.data_ptr(),
              num_algos,
              &perf_count,
              perf_results.data(),
              ws.data,
              ws.size),
          args);

      // Free the cached blocks in our caching allocator. They are
      // needed here because the above benchmarking uses a huge amount of
      // memory, e.g. a few GBs.
      c10::cuda::CUDACachingAllocator::emptyCache();
    }
    return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
  }

  static void getWorkspaceSize(
      const ConvolutionArgs& args,
      algo_t algo,
      size_t* workspaceSize) {
    AT_CUDNN_CHECK_WITH_SHAPES(
        cudnnGetConvolutionForwardWorkspaceSize(
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            algo,
            workspaceSize),
        args);
  }
};

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free