CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM Class — pytorch Architecture
Architecture documentation for the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM class in Conv_v7.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cudnn/Conv_v7.cpp lines 264–350
template <>
struct algorithm_search<cudnnConvolutionFwdAlgoPerf_t> {
using perf_t = cudnnConvolutionFwdAlgoPerf_t;
using algo_t = cudnnConvolutionFwdAlgo_t;
static constexpr auto DEFAULT_ALGO =
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
static BenchmarkCache<perf_t>& cache() {
return fwd_algos;
}
static std::vector<perf_t> findAlgorithms(
const ConvolutionArgs& args,
bool benchmark) {
static const algo_t algos[] = {
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
};
static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
static_assert(
sizeof(algos) / sizeof(algos[0]) == num_algos,
"Missing cuDNN convolution forward algorithms");
int perf_count;
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_FWD_ALGO_COUNT> perf_results;
if (!benchmark) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionForwardAlgorithm_v7(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
num_algos,
&perf_count,
perf_results.data()),
args);
} else {
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
Workspace ws(max_ws_size);
at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnFindConvolutionForwardAlgorithmEx(
args.handle,
args.idesc.desc(),
args.input.const_data_ptr(),
args.wdesc.desc(),
args.weight.const_data_ptr(),
args.cdesc.desc(),
args.odesc.desc(),
args.output.data_ptr(),
num_algos,
&perf_count,
perf_results.data(),
ws.data,
ws.size),
args);
// Free the cached blocks in our caching allocator. They are
// needed here because the above benchmarking uses a huge amount of
// memory, e.g. a few GBs.
c10::cuda::CUDACachingAllocator::emptyCache();
}
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
}
static void getWorkspaceSize(
const ConvolutionArgs& args,
algo_t algo,
size_t* workspaceSize) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionForwardWorkspaceSize(
args.handle,
args.idesc.desc(),
args.wdesc.desc(),
args.cdesc.desc(),
args.odesc.desc(),
algo,
workspaceSize),
args);
}
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free