CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 Class — pytorch Architecture
Architecture documentation for the CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 class in Conv_v7.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cudnn/Conv_v7.cpp lines 437–524
template <>
struct algorithm_search<cudnnConvolutionBwdFilterAlgoPerf_t> {
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
using algo_t = cudnnConvolutionBwdFilterAlgo_t;
static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
static BenchmarkCache<perf_t>& cache() {
return bwd_filter_algos;
}
static std::vector<perf_t> findAlgorithms(
const ConvolutionArgs& args,
bool benchmark) {
static const algo_t algos[] = {
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED,
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
};
// NOTE: - 1 because ALGO_WINOGRAD is not implemented
static constexpr int num_algos =
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1;
static_assert(
sizeof(algos) / sizeof(algos[0]) == num_algos,
"Missing cuDNN convolution backward filter algorithms.");
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT>
perf_results;
int perf_count;
if (!benchmark) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionBackwardFilterAlgorithm_v7(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.wdesc.desc(),
num_algos,
&perf_count,
perf_results.data()),
args);
} else {
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
Workspace ws(max_ws_size);
at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle,
args.idesc.desc(),
args.input.const_data_ptr(),
args.odesc.desc(),
args.output.const_data_ptr(),
args.cdesc.desc(),
args.wdesc.desc(),
args.weight.data_ptr(),
num_algos,
&perf_count,
perf_results.data(),
ws.data,
ws.size),
args);
// Free the cached blocks in our caching allocator. They are
// needed here because the above benchmarking uses a huge amount of
// memory, e.g. a few GBs.
c10::cuda::CUDACachingAllocator::emptyCache();
}
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
}
static void getWorkspaceSize(
const ConvolutionArgs& args,
algo_t algo,
size_t* workspaceSize) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle,
args.idesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.wdesc.desc(),
algo,
workspaceSize),
args);
}
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free