CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 Class — pytorch Architecture
Architecture documentation for the CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 class in Conv_v7.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cudnn/Conv_v7.cpp lines 352–435
template <>
struct algorithm_search<cudnnConvolutionBwdDataAlgoPerf_t> {
using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
using algo_t = cudnnConvolutionBwdDataAlgo_t;
static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
static BenchmarkCache<perf_t>& cache() {
return bwd_data_algos;
}
static std::vector<perf_t> findAlgorithms(
const ConvolutionArgs& args,
bool benchmark) {
static const algo_t algos[] = {
CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED};
static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
static_assert(
sizeof(algos) / sizeof(algos[0]) == num_algos,
"Missing cuDNN convolution backward data algorithms.");
int perf_count;
c10::SmallVector<perf_t, CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT>
perf_results;
if (!benchmark) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionBackwardDataAlgorithm_v7(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.idesc.desc(),
num_algos,
&perf_count,
perf_results.data()),
args);
} else {
size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos);
Workspace ws(max_ws_size);
at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind");
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle,
args.wdesc.desc(),
args.weight.const_data_ptr(),
args.odesc.desc(),
args.output.const_data_ptr(),
args.cdesc.desc(),
args.idesc.desc(),
args.input.data_ptr(),
num_algos,
&perf_count,
perf_results.data(),
ws.data,
ws.size),
args);
// Free the cached blocks in our caching allocator. They are
// needed here because the above benchmarking uses a huge amount of
// memory, e.g. a few GBs.
c10::cuda::CUDACachingAllocator::emptyCache();
}
return getValidAlgorithms<perf_t>(perf_results.data(), args, perf_count);
}
static void getWorkspaceSize(
const ConvolutionArgs& args,
cudnnConvolutionBwdDataAlgo_t algo,
size_t* workspaceSize) {
AT_CUDNN_CHECK_WITH_SHAPES(
cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle,
args.wdesc.desc(),
args.odesc.desc(),
args.cdesc.desc(),
args.idesc.desc(),
algo,
workspaceSize),
args);
}
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free