any_of Class — pytorch Architecture
Architecture documentation for the any_of class in Convolution.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/Convolution.cpp lines 286–591
template <typename T>
struct ConvParams {
std::vector<T> stride;
std::vector<T> padding;
std::vector<T> dilation;
bool transposed{};
std::vector<T> output_padding;
T groups{};
bool benchmark{};
bool deterministic{};
bool cudnn_enabled{};
bool allow_tf32{};
bool is_strided() const {
return std::any_of(
stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; });
}
bool is_dilated() const {
return std::any_of(
dilation.cbegin(), dilation.cend(), [](const T& d) { return d != 1; });
}
bool is_padded() const {
return std::any_of(
padding.cbegin(), padding.cend(), [](const T& p) { return p != 0; });
}
bool is_output_padding_neg() const {
return std::any_of(
output_padding.cbegin(),
output_padding.cend(),
[](const T& p) { return p < 0; });
}
bool is_output_padding_big() const {
// Revisit this with std::views::zip at C++20.
for (auto i: c10::irange(output_padding.size())) {
if (output_padding[i] >= stride[i]) {
return true;
}
}
return false;
}
bool is_padding_neg() const {
return std::any_of(
padding.cbegin(), padding.cend(), [](const T& p) { return p < 0; });
}
bool is_dilation_neg() const {
return std::any_of(
dilation.cbegin(), dilation.cend(), [](const T& d) { return d < 0; });
}
bool is_stride_nonpos() const {
return std::any_of(
stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; });
}
void view1d_as_2d() {
if (stride.size() == 1) {
stride.insert(stride.begin(), 1);
padding.insert(padding.begin(), 0);
dilation.insert(dilation.begin(), 1);
output_padding.insert(output_padding.begin(), 0);
}
}
bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const std::optional<at::Tensor>& bias) const {
#if defined(__ARM_NEON__) || (defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=12000)
// Currently only 3x3 depthwise convolutions on tensors of float are supported.
return (input.ndimension() == 4) &&
(at::symint::size<T>(input, 1) == groups) &&
(weight.ndimension() == 4 ) &&
(at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0) &&
(at::symint::size<T>(weight, 1) == 1) &&
(at::symint::size<T>(weight, 2) == 3) &&
(at::symint::size<T>(weight, 3) == 3) &&
(input.device().is_cpu()) &&
(input.scalar_type() == at::kFloat) &&
input.is_contiguous() &&
(weight.device().is_cpu()) &&
(weight.scalar_type() == at::kFloat) &&
weight.is_contiguous() &&
(!bias.has_value() || bias->is_contiguous()) &&
!is_strided() &&
!is_dilated() &&
!transposed;
#else
return false;
#endif
}
bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const {
constexpr int64_t int_max = std::numeric_limits<int>::max();
auto numel_input = at::symint::numel<T>(input);
// empty input
if (numel_input == 0) {
return false;
}
// input size can not be reduced to the range of int by splitting the batch dim
auto n = at::symint::size<T>(input, 0);
if (numel_input / n > int_max) {
return true;
}
// output size can not be reduced to the range of int by splitting the batch dim
T outsize = 1;
if (transposed) {
auto o = conv_input_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, output_padding, stride, dilation, groups);
outsize = c10::multiply_integers(o.begin() + 1, o.end());
} else {
auto o = conv_output_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, stride, dilation);
outsize = c10::multiply_integers(o.begin() + 1, o.end());
}
return outsize > int_max;
}
bool use_cudnn(const at::Tensor& input, const at::Tensor& weight) const {
// Note [Mobile check segfaults]
// cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
// that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
#if !defined(C10_MOBILE)
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
return false;
}
static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
if (needs_64bit_indexing_no_split(input, weight)) {
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
" if the V8 API is not enabled or before cuDNN version 9.3+."
" Consider upgrading cuDNN and/or enabling the V8 API for better efficiency.");
return false;
}
}
if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
return false;
}
}
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) {
if (is_dilated()) {
return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big();
}
}
return !is_output_padding_big();
#else
return false;
#endif
}
// Use cudnn for FP16 depthwise convolutions
bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const {
if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
return false;
}
// native kernel doesn't support 64-bit non-splittable case
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
if (cudnn_version < 0 || (cudnn_version > 91000 && cudnn_version < 91500)) {
return false;
}
}
if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
" if the V8 API is not enabled or before cuDNN version 9.3+."
" Upgrade cuDNN or enable the V8 API to use cuDNN for 64-bit depthwise convolutions.");
return false;
} else {
return true;
}
}
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
// always use cudnn_depthwise for channels_last format
return true;
}
if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
bool kernel_cond = (use_cudnn(input, weight) &&
input.scalar_type() == kHalf && // only for FP16
weight.scalar_type() == kHalf &&
is_depthwise(input, weight) &&
input.ndimension() == 4 && // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
!is_dilated() && // no dilation supported
(stride[0] == stride[1] || at::symint::size<T>(input, 2) == 1) && // square or 1d
at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
if (kernel_cond) {
return check_cudnn_depthwise_workload_with_filter<T>(input, stride[1], weight);
}
return false;
} else {
return false;
}
}
bool use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const {
// MIOpen supports 64-bit indexing via miopenSetTensorDescriptorV2 API
// Reference: https://github.com/ROCm/MIOpen/pull/2838
return ((input.scalar_type() == at::kFloat) || (input.scalar_type() == at::kHalf) || (input.scalar_type() == at::kBFloat16))
&& cudnn_enabled
&& input.is_cuda()
&& detail::getCUDAHooks().compiledWithMIOpen()
&& input.dim() <= MIOPEN_DIM_MAX
&& !(groups > 1 && is_dilated()) // MIOpen currently does not support dilation with groups of size > 1
;
}
bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const {
#if AT_MKLDNN_ENABLED()
if (!at::globalContext().userEnabledMkldnn()) {
return false;
}
if (transposed && is_output_padding_big()) {
return false;
}
if (input.device().is_cpu() &&
((input.scalar_type() == at::kBFloat16 && mkldnn_bf16_device_check()) ||
(input.scalar_type() == at::kHalf && mkldnn_fp16_device_check()))) {
return true;
}
return (input.is_mkldnn()) || // input is mkldnn Tensor
(input.device().is_cpu() &&
input.scalar_type() == kFloat && // only on CPU Float Tensors
// For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
// but THNN is faster when single-threaded.
(is_strided() || is_dilated() || at::symint::size<T>(input, 0) >= 16 ||
at::symint::size<T>(weight, -1) != 1 || at::symint::size<T>(weight, -2) != 1 || at::get_num_threads() > 1) &&
(groups > 1
|| (at::symint::size<T>(weight, -1) > 3 && at::symint::size<T>(weight, -2) > 3)
|| at::symint::size<T>(input, 0) > 1
|| at::symint::size<T>(input, 0)*at::symint::size<T>(input, 1)*at::symint::size<T>(input, 2)*at::symint::size<T>(input, 3) > 20480) // for some case, native is faster
);
#endif
return false;
}
bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const {
#if AT_NNPACK_ENABLED()
return at::globalContext().userEnabledNNPACK() &&
at::_nnpack_available() &&
input.device().is_cpu() &&
input.scalar_type() == kFloat && // only on CPU Float Tensors
!is_dilated() && // or dilation
!transposed && // or transposed tensors
input.ndimension() == 4 && // must be in NCHW format
weight.ndimension() == 4 &&
(at::symint::size<T>(weight, 2) < 17) && (at::symint::size<T>(weight, 3) < 17) && // NNPACK only supports kernels up to 16x16
(padding[0] < at::symint::size<T>(weight, 2)) && (padding[1] < at::symint::size<T>(weight, 3)) // NNPACK only supports padding < kernel_size. See https://github.com/pytorch/pytorch/issues/90142.
#if !defined(C10_MOBILE)
&& at::symint::size<T>(input, 0) >= 16 // ensure large enough batch size to ensure perf, tuneable
#endif
;
#endif
return false;
}
bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
const at::OptionalArrayRef<T> bias_sizes_opt) const {
#if defined(C10_MOBILE)
if (!transposed) {
// NB: for the call here, it MATTERS that we are templated. If you
// untemplate this to always use SymInt, the function
// xnnpack_use_convolution2d will always return false
return (at::symint::size<T>(input, 1) == groups) &&
xnnpack_use_convolution2d(
input,
weight,
bias_sizes_opt,
padding,
stride,
dilation,
groups,
transposed);
}
#endif
return false;
}
bool use_mps(const at::Tensor& input, const at::Tensor& weight) const {
// These checks need to be expanded. Currently we have very limited set of
// checks for MPS.
#ifdef USE_MPS
if (needs_64bit_indexing_no_split(input, weight)) {
return false;
}
if (!input.is_mps()) {
return false;
}
return true;
#else
return false;
#endif
}
// We currently only have depthwise support for the case where groups ==
// nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
// a depthwise multiplier)
bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const {
return input.is_cuda() &&
!transposed &&
(input.ndimension() == 4 || input.ndimension() == 5) &&
at::symint::size<T>(input, 1) == groups &&
groups > 1 && // no point if there is only a single group
at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0; // output channels must be a multiple of input channels
}
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free