any_of Class — pytorch Architecture

Architecture documentation for the any_of class in Convolution.cpp from the pytorch codebase.
Class cpp
Entity Profile

Source Code

aten/src/ATen/native/Convolution.cpp lines 286–591
template <typename T>
struct ConvParams {
  std::vector<T> stride;
  std::vector<T> padding;
  std::vector<T> dilation;
  bool transposed{};
  std::vector<T> output_padding;
  T groups{};
  bool benchmark{};
  bool deterministic{};
  bool cudnn_enabled{};
  bool allow_tf32{};

  bool is_strided() const {
    return std::any_of(
      stride.cbegin(), stride.cend(), [](const T& s) { return s != 1; });
  }

  bool is_dilated() const {
    return std::any_of(
      dilation.cbegin(), dilation.cend(), [](const T& d) { return d != 1; });
  }

  bool is_padded() const {
    return std::any_of(
      padding.cbegin(), padding.cend(), [](const T& p) { return p != 0; });
  }

  bool is_output_padding_neg() const {
    return std::any_of(
      output_padding.cbegin(),
      output_padding.cend(),
      [](const T& p) { return p < 0; });
  }

  bool is_output_padding_big() const {
    // Revisit this with std::views::zip at C++20.
    for (auto i: c10::irange(output_padding.size())) {
      if (output_padding[i] >= stride[i]) {
        return true;
      }
    }
    return false;
  }

  bool is_padding_neg() const {
    return std::any_of(
      padding.cbegin(), padding.cend(), [](const T& p) { return p < 0; });
  }

  bool is_dilation_neg() const {
    return std::any_of(
      dilation.cbegin(), dilation.cend(), [](const T& d) { return d < 0; });
  }

  bool is_stride_nonpos() const {
    return std::any_of(
      stride.cbegin(), stride.cend(), [](const T& s) { return s <= 0; });
  }

  void view1d_as_2d() {
    if (stride.size() == 1) {
      stride.insert(stride.begin(), 1);
      padding.insert(padding.begin(), 0);
      dilation.insert(dilation.begin(), 1);
      output_padding.insert(output_padding.begin(), 0);
    }
  }

  bool use_cpu_depthwise3x3_winograd(const at::Tensor& input, const at::Tensor& weight, const std::optional<at::Tensor>& bias) const {
#if defined(__ARM_NEON__) || (defined(__riscv_v_intrinsic) && __riscv_v_intrinsic>=12000)
    // Currently only 3x3 depthwise convolutions on tensors of float are supported.
    return (input.ndimension() == 4) &&
           (at::symint::size<T>(input, 1) == groups) &&
           (weight.ndimension() == 4 ) &&
           (at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0) &&
           (at::symint::size<T>(weight, 1) == 1) &&
           (at::symint::size<T>(weight, 2) == 3) &&
           (at::symint::size<T>(weight, 3) == 3) &&
           (input.device().is_cpu()) &&
           (input.scalar_type() == at::kFloat) &&
           input.is_contiguous() &&
           (weight.device().is_cpu()) &&
           (weight.scalar_type() == at::kFloat) &&
           weight.is_contiguous() &&
           (!bias.has_value() || bias->is_contiguous()) &&
           !is_strided() &&
           !is_dilated() &&
           !transposed;
#else
    return false;
#endif
  }

  bool needs_64bit_indexing_no_split(const at::Tensor& input, const at::Tensor& weight) const {
    constexpr int64_t int_max = std::numeric_limits<int>::max();
    auto numel_input = at::symint::numel<T>(input);
    // empty input
    if (numel_input == 0) {
      return false;
    }
    // input size can not be reduced to the range of int by splitting the batch dim
    auto n = at::symint::size<T>(input, 0);
    if (numel_input / n > int_max) {
      return true;
    }
    // output size can not be reduced to the range of int by splitting the batch dim
    T outsize = 1;
    if (transposed) {
      auto o = conv_input_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, output_padding, stride, dilation, groups);
      outsize = c10::multiply_integers(o.begin() + 1, o.end());
    } else {
      auto o = conv_output_size(at::symint::sizes<T>(input), at::symint::sizes<T>(weight), padding, stride, dilation);
      outsize = c10::multiply_integers(o.begin() + 1, o.end());
    }
    return outsize > int_max;
  }

  bool use_cudnn(const at::Tensor& input, const at::Tensor& weight) const {
  // Note [Mobile check segfaults]
  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
#if !defined(C10_MOBILE)
    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
    if (needs_64bit_indexing_no_split(input, weight)) {
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
                        " Consider upgrading cuDNN and/or enabling the V8 API for better efficiency.");
        return false;
      }
    }
    if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
      if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
        return false;
      }
    }
    if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous) {
      if (is_dilated()) {
        return detail::getCUDAHooks().supportsDilatedConvolutionWithCuDNN() && !is_output_padding_big();
      }
    }
    return !is_output_padding_big();
#else
    return false;
#endif
  }

  // Use cudnn for FP16 depthwise convolutions
  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
    if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
      return false;
    }
    // native kernel doesn't support 64-bit non-splittable case
    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || (cudnn_version > 91000 && cudnn_version < 91500)) {
          return false;
        }
      }

      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
                        " Upgrade cuDNN or enable the V8 API to use cuDNN for 64-bit depthwise convolutions.");
        return false;
      } else {
        return true;
      }
    }
    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
      // always use cudnn_depthwise for channels_last format
      return true;
    }
    if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
      bool kernel_cond =  (use_cudnn(input, weight) &&
                           input.scalar_type() == kHalf && // only for FP16
                           weight.scalar_type() == kHalf &&
                           is_depthwise(input, weight) &&
                           input.ndimension() == 4 &&   // TODO: 5-D contiguous depthwise is not supported yet, need benchmarks
                           !is_dilated() && // no dilation supported
                           (stride[0] == stride[1] || at::symint::size<T>(input, 2) == 1) && // square or 1d
                           at::symint::size<T>(input, 1) >= 32); // min 32 channels supported)
      if (kernel_cond) {
        return check_cudnn_depthwise_workload_with_filter<T>(input, stride[1], weight);
      }
      return false;
    } else {
      return false;
    }
  }

  bool use_miopen(const at::Tensor& input, const at::Tensor& weight, bool bias_defined) const  {
    // MIOpen supports 64-bit indexing via miopenSetTensorDescriptorV2 API
    // Reference: https://github.com/ROCm/MIOpen/pull/2838
    return ((input.scalar_type() == at::kFloat) || (input.scalar_type() == at::kHalf) || (input.scalar_type() == at::kBFloat16))
           && cudnn_enabled
           && input.is_cuda()
           && detail::getCUDAHooks().compiledWithMIOpen()
           && input.dim() <= MIOPEN_DIM_MAX
           && !(groups > 1 && is_dilated()) // MIOpen currently does not support dilation with groups of size > 1
           ;
  }
  bool use_mkldnn(const at::Tensor& input, const at::Tensor& weight) const  {
#if AT_MKLDNN_ENABLED()
    if (!at::globalContext().userEnabledMkldnn()) {
      return false;
    }
    if (transposed && is_output_padding_big()) {
      return false;
    }
    if (input.device().is_cpu() &&
        ((input.scalar_type() == at::kBFloat16 && mkldnn_bf16_device_check()) ||
         (input.scalar_type() == at::kHalf && mkldnn_fp16_device_check()))) {
      return true;
    }
    return (input.is_mkldnn()) || // input is mkldnn Tensor
      (input.device().is_cpu() &&
       input.scalar_type() == kFloat && // only on CPU Float Tensors
       // For 1x1 filters, MKLDNN is faster than THNN when multi-threaded,
       // but THNN is faster when single-threaded.
       (is_strided() || is_dilated() || at::symint::size<T>(input, 0) >= 16 ||
        at::symint::size<T>(weight, -1) != 1 || at::symint::size<T>(weight, -2) != 1 || at::get_num_threads() > 1) &&
       (groups > 1
        || (at::symint::size<T>(weight, -1) > 3 && at::symint::size<T>(weight, -2) > 3)
        || at::symint::size<T>(input, 0) > 1
        || at::symint::size<T>(input, 0)*at::symint::size<T>(input, 1)*at::symint::size<T>(input, 2)*at::symint::size<T>(input, 3) > 20480) // for some case, native is faster
        );

#endif
    return false;
  }
  bool use_nnpack(const at::Tensor& input, const at::Tensor& weight) const  {
#if AT_NNPACK_ENABLED()
    return at::globalContext().userEnabledNNPACK() &&
           at::_nnpack_available() &&
           input.device().is_cpu() &&
           input.scalar_type() == kFloat && // only on CPU Float Tensors
           !is_dilated() && // or dilation
           !transposed &&   // or transposed tensors
           input.ndimension() == 4 && // must be in NCHW format
           weight.ndimension() == 4 &&
           (at::symint::size<T>(weight, 2) < 17) && (at::symint::size<T>(weight, 3) < 17) && // NNPACK only supports kernels up to 16x16
           (padding[0] < at::symint::size<T>(weight, 2)) && (padding[1] < at::symint::size<T>(weight, 3)) // NNPACK only supports padding < kernel_size. See https://github.com/pytorch/pytorch/issues/90142.
#if !defined(C10_MOBILE)
           && at::symint::size<T>(input, 0) >= 16 // ensure large enough batch size to ensure perf, tuneable
#endif
       ;
#endif
    return false;
  }
  bool use_xnnpack(const at::Tensor& input, const at::Tensor& weight,
                   const at::OptionalArrayRef<T> bias_sizes_opt) const {
#if defined(C10_MOBILE)
    if (!transposed) {
      // NB: for the call here, it MATTERS that we are templated. If you
      // untemplate this to always use SymInt, the function
      // xnnpack_use_convolution2d will always return false
      return (at::symint::size<T>(input, 1) == groups) &&
              xnnpack_use_convolution2d(
                  input,
                  weight,
                  bias_sizes_opt,
                  padding,
                  stride,
                  dilation,
                  groups,
                  transposed);
    }
#endif
    return false;
  }

  bool use_mps(const at::Tensor& input, const at::Tensor& weight) const {
    // These checks need to be expanded. Currently we have very limited set of
    // checks for MPS.
#ifdef USE_MPS
    if (needs_64bit_indexing_no_split(input, weight)) {
      return false;
    }
    if (!input.is_mps()) {
      return false;
    }
    return true;
#else
    return false;
#endif
  }

  // We currently only have depthwise support for the case where groups ==
  // nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
  // a depthwise multiplier)
  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
    return input.is_cuda() &&
           !transposed &&
           (input.ndimension() == 4 || input.ndimension() == 5) &&
           at::symint::size<T>(input, 1) == groups &&
           groups > 1 && // no point if there is only a single group
           at::symint::size<T>(weight, 0) % at::symint::size<T>(input, 1) == 0; // output channels must be a multiple of input channels
  }
};
Source

View on GitHub
Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free