is_same_v Class — pytorch Architecture
Architecture documentation for the is_same_v class in AdaptiveAvgPoolKernel.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp lines 157–254
template <typename scalar_t>
typename std::enable_if_t<!std::is_same_v<scalar_t, at::opmath_type<scalar_t>>, void>
cpu_adaptive_avg_pool2d_channels_last(
Tensor& output_,
const Tensor& input_,
IntArrayRef output_size) {
auto memory_format = at::MemoryFormat::ChannelsLast;
auto input = input_.contiguous(memory_format);
auto output = output_.contiguous(memory_format);
auto input_data = input.const_data_ptr<scalar_t>();
auto output_data = output.data_ptr<scalar_t>();
int64_t nbatch = input.size(0);
int64_t channels = input.size(1);
int64_t input_height = input.size(2);
int64_t input_width = input.size(3);
int64_t output_height = output_size[0];
int64_t output_width = output_size[1];
using bVec = vec::Vectorized<scalar_t>;
using fVec = vec::Vectorized<float>;
// parallel on dim N, H, W
at::parallel_for(0, nbatch * output_height * output_width, 0, [&](int64_t begin, int64_t end) {
int64_t n = 0;
int64_t oh = 0;
int64_t ow = 0;
data_index_init(begin, n, nbatch, oh, output_height, ow, output_width);
// temp buffer for sum, use float as accumulation type
// can't reuse output buffer to store sum since it is BFloat16/Half
auto sum_arr = std::make_unique<float []>(channels);
float* sum = sum_arr.get();
for (const auto i : c10::irange(begin, end)) {
int64_t ih0 = start_index(oh, output_height, input_height);
int64_t ih1 = end_index(oh, output_height, input_height);
int64_t kh = ih1 - ih0;
int64_t iw0 = start_index(ow, output_width, input_width);
int64_t iw1 = end_index(ow, output_width, input_width);
int64_t kw = iw1 - iw0;
scalar_t* out = output_data + i * channels;
int64_t size = channels;
// Pass I: zero the out lane
int64_t d1 = 0;
for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) {
fVec sum_fvec = fVec(float(0));
sum_fvec.store(sum + d1);
}
for (; d1 < size; d1++) {
sum[d1] = float(0);
}
// Pass II: compute local sum
for (const auto ih : c10::irange(ih0, ih1)) {
for (const auto iw : c10::irange(iw0, iw1)) {
const scalar_t* in = input_data + n * input_height * input_width * channels +
ih * input_width * channels + iw * channels;
int64_t d2 = 0;
for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) {
bVec data_bvec = bVec::loadu(in + d2);
auto [data_fvec0, data_fvec1] = convert_to_float<scalar_t>(data_bvec);
fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0;
fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1;
sum_fvec0.store(sum + d2);
sum_fvec1.store(sum + d2 + fVec::size());
}
for (; d2 < size; d2++) {
sum[d2] += float(in[d2]);
}
}
}
// Pass III: compute local average
int64_t d3 = 0;
for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) {
fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(kh * kw));
fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(kh * kw));
bVec out_bvec = convert_from_float<scalar_t>(out_fvec0, out_fvec1);
out_bvec.store(out + d3);
}
for (; d3 < size; d3++) {
out[d3] = scalar_t(sum[d3] / kh / kw);
}
// move on to next output index
data_index_step(n, nbatch, oh, output_height, ow, output_width);
}
});
if (!output_.is_contiguous(memory_format)) {
output_.copy_(output);
}
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free