run_parallel_cdist Class — pytorch Architecture
Architecture documentation for the run_parallel_cdist class in DistanceOpsKernel.cpp from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/cpu/DistanceOpsKernel.cpp lines 203–251
template <typename F>
static void run_parallel_cdist(Tensor& result, const Tensor& t1, const Tensor& t2, const scalar_t p) {
const scalar_t * const t1_start = t1.const_data_ptr<scalar_t>();
const scalar_t * const t2_start = t2.const_data_ptr<scalar_t>();
int64_t d = t1.size(0);
int64_t r1 = t1.size(-2);
int64_t r2 = t2.size(-2);
int64_t m = t1.size(-1);
scalar_t * const res_start = result.data_ptr<scalar_t>();
int64_t combs = r1 * r2;
int64_t size1 = r1 * m;
int64_t size2 = r2 * m;
parallel_for(0, combs * d, internal::GRAIN_SIZE / (16 * m), [=](int64_t start, int64_t end) {
scalar_t * res = res_start + start;
const scalar_t * const res_end = res_start + end;
int64_t l = start / combs;
int64_t k = start % combs;
int64_t i = k / r2;
int64_t j = k % r2;
i = i * m;
j = j * m;
while (res != res_end) {
const scalar_t * self_i = t1_start + size1 * l + i;
const scalar_t * self_j = t2_start + size2 * l + j;
scalar_t agg = 0;
for (const auto x : c10::irange(m)) {
scalar_t a = *(self_i + x);
scalar_t b = *(self_j + x);
agg = F::red(agg, F::map(std::abs(a-b), p));
}
*res = F::finish(agg, p);
res += 1;
j += m;
if (j == size2) {
j = 0;
i += m;
if (i == size1) {
i = 0;
l += 1;
}
}
}
});
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free