run_parallel_cdist Class — pytorch Architecture

Architecture documentation for the run_parallel_cdist class in DistanceOpsKernel.cpp from the pytorch codebase.

Class cpp

Entity Profile

Source Code

aten/src/ATen/native/cpu/DistanceOpsKernel.cpp lines 203–251

  template <typename F>
  static void run_parallel_cdist(Tensor& result, const Tensor& t1, const Tensor& t2, const scalar_t p) {
    const scalar_t * const t1_start = t1.const_data_ptr<scalar_t>();
    const scalar_t * const t2_start = t2.const_data_ptr<scalar_t>();
    int64_t d = t1.size(0);
    int64_t r1 = t1.size(-2);
    int64_t r2 = t2.size(-2);
    int64_t m = t1.size(-1);

    scalar_t * const res_start = result.data_ptr<scalar_t>();
    int64_t combs = r1 * r2;
    int64_t size1 = r1 * m;
    int64_t size2 = r2 * m;

    parallel_for(0, combs * d, internal::GRAIN_SIZE / (16 * m), [=](int64_t start, int64_t end) {
      scalar_t * res = res_start + start;
      const scalar_t * const res_end = res_start + end;
      int64_t l = start / combs;
      int64_t k = start % combs;
      int64_t i = k / r2;
      int64_t j = k % r2;
      i = i * m;
      j = j * m;

      while (res != res_end) {
        const scalar_t * self_i = t1_start + size1 * l + i;
        const scalar_t * self_j = t2_start + size2 * l + j;

        scalar_t agg = 0;
        for (const auto x : c10::irange(m)) {
          scalar_t a = *(self_i + x);
          scalar_t b = *(self_j + x);
          agg = F::red(agg, F::map(std::abs(a-b), p));
        }
        *res = F::finish(agg, p);

        res += 1;
        j += m;
        if (j == size2) {
          j = 0;
          i += m;
          if (i == size1) {
            i = 0;
            l += 1;
          }
        }
      }
    });
  }

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free