values Class — pytorch Architecture
Architecture documentation for the values class in ComputeSparseTile.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/native/sparse/cuda/ComputeSparseTile.h lines 66–112
template <typename Tile4x4Accessor>
CUTLASS_DEVICE Indices4x4 operator()(Tile4x4Accessor values) {
using TileValueOrdered =
TileValueOrderedT<typename Tile4x4Accessor::Element, Op>;
using TileValuesFragment = cutlass::Array<TileValueOrdered, 4 * 4>;
Indices4x4 indices;
TileValuesFragment values_ordered;
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < 4; ++i) {
CUTLASS_PRAGMA_UNROLL
for (int j = 0; j < 4; ++j) {
TileValueOrdered& v = values_ordered[i * 4 + j];
v.parts.value = values.at(i, j).get();
v.parts.col = uint2b_t(j);
v.parts.row = uint2b_t(i);
}
}
// Use a sorting network (aka without branches) to avoid
// warp divergence
StaticSort<TileValuesFragment::kElements> sorter;
sorter(values_ordered);
// bitmask to store how many we have selected on a given row/col
// 0 selected: (numPerRow >> 2*row) = 00 (0)
// 1 selected: (numPerRow >> 2*row) = 01 (1)
// 2 selected: (numPerRow >> 2*row) = 11 (3)
uint32_t numPerRow = 0;
uint32_t numPerCol = 0;
indices = 0;
// Take as many as we can, starting with the largest values
CUTLASS_PRAGMA_UNROLL
for (int i = values_ordered.size() - 1; i >= 0; i--) {
auto& e = values_ordered[i];
uint32_t rcount = uint2b_t(numPerRow >> 2 * e.parts.row);
uint32_t ccount = uint2b_t(numPerCol >> 2 * e.parts.col);
// NOTE: This is more efficient (yet equivalent) to:
// `rcount != 3 && ccount != 3`
bool selected = (rcount + ccount) <= 2;
indices |= selected << (e.parts.col + 4 * e.parts.row);
numPerRow |= (rcount + selected) << 2 * e.parts.row;
numPerCol |= (ccount + selected) << 2 * e.parts.col;
}
return indices;
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free