VECTOR_WIDTH Class — pytorch Architecture
Architecture documentation for the VECTOR_WIDTH class in vec256_zarch.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h lines 369–613
template <typename T>
struct Vectorized<T, std::enable_if_t<is_zarch_implemented<T>()>> {
public:
using value_type = T;
using vtype = ZSimdVect<T>;
using vmaskType = ZSimdVectBinary<T>;
using size_type = int;
// because of gcc inconsistency for int64_t we are obliged to use this, not
// value_type
using ElementType = ZSimdVectElement<T>;
using vinner_data = std::pair<vtype, vtype>;
private:
vtype _vec0;
vtype _vec1;
public:
static constexpr size_type size() {
return VECTOR_WIDTH / sizeof(ElementType);
}
Vectorized() {}
C10_ALWAYS_INLINE Vectorized(vtype v) : _vec0{v}, _vec1{v} {}
C10_ALWAYS_INLINE Vectorized(const vinner_data& v)
: _vec0{v.first}, _vec1{v.second} {}
C10_ALWAYS_INLINE Vectorized(vtype v1, vtype v2) : _vec0{v1}, _vec1{v2} {}
C10_ALWAYS_INLINE Vectorized(T s)
: _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {}
template <typename U, typename DUMMY = void>
struct LoaduHelper {
static Vectorized<T> C10_ALWAYS_INLINE
loadu(const U* ptr, int count = size()) {
__at_align__ ElementType tmp_values[size()] = {};
std::memcpy(
tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
return {
vec_xl(offset0, &(tmp_values[0])),
vec_xl(offset16, &(tmp_values[0]))};
}
};
template <typename DUMMY>
struct LoaduHelper<ElementType, DUMMY> {
static Vectorized<T> C10_ALWAYS_INLINE
loadu(const ElementType* ptr, int count = size()) {
if (count == size()) {
return {vec_xl(offset0, ptr), vec_xl(offset16, ptr)};
}
__at_align__ ElementType tmp_values[size()] = {};
std::memcpy(
tmp_values, ptr, std::min(count, size()) * sizeof(ElementType));
return {
vec_xl(offset0, &(tmp_values[0])),
vec_xl(offset16, &(tmp_values[0]))};
}
};
template <typename U>
static Vectorized<T> C10_ALWAYS_INLINE
loadu(const U* ptr, int count = size()) {
return LoaduHelper<U>::loadu(ptr, count);
}
template <typename U>
static Vectorized<T> C10_ALWAYS_INLINE loadu_one_fourth(const U* ptr) {
// load only first 8 bytes
// only intended to be used with uint8_t
return loadu(ptr, 8 / sizeof(ElementType));
}
template <typename U, typename DUMMY = void>
struct StoreHelper {
static void C10_ALWAYS_INLINE
store(const Vectorized<T>& vec, U* ptr, int count = size()) {
if (count > 0) {
__at_align__ ElementType tmp_values[size()];
vec_xst(vec._vec0, offset0, &(tmp_values[0]));
vec_xst(vec._vec1, offset16, &(tmp_values[0]));
std::memcpy(
ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
}
}
};
template <typename DUMMY>
struct StoreHelper<ElementType, DUMMY> {
static void C10_ALWAYS_INLINE
store(const Vectorized<T>& vec, ElementType* ptr, int count = size()) {
if (count == size()) {
vec_xst(vec._vec0, offset0, ptr);
vec_xst(vec._vec1, offset16, ptr);
} else if (count > 0) {
__at_align__ ElementType tmp_values[size()];
vec_xst(vec._vec0, offset0, &(tmp_values[0]));
vec_xst(vec._vec1, offset16, &(tmp_values[0]));
std::memcpy(
ptr, tmp_values, std::min(count, size()) * sizeof(ElementType));
}
}
};
template <typename U>
void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const {
return StoreHelper<U>::store(*this, ptr, count);
}
C10_ALWAYS_INLINE const vtype& vec0() const {
return _vec0;
}
C10_ALWAYS_INLINE const vtype& vec1() const {
return _vec1;
}
C10_ALWAYS_INLINE vinner_data data() const {
return std::make_pair<>(_vec0, _vec1);
}
C10_ALWAYS_INLINE operator vinner_data() const {
return data();
}
C10_ALWAYS_INLINE const vmaskType vecb0() const {
return (vmaskType)_vec0;
}
C10_ALWAYS_INLINE const vmaskType vecb1() const {
return (vmaskType)_vec1;
}
static Vectorized<T> C10_ALWAYS_INLINE blendv(
const Vectorized<T>& a,
const Vectorized<T>& b,
const Vectorized<T>& mask) {
return {
vec_sel(a._vec0, b._vec0, mask.vecb0()),
vec_sel(a._vec1, b._vec1, mask.vecb1())};
}
template <typename U = T, std::enable_if_t<(sizeof(U) == 8), int> = 0>
C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4)
: _vec0{s1, s2}, _vec1{s3, s4} {}
template <typename U = T, std::enable_if_t<(sizeof(U) == 4), int> = 0>
C10_ALWAYS_INLINE Vectorized(T s1, T s2, T s3, T s4, T s5, T s6, T s7, T s8)
: _vec0{s1, s2, s3, s4}, _vec1{s5, s6, s7, s8} {}
template <typename U = T, std::enable_if_t<(sizeof(U) == 2), int> = 0>
C10_ALWAYS_INLINE Vectorized(
T s1,
T s2,
T s3,
T s4,
T s5,
T s6,
T s7,
T s8,
T s9,
T s10,
T s11,
T s12,
T s13,
T s14,
T s15,
T s16)
: _vec0{s1, s2, s3, s4, s5, s6, s7, s8},
_vec1{s9, s10, s11, s12, s13, s14, s15, s16} {}
template <typename U = T, std::enable_if_t<(sizeof(U) == 1), int> = 0>
C10_ALWAYS_INLINE Vectorized(
T s1,
T s2,
T s3,
T s4,
T s5,
T s6,
T s7,
T s8,
T s9,
T s10,
T s11,
T s12,
T s13,
T s14,
T s15,
T s16,
T s17,
T s18,
T s19,
T s20,
T s21,
T s22,
T s23,
T s24,
T s25,
T s26,
T s27,
T s28,
T s29,
T s30,
T s31,
T s32)
: _vec0{s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16},
_vec1{
s17,
s18,
s19,
s20,
s21,
s22,
s23,
s24,
s25,
s26,
s27,
s28,
s29,
s30,
s31,
s32} {}
template <typename step_t, typename U = T>
static std::enable_if_t<sizeof(U) == 8, Vectorized<T>> arange(
T base = 0,
step_t step = static_cast<step_t>(1)) {
return Vectorized<T>(base, base + step, base + 2 * step, base + 3 * step);
}
template <typename step_t, typename U = T>
static std::enable_if_t<sizeof(U) == 4, Vectorized<T>> arange(
T base = 0,
step_t step = static_cast<step_t>(1)) {
return Vectorized<T>(
base,
base + step,
base + 2 * step,
base + 3 * step,
base + 4 * step,
base + 5 * step,
base + 6 * step,
base + 7 * step);
}
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free