Vectorizedqi Class — pytorch Architecture
Architecture documentation for the Vectorizedqi class in vec512_qint.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/vec512/vec512_qint.h lines 338–466
template <>
struct Vectorized<c10::qint32> : public Vectorizedqi {
using size_type = int;
static constexpr size_type size() {
return 16;
}
static constexpr int float_num_vecs() {
return 1;
}
static constexpr int int_num_vecs() {
return 1;
}
using float_vec_return_type = std::array<Vectorized<float>, 1>;
using int_vec_return_type = std::array<Vectorized<c10::qint32>, 1>;
using value_type = c10::qint32::underlying;
public:
using Vectorizedqi::Vectorizedqi;
Vectorized() {}
Vectorized(__m512i vals_) {
vals = vals_;
}
// Broadcast constructor
Vectorized(const c10::qint32& val) {
value_type uw = val.val_;
vals = _mm512_set1_epi32(uw);
}
void store(void* ptr, int count = size()) const {
if (count != size()) {
memcpy(ptr, &vals, count * sizeof(value_type));
} else {
_mm512_storeu_si512((__m512i*)ptr, vals);
}
}
static Vectorized<c10::qint32> loadu(const void* ptr) {
return Vectorized<c10::qint32>(ptr);
}
static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
__at_align__ value_type tmp_values[size()];
// Ensure uninitialized memory does not change the output value See
// https://github.com/pytorch/pytorch/issues/32502 for more details. We do
// not initialize arrays to zero using "={0}" because gcc would compile it
// to two instructions while a loop would be compiled to one instruction.
for (const auto i : c10::irange(size())) {
tmp_values[i] = 0;
}
std::memcpy(
tmp_values,
reinterpret_cast<const value_type*>(ptr),
count * sizeof(value_type));
return loadu(tmp_values);
}
float_vec_return_type dequantize(
Vectorized<float> scale,
Vectorized<float> zero_point,
Vectorized<float> scale_zp_premul) const {
__m512 float_vals = _mm512_cvtepi32_ps(vals);
return {vec::fmadd(scale, Vectorized<float>(float_vals), scale_zp_premul)};
}
float_vec_return_type dequantize(
Vectorized<float> scale,
Vectorized<float> zero_point) const {
__m512 float_vals = _mm512_cvtepi32_ps(vals);
return {(Vectorized<float>(float_vals) - zero_point) * scale};
}
static Vectorized<c10::qint32> quantize(
const float_vec_return_type& rhs,
float scale,
int32_t zero_point,
float inverse_scale [[maybe_unused]]) {
Vectorized<c10::qint32> retval;
auto rhs_data = (__m512)rhs[0];
at::native::quantize_vec<c10::qint32, /*precision=*/32>(
scale, zero_point, (float*)&rhs_data, (c10::qint32*)&retval.vals, 16);
return retval;
}
Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
return _mm512_max_epi32(vals, b.vals);
}
Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
return _mm512_min_epi32(vals, b.vals);
}
Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const {
return maximum(zero_point);
}
Vectorized<c10::qint32> relu6(
Vectorized<c10::qint32> zero_point,
Vectorized<c10::qint32> q_six) {
return _mm512_min_epi32(
_mm512_max_epi32(vals, zero_point.vals), q_six.vals);
}
int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
return {_mm512_sub_epi32(vals, b)};
}
static Vectorized<c10::qint32> requantize_from_int(
const int_vec_return_type& inp,
float multiplier,
int32_t zero_point) {
__m512 multiplier_v = _mm512_set1_ps(multiplier);
__m512i zero_point_v = _mm512_set1_epi32(zero_point);
__m512 scaled = _mm512_mul_ps(_mm512_cvtepi32_ps(inp[0]), multiplier_v);
__m512i rounded = _mm512_cvtps_epi32(scaled);
return _mm512_add_epi32(rounded, zero_point_v);
}
private:
// Load from memory constructor
Vectorized(const void* ptr) {
vals = _mm512_loadu_si512((const __m512i*)ptr);
}
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free