diff --git a/cpu-fast-inference.patch b/cpu-fast-inference.patch new file mode 100644 index 0000000000000000000000000000000000000000..90143ed61cc16e1dc32d6636e4248799674dc0a8 --- /dev/null +++ b/cpu-fast-inference.patch @@ -0,0 +1,5819 @@ +diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake +index 714abca2a..c5113f059 100644 +--- a/cmake/cpu_extension.cmake ++++ b/cmake/cpu_extension.cmake +@@ -72,17 +72,14 @@ endfunction() + + is_avx512_disabled(AVX512_DISABLED) + +-if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") +- set(APPLE_SILICON_FOUND TRUE) +-else() +- find_isa(${CPUINFO} "avx2" AVX2_FOUND) +- find_isa(${CPUINFO} "avx512f" AVX512_FOUND) +- find_isa(${CPUINFO} "POWER10" POWER10_FOUND) +- find_isa(${CPUINFO} "POWER9" POWER9_FOUND) +- find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support +- find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support +-endif() ++find_isa(${CPUINFO} "avx2" AVX2_FOUND) ++find_isa(${CPUINFO} "avx512f" AVX512_FOUND) ++find_isa(${CPUINFO} "POWER10" POWER10_FOUND) ++find_isa(${CPUINFO} "POWER9" POWER9_FOUND) ++find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support ++find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support + ++add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + + if (AVX512_FOUND AND NOT AVX512_DISABLED) + list(APPEND CXX_COMPILE_FLAGS +@@ -119,16 +116,19 @@ elseif (ASIMD_FOUND) + message(STATUS "ARMv8 or later architecture detected") + if(ARM_BF16_FOUND) + message(STATUS "BF16 extension detected") +- set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16") ++ set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16+i8mm") ++ # set(MARCH_FLAGS "-march=armv8.6-a+bf16+dotprod+fp16+sve+i8mm") + add_compile_definitions(ARM_BF16_SUPPORT) + else() + message(WARNING "BF16 functionality is not available") +- set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") ++ set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16+i8mm") ++ # set(MARCH_FLAGS "-march=armv8.6-a+bf16+dotprod+fp16+sve+i8mm") + endif() +- list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) +-elseif(APPLE_SILICON_FOUND) +- message(STATUS "Apple Silicon Detected") +- set(ENABLE_NUMA OFF) ++ list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS} "-fpermissive" ++ "-O3" "-funroll-loops" "-fomit-frame-pointer" ++ "-ffast-math" "-finline-functions" ++ "-flto" "-ftree-vectorize" ++ "-falign-functions=16" "-falign-loops=16" "-fno-unwind-tables") + else() + message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") + endif() +@@ -182,7 +182,9 @@ set(VLLM_EXT_SRC + "csrc/cpu/utils.cpp" + "csrc/cpu/layernorm.cpp" + "csrc/cpu/pos_encoding.cpp" +- "csrc/cpu/torch_bindings.cpp") ++ "csrc/cpu/sysHAX_ops.cpp" ++ "csrc/cpu/torch_bindings.cpp" ++ "csrc/cpu/quantize.cpp") + + if (AVX512_FOUND AND NOT AVX512_DISABLED) + set(VLLM_EXT_SRC +diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp +index 990e99f2f..db71196a4 100644 +--- a/csrc/cpu/cpu_types_arm.hpp ++++ b/csrc/cpu/cpu_types_arm.hpp +@@ -2,6 +2,10 @@ + #include + #include + ++typedef float16_t f16; ++extern float f16_to_f32(f16 h); ++extern f16 f32_to_f16(float h); ++ + namespace vec_op { + + #ifdef ARM_BF16_SUPPORT +@@ -65,96 +69,71 @@ struct FP16Vec8 : public Vec { + }; + + struct FP16Vec16 : public Vec { +- constexpr static int VEC_ELEM_NUM = 16; +- +- float16x8x2_t reg; +- +- explicit FP16Vec16(const void* ptr) { +- reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); +- reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); +- } +- +- explicit FP16Vec16(const FP32Vec16& vec); +- +- void save(void* ptr) const { +- vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); +- vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); +- } ++ constexpr static int VEC_ELEM_NUM = 16; ++ ++ union { ++ float16x8x2_t reg; ++ f16 s[VEC_ELEM_NUM]; ++ }; ++ ++ explicit FP16Vec16() { ++ reg.val[0] = reg.val[1] = vdupq_n_f16(0.0f); ++ } ++ ++ explicit FP16Vec16(const void *ptr) { ++ reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); ++ reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); ++ } ++ ++ explicit FP16Vec16(const FP32Vec16& vec); + +- void save(void* ptr, const int elem_num) const { +- int full_blocks = elem_num / 8; +- int remainder = elem_num % 8; ++ explicit FP16Vec16(const FP16Vec8& vec) { ++ reg.val[0] = reg.val[1] = vec.reg; ++ } ++ ++ void save(void *ptr) const { ++ vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); ++ vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); ++ } ++ ++ void save(void *ptr, const int elem_num) const { ++ int full_blocks = elem_num / 8; ++ int remainder = elem_num % 8; ++ ++ if (full_blocks > 0) { ++ vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); ++ if (full_blocks > 1) { ++ vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); ++ } ++ } ++ ++ if (remainder > 0) { ++ float16x8_t temp = reg.val[full_blocks]; ++ for (int i = 0; i < remainder; ++i) { ++ reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); ++ } ++ } ++ } + +- if (full_blocks > 0) { +- vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); +- if (full_blocks > 1) { +- vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); +- } ++ f16 reduce_sum() const { ++ float16x8_t sum = vaddq_f16(reg.val[0], reg.val[1]); ++ float32x4_t t0 = vcvt_f32_f16(vget_low_f16(sum)); ++ float32x4_t t1 = vcvt_f32_f16(vget_high_f16(sum)); ++ return f32_to_f16(vaddvq_f32(vaddq_f32(t0, t1))); + } + +- // Note: below is the unrolled version of the following code: +- // +- // for (int i = 0; i < remainder; ++i) { +- // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = +- // vgetq_lane_f16(temp, i); +- // } +- // +- // For macOS build (Clang), the arm/neon intrinsics function +- // `vgetq_lane_f16` needs the parameter `i` to be constant at compile +- // time. +- +- if (remainder > 0) { +- float16x8_t temp = reg.val[full_blocks]; +- __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); +- switch (remainder) { +- case 1: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- break; +- case 2: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); +- break; +- case 3: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); +- fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); +- break; +- case 4: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); +- fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); +- fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); +- break; +- case 5: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); +- fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); +- fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); +- fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); +- break; +- case 6: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); +- fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); +- fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); +- fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); +- fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); +- break; +- case 7: +- fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); +- fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); +- fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); +- fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); +- fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); +- fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); +- fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); +- break; +- +- default: +- break; +- } ++ template ++ float reduce_sub_sum(int idx) { ++ f16 sum = 0.0; ++ constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); ++ uint32_t mask = base_mask << (idx * group_size); ++ unroll_loop([&sum, &mask, this](int i){ ++ int flag = mask & 0x1; ++ mask = mask >> 1; ++ if (flag != 0) sum += s[i]; ++ }); ++ return sum; + } +- } + }; + + #ifdef ARM_BF16_SUPPORT +@@ -550,6 +529,11 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { + acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]); + }; + ++inline void fma(FP16Vec16 &acc, FP16Vec16 &a, FP16Vec16 &b) { ++ acc.reg.val[0] = vfmaq_f16(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); ++ acc.reg.val[1] = vfmaq_f16(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); ++} ++ + #ifdef ARM_BF16_SUPPORT + inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { + float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); +diff --git a/csrc/cpu/instruct.h b/csrc/cpu/instruct.h +new file mode 100644 +index 000000000..07eac058b +--- /dev/null ++++ b/csrc/cpu/instruct.h +@@ -0,0 +1,81 @@ ++#pragma once ++ ++#include ++ ++#define MIN(a, b) ((a) < (b) ? (a) : (b)) ++#define MAX(a, b) ((a) > (b) ? (a) : (b)) ++ ++#ifdef __ARM_NEON ++ ++typedef struct ggml_int16x8x2_t { ++ int16x8_t val[2]; ++} ggml_int16x8x2_t; ++ ++inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { ++ ggml_int16x8x2_t res; ++ ++ res.val[0] = vld1q_s16(ptr + 0); ++ res.val[1] = vld1q_s16(ptr + 8); ++ ++ return res; ++} ++ ++typedef struct ggml_uint8x16x2_t { ++ uint8x16_t val[2]; ++} ggml_uint8x16x2_t; ++ ++inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { ++ ggml_uint8x16x2_t res; ++ ++ res.val[0] = vld1q_u8(ptr + 0); ++ res.val[1] = vld1q_u8(ptr + 16); ++ ++ return res; ++} ++ ++typedef struct ggml_uint8x16x4_t { ++ uint8x16_t val[4]; ++} ggml_uint8x16x4_t; ++ ++inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { ++ ggml_uint8x16x4_t res; ++ ++ res.val[0] = vld1q_u8(ptr + 0); ++ res.val[1] = vld1q_u8(ptr + 16); ++ res.val[2] = vld1q_u8(ptr + 32); ++ res.val[3] = vld1q_u8(ptr + 48); ++ ++ return res; ++} ++ ++typedef struct ggml_int8x16x2_t { ++ int8x16_t val[2]; ++} ggml_int8x16x2_t; ++ ++inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { ++ ggml_int8x16x2_t res; ++ ++ res.val[0] = vld1q_s8(ptr + 0); ++ res.val[1] = vld1q_s8(ptr + 16); ++ ++ return res; ++} ++ ++typedef struct ggml_int8x16x4_t { ++ int8x16_t val[4]; ++} ggml_int8x16x4_t; ++ ++inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { ++ ggml_int8x16x4_t res; ++ ++ res.val[0] = vld1q_s8(ptr + 0); ++ res.val[1] = vld1q_s8(ptr + 16); ++ res.val[2] = vld1q_s8(ptr + 32); ++ res.val[3] = vld1q_s8(ptr + 48); ++ ++ return res; ++} ++ ++#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) ++ ++#endif +\ No newline at end of file +diff --git a/csrc/cpu/quantize.cpp b/csrc/cpu/quantize.cpp +new file mode 100644 +index 000000000..2194f8fdf +--- /dev/null ++++ b/csrc/cpu/quantize.cpp +@@ -0,0 +1,3311 @@ ++#include ++#include ++#if __AVX__ || __AVX2__ || __AVX512F__ ++#include ++#include ++#endif ++#ifdef __ARM_FEATURE_SVE ++#include ++#endif ++#include "quantize.h" ++#include "instruct.h" ++ ++static inline int nearest_int(float fval) ++{ ++ float val = fval + 12582912.f; ++ int i; memcpy(&i, &val, sizeof(int)); ++ return (i & 0x007fffff) - 0x00400000; ++} ++ ++#if __AVX__ || __AVX2__ || __AVX512F__ ++static inline float hsum_float_8(const __m256 x) ++{ ++ __m128 res = _mm256_extractf128_ps(x, 1); ++ res = _mm_add_ps(res, _mm256_castps256_ps128(x)); ++ res = _mm_add_ps(res, _mm_movehl_ps(res, res)); ++ res = _mm_add_ss(res, _mm_movehdup_ps(res)); ++ return _mm_cvtss_f32(res); ++} ++#endif ++ ++void dequantize_row_q2_K(const block_q2_K *__restrict__ src, float *__restrict__ dst, int64_t k) ++{ ++ const int nb = k / QK_K; ++ for (int i = 0; i < nb; i++) { ++ const float d = GGML_FP16_TO_FP32(src[i].GGML_COMMON_AGGR.d); ++ const float min = GGML_FP16_TO_FP32(src[i].GGML_COMMON_AGGR.dmin); ++ const uint8_t *q = src[i].qs; ++#if QK_K == 256 ++ int is = 0; ++ float dl, ml; ++ for (int n = 0; n < QK_K; n += 128) { ++ int shift = 0; ++ for (int j = 0; j < 4; ++j) { ++ uint8_t sc = src[i].scales[is++]; ++ dl = d * (sc & 0xF); ++ ml = min * (sc >> 4); ++ for (int l = 0; l < 16; ++l) { ++ *dst++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml; ++ } ++ sc = src[i].scales[is++]; ++ dl = d * (sc & 0xF); ++ ml = min * (sc >> 4); ++ for (int l = 0; l < 16; ++l) { ++ *dst++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml; ++ } ++ shift += 2; ++ } ++ q += 32; ++ } ++#else ++ float dl1 = d * (src[i].scales[0] & 0xF), ml1 = min * (src[i].scales[0] >> 4); ++ float dl2 = d * (src[i].scales[1] & 0xF), ml2 = min * (src[i].scales[1] >> 4); ++ float dl3 = d * (src[i].scales[2] & 0xF), ml3 = min * (src[i].scales[2] >> 4); ++ float dl4 = d * (src[i].scales[3] & 0xF), ml4 = min * (src[i].scales[3] >> 4); ++ for (int l = 0; l < 16; ++l) { ++ dst[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1; ++ dst[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2; ++ dst[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3; ++ dst[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4; ++ } ++ dst += QK_K; ++#endif ++ } ++} ++ ++void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) ++{ ++ int64_t i = 0; ++#if defined(__F16C__) ++ //if (ggml_cpu_has_f16c()) { ++ for (; i + 7 < n; i += 8) { ++ __m256 x_vec = _mm256_loadu_ps(x + i); ++ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); ++ _mm_storeu_si128((__m128i *)(y + i), y_vec); ++ } ++ for(; i + 3 < n; i += 4) { ++ __m128 x_vec = _mm_loadu_ps(x + i); ++ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); ++ _mm_storel_epi64((__m128i *)(y + i), y_vec); ++ } ++ //} ++#endif ++ for (; i < n; i++) { ++ y[i] = GGML_FP32_TO_FP16(x[i]); ++ } ++} ++ ++ ++void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) ++{ ++ for (int64_t i = 0; i < n; i++) { ++ y[i] = GGML_FP16_TO_FP32(x[i]); ++ } ++} ++ ++#define GGML_F16_SVE_STEP 64 ++#define GGML_F16_SVE_ARR 4 ++#define GGML_F16_SVE_EPR 16 ++ ++void ggml_vec_dot_f16(int n, float * __restrict__ s, size_t bs, ggml_fp16_t * __restrict__ x, size_t bx, ggml_fp16_t * __restrict__ y, size_t by, int nrc) { ++ double sumf = 0.0; ++ ++#if defined(__ARM_FEATURE_SVE) ++ svbool_t pre = svptrue_b16(); ++ ++ const int np = (n & ~(GGML_F16_SVE_STEP - 1)); ++ ++ svfloat16_t sum = {svdup_f16(0.0f)}; ++ ++ svfloat16_t ax; //需要4个256,即4个ax ++ svfloat16_t ay; ++ ++ for (int i = 0; i < np; i += GGML_F16_SVE_STEP) { ++ for (int j = 0; j < GGML_F16_SVE_ARR; j++) { //4个256,即64个f16 ++ ax = svld1_f16(pre, x + i + j * GGML_F16_SVE_EPR); ++ ay = svld1_f16(pre, y + i + j * GGML_F16_SVE_EPR); ++ ++ sum = svmla_f16_z(pre, sum, ax, ay); ++ } ++ } ++ ++ /* 合并 */ ++ sumf = svaddv_f16(pre, sum); ++ /* 剩余的维度 */ ++ for (int i = np; i < n; ++i) { ++ sumf += x[i] * y[i]; ++ } ++ ++#elif defined(GGML_SIMD) ++ const int np = (n & ~(GGML_F16_STEP - 1)); ++ ++ GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; ++ ++ GGML_F16_VEC ax[GGML_F16_ARR]; ++ GGML_F16_VEC ay[GGML_F16_ARR]; ++ ++ for (int i = 0; i < np; i += GGML_F16_STEP) { ++ for (int j = 0; j < GGML_F16_ARR; j++) { ++ ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); ++ ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); ++ ++ sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); ++ } ++ } ++ ++ // reduce sum0..sum3 to sum0 ++ GGML_F16_VEC_REDUCE(sumf, sum); ++ ++ // leftovers ++ for (int i = np; i < n; ++i) { ++ sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); ++ //sumf += x[i] * y[i]; ++ } ++ ++#else ++ for (int i = 0; i < n; ++i) { ++ sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); ++ } ++#endif ++ ++ *s = sumf; ++} ++ ++void dequantize_row_q4_0(const block_q4_0 * __restrict__ src, float * __restrict__ dst, int64_t k) ++{ ++ static const int qk = QK4_0; ++ const int nb = k / qk; ++ ++ for (int i = 0; i < nb; i++) { ++ const float d = GGML_FP16_TO_FP32(src[i].d); ++ ++ for (int j = 0; j < qk / 2; ++j) { ++ const int x0 = (src[i].qs[j] & 0x0F) - 8; ++ const int x1 = (src[i].qs[j] >> 4) - 8; ++ ++ dst[i*qk + j + 0 ] = x0*d; ++ dst[i*qk + j + qk/2] = x1*d; ++ } ++ } ++} ++ ++void dequantize_row_q4_1(const block_q4_1 * __restrict__ src, float * __restrict__ dst, int64_t k) ++{ ++ static const int qk = QK4_1; ++ ++ const int nb = k / qk; ++ ++ for (int i = 0; i < nb; i++) { ++ const float d = GGML_FP16_TO_FP32(src[i].d); ++ const float m = GGML_FP16_TO_FP32(src[i].m); ++ ++ for (int j = 0; j < qk/2; ++j) { ++ const int x0 = (src[i].qs[j] & 0x0F); ++ const int x1 = (src[i].qs[j] >> 4); ++ ++ dst[i*qk + j + 0 ] = x0*d + m; ++ dst[i*qk + j + qk/2] = x1*d + m; ++ } ++ } ++} ++ ++void dequantize_row_q8_0(const block_q8_0 *__restrict__ x, float *__restrict__ y, int64_t k) ++{ ++ static const int qk = QK8_0; ++ ++ const int nb = k / qk; ++ for (int i = 0; i < nb; i++) { ++ const float d = GGML_FP16_TO_FP32(x[i].d); ++ ++ for (int j = 0; j < qk; ++j) { ++ y[i*qk + j] = x[i].qs[j]*d; ++ } ++ } ++} ++ ++/* QK_K == 256 */ ++void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc) ++{ ++ (void)(nrc); ++ (void)(bx); ++ (void)(by); ++ (void)(bs); ++ ++ const block_q2_K * __restrict__ x = vx; ++ const block_q8_K * __restrict__ y = vy; ++ const int nb = n / QK_K; ++ ++#ifdef __ARM_NEON ++ const uint8x16_t m3 = vdupq_n_u8(0x3); ++ const uint8x16_t m4 = vdupq_n_u8(0xF); ++ ++ const int32x4_t vzero = vdupq_n_s32(0); ++ ++ ggml_int8x16x2_t q2bytes; ++ uint8_t aux[16]; ++ ++ float sum = 0; ++ ++ for (int i = 0; i < nb; ++i) { ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.d); ++ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.dmin); ++ ++ const uint8_t * __restrict__ q2 = x[i].qs; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ const uint8_t * __restrict__ sc = x[i].scales; ++ ++ const uint8x16_t mins_and_scales = vld1q_u8(sc); ++ const uint8x16_t scales = vandq_u8(mins_and_scales, m4); ++ vst1q_u8(aux, scales); ++ ++ const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4); ++ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); ++ const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}}; ++ const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])), ++ vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0]))); ++ const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])), ++ vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1]))); ++ sum += dmin * vaddvq_s32(vaddq_s32(s0, s1)); ++ ++ int isum = 0; ++ int is = 0; ++ ++// We use this macro instead of a function call because for some reason ++// the code runs 2-3% slower, even if the function is declared inline ++#define MULTIPLY_ACCUM_WITH_SCALE(index)\ ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\ ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)]; ++ ++#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\ ++ q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\ ++ q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\ ++ q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\ ++ MULTIPLY_ACCUM_WITH_SCALE((index)); ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32; ++ ++ ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32; ++ q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3)); ++ q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3)); ++ ++ MULTIPLY_ACCUM_WITH_SCALE(0); ++ ++ SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2); ++ SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4); ++ SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6); ++ ++ is += 8; ++ } ++ ++ sum += d * isum; ++ } ++ ++ *s = sum; ++ ++#elif defined __AVX2__ ++ ++ const __m256i m3 = _mm256_set1_epi8(3); ++ const __m128i m4 = _mm_set1_epi8(0xF); ++ ++ __m256 acc = _mm256_setzero_ps(); ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); ++ ++ const uint8_t * __restrict__ q2 = x[i].qs; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); ++ const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); ++ const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); ++ const __m256i mins = _mm256_cvtepi8_epi16(mins8); ++ const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums)); ++ ++ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc); ++ ++ const __m256i all_scales = _mm256_cvtepi8_epi16(scales8); ++ const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); ++ const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); ++ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; ++ ++ __m256i sumi = _mm256_setzero_si256(); ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32; ++ ++ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ ++ const __m256i q2_0 = _mm256_and_si256(q2bits, m3); ++ const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3); ++ const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3); ++ const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3); ++ ++ __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0); ++ __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1); ++ __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2); ++ __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3); ++ ++ p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0); ++ p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1); ++ p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2); ++ p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3); ++ ++ p0 = _mm256_add_epi32(p0, p1); ++ p2 = _mm256_add_epi32(p2, p3); ++ ++ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2)); ++ } ++ ++ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); ++ ++ } ++ ++ *s = hsum_float_8(acc); ++ ++#elif defined __AVX__ ++ ++ const __m128i m3 = _mm_set1_epi8(0x3); ++ const __m128i m4 = _mm_set1_epi8(0xF); ++ const __m128i m2 = _mm_set1_epi8(0x2); ++ ++ __m256 acc = _mm256_setzero_ps(); ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.d); ++ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.dmin); ++ ++ const uint8_t * __restrict__ q2 = x[i].qs; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ // load mins and scales from block_q2_K.scales[QK_K/16] ++ const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); ++ const __m128i scales16 = _mm_and_si128(mins_and_scales, m4); ++ const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4); ++ const __m128i mins_0 = _mm_cvtepi8_epi16(mins16); ++ const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16)); ++ ++ // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2 ++ const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0])); ++ const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8])); ++ ++ // sumf += -dmin * summs in 32bits*8 ++ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc); ++ ++ const __m128i scales_0 = _mm_cvtepi8_epi16(scales16); ++ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16)); ++ const __m128i scales[2] = { scales_0, scales_1 }; ++ ++ __m128i sumi_0 = _mm_setzero_si128(); ++ __m128i sumi_1 = _mm_setzero_si128(); ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K] ++ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ ++ // load 2bits*16*8 from block_q2_K.qs[QK_K/4] ++ __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; ++ const __m128i q2_0 = _mm_and_si128(q2bits, m3); ++ const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); ++ const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); ++ const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); ++ q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16; ++ const __m128i q2_1 = _mm_and_si128(q2bits, m3); ++ const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3); ++ const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3); ++ const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3); ++ ++ // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8 ++ __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0); ++ __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1); ++ __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2); ++ __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3); ++ __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4); ++ __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5); ++ __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6); ++ __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7); ++ ++ // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8 ++ __m128i shuffle = _mm_set1_epi16(0x0100); ++ p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7); ++ ++ p0 = _mm_add_epi32(p0, p1); ++ p2 = _mm_add_epi32(p2, p3); ++ p4 = _mm_add_epi32(p4, p5); ++ p6 = _mm_add_epi32(p6, p7); ++ ++ // isum in 32bits*4*2 ++ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2)); ++ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6)); ++ } ++ ++ // sumf += dall * isum - dmin * summs in 32bits ++ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); ++ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc); ++ } ++ ++ *s = hsum_float_8(acc); ++ ++#elif defined __riscv_v_intrinsic ++ ++ float sumf = 0; ++ uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const uint8_t * q2 = x[i].qs; ++ const int8_t * q8 = y[i].qs; ++ const uint8_t * sc = x[i].scales; ++ ++ const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); ++ ++ size_t vl = 16; ++ ++ vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl); ++ vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl); ++ ++ vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl); ++ ++ vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl); ++ vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl); ++ vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl)); ++ vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl); ++ vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); ++ ++ sumf += dmin * __riscv_vmv_x_s_i32m1_i32(vsums); ++ ++ vl = 32; ++ ++ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); ++ vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl); ++ ++ uint8_t is=0; ++ int isum=0; ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ // load Q2 ++ vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl); ++ ++ vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl); ++ vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl); ++ vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl); ++ vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl); ++ ++ // duplicate scale elements for product ++ vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl); ++ vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl); ++ vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl); ++ vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl); ++ ++ vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl)); ++ vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl)); ++ vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl)); ++ vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl)); ++ ++ // load Q8 ++ vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl); ++ vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl); ++ vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl); ++ vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl); ++ ++ vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl); ++ vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl); ++ vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl); ++ vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl); ++ ++ vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl); ++ vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl); ++ ++ isum += __riscv_vmv_x_s_i32m1_i32(isum1); ++ ++ q2+=32; q8+=128; is=8; ++ ++ } ++ ++ sumf += dall * isum; ++ ++ } ++ ++ *s = sumf; ++ ++#else ++ float sumf = 0; ++ for (int i = 0; i < nb; ++i) { ++ const uint8_t * q2 = x[i].qs; ++ const int8_t * q8 = y[i].qs; ++ const uint8_t * sc = x[i].scales; ++ ++ int summs = 0; ++ for (int j = 0; j < 16; ++j) { ++ summs += y[i].bsums[j] * (sc[j] >> 4); ++ } ++ ++ const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.d); ++ const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.dmin); ++ ++ int isum = 0; ++ int is = 0; ++ int d; ++ for (int k = 0; k < QK_K/128; ++k) { ++ int shift = 0; ++ for (int j = 0; j < 4; ++j) { ++ d = sc[is++] & 0xF; ++ int isuml = 0; ++ for (int l = 0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); ++ isum += d * isuml; ++ d = sc[is++] & 0xF; ++ isuml = 0; ++ for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3); ++ isum += d * isuml; ++ shift += 2; ++ q8 += 32; ++ } ++ q2 += 32; ++ } ++ sumf += dall * isum - dmin * summs; ++ } ++ *s = sumf; ++#endif ++} ++ ++static float make_qkx2_quants(int n, int nmax, const float *__restrict__ x, const float * __restrict__ weights, ++ uint8_t * __restrict__ L, float * __restrict__ the_min, uint8_t *__restrict__ Laux, //Laux表示微调后每个微调结果储存 ++ float rmin, float rdelta, int nstep, int use_mad) ++{ ++ float min = x[0]; ++ float max = x[0]; ++ float sum_w = weights[0]; ++ float sum_x = sum_w * x[0]; ++#ifdef HAVE_BUGGY_APPLE_LINKER ++ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 ++ for (volatile int i = 1; i < n; ++i) { ++#else ++ for (int i = 1; i < n; ++i) { //求数据里最小和最大值 ++#endif ++ if (x[i] < min) min = x[i]; ++ if (x[i] > max) max = x[i]; ++ float w = weights[i]; ++ sum_w += w; //序列绝对值的和 ++ sum_x += w * x[i]; //序列数字平方和(带符号) ++ } ++ if (min > 0) min = 0; //最小的都大于0,min变成0 ++ if (max == min) { //序列元素都一样 ++ for (int i = 0; i < n; ++i) L[i] = 0; ++ *the_min = -min; ++ return 0.f; //此时的 ++ } ++ float iscale = nmax/(max - min); //量化级别,因为2位bit表示范围是3 ++ float scale = 1/iscale; //缩放因子 ++ float best_mad = 0; ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale*(x[i] - min)); //这个元素能用整数表示的数字 ++ L[i] = MAX(0, MIN(nmax, l)); //该值能表示的数字,一定是0-3 ++ float diff = scale * L[i] + min - x[i]; //反量化求一下误差 ++ diff = use_mad ? fabsf(diff) : diff * diff; //1 使用误差绝对值,否则误差平方值 ++ float w = weights[i]; ++ best_mad += w * diff; //误差 * 元素绝对值 累加和 ++ } ++ if (nstep < 1) { //0表示别微调 ++ *the_min = -min; ++ return scale; ++ } ++ for (int is = 0; is <= nstep; ++is) { //15 + 1次循环 ++ iscale = (rmin + rdelta*is + nmax)/(max - min); //-0.5 + 0.1 * is + 3 微调以后的? 每一个的表示范围微调 ++ float sum_l = 0, sum_l2 = 0, sum_xl = 0; ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale*(x[i] - min)); ++ l = MAX(0, MIN(nmax, l)); ++ Laux[i] = l; ++ float w = weights[i]; ++ sum_l += w*l; ++ sum_l2 += w*l*l; ++ sum_xl += w*l*x[i]; ++ } ++ float D = sum_w * sum_l2 - sum_l * sum_l; ++ if (D > 0) { ++ float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D; ++ float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D; ++ if (this_min > 0) { ++ this_min = 0; ++ this_scale = sum_xl / sum_l2; ++ } ++ float mad = 0; ++ for (int i = 0; i < n; ++i) { ++ float diff = this_scale * Laux[i] + this_min - x[i]; ++ diff = use_mad ? fabsf(diff) : diff * diff; ++ float w = weights[i]; ++ mad += w * diff; ++ } ++ if (mad < best_mad) { //mad越小应该表示越好 ++ for (int i = 0; i < n; ++i) { ++ L[i] = Laux[i]; ++ } ++ best_mad = mad; ++ scale = this_scale; //这个块最好的缩放因子 ++ min = this_min; ++ } ++ } ++ } ++ *the_min = -min; //返回最好块的最小值(取反) ++ return scale; ++} ++ ++void quantize_row_q2_K(const float *__restrict__ x, block_q2_K *__restrict__ y, int64_t k) ++{ ++ const int nb = k / QK_K; //块数,超级块个数 ++ uint8_t L[QK_K]; ++ uint8_t Laux[16]; ++ float weights[16]; ++ float mins[QK_K/16]; ++ float scales[QK_K/16]; ++ const float q4scale = 15.f; ++ ++ for (int i = 0; i < nb; i++) { ++ float max_scale = 0; // as we are deducting the min, scales are always positive 扣除了最小值,因此scales始终为正 ++ float max_min = 0; ++ for (int j = 0; j < QK_K/16; ++j) { //一个超级块有16个小块,每小块16个元素 ++ for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]); //每块16个元素,先算绝对值 ++ //找到误差最小的量化的办法去求scales ++ scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, 1); ++ float scale = scales[j]; ++ if (scale > max_scale) { //得到超级块最大的scale ++ max_scale = scale; ++ } ++ float min = mins[j]; ++ if (min > max_min) { //块里面返回的是-min, 负数反值最大的min, 超级快最大的负数 ++ max_min = min; ++ } ++ } ++ ++ if (max_scale > 0) { ++ float iscale = q4scale/max_scale; //再4位量化,最小scale是0,因此这里省略min_scale ++ for (int j = 0; j < QK_K/16; ++j) { ++ int l = nearest_int(iscale*scales[j]); ++ y[i].scales[j] = l; //每个超级块的scales就是每个小块的scale再量化后的,储存在低4B(因为是4位再量化) ++ } ++ y[i].GGML_COMMON_AGGR.d = GGML_FP32_TO_FP16(max_scale/q4scale); //每个超级块的量化scale = d ++ } else { ++ for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0; ++ y[i].GGML_COMMON_AGGR.d = GGML_FP32_TO_FP16(0.f); ++ } ++ if (max_min > 0) { //小块里有负数,也是4位量化 ++ float iscale = q4scale/max_min; //基准也是0 ++ for (int j = 0; j < QK_K/16; ++j) { ++ int l = nearest_int(iscale*mins[j]); ++ y[i].scales[j] |= (l << 4); //填充到高4位 ++ } ++ y[i].GGML_COMMON_AGGR.dmin = GGML_FP32_TO_FP16(max_min/q4scale); //量化因子 ++ } else { ++ y[i].GGML_COMMON_AGGR.dmin = GGML_FP32_TO_FP16(0.f); ++ } ++ for (int j = 0; j < QK_K/16; ++j) { ++ const float d = GGML_FP16_TO_FP32(y[i].GGML_COMMON_AGGR.d) * (y[i].scales[j] & 0xF); //每个小块的scale提取 ++ if (!d) continue; ++ const float dm = GGML_FP16_TO_FP32(y[i].GGML_COMMON_AGGR.dmin) * (y[i].scales[j] >> 4); //d min提取 ++ for (int ii = 0; ii < 16; ++ii) { ++ int l = nearest_int((x[16*j + ii] + dm)/d); //r /s = q ++ l = MAX(0, MIN(3, l)); ++ L[16*j + ii] = l; //0-3范围,L实际有效值就是2位; 根据每个小块的scale和dmin又重新量化每个元素 ++ } ++ } ++ ++#if QK_K == 256 ++ for (int j = 0; j < QK_K; j += 128) { ++ for (int l = 0; l < 32; ++l) { //32个数, 以0,32,64,96作为步长储存,一轮储存128个 ++ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); ++ } ++ } ++#else ++ for (int l = 0; l < 16; ++l) { ++ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); ++ } ++#endif ++ ++ x += QK_K; ++ ++ } ++} ++ ++void quantize_row_q4_0(const float *__restrict__ x, block_q4_0 *__restrict__ y, int64_t k) ++{ ++ static const int qk = QK4_0; ++ const int nb = k / qk; ++ ++ for (int i = 0; i < nb; i++) { ++ float amax = 0.0f; // absolute max ++ float max = 0.0f; ++ ++ for (int j = 0; j < qk; j++) { ++ const float v = x[i*qk + j]; ++ if (amax < fabsf(v)) { ++ amax = fabsf(v); ++ max = v; ++ } ++ } ++ ++ const float d = max / -8; ++ const float id = d ? 1.0f/d : 0.0f; ++ ++ y[i].d = GGML_FP32_TO_FP16(d); ++ ++ for (int j = 0; j < qk/2; ++j) { ++ const float x0 = x[i*qk + 0 + j]*id; ++ const float x1 = x[i*qk + qk/2 + j]*id; ++ ++ const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f)); ++ const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f)); ++ ++ y[i].qs[j] = xi0; ++ y[i].qs[j] |= xi1 << 4; ++ } ++ } ++} ++ ++void quantize_row_q4_1(const float *__restrict__ x, block_q4_1 *__restrict__ y, int64_t k) ++{ ++ const int qk = QK4_1; ++ ++ const int nb = k / qk; ++ ++ for (int i = 0; i < nb; i++) { ++ float min = FLT_MAX; ++ float max = -FLT_MAX; ++ ++ for (int j = 0; j < qk; j++) { ++ const float v = x[i*qk + j]; ++ ++ if (v < min) min = v; ++ if (v > max) max = v; ++ } ++ ++ const float d = (max - min) / ((1 << 4) - 1); ++ const float id = d ? 1.0f/d : 0.0f; ++ ++ y[i].d = GGML_FP32_TO_FP16(d); ++ y[i].m = GGML_FP32_TO_FP16(min); ++ ++ for (int j = 0; j < qk/2; ++j) { ++ const float x0 = (x[i*qk + 0 + j] - min)*id; ++ const float x1 = (x[i*qk + qk/2 + j] - min)*id; ++ ++ const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f)); ++ const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f)); ++ ++ y[i].qs[j] = xi0; ++ y[i].qs[j] |= xi1 << 4; ++ } ++ } ++} ++ ++void quantize_row_q8_0(const float *__restrict__ x, block_q8_0 *__restrict__ vy, int64_t k) ++{ ++ const int nb = k / QK8_0; ++ block_q8_0 *__restrict__ y = vy; ++ ++#if defined(__ARM_NEON) ++ for (int i = 0; i < nb; i++) { ++ float32x4_t srcv [8]; ++ float32x4_t asrcv[8]; ++ float32x4_t amaxv[8]; ++ ++ for (int j = 0; j < 8; j++) srcv[j] = vld1q_f32(x + i*32 + 4*j); ++ for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]); ++ ++ for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]); ++ for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]); ++ for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]); ++ ++ const float amax = vmaxvq_f32(amaxv[0]); ++ ++ const float d = amax / ((1 << 7) - 1); ++ const float id = d ? 1.0f/d : 0.0f; ++ ++ y[i].d = GGML_FP32_TO_FP16(d); ++ ++ for (int j = 0; j < 8; j++) { ++ const float32x4_t v = vmulq_n_f32(srcv[j], id); ++ const int32x4_t vi = vcvtnq_s32_f32(v); ++ ++ y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0); ++ y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1); ++ y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2); ++ y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3); ++ } ++ } ++#elif defined(__wasm_simd128__) ++ for (int i = 0; i < nb; i++) { ++ v128_t srcv [8]; ++ v128_t asrcv[8]; ++ v128_t amaxv[8]; ++ ++ for (int j = 0; j < 8; j++) srcv[j] = wasm_v128_load(x + i*32 + 4*j); ++ for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]); ++ ++ for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]); ++ for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]); ++ for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]); ++ ++ const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0), ++ wasm_f32x4_extract_lane(amaxv[0], 1)), ++ MAX(wasm_f32x4_extract_lane(amaxv[0], 2), ++ wasm_f32x4_extract_lane(amaxv[0], 3))); ++ ++ const float d = amax / ((1 << 7) - 1); ++ const float id = d ? 1.0f/d : 0.0f; ++ ++ y[i].d = GGML_FP32_TO_FP16(d); ++ ++ for (int j = 0; j < 8; j++) { ++ const v128_t v = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id)); ++ const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v); ++ ++ y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0); ++ y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1); ++ y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2); ++ y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3); ++ } ++ } ++#elif defined(__AVX2__) || defined(__AVX__) ++ for (int i = 0; i < nb; i++) { ++ // Load elements into 4 AVX vectors ++ __m256 v0 = _mm256_loadu_ps( x ); ++ __m256 v1 = _mm256_loadu_ps( x + 8 ); ++ __m256 v2 = _mm256_loadu_ps( x + 16 ); ++ __m256 v3 = _mm256_loadu_ps( x + 24 ); ++ x += 32; ++ ++ // Compute max(abs(e)) for the block ++ const __m256 signBit = _mm256_set1_ps( -0.0f ); ++ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); ++ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); ++ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); ++ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); ++ ++ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); ++ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); ++ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); ++ const float maxScalar = _mm_cvtss_f32( max4 ); ++ ++ // Quantize these floats ++ const float d = maxScalar / 127.f; ++ y[i].d = GGML_FP32_TO_FP16(d); ++ const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; ++ const __m256 mul = _mm256_set1_ps( id ); ++ ++ // Apply the multiplier ++ v0 = _mm256_mul_ps( v0, mul ); ++ v1 = _mm256_mul_ps( v1, mul ); ++ v2 = _mm256_mul_ps( v2, mul ); ++ v3 = _mm256_mul_ps( v3, mul ); ++ ++ // Round to nearest integer ++ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); ++ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); ++ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); ++ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); ++ ++ // Convert floats to integers ++ __m256i i0 = _mm256_cvtps_epi32( v0 ); ++ __m256i i1 = _mm256_cvtps_epi32( v1 ); ++ __m256i i2 = _mm256_cvtps_epi32( v2 ); ++ __m256i i3 = _mm256_cvtps_epi32( v3 ); ++ ++#if defined(__AVX2__) ++ // Convert int32 to int16 ++ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 ++ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 ++ // Convert int16 to int8 ++ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 ++ ++ // We got our precious signed bytes, but the order is now wrong ++ // These AVX2 pack instructions process 16-byte pieces independently ++ // The following instruction is fixing the order ++ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); ++ i0 = _mm256_permutevar8x32_epi32( i0, perm ); ++ ++ _mm256_storeu_si256((__m256i *)y[i].qs, i0); ++#else ++ // Since we don't have in AVX some necessary functions, ++ // we split the registers in half and call AVX2 analogs from SSE ++ __m128i ni0 = _mm256_castsi256_si128( i0 ); ++ __m128i ni1 = _mm256_extractf128_si256( i0, 1); ++ __m128i ni2 = _mm256_castsi256_si128( i1 ); ++ __m128i ni3 = _mm256_extractf128_si256( i1, 1); ++ __m128i ni4 = _mm256_castsi256_si128( i2 ); ++ __m128i ni5 = _mm256_extractf128_si256( i2, 1); ++ __m128i ni6 = _mm256_castsi256_si128( i3 ); ++ __m128i ni7 = _mm256_extractf128_si256( i3, 1); ++ ++ // Convert int32 to int16 ++ ni0 = _mm_packs_epi32( ni0, ni1 ); ++ ni2 = _mm_packs_epi32( ni2, ni3 ); ++ ni4 = _mm_packs_epi32( ni4, ni5 ); ++ ni6 = _mm_packs_epi32( ni6, ni7 ); ++ // Convert int16 to int8 ++ ni0 = _mm_packs_epi16( ni0, ni2 ); ++ ni4 = _mm_packs_epi16( ni4, ni6 ); ++ ++ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); ++ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); ++#endif ++ } ++#elif defined(__riscv_v_intrinsic) ++ ++ size_t vl = __riscv_vsetvl_e32m4(QK8_0); ++ ++ for (int i = 0; i < nb; i++) { ++ // load elements ++ vfloat32m4_t v_x = __riscv_vle32_v_f32m4(x+i*QK8_0, vl); ++ ++ vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl); ++ vfloat32m1_t tmp = __riscv_vfmv_v_f_f32m1(0.0f, vl); ++ vfloat32m1_t vmax = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl); ++ float amax = __riscv_vfmv_f_s_f32m1_f32(vmax); ++ ++ const float d = amax / ((1 << 7) - 1); ++ const float id = d ? 1.0f/d : 0.0f; ++ ++ y[i].d = GGML_FP32_TO_FP16(d); ++ ++ vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl); ++ ++ // convert to integer ++ vint16m2_t vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl); ++ vint8m1_t vs = __riscv_vncvt_x_x_w_i8m1(vi, vl); ++ ++ // store result ++ __riscv_vse8_v_i8m1(y[i].qs , vs, vl); ++ } ++#else ++ GGML_UNUSED(nb); ++ // scalar ++ quantize_row_q8_0_reference(x, y, k); ++#endif ++} ++ ++void quantize_row_q8_1(const float * __restrict__ x, block_q8_1 * __restrict__ y, int64_t k) ++{ ++ const int nb = k / QK8_1; ++ ++ for (int i = 0; i < nb; i++) { ++ float amax = 0.0f; // absolute max ++ ++ for (int j = 0; j < QK8_1; j++) { ++ const float v = x[i*QK8_1 + j]; ++ amax = MAX(amax, fabsf(v)); ++ } ++ ++ const float d = amax / ((1 << 7) - 1); ++ const float id = d ? 1.0f/d : 0.0f; ++ ++ y[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d = GGML_FP32_TO_FP16(d); ++ ++ int sum = 0; ++ ++ for (int j = 0; j < QK8_1/2; ++j) { ++ const float v0 = x[i*QK8_1 + j]*id; ++ const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id; ++ ++ y[i].qs[ j] = roundf(v0); ++ y[i].qs[QK8_1/2 + j] = roundf(v1); ++ ++ sum += y[i].qs[ j]; ++ sum += y[i].qs[QK8_1/2 + j]; ++ } ++ ++ y[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s = GGML_FP32_TO_FP16(sum*d); ++ } ++} ++ ++static float make_qx_quants(int n, int nmax, const float *__restrict__ x, int8_t * __restrict__ L, int rmse_type, ++ const float *__restrict__ qw) ++{ ++ float max = 0; ++ float amax = 0; ++ for (int i = 0; i < n; ++i) { ++ float ax = fabsf(x[i]); ++ if (ax > amax) { amax = ax; max = x[i]; } ++ } ++ if (amax < 1e-30f) { // all zero ++ for (int i = 0; i < n; ++i) { ++ L[i] = 0; ++ } ++ return 0.f; ++ } ++ float iscale = -nmax / max; ++ if (rmse_type == 0) { ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale * x[i]); ++ L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); ++ } ++ return 1/iscale; ++ } ++ int return_early = 0; ++ if (rmse_type < 0) { ++ rmse_type = -rmse_type; ++ return_early = 1; ++ } ++ float sumlx = 0; ++ float suml2 = 0; ++#ifdef HAVE_BUGGY_APPLE_LINKER ++ // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7 ++ for (volatile int i = 0; i < n; ++i) { ++#else ++ for (int i = 0; i < n; ++i) { ++#endif ++ int l = nearest_int(iscale * x[i]); ++ l = MAX(-nmax, MIN(nmax-1, l)); ++ L[i] = l + nmax; ++ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); ++ sumlx += w*x[i]*l; ++ suml2 += w*l*l; ++ } ++ float scale = sumlx/suml2; ++ if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale; ++ float best = scale * sumlx; ++ for (int is = -9; is <= 9; ++is) { ++ if (is == 0) { ++ continue; ++ } ++ iscale = -(nmax + 0.1f*is) / max; ++ sumlx = suml2 = 0; ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale * x[i]); ++ l = MAX(-nmax, MIN(nmax-1, l)); ++ float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i])); ++ sumlx += w*x[i]*l; ++ suml2 += w*l*l; ++ } ++ if (suml2 > 0 && sumlx*sumlx > best*suml2) { ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale * x[i]); ++ L[i] = nmax + MAX(-nmax, MIN(nmax-1, l)); ++ } ++ scale = sumlx/suml2; best = scale*sumlx; ++ } ++ } ++ return scale; ++} ++ ++void quantize_row_q6_K(const float * __restrict__ x, block_q6_K *__restrict__ y, int64_t k) ++{ ++ const int64_t nb = k / QK_K; ++ ++ int8_t L[QK_K]; ++ float scales[QK_K/16]; ++ ++ for (int i = 0; i < nb; i++) { ++ float max_scale = 0; ++ float max_abs_scale = 0; ++ ++ for (int ib = 0; ib < QK_K/16; ++ib) { ++ ++ const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL); ++ scales[ib] = scale; ++ ++ const float abs_scale = fabsf(scale); ++ if (abs_scale > max_abs_scale) { ++ max_abs_scale = abs_scale; ++ max_scale = scale; ++ } ++ ++ } ++ ++ if (!max_abs_scale) { ++ memset(&y[i], 0, sizeof(block_q6_K)); ++ y[i].d = GGML_FP32_TO_FP16(0.f); ++ x += QK_K; ++ continue; ++ } ++ ++ float iscale = -128.f/max_scale; ++ y[i].d = GGML_FP32_TO_FP16(1/iscale); ++ for (int ib = 0; ib < QK_K/16; ++ib) { ++ y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib])); ++ } ++ ++ for (int j = 0; j < QK_K/16; ++j) { ++ float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j]; ++ if (!d) { ++ continue; ++ } ++ for (int ii = 0; ii < 16; ++ii) { ++ int l = nearest_int(x[16*j + ii]/d); ++ l = MAX(-32, MIN(31, l)); ++ L[16*j + ii] = l + 32; ++ } ++ } ++ ++ uint8_t * __restrict__ ql = y[i].ql; ++ uint8_t * __restrict__ qh = y[i].qh; ++#if QK_K == 256 ++ for (int j = 0; j < QK_K; j += 128) { ++ for (int l = 0; l < 32; ++l) { ++ const uint8_t q1 = L[j + l + 0] & 0xF; ++ const uint8_t q2 = L[j + l + 32] & 0xF; ++ const uint8_t q3 = L[j + l + 64] & 0xF; ++ const uint8_t q4 = L[j + l + 96] & 0xF; ++ ql[l+ 0] = q1 | (q3 << 4); ++ ql[l+32] = q2 | (q4 << 4); ++ qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6); ++ } ++ ql += 64; ++ qh += 32; ++ } ++#else ++ for (int l = 0; l < 32; ++l) { ++ const uint8_t q1 = L[l + 0] & 0xF; ++ const uint8_t q2 = L[l + 32] & 0xF; ++ ql[l] = q1 | (q2 << 4); ++ } ++ for (int l = 0; l < 16; ++l) { ++ qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6); ++ } ++#endif ++ ++ x += QK_K; ++ ++ } ++} ++ ++void quantize_row_q8_K(const float *__restrict__ x, block_q8_K *__restrict__ y, int64_t k) ++{ ++ const int64_t nb = k / QK_K; ++ ++ for (int i = 0; i < nb; i++) { ++ float max = 0; ++ float amax = 0; ++ for (int j = 0; j < QK_K; ++j) { ++ float ax = fabsf(x[j]); ++ if (ax > amax) { ++ amax = ax; max = x[j]; ++ } ++ } ++ if (!amax) { ++ y[i].d = 0; ++ memset(y[i].qs, 0, QK_K); ++ x += QK_K; ++ continue; ++ } ++ //const float iscale = -128.f/max; ++ // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward ++ const float iscale = -127.f/max; ++ for (int j = 0; j < QK_K; ++j) { ++ int v = nearest_int(iscale*x[j]); ++ y[i].qs[j] = MIN(127, v); ++ } ++ for (int j = 0; j < QK_K/16; ++j) { ++ int sum = 0; ++ for (int ii = 0; ii < 16; ++ii) { ++ sum += y[i].qs[j*16 + ii]; ++ } ++ y[i].bsums[j] = sum; ++ } ++ y[i].d = 1 / iscale; ++ x += QK_K; ++ } ++} ++ ++static float make_q3_quants(int n, int nmax, const float *__restrict__ x, int8_t * __restrict__ L, int do_rmse) ++{ ++ float max = 0; ++ float amax = 0; ++ for (int i = 0; i < n; ++i) { ++ float ax = fabsf(x[i]); ++ if (ax > amax) { amax = ax; max = x[i]; } ++ } ++ if (!amax) { // all zero ++ for (int i = 0; i < n; ++i) { L[i] = 0; } ++ return 0.f; ++ } ++ float iscale = -nmax / max; ++ if (do_rmse) { ++ float sumlx = 0; ++ float suml2 = 0; ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale * x[i]); ++ l = MAX(-nmax, MIN(nmax-1, l)); ++ L[i] = l; ++ float w = x[i]*x[i]; ++ sumlx += w*x[i]*l; ++ suml2 += w*l*l; ++ } ++ for (int itry = 0; itry < 5; ++itry) { ++ int n_changed = 0; ++ for (int i = 0; i < n; ++i) { ++ float w = x[i]*x[i]; ++ float slx = sumlx - w*x[i]*L[i]; ++ if (slx > 0) { ++ float sl2 = suml2 - w*L[i]*L[i]; ++ int new_l = nearest_int(x[i] * sl2 / slx); ++ new_l = MAX(-nmax, MIN(nmax-1, new_l)); ++ if (new_l != L[i]) { ++ slx += w*x[i]*new_l; ++ sl2 += w*new_l*new_l; ++ if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) { ++ L[i] = new_l; sumlx = slx; suml2 = sl2; ++ ++n_changed; ++ } ++ } ++ } ++ } ++ if (!n_changed) { ++ break; ++ } ++ } ++ for (int i = 0; i < n; ++i) { ++ L[i] += nmax; ++ } ++ return sumlx / suml2; ++ } ++ for (int i = 0; i < n; ++i) { ++ int l = nearest_int(iscale * x[i]); ++ l = MAX(-nmax, MIN(nmax-1, l)); ++ L[i] = l + nmax; ++ } ++ return 1/iscale; ++} ++ ++void quantize_row_q3_K(const float *__restrict__ x, block_q3_K *__restrict__ y, int64_t k) ++{ ++ const int nb = k / QK_K; ++ int8_t L[QK_K]; ++ float scales[QK_K / 16]; ++ ++ for (int i = 0; i < nb; i++) { ++ float max_scale = 0; ++ float amax = 0; ++ for (int j = 0; j < QK_K / 16; ++j) { ++ scales[j] = make_q3_quants(16, 4, x + 16 * j, L + 16 * j, 1); ++ float scale = fabsf(scales[j]); ++ if (scale > amax) { ++ amax = scale; max_scale = scales[j]; ++ } ++ } ++ ++#if QK_K == 256 ++ memset(y[i].scales, 0, 12); ++ if (max_scale) { ++ float iscale = -32.f/max_scale; ++ for (int j = 0; j < QK_K/16; ++j) { ++ int8_t l = nearest_int(iscale*scales[j]); ++ l = MAX(-32, MIN(31, l)) + 32; ++ if (j < 8) { ++ y[i].scales[j] = l & 0xF; ++ } else { ++ y[i].scales[j-8] |= ((l & 0xF) << 4); ++ } ++ l >>= 4; ++ y[i].scales[j%4 + 8] |= (l << (2*(j/4))); ++ } ++ y[i].d = GGML_FP32_TO_FP16(1/iscale); ++ } else { ++ y[i].d = GGML_FP32_TO_FP16(0.f); ++ } ++ ++ int8_t sc; ++ for (int j = 0; j < QK_K/16; ++j) { ++ sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4; ++ sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32; ++ float d = GGML_FP16_TO_FP32(y[i].d) * sc; ++ if (!d) { ++ continue; ++ } ++ for (int ii = 0; ii < 16; ++ii) { ++ int l = nearest_int(x[16*j + ii]/d); ++ l = MAX(-4, MIN(3, l)); ++ L[16*j + ii] = l + 4; ++ } ++ } ++#else ++ if (max_scale) { ++ float iscale = -8.f/max_scale; ++ for (int j = 0; j < QK_K/16; j+=2) { ++ int l1 = nearest_int(iscale*scales[j]); ++ l1 = 8 + MAX(-8, MIN(7, l1)); ++ int l2 = nearest_int(iscale*scales[j+1]); ++ l2 = 8 + MAX(-8, MIN(7, l2)); ++ y[i].scales[j/2] = l1 | (l2 << 4); ++ } ++ y[i].d = GGML_FP32_TO_FP16(1/iscale); ++ } else { ++ for (int j = 0; j < QK_K/16; j+=2) { ++ y[i].scales[j/2] = 0; ++ } ++ y[i].d = GGML_FP32_TO_FP16(0.f); ++ } ++ for (int j = 0; j < QK_K/16; ++j) { ++ int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4; ++ float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8); ++ if (!d) { ++ continue; ++ } ++ for (int ii = 0; ii < 16; ++ii) { ++ int l = nearest_int(x[16*j + ii]/d); ++ l = MAX(-4, MIN(3, l)); ++ L[16*j + ii] = l + 4; ++ } ++ } ++#endif ++ ++ memset(y[i].hmask, 0, QK_K/8); ++ // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc. ++ int m = 0; ++ uint8_t hm = 1; ++ for (int j = 0; j < QK_K; ++j) { ++ if (L[j] > 3) { ++ y[i].hmask[m] |= hm; ++ L[j] -= 4; ++ } ++ if (++m == QK_K/8) { ++ m = 0; hm <<= 1; ++ } ++ } ++#if QK_K == 256 ++ for (int j = 0; j < QK_K; j += 128) { ++ for (int l = 0; l < 32; ++l) { ++ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6); ++ } ++ } ++#else ++ for (int l = 0; l < 16; ++l) { ++ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6); ++ } ++#endif ++ ++ x += QK_K; ++ } ++} ++ ++void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, ++ size_t bx, const void * __restrict__ vy, size_t by, int nrc) ++{ ++ const int qk = QK8_0; ++ const int nb = n / qk; ++ ++ (void)(nrc); ++ (void)(bx); ++ (void)(by); ++ (void)(bs); ++ ++ const block_q4_0 * __restrict__ x = vx; ++ const block_q8_0 * __restrict__ y = vy; ++ ++#if defined(__ARM_FEATURE_MATMUL_INT8) ++ if (nrc == 2) { ++ const block_q4_0 * __restrict__ vx0 = vx; ++ const block_q4_0 * __restrict__ vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx); ++ ++ const block_q8_0 * __restrict__ vy0 = vy; ++ const block_q8_0 * __restrict__ vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); ++ ++ float32x4_t sumv0 = vdupq_n_f32(0.0f); ++ ++ for (int i = 0; i < nb; i++) { ++ const block_q4_0 * __restrict__ b_x0 = &vx0[i]; ++ const block_q4_0 * __restrict__ b_x1 = &vx1[i]; ++ const block_q8_0 * __restrict__ b_y0 = &vy0[i]; ++ const block_q8_0 * __restrict__ b_y1 = &vy1[i]; ++ ++ const uint8x16_t m4b = vdupq_n_u8(0x0F); ++ const int8x16_t s8b = vdupq_n_s8(0x8); ++ ++ const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); ++ const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); ++ ++ // 4-bit -> 8-bit ++ const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); ++ const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); ++ const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); ++ const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); ++ ++ // sub 8 ++ const int8x16_t x0_l = vsubq_s8(v0_0l, s8b); ++ const int8x16_t x0_h = vsubq_s8(v0_0h, s8b); ++ const int8x16_t x1_l = vsubq_s8(v0_1l, s8b); ++ const int8x16_t x1_h = vsubq_s8(v0_1h, s8b); ++ ++ // load y ++ const int8x16_t y0_l = vld1q_s8(b_y0->qs); ++ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); ++ const int8x16_t y1_l = vld1q_s8(b_y1->qs); ++ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); ++ ++ float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), ++ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), ++ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), ++ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; ++ ++ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); ++ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); ++ ++ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); ++ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); ++ ++ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); ++ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); ++ ++ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); ++ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); ++ ++ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), ++ l1, r1)), l2, r2)), l3, r3))), scale); ++ } ++ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); ++ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); ++ ++ vst1_f32(s, vget_low_f32(sumv2)); ++ vst1_f32(s + bs, vget_high_f32(sumv2)); ++ return; ++ } ++#endif ++ ++#if defined(__ARM_FEATURE_SVE) ++ float sum[4] = {0.0}; ++ int64_t x0_sum, x1_sum, x0_sum_1, x1_sum_1; ++ svbool_t pre = svptrue_b8(); ++ svbool_t pre32 = svptrue_b32(); ++ svint32_t x0_32_0 = svdup_s32(0); ++ svint32_t x1_32_0 = svdup_s32(0); ++ ++ for (int i = 0; i < nb; i += 4) { ++ __builtin_prefetch((char *)&x[i], 0, 3); ++ __builtin_prefetch((char *)&y[i], 0, 3); ++ const block_q4_0 *__restrict__ x0 = &x[i + 0]; ++ const block_q4_0 *__restrict__ x1 = &x[i + 1]; ++ const block_q4_0 *__restrict__ x2 = &x[i + 2]; ++ const block_q4_0 *__restrict__ x3 = &x[i + 3]; ++ ++ const block_q8_0 *__restrict__ y0 = &y[i + 0]; ++ const block_q8_0 *__restrict__ y1 = &y[i + 1]; ++ const block_q8_0 *__restrict__ y2 = &y[i + 2]; ++ const block_q8_0 *__restrict__ y3 = &y[i + 3]; ++ ++ svuint8_t x_merge = svld1(pre, x0->qs); ++ memcpy((char *)&x_merge + 16, x1->qs, 16); ++ /* 低4位-8 */ ++ svint8_t x_low4_0 = svsub_z(pre, svreinterpret_s8(svand_z(pre, x_merge, 0xf)), 8); ++ svint8_t x_high4_0 = svsub_z(pre, svreinterpret_s8(svlsr_z(pre, x_merge, 4)), 8); ++ /* 交叉存储 */ ++ svint8_t x_tmp = svext(x_high4_0, x_low4_0, 16); ++ svint8_t x_low4 = svext(x_tmp, x_high4_0, 16); ++ svint8_t x_high4 = svext(x_low4_0, x_tmp, 16); ++ /* 提取y */ ++ svint8_t y0_merge = svld1(pre, y0->qs); ++ svint8_t y1_merge = svld1(pre, y1->qs); ++ ++ svuint8_t x_merge1 = svld1(pre, x2->qs); ++ memcpy((char *)&x_merge1 + 16, x3->qs, 16); ++ /* 低4位-8 */ ++ svint8_t x_low4_2 = svsub_z(pre, svreinterpret_s8(svand_z(pre, x_merge1, 0xf)), 8); ++ svint8_t x_high4_2 = svsub_z(pre, svreinterpret_s8(svlsr_z(pre, x_merge1, 4)), 8); ++ x_tmp = svext(x_high4_2, x_low4_2, 16); ++ svint8_t x_low4_1 = svext(x_tmp, x_high4_2, 16); ++ svint8_t x_high4_1 = svext(x_low4_2, x_tmp, 16); ++ /* 提取y */ ++ svint8_t y0_merge_1 = svld1(pre, y2->qs); ++ svint8_t y1_merge_1 = svld1(pre, y3->qs); ++ ++ svint32_t x0_mul32 = svdot(x0_32_0, x_low4, y0_merge); ++ svint32_t x1_mul32 = svdot(x1_32_0, x_high4, y1_merge); ++ svint32_t x0_mul32_1 = svdot(x0_32_0, x_low4_1, y0_merge_1); ++ svint32_t x1_mul32_1 = svdot(x1_32_0, x_high4_1, y1_merge_1); ++ ++ x0_sum = svaddv(pre32, x0_mul32); ++ x1_sum = svaddv(pre32, x1_mul32); ++ x0_sum_1 = svaddv(pre32, x0_mul32_1); ++ x1_sum_1 = svaddv(pre32, x1_mul32_1); ++ ++ sum[0] += x0_sum * x0->d * y0->d; ++ sum[1] += x1_sum * x1->d * y1->d; ++ sum[2] += x0_sum_1 * x2->d * y2->d; ++ sum[3] += x1_sum_1 * x3->d * y3->d; ++ } ++ *s = sum[0] + sum[1] + sum[2] + sum[3]; ++#elif defined(__ARM_NEON) ++ float32x4_t sumv0 = vdupq_n_f32(0.0f); ++ float32x4_t sumv1 = vdupq_n_f32(0.0f); ++ ++ for (int i = 0; i < nb; i += 2) { ++ const block_q4_0 * __restrict__ x0 = &x[i + 0]; ++ const block_q4_0 * __restrict__ x1 = &x[i + 1]; ++ const block_q8_0 * __restrict__ y0 = &y[i + 0]; ++ const block_q8_0 * __restrict__ y1 = &y[i + 1]; ++ ++ const uint8x16_t m4b = vdupq_n_u8(0x0F); ++ const int8x16_t s8b = vdupq_n_s8(0x8); ++ ++ const uint8x16_t v0_0 = vld1q_u8(x0->qs); ++ const uint8x16_t v0_1 = vld1q_u8(x1->qs); ++ ++ // 4-bit -> 8-bit ++ const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); ++ const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); ++ const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); ++ const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); ++ ++ // sub 8 ++ const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); ++ const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); ++ const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); ++ const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); ++ ++ // load y ++ const int8x16_t v1_0l = vld1q_s8(y0->qs); ++ const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); ++ const int8x16_t v1_1l = vld1q_s8(y1->qs); ++ const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); ++ ++ // dot product into int32x4_t ++ const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h); ++ const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h); ++ ++ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); ++ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); ++ } ++ ++ *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); ++#elif defined(__AVX2__) ++ // Initialize accumulator with zeros ++ __m256 acc = _mm256_setzero_ps(); ++ ++ // Main loop ++ for (int i = 0; i < nb; ++i) { ++ /* Compute combined scale for the block */ ++ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); ++ ++ __m256i qx = bytes_from_nibbles_32(x[i].qs); ++ ++ // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. ++ const __m256i off = _mm256_set1_epi8( 8 ); ++ qx = _mm256_sub_epi8( qx, off ); ++ ++ __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs); ++ ++ const __m256 q = mul_sum_i8_pairs_float(qx, qy); ++ ++ /* Multiply q with scale and accumulate */ ++ acc = _mm256_fmadd_ps( d, q, acc ); ++ } ++ ++ *s = hsum_float_8(acc); ++#elif defined(__AVX__) ++ // Initialize accumulator with zeros ++ __m256 acc = _mm256_setzero_ps(); ++ ++ // Main loop ++ for (int i = 0; i < nb; ++i) { ++ // Compute combined scale for the block ++ const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); ++ ++ const __m128i lowMask = _mm_set1_epi8(0xF); ++ const __m128i off = _mm_set1_epi8(8); ++ ++ const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs); ++ ++ __m128i bx_0 = _mm_and_si128(lowMask, tmp); ++ __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); ++ bx_0 = _mm_sub_epi8(bx_0, off); ++ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); ++ ++ bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4)); ++ by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); ++ bx_0 = _mm_sub_epi8(bx_0, off); ++ const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0); ++ ++ // Convert int32_t to float ++ __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1)); ++ ++ // Apply the scale, and accumulate ++ acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc); ++ } ++ ++ *s = hsum_float_8(acc); ++#elif defined(__SSSE3__) ++ // set constants ++ const __m128i lowMask = _mm_set1_epi8(0xF); ++ const __m128i off = _mm_set1_epi8(8); ++ ++ // Initialize accumulator with zeros ++ __m128 acc_0 = _mm_setzero_ps(); ++ __m128 acc_1 = _mm_setzero_ps(); ++ __m128 acc_2 = _mm_setzero_ps(); ++ __m128 acc_3 = _mm_setzero_ps(); ++ ++ // First round without accumulation ++ { ++ _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0); ++ _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0); ++ ++ // Compute combined scale for the block 0 and 1 ++ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) ); ++ ++ const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs); ++ ++ __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); ++ __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs); ++ bx_0 = _mm_sub_epi8(bx_0, off); ++ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); ++ ++ __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); ++ __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16)); ++ bx_1 = _mm_sub_epi8(bx_1, off); ++ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); ++ ++ _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0); ++ _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0); ++ ++ // Compute combined scale for the block 2 and 3 ++ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) ); ++ ++ const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs); ++ ++ __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); ++ __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs); ++ bx_2 = _mm_sub_epi8(bx_2, off); ++ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); ++ ++ __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); ++ __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16)); ++ bx_3 = _mm_sub_epi8(bx_3, off); ++ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); ++ ++ // Convert int32_t to float ++ __m128 p0 = _mm_cvtepi32_ps(i32_0); ++ __m128 p1 = _mm_cvtepi32_ps(i32_1); ++ __m128 p2 = _mm_cvtepi32_ps(i32_2); ++ __m128 p3 = _mm_cvtepi32_ps(i32_3); ++ ++ // Apply the scale ++ acc_0 = _mm_mul_ps( d_0_1, p0 ); ++ acc_1 = _mm_mul_ps( d_0_1, p1 ); ++ acc_2 = _mm_mul_ps( d_2_3, p2 ); ++ acc_3 = _mm_mul_ps( d_2_3, p3 ); ++ } ++ ++ assert(nb % 2 == 0); // TODO: handle odd nb ++ ++ // Main loop ++ for (int i = 2; i < nb; i+=2) { ++ _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0); ++ _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0); ++ ++ // Compute combined scale for the block 0 and 1 ++ const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) ); ++ ++ const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs); ++ ++ __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1); ++ __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs); ++ bx_0 = _mm_sub_epi8(bx_0, off); ++ const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0); ++ ++ __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4)); ++ __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16)); ++ bx_1 = _mm_sub_epi8(bx_1, off); ++ const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1); ++ ++ _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0); ++ _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0); ++ ++ // Compute combined scale for the block 2 and 3 ++ const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) ); ++ ++ const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs); ++ ++ __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3); ++ __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs); ++ bx_2 = _mm_sub_epi8(bx_2, off); ++ const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2); ++ ++ __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4)); ++ __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16)); ++ bx_3 = _mm_sub_epi8(bx_3, off); ++ const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3); ++ ++ // Convert int32_t to float ++ __m128 p0 = _mm_cvtepi32_ps(i32_0); ++ __m128 p1 = _mm_cvtepi32_ps(i32_1); ++ __m128 p2 = _mm_cvtepi32_ps(i32_2); ++ __m128 p3 = _mm_cvtepi32_ps(i32_3); ++ ++ // Apply the scale ++ __m128 p0_d = _mm_mul_ps( d_0_1, p0 ); ++ __m128 p1_d = _mm_mul_ps( d_0_1, p1 ); ++ __m128 p2_d = _mm_mul_ps( d_2_3, p2 ); ++ __m128 p3_d = _mm_mul_ps( d_2_3, p3 ); ++ ++ // Acummulate ++ acc_0 = _mm_add_ps(p0_d, acc_0); ++ acc_1 = _mm_add_ps(p1_d, acc_1); ++ acc_2 = _mm_add_ps(p2_d, acc_2); ++ acc_3 = _mm_add_ps(p3_d, acc_3); ++ } ++ ++ *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); ++#elif defined(__riscv_v_intrinsic) ++ float sumf = 0.0; ++ ++ size_t vl = __riscv_vsetvl_e8m1(qk/2); ++ ++ for (int i = 0; i < nb; i++) { ++ // load elements ++ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl); ++ ++ vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl); ++ vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl); ++ ++ // mask and store lower part of x, and then upper part ++ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); ++ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); ++ ++ vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); ++ vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); ++ ++ // subtract offset ++ vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl); ++ vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl); ++ ++ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); ++ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); ++ ++ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); ++ ++ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); ++ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); ++ ++ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); ++ ++ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); ++ } ++ ++ *s = sumf; ++#else ++ // scalar ++ float sumf = 0.0; ++ ++ for (int i = 0; i < nb; i++) { ++ int sumi = 0; ++ ++ for (int j = 0; j < qk/2; ++j) { ++ const int v0 = (x[i].qs[j] & 0x0F) - 8; ++ const int v1 = (x[i].qs[j] >> 4) - 8; ++ ++ sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); ++ } ++ ++ sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); ++ } ++ ++ *s = sumf; ++#endif ++} ++ ++void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc) ++{ ++ const int qk = QK8_1; ++ const int nb = n / qk; ++ ++ const block_q4_1 * __restrict__ x = vx; ++ const block_q8_1 * __restrict__ y = vy; ++ ++#if defined(__ARM_FEATURE_MATMUL_INT8_0) ++ if (nrc == 2) { ++ const block_q4_1 * __restrict__ vx0 = vx; ++ const block_q4_1 * __restrict__ vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx); ++ const block_q8_1 * __restrict__ vy0 = vy; ++ const block_q8_1 * __restrict__ vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by); ++ ++ float32x4_t sumv0 = vdupq_n_f32(0.0f); ++ float32x4_t summs0 = vdupq_n_f32(0.0f); ++ ++ for (int i = 0; i < nb; i++) { ++ const block_q4_1 * __restrict__ b_x0 = &vx0[i]; ++ const block_q4_1 * __restrict__ b_x1 = &vx1[i]; ++ const block_q8_1 * __restrict__ b_y0 = &vy0[i]; ++ const block_q8_1 * __restrict__ b_y1 = &vy1[i]; ++ ++ float32_t summs_t[4] = { ++ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s), ++ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s), ++ GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s), ++ GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s) ++ }; ++ summs0 = vaddq_f32(summs0, vld1q_f32(summs_t)); ++ ++ const uint8x16_t m4b = vdupq_n_u8(0x0F); ++ ++ const uint8x16_t v0_0 = vld1q_u8(b_x0->qs); ++ const uint8x16_t v0_1 = vld1q_u8(b_x1->qs); ++ ++ // 4-bit -> 8-bit ++ const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); ++ const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); ++ const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); ++ const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); ++ ++ // load y ++ const int8x16_t y0_l = vld1q_s8(b_y0->qs); ++ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); ++ const int8x16_t y1_l = vld1q_s8(b_y1->qs); ++ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); ++ ++ // mmla into int32x4_t ++ float32_t _scale[4] = { ++ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), ++ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), ++ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), ++ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d) ++ }; ++ float32x4_t scale = vld1q_f32(_scale); ++ ++ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); ++ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); ++ ++ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); ++ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); ++ ++ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); ++ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); ++ ++ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); ++ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); ++ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), ++ l1, r1)), l2, r2)), l3, r3))), scale); ++ } ++ ++ float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2); ++ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); ++ ++ sumv2 = vaddq_f32(sumv2, summs0); ++ ++ vst1_f32(s, vget_low_f32 (sumv2)); ++ vst1_f32(s + bs, vget_high_f32(sumv2)); ++ ++ return; ++ } ++#endif ++ ++ int ib = 0; ++ float sumf = 0; ++ ++ // TODO: add WASM SIMD ++#if defined(__ARM_NEON) ++ float32x4_t sumv0 = vdupq_n_f32(0.0f); ++ float32x4_t sumv1 = vdupq_n_f32(0.0f); ++ ++ float summs = 0; ++ ++ for (; ib + 1 < nb; ib += 2) { ++ const block_q4_1 * __restrict__ x0 = &x[ib + 0]; ++ const block_q4_1 * __restrict__ x1 = &x[ib + 1]; ++ const block_q8_1 * __restrict__ y0 = &y[ib + 0]; ++ const block_q8_1 * __restrict__ y1 = &y[ib + 1]; ++ ++ summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s); ++ ++ const uint8x16_t m4b = vdupq_n_u8(0x0F); ++ ++ const uint8x16_t v0_0 = vld1q_u8(x0->qs); ++ const uint8x16_t v0_1 = vld1q_u8(x1->qs); ++ ++ // 4-bit -> 8-bit ++ const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b)); ++ const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); ++ const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b)); ++ const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); ++ ++ // load y ++ const int8x16_t v1_0l = vld1q_s8(y0->qs); ++ const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); ++ const int8x16_t v1_1l = vld1q_s8(y1->qs); ++ const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); ++ ++ // dot product into int32x4_t ++ const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h); ++ const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h); ++ ++ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d)); ++ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d)); ++ } ++ ++ sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs; ++#elif defined(__AVX2__) || defined(__AVX__) ++ // Initialize accumulator with zeros ++ __m256 acc = _mm256_setzero_ps(); ++ ++ float summs = 0; ++ ++ // Main loop ++ for (; ib < nb; ++ib) { ++ const float d0 = GGML_FP16_TO_FP32(x[ib].d); ++ const float d1 = GGML_FP16_TO_FP32(y[ib].d); ++ ++ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); ++ ++ const __m256 d0v = _mm256_set1_ps( d0 ); ++ const __m256 d1v = _mm256_set1_ps( d1 ); ++ ++ // Compute combined scales ++ const __m256 d0d1 = _mm256_mul_ps( d0v, d1v ); ++ ++ // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes ++ const __m256i qx = bytes_from_nibbles_32(x[ib].qs); ++ const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs ); ++ ++ const __m256 xy = mul_sum_us8_pairs_float(qx, qy); ++ ++ // Accumulate d0*d1*x*y ++#if defined(__AVX2__) ++ acc = _mm256_fmadd_ps( d0d1, xy, acc ); ++#else ++ acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc ); ++#endif ++ } ++ ++ sumf = hsum_float_8(acc) + summs; ++#elif defined(__riscv_v_intrinsic) ++ size_t vl = __riscv_vsetvl_e8m1(qk/2); ++ ++ for (; ib < nb; ++ib) { ++ // load elements ++ vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl); ++ ++ vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl); ++ vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl); ++ ++ // mask and store lower part of x, and then upper part ++ vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl); ++ vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl); ++ ++ vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a); ++ vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l); ++ ++ vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl); ++ vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl); ++ ++ vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); ++ ++ vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl); ++ vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl); ++ ++ int sumi = __riscv_vmv_x_s_i32m1_i32(vs2); ++ ++ sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); ++ } ++ ++#elif defined(__POWER9_VECTOR__) ++ const vector signed char lowMask = vec_splats((signed char)0xF); ++ const vector signed int v0 = vec_splats((int32_t)0); ++ const vector unsigned char v4 = vec_splats((unsigned char)0x4); ++ ++ vector float vsumf0 = vec_splats(0.0f); ++ ++#pragma GCC unroll 4 ++ for (; ib < nb; ++ib) { ++ __builtin_prefetch(x[ib].qs, 0, 1); ++ __builtin_prefetch(y[ib].qs, 0, 1); ++ ++ vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d)); ++ vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d)); ++ vector float vd = vec_mul(vxd, vyd); ++ ++ vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m)); ++ vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f}; ++ vsumf0 = vec_madd(vxmin, vys, vsumf0); ++ ++ vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs); ++ vector signed char q8y0 = vec_xl( 0, y[ib].qs); ++ vector signed char q8y1 = vec_xl(16, y[ib].qs); ++ ++ vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask); ++ vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4); ++ ++ vector signed int vsumi0 = v0; ++ ++ vsumi0 = vec_msum(q8y0, q4x0, vsumi0); ++ vsumi0 = vec_msum(q8y1, q4x1, vsumi0); ++ ++ vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0); ++ } ++ ++ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4)); ++ vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8)); ++ ++ sumf = vec_extract(vsumf0, 0); ++ ++#elif defined(__loongarch_asx) ++ // Initialize accumulator with zeros ++ __m256 acc = (__m256)__lasx_xvldi(0); ++ ++ float summs = 0; ++ ++ // Main loop ++ for (; ib < nb; ++ib) { ++ const float d0 = GGML_FP16_TO_FP32(x[ib].d); ++ const float d1 = GGML_FP16_TO_FP32(y[ib].d); ++ ++ summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s); ++ ++ const __m256 d0v = __lasx_xvreplfr2vr_s( d0 ); ++ const __m256 d1v = __lasx_xvreplfr2vr_s( d1 ); ++ ++ // Compute combined scales ++ const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v ); ++ ++ // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes ++ const __m256i qx = bytes_from_nibbles_32(x[ib].qs); ++ const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0); ++ ++ const __m256 xy = mul_sum_us8_pairs_float(qx, qy); ++ ++ // Accumulate d0*d1*x*y ++ acc = __lasx_xvfmadd_s( d0d1, xy, acc ); ++ } ++ ++ sumf = hsum_float_8(acc) + summs; ++#endif ++ for (; ib < nb; ++ib) { ++ int sumi0 = 0; ++ int sumi1 = 0; ++ ++ for (int j = 0; j < qk/2; ++j) { ++ const int v0 = (x[ib].qs[j] & 0x0F); ++ const int v1 = (x[ib].qs[j] >> 4); ++ ++ sumi0 += (v0 * y[ib].qs[j]); ++ sumi1 += (v1 * y[ib].qs[j + qk/2]); ++ } ++ ++ int sumi = sumi0 + sumi1; ++ sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s); ++ } ++ ++ *s = sumf; ++} ++ ++void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc) ++{ ++ (void)(nrc); ++ (void)(bx); ++ (void)(by); ++ (void)(bs); ++ ++ const uint32_t kmask1 = 0x03030303; ++ const uint32_t kmask2 = 0x0f0f0f0f; ++ ++ const block_q3_K *__restrict__ x = vx; ++ const block_q8_K *__restrict__ y = vy; ++ const int nb = n / QK_K; ++ ++#ifdef __ARM_NEON ++ uint32_t aux[3]; ++ uint32_t utmp[4]; ++ ++ const uint8x16_t m3b = vdupq_n_u8(0x3); ++ const int32x4_t vzero = vdupq_n_s32(0); ++ ++ const uint8x16_t m0 = vdupq_n_u8(1); ++ const uint8x16_t m1 = vshlq_n_u8(m0, 1); ++ const uint8x16_t m2 = vshlq_n_u8(m0, 2); ++ const uint8x16_t m3 = vshlq_n_u8(m0, 3); ++ const int8_t m32 = 32; ++ ++ ggml_int8x16x4_t q3bytes; ++ ++ float sum = 0; ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ ++ const uint8_t * __restrict__ q3 = x[i].qs; ++ const uint8_t * __restrict__ qh = x[i].hmask; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); ++ ++ ggml_uint8x16x4_t q3h; ++ ++ int32_t isum = 0; ++ ++ // Set up scales ++ memcpy(aux, x[i].scales, 12); ++ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); ++ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); ++ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); ++ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); ++ ++ int8_t * scale = (int8_t *)utmp; ++ for (int j = 0; j < 16; ++j) scale[j] -= m32; ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32; ++ const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64; ++ const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64; ++ ++ q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2); ++ q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2); ++ q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1); ++ q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1); ++ ++ q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0])); ++ q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1])); ++ q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2])); ++ q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3])); ++ ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0]; ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1]; ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2]; ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3]; ++ ++ scale += 4; ++ ++ q3h.val[0] = vbicq_u8(m2, qhbits.val[0]); ++ q3h.val[1] = vbicq_u8(m2, qhbits.val[1]); ++ q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1); ++ q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1); ++ ++ q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0])); ++ q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1])); ++ q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2])); ++ q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3])); ++ ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0]; ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1]; ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2]; ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3]; ++ ++ scale += 4; ++ ++ if (j == 0) { ++ qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4); ++ qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4); ++ } ++ ++ } ++ sum += d * isum; ++ ++ } ++ ++ *s = sum; ++ ++#elif defined __AVX2__ ++ ++ const __m256i m3 = _mm256_set1_epi8(3); ++ const __m256i mone = _mm256_set1_epi8(1); ++ const __m128i m32 = _mm_set1_epi8(32); ++ ++ __m256 acc = _mm256_setzero_ps(); ++ ++ uint32_t aux[3]; ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ ++ const uint8_t * __restrict__ q3 = x[i].qs; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ // Set up scales ++ memcpy(aux, x[i].scales, 12); ++ __m128i scales128 = _mm_set_epi32( ++ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ++ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), ++ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), ++ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); ++ scales128 = _mm_sub_epi8(scales128, m32); ++ const __m256i all_scales = _mm256_cvtepi8_epi16(scales128); ++ const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0); ++ const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1); ++ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)}; ++ ++ // high bit ++ const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask); ++ ++ // integer accumulator ++ __m256i sumi = _mm256_setzero_si256(); ++ ++ int bit = 0; ++ int is = 0; ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ // load low 2 bits ++ const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32; ++ ++ // prepare low and high bits ++ const __m256i q3l_0 = _mm256_and_si256(q3bits, m3); ++ const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++ ++bit; ++ ++ const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3); ++ const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++ ++bit; ++ ++ const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3); ++ const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++ ++bit; ++ ++ const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3); ++ const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2); ++ ++bit; ++ ++ // load Q8 quants ++ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ ++ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, ++ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, ++ // and 2 if the high bit was set) ++ __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0); ++ __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1); ++ __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2); ++ __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3); ++ ++ __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0); ++ __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1); ++ __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2); ++ __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3); ++ ++ p16_0 = _mm256_sub_epi16(p16_0, q8s_0); ++ p16_1 = _mm256_sub_epi16(p16_1, q8s_1); ++ p16_2 = _mm256_sub_epi16(p16_2, q8s_2); ++ p16_3 = _mm256_sub_epi16(p16_3, q8s_3); ++ ++ // multiply with scales ++ p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0); ++ p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1); ++ p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2); ++ p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3); ++ ++ // accumulate ++ p16_0 = _mm256_add_epi32(p16_0, p16_1); ++ p16_2 = _mm256_add_epi32(p16_2, p16_3); ++ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2)); ++ ++ } ++ ++ // multiply with block scale and accumulate ++ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); ++ ++ } ++ ++ *s = hsum_float_8(acc); ++ ++#elif defined __AVX__ ++ ++ const __m128i m3 = _mm_set1_epi8(3); ++ const __m128i mone = _mm_set1_epi8(1); ++ const __m128i m32 = _mm_set1_epi8(32); ++ const __m128i m2 = _mm_set1_epi8(2); ++ ++ __m256 acc = _mm256_setzero_ps(); ++ ++ const uint32_t *aux; ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ ++ const uint8_t * __restrict__ q3 = x[i].qs; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ // Set up scales ++ aux = (const uint32_t *)x[i].scales; ++ __m128i scales128 = _mm_set_epi32( ++ ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ++ ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), ++ (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4), ++ (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4)); ++ scales128 = _mm_sub_epi8(scales128, m32); ++ const __m128i scales_0 = _mm_cvtepi8_epi16(scales128); ++ const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128)); ++ const __m128i scales[2] = { scales_0, scales_1 }; ++ ++ // high bit *128*2 from block_q3_K.hmask[QK_K/8] ++ const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]); ++ const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]); ++ ++ // integer accumulator ++ __m128i sumi_0 = _mm_setzero_si128(); ++ __m128i sumi_1 = _mm_setzero_si128(); ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4] ++ const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; ++ const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16; ++ ++ // prepare low and high bits ++ const int bit = j << 2; ++ ++ const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3); ++ const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3); ++ const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2); ++ const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2); ++ ++ const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3); ++ const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3); ++ const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2); ++ const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2); ++ ++ const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3); ++ const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3); ++ const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2); ++ const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2); ++ ++ const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3); ++ const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3); ++ const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2); ++ const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2); ++ ++ // load Q8 quants from block_q8_K.qs[QK_K] ++ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ ++ // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16, ++ // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set, ++ // and 2 if the high bit was set) ++ __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0); ++ __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1); ++ __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2); ++ __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3); ++ __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4); ++ __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5); ++ __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6); ++ __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7); ++ ++ __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0); ++ __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1); ++ __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2); ++ __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3); ++ __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4); ++ __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5); ++ __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6); ++ __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7); ++ ++ p16_0 = _mm_sub_epi16(p16_0, q8s_0); ++ p16_1 = _mm_sub_epi16(p16_1, q8s_1); ++ p16_2 = _mm_sub_epi16(p16_2, q8s_2); ++ p16_3 = _mm_sub_epi16(p16_3, q8s_3); ++ p16_4 = _mm_sub_epi16(p16_4, q8s_4); ++ p16_5 = _mm_sub_epi16(p16_5, q8s_5); ++ p16_6 = _mm_sub_epi16(p16_6, q8s_6); ++ p16_7 = _mm_sub_epi16(p16_7, q8s_7); ++ ++ // multiply with scales ++ __m128i shuffle = _mm_set1_epi16(0x0100); ++ p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6); ++ shuffle = _mm_add_epi16(shuffle, m2); ++ p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7); ++ ++ // accumulate ++ p16_0 = _mm_add_epi32(p16_0, p16_1); ++ p16_2 = _mm_add_epi32(p16_2, p16_3); ++ p16_4 = _mm_add_epi32(p16_4, p16_5); ++ p16_6 = _mm_add_epi32(p16_6, p16_7); ++ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); ++ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6)); ++ ++ } ++ ++ // multiply with block scale and accumulate ++ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); ++ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); ++ ++ } ++ ++ *s = hsum_float_8(acc); ++ ++#elif defined __riscv_v_intrinsic ++ ++ uint32_t aux[3]; ++ uint32_t utmp[4]; ++ ++ float sumf = 0; ++ for (int i = 0; i < nb; ++i) { ++ ++ const uint8_t * __restrict__ q3 = x[i].qs; ++ const uint8_t * __restrict__ qh = x[i].hmask; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ memcpy(aux, x[i].scales, 12); ++ utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); ++ utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4); ++ utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4); ++ utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4); ++ ++ int8_t * scale = (int8_t *)utmp; ++ for (int j = 0; j < 16; ++j) scale[j] -= 32; ++ ++ ++ size_t vl = 32; ++ uint8_t m = 1; ++ ++ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); ++ vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl); ++ ++ int sum_t = 0; ++ ++ for (int j = 0; j < QK_K; j += 128) { ++ ++ vl = 32; ++ ++ // load Q3 ++ vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl); ++ ++ vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl)); ++ vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl)); ++ vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl)); ++ vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl)); ++ ++ // compute mask for subtraction ++ vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl); ++ vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl); ++ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl); ++ m <<= 1; ++ ++ vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl); ++ vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl); ++ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl); ++ m <<= 1; ++ ++ vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl); ++ vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl); ++ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl); ++ m <<= 1; ++ ++ vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl); ++ vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl); ++ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl); ++ m <<= 1; ++ ++ // load Q8 and take product with Q3 ++ vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl); ++ vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl); ++ vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl); ++ vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl); ++ ++ vl = 16; ++ ++ // retrieve lane to multiply with scale ++ vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl); ++ vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl); ++ vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl); ++ vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl); ++ vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl); ++ vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl); ++ vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl); ++ vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl); ++ ++ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl); ++ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl); ++ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl); ++ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl); ++ ++ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); ++ ++ q3 += 32; q8 += 128; scale += 8; ++ ++ } ++ ++ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; ++ ++ sumf += d*sum_t; ++ ++ } ++ ++ *s = sumf; ++ ++#else ++ // scalar version ++ // This function is written like this so the compiler can manage to vectorize most of it ++ // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the ++ // manually vectorized version above. Every other version I tried would run at least 4 times slower. ++ // The ideal situation would be if we could just write the code once, and the compiler would ++ // automatically produce the best possible set of machine instructions, instead of us having to manually ++ // write vectorized versions for AVX, ARM_NEON, etc. ++ ++ int8_t aux8[QK_K]; ++ int16_t aux16[8]; ++ float sums [8]; ++ int32_t aux32[8]; ++ memset(sums, 0, 8*sizeof(float)); ++ ++ uint32_t auxs[4]; ++ const int8_t * scales = (const int8_t*)auxs; ++ ++ float sumf = 0; ++ for (int i = 0; i < nb; ++i) { ++ const uint8_t * __restrict__ q3 = x[i].qs; ++ const uint8_t * __restrict__ hm = x[i].hmask; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ memset(aux32, 0, 8*sizeof(int32_t)); ++ int8_t * __restrict__ a = aux8; ++ uint8_t m = 1; ++ for (int j = 0; j < QK_K; j += 128) { ++ for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; ++ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); ++ a += 32; m <<= 1; ++ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3; ++ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); ++ a += 32; m <<= 1; ++ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3; ++ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); ++ a += 32; m <<= 1; ++ for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3; ++ for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4); ++ a += 32; m <<= 1; ++ q3 += 32; ++ } ++ a = aux8; ++ ++ memcpy(auxs, x[i].scales, 12); ++ uint32_t tmp = auxs[2]; ++ auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); ++ auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); ++ auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4); ++ auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); ++ for (int j = 0; j < QK_K/16; ++j) { ++ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; ++ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; ++ q8 += 8; a += 8; ++ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; ++ for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l]; ++ q8 += 8; a += 8; ++ } ++ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; ++ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; ++ } ++ for (int l = 0; l < 8; ++l) sumf += sums[l]; ++ *s = sumf; ++ ++#endif ++ ++} ++ ++void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc) ++{ ++ (void)(nrc); ++ (void)(bx); ++ (void)(by); ++ (void)(bs); ++ ++ const block_q6_K * __restrict__ x = vx; ++ const block_q8_K * __restrict__ y = vy; ++ ++ const int nb = n / QK_K; ++ ++#ifdef __ARM_NEON ++ float sum = 0; ++ ++ const uint8x16_t m4b = vdupq_n_u8(0xF); ++ const int32x4_t vzero = vdupq_n_s32(0); ++ //const int8x16_t m32s = vdupq_n_s8(32); ++ ++ const uint8x16_t mone = vdupq_n_u8(3); ++ ++ ggml_int8x16x4_t q6bytes; ++ ggml_uint8x16x4_t q6h; ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d_all = GGML_FP16_TO_FP32(x[i].d); ++ ++ const uint8_t * __restrict__ q6 = x[i].ql; ++ const uint8_t * __restrict__ qh = x[i].qh; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ const int8_t * __restrict__ scale = x[i].scales; ++ ++ const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); ++ const int8x16_t scales = vld1q_s8(scale); ++ const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}}; ++ ++ const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])), ++ vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))), ++ vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])), ++ vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1])))); ++ int32_t isum_mins = vaddvq_s32(prod); ++ ++ int32_t isum = 0; ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32; ++ ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64; ++ ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; ++ ++ q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4); ++ q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4); ++ uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2); ++ q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); ++ shifted = vshrq_n_u8(qhbits.val[1], 2); ++ q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); ++ ++ //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s); ++ //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s); ++ //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s); ++ //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s); ++ q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])); ++ q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])); ++ q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])); ++ q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])); ++ ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + ++ vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + ++ vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + ++ vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; ++ ++ scale += 4; ++ ++ q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64; ++ ++ shifted = vshrq_n_u8(qhbits.val[0], 4); ++ q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4); ++ shifted = vshrq_n_u8(qhbits.val[1], 4); ++ q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4); ++ shifted = vshrq_n_u8(qhbits.val[0], 6); ++ q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4); ++ shifted = vshrq_n_u8(qhbits.val[1], 6); ++ q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4); ++ ++ //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s); ++ //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s); ++ //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s); ++ //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s); ++ q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])); ++ q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])); ++ q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])); ++ q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])); ++ ++ isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] + ++ vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] + ++ vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] + ++ vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3]; ++ scale += 4; ++ } ++ //sum += isum * d_all * y[i].d; ++ sum += d_all * y[i].d * (isum - 32 * isum_mins); ++ ++ } ++ *s = sum; ++ ++#elif defined __AVX2__ ++ ++ const __m256i m4 = _mm256_set1_epi8(0xF); ++ const __m256i m2 = _mm256_set1_epi8(3); ++ const __m256i m32s = _mm256_set1_epi8(32); ++ ++ __m256 acc = _mm256_setzero_ps(); ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ ++ const uint8_t * __restrict__ q4 = x[i].ql; ++ const uint8_t * __restrict__ qh = x[i].qh; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); ++ ++ __m256i sumi = _mm256_setzero_si256(); ++ ++ int is = 0; ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0)); ++ const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1)); ++ const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2)); ++ const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3)); ++ is += 4; ++ ++ const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; ++ const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32; ++ const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32; ++ ++ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4); ++ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4); ++ const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4); ++ const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4); ++ ++ const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0); ++ const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1); ++ const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2); ++ const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3); ++ ++ const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; ++ ++ __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0); ++ __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1); ++ __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2); ++ __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3); ++ ++ __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0); ++ __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1); ++ __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2); ++ __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3); ++ ++ p16_0 = _mm256_sub_epi16(p16_0, q8s_0); ++ p16_1 = _mm256_sub_epi16(p16_1, q8s_1); ++ p16_2 = _mm256_sub_epi16(p16_2, q8s_2); ++ p16_3 = _mm256_sub_epi16(p16_3, q8s_3); ++ ++ p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0); ++ p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1); ++ p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2); ++ p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3); ++ ++ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1)); ++ sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3)); ++ ++ } ++ ++ acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc); ++ } ++ ++ *s = hsum_float_8(acc); ++ ++#elif defined __AVX__ ++ ++ const __m128i m4 = _mm_set1_epi8(0xF); ++ const __m128i m3 = _mm_set1_epi8(3); ++ const __m128i m32s = _mm_set1_epi8(32); ++ const __m128i m2 = _mm_set1_epi8(2); ++ ++ __m256 acc = _mm256_setzero_ps(); ++ ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); ++ ++ const uint8_t * __restrict__ q4 = x[i].ql; ++ const uint8_t * __restrict__ qh = x[i].qh; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); ++ ++ __m128i sumi_0 = _mm_setzero_si128(); ++ __m128i sumi_1 = _mm_setzero_si128(); ++ ++ __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000); ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16; ++ const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16; ++ ++ const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4); ++ const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4); ++ const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4); ++ const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4); ++ const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4); ++ const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4); ++ const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4); ++ const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4); ++ ++ const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; ++ const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; ++ const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; ++ const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16; ++ ++ const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0); ++ const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1); ++ const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2); ++ const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3); ++ const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4); ++ const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5); ++ const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6); ++ const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7); ++ ++ const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16; ++ ++ __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0); ++ __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1); ++ __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2); ++ __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3); ++ __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4); ++ __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5); ++ __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6); ++ __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7); ++ ++ __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0); ++ __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1); ++ __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2); ++ __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3); ++ __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4); ++ __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5); ++ __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6); ++ __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7); ++ ++ p16_0 = _mm_sub_epi16(p16_0, q8s_0); ++ p16_1 = _mm_sub_epi16(p16_1, q8s_1); ++ p16_2 = _mm_sub_epi16(p16_2, q8s_2); ++ p16_3 = _mm_sub_epi16(p16_3, q8s_3); ++ p16_4 = _mm_sub_epi16(p16_4, q8s_4); ++ p16_5 = _mm_sub_epi16(p16_5, q8s_5); ++ p16_6 = _mm_sub_epi16(p16_6, q8s_6); ++ p16_7 = _mm_sub_epi16(p16_7, q8s_7); ++ ++ const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle); ++ shuffle = _mm_add_epi8(shuffle, m2); ++ const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle); ++ shuffle = _mm_add_epi8(shuffle, m2); ++ const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle); ++ shuffle = _mm_add_epi8(shuffle, m2); ++ const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle); ++ shuffle = _mm_add_epi8(shuffle, m2); ++ ++ p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0); ++ p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1); ++ p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2); ++ p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3); ++ p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4); ++ p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5); ++ p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6); ++ p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7); ++ ++ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2)); ++ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3)); ++ sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6)); ++ sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7)); ++ ++ } ++ ++ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0); ++ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc); ++ } ++ ++ *s = hsum_float_8(acc); ++ ++#elif defined __riscv_v_intrinsic ++ ++ float sumf = 0; ++ for (int i = 0; i < nb; ++i) { ++ ++ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; ++ ++ const uint8_t * __restrict__ q6 = x[i].ql; ++ const uint8_t * __restrict__ qh = x[i].qh; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ ++ const int8_t * __restrict__ scale = x[i].scales; ++ ++ size_t vl; ++ ++ vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); ++ ++ int sum_t = 0; ++ int is = 0; ++ ++ for (int j = 0; j < QK_K/128; ++j) { ++ ++ vl = 32; ++ ++ // load qh ++ vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl); ++ ++ // load Q6 ++ vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl); ++ vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl); ++ ++ vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl); ++ vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl); ++ vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl); ++ vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl); ++ ++ vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl); ++ vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl); ++ vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl); ++ vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl); ++ ++ vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl); ++ vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl); ++ vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl); ++ vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl); ++ ++ vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl); ++ vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl); ++ vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl); ++ vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl); ++ ++ // load Q8 and take product ++ vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl); ++ vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl); ++ vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl); ++ vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl); ++ ++ vl = 16; ++ ++ vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl); ++ vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl); ++ vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl); ++ vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl); ++ vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl); ++ vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl); ++ vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl); ++ vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl); ++ ++ vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl); ++ vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl); ++ vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl); ++ vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl); ++ ++ sum_t += __riscv_vmv_x_s_i32m1_i32(isum3); ++ ++ q6 += 64; qh += 32; q8 += 128; is=8; ++ ++ } ++ ++ sumf += d * sum_t; ++ ++ } ++ ++ *s = sumf; ++ ++#else ++ ++ int8_t aux8[QK_K]; ++ int16_t aux16[8]; ++ float sums [8]; ++ int32_t aux32[8]; ++ memset(sums, 0, 8*sizeof(float)); ++ ++ float sumf = 0; ++ for (int i = 0; i < nb; ++i) { ++ const uint8_t * __restrict__ q4 = x[i].ql; ++ const uint8_t * __restrict__ qh = x[i].qh; ++ const int8_t * __restrict__ q8 = y[i].qs; ++ memset(aux32, 0, 8*sizeof(int32_t)); ++ int8_t * __restrict__ a = aux8; ++ for (int j = 0; j < QK_K; j += 128) { ++ for (int l = 0; l < 32; ++l) { ++ a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; ++ a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; ++ a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32; ++ a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32; ++ } ++ a += 128; ++ q4 += 64; ++ qh += 32; ++ } ++ a = aux8; ++ int is = 0; ++ for (int j = 0; j < QK_K/16; ++j) { ++ int scale = x[i].scales[is++]; ++ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; ++ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; ++ q8 += 8; a += 8; ++ for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; ++ for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; ++ q8 += 8; a += 8; ++ } ++ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; ++ for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; ++ } ++ for (int l = 0; l < 8; ++l) sumf += sums[l]; ++ *s = sumf; ++#endif ++} ++ ++void ggml_vec_dot_q8_0_q8_0(int n, float *__restrict__ s, size_t bs, const void *__restrict__ vx, size_t bx, const void *__restrict__ vy, size_t by, int nrc) ++{ ++ const int qk = QK8_0; ++ const int nb = n / qk; ++ ++#if defined(__ARM_FEATURE_MATMUL_INT8) ++ assert((nrc == 2) || (nrc == 1) || (nrc == 16)); ++#else ++ assert(nrc == 1); ++#endif ++ ++ const block_q8_0 *__restrict__ x = vx; ++ const block_q8_0 *__restrict__ y = vy; ++ ++#if defined(__ARM_FEATURE_MATMUL_INT8) ++ if (nrc == 2) { ++ const block_q8_0 * __restrict__ vx0 = vx; ++ const block_q8_0 * __restrict__ vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx); ++ const block_q8_0 * __restrict__ vy0 = vy; ++ const block_q8_0 * __restrict__ vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by); ++ ++ float32x4_t sumv0 = vdupq_n_f32(0.0f); ++ ++ for (int i = 0; i < nb; i++) { ++ const block_q8_0 * __restrict__ b_x0 = &vx0[i]; ++ const block_q8_0 * __restrict__ b_y0 = &vy0[i]; ++ ++ const block_q8_0 * __restrict__ b_x1 = &vx1[i]; ++ const block_q8_0 * __restrict__ b_y1 = &vy1[i]; ++ ++ const int8x16_t x0_l = vld1q_s8(b_x0->qs); ++ const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16); ++ const int8x16_t x1_l = vld1q_s8(b_x1->qs); ++ const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16); ++ ++ // load y ++ const int8x16_t y0_l = vld1q_s8(b_y0->qs); ++ const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16); ++ const int8x16_t y1_l = vld1q_s8(b_y1->qs); ++ const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); ++ ++ float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), ++ GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), ++ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), ++ GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; ++ float32x4_t scale = vld1q_f32(_scale); ++ ++ int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); ++ int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); ++ ++ int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); ++ int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h))); ++ ++ int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); ++ int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l))); ++ ++ int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); ++ int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h))); ++ ++ sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)), ++ l1, r1)), l2, r2)), l3, r3))), scale); ++ } ++ float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2); ++ float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1); ++ ++ vst1_f32(s, vget_low_f32(sumv2)); ++ vst1_f32(s + bs, vget_high_f32(sumv2)); ++ return; ++ } ++#endif ++ ++ int ib = 0; ++ float sumf = 0; ++ ++#if defined(__ARM_FEATURE_SVE) ++ svfloat32_t sumv0 = svdup_n_f32(0.0f); ++ svfloat32_t sumv1 = svdup_n_f32(0.0f); ++ ++ const int vector_length = ggml_cpu_get_sve_cnt() * 8; ++ ++ //VLA Implemenation for SVE ++ switch (vector_length) { ++ case 128: ++ { ++ // predicate for activating lanes for 16 Int8 elements ++ const svbool_t ph16 = svptrue_pat_b8 (SV_VL16); ++ const svbool_t pl16 = svptrue_pat_b32(SV_VL4); ++ ++ for (; ib + 1 < nb; ib += 2) { ++ const block_q8_0 *__restrict__ x0 = &x[ib + 0]; ++ const block_q8_0 *__restrict__ x1 = &x[ib + 1]; ++ const block_q8_0 *__restrict__ y0 = &y[ib + 0]; ++ const block_q8_0 *__restrict__ y1 = &y[ib + 1]; ++ ++ // load x ++ const svint8_t qx0_0 = svld1_s8(ph16, x0->qs); ++ const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16); ++ const svint8_t qx1_0 = svld1_s8(ph16, x1->qs); ++ const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16); ++ ++ // load y ++ const svint8_t qy0_0 = svld1_s8(ph16, y0->qs); ++ const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16); ++ const svint8_t qy1_0 = svld1_s8(ph16, y1->qs); ++ const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16); ++ ++ sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16, ++ svdot_s32(svdup_n_s32(0), qx0_0, qy0_0), ++ svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); ++ sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16, ++ svdot_s32(svdup_n_s32(0), qx1_0, qy1_0), ++ svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); ++ } ++ ++ sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1)); ++ } break; ++ case 256: ++ { ++ //printf("sve256"); ++ for (; ib + 1 < nb; ib += 2) { ++ const block_q8_0 *__restrict__ x0 = &x[ib + 0]; ++ const block_q8_0 *__restrict__ x1 = &x[ib + 1]; ++ const block_q8_0 *__restrict__ y0 = &y[ib + 0]; ++ const block_q8_0 *__restrict__ y1 = &y[ib + 1]; ++ ++ // load x ++ const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs); ++ const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs); ++ ++ // load y ++ const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs); ++ const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs); ++ ++ sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), ++ svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); ++ sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), ++ svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); ++ } ++ ++ sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1)); ++ } break; ++ case 512: ++ { ++ // predicate for activating high 256 bit ++ const svbool_t ph32 = svptrue_pat_b8(SV_VL32); ++ // predicate for activating low 256 bit ++ const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32); ++ ++ // predicate for activating high lanes for 8 float32 elements ++ const svbool_t ph8 = svptrue_pat_b32(SV_VL8); ++ // predicate for activating low lanes for 8 float32 elements ++ const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8); ++ ++ svfloat32_t sumv00 = svdup_n_f32(0.0f); ++ ++ for (; ib + 1 < nb; ib += 2) { ++ const block_q8_0 *__restrict__ x0 = &x[ib + 0]; ++ const block_q8_0 *__restrict__ x1 = &x[ib + 1]; ++ const block_q8_0 *__restrict__ y0 = &y[ib + 0]; ++ const block_q8_0 *__restrict__ y1 = &y[ib + 1]; ++ ++ //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits ++ // and add them to make one 64 element vector ++ // load x ++ const svint8_t qx_32 = svld1_s8(ph32, x0->qs); ++ svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2); ++ ++ qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64); ++ ++ // load y ++ const svint8_t qy_32 = svld1_s8(ph32, y0->qs); ++ svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2); ++ ++ qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64); ++ ++ // scale creation ++ const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d); ++ const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d); ++ ++ // duplicate deq1 in first half of vector and deq2 in second half of vector ++ const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2); ++ ++ const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64)); ++ ++ sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp); ++ } ++ ++ sumf = svaddv_f32(svptrue_b32(), sumv00); ++ break; ++ } ++ default: ++ assert(false && "Unsupported vector length"); ++ break; ++ } ++#elif defined(__ARM_NEON) ++ float32x4_t sumv0 = vdupq_n_f32(0.0f); ++ float32x4_t sumv1 = vdupq_n_f32(0.0f); ++ ++ for (; ib + 1 < nb; ib += 2) { ++ const block_q8_0 *__restrict__ x0 = &x[ib + 0]; ++ const block_q8_0 *__restrict__ x1 = &x[ib + 1]; ++ const block_q8_0 *__restrict__ y0 = &y[ib + 0]; ++ const block_q8_0 *__restrict__ y1 = &y[ib + 1]; ++ ++ const int8x16_t x0_0 = vld1q_s8(x0->qs); ++ const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); ++ const int8x16_t x1_0 = vld1q_s8(x1->qs); ++ const int8x16_t x1_1 = vld1q_s8(x1->qs + 16); ++ ++ // load y ++ const int8x16_t y0_0 = vld1q_s8(y0->qs); ++ const int8x16_t y0_1 = vld1q_s8(y0->qs + 16); ++ const int8x16_t y1_0 = vld1q_s8(y1->qs); ++ const int8x16_t y1_1 = vld1q_s8(y1->qs + 16); ++ ++ sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32( ++ ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0), ++ ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d)); ++ ++ sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32( ++ ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0), ++ ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d)); ++ } ++ ++ sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1); ++#else ++ /* 不加速 */ ++ for (; ib < nb; ++ib) { ++ int sumi = 0; ++ ++ for (int j = 0; j < qk; j++) { ++ sumi += x[ib].qs[j] * y[ib].qs[j]; ++ } ++ ++ sumf += sumi * (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d)); ++ } ++#endif ++ *s = sumf; ++} ++ +diff --git a/csrc/cpu/quantize.h b/csrc/cpu/quantize.h +new file mode 100644 +index 000000000..10e988da1 +--- /dev/null ++++ b/csrc/cpu/quantize.h +@@ -0,0 +1,253 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++typedef float16_t ggml_half; ++typedef float32_t ggml_half2; ++typedef float16_t ggml_fp16_t; ++typedef float16_t ggml_float; ++typedef float16_t f16; ++ ++#define QK4_0 32 ++typedef struct { ++ ggml_half d; // delta ++ uint8_t qs[QK4_0 / 2]; // nibbles / quants ggml_half ++} block_q4_0; ++ ++#define QK4_1 32 ++typedef struct { ++ float d; // delta ++ float m; // min ++ uint8_t qs[QK4_1 / 2]; // nibbles / quants ++} block_q4_1; ++ ++#define QK8_0 32 ++typedef struct { ++ ggml_half d; // delta ++ int8_t qs[QK8_0]; // quants ++} block_q8_0; ++ ++#define QK8_1 32 ++typedef struct { ++ union { ++ struct { ++ ggml_half d; // delta ++ ggml_half s; // d * sum(qs[i]) ++ } GGML_COMMON_AGGR_S; ++ ggml_half2 ds; ++ } GGML_COMMON_AGGR_U; ++ int8_t qs[QK8_1]; // quants ++} block_q8_1; ++ ++#define QK_K 256 //目前不支持GGML_QKK_64 ++typedef struct { ++ uint8_t ql[QK_K/2]; // quants, lower 4 bits ++ uint8_t qh[QK_K/4]; // quants, upper 2 bits ++ int8_t scales[QK_K/16]; // scales, quantized with 8 bits ++ ggml_half d; // super-block scale ggml_half ++} block_q6_K; ++ ++typedef struct { ++ uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits ++ uint8_t qs[QK_K/4]; ++ union { ++ struct { ++ ggml_half d; // super-block scale for quantized scales ++ ggml_half dmin; // super-block scale for quantized mins ++ } GGML_COMMON_AGGR; ++ ggml_half2 dm; ++ }; ++} block_q2_K; ++ ++typedef struct { ++ uint8_t hmask[QK_K/8]; // quants - high bit ++ uint8_t qs[QK_K/4]; // quants - low 2 bits ++ uint8_t scales[12]; // scales, quantized with 6 bits ++ ggml_half d; // super-block scale ++} block_q3_K; ++ ++typedef struct { ++ float d; // delta ++ int8_t qs[QK_K]; // quants ++ int16_t bsums[QK_K/16]; // sum of quants in groups of 16 ++} block_q8_K; ++ ++#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) ++ ++ ++void dequantize_row_q2_K(const block_q2_K *__restrict__ src, float *__restrict__ dst, int64_t k); ++void dequantize_row_q4_0(const block_q4_0 * __restrict__ src, float * __restrict__ dst, int64_t k); ++void dequantize_row_q4_1(const block_q4_1 * __restrict__ src, float * __restrict__ dst, int64_t k); ++void dequantize_row_q8_0(const block_q8_0 *__restrict__ x, float *__restrict__ y, int64_t k); ++ ++void quantize_row_q8_K(const float *__restrict__ x, block_q8_K *__restrict__ y, int64_t k); ++void quantize_row_q6_K(const float *__restrict__ x, block_q6_K *__restrict__ y, int64_t k); ++void quantize_row_q3_K(const float *__restrict__ x, block_q3_K *__restrict__ y, int64_t k); ++void quantize_row_q2_K(const float *__restrict__ x, block_q2_K *__restrict__ y, int64_t k); ++void quantize_row_q4_0(const float *__restrict__ x, block_q4_0 *__restrict__ y, int64_t k); ++void quantize_row_q4_1(const float *__restrict__ x, block_q4_1 *__restrict__ y, int64_t k); ++void quantize_row_q8_0(const float *__restrict__ x, block_q8_0 *__restrict__ y, int64_t k); ++void quantize_row_q8_1(const float * __restrict__ x, block_q8_1 * __restrict__ y, int64_t k); ++void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, ++ size_t bx, const void * __restrict__ vy, size_t by, int nrc); ++void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, ++ size_t bx, const void * __restrict__ vy, size_t by, int nrc); ++void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, ++ size_t bx, const void * __restrict__ vy, size_t by, int nrc); ++void ggml_vec_dot_q4_0_q8_0(int n, float *__restrict__ s, size_t bs, const void *__restrict__ vx, ++ size_t bx, const void *__restrict__ vy, size_t by, int nrc); ++void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, ++ size_t bx, const void * __restrict__ vy, size_t by, int nrc); ++void ggml_vec_dot_q8_0_q8_0(int n, float *__restrict__ s, size_t bs, const void *__restrict__ vx, ++ size_t bx, const void *__restrict__ vy, size_t by, int nrc); ++ ++void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n); ++void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n); ++ ++void ggml_vec_dot_f16(int n, float * __restrict__ s, size_t bs, ggml_fp16_t * __restrict__ x, size_t bx, ggml_fp16_t * __restrict__ y, size_t by, int nrc); ++typedef void (*ggml_to_float_t)(const void *__restrict__ x, float *__restrict__ y, int64_t k); ++typedef void (*ggml_vec_dot_t)(int n, float *__restrict__ s, size_t bs, const void *__restrict__ x, size_t bx, ++ const void *__restrict__ y, size_t by, int nrc); ++typedef void (*ggml_from_float_t)(const float *__restrict__ x, void *__restrict__ y, int64_t k); ++ ++static inline float GGML_FP16_TO_FP32(ggml_half h) { ++ //ggml_half tmp; ++ //memcpy(&tmp, &h, sizeof(ggml_half)); ++ return (float)h; ++} ++ ++static inline float fp32_from_bits(uint32_t w) { ++ union { ++ uint32_t as_bits; ++ float as_value; ++ } fp32; ++ fp32.as_bits = w; ++ return fp32.as_value; ++} ++ ++static inline uint32_t fp32_to_bits(float f) { ++ union { ++ float as_value; ++ uint32_t as_bits; ++ } fp32; ++ fp32.as_value = f; ++ return fp32.as_bits; ++} ++ ++#define GGML_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) ++#if __AVX__ || __AVX2__ || __AVX512F__ ++static inline uint16_t ggml_compute_fp32_to_fp16(float f) { ++#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) ++ const float scale_to_inf = 0x1.0p+112f; ++ const float scale_to_zero = 0x1.0p-110f; ++#else ++ const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); ++ const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); ++#endif ++ float base = (fabsf(f) * scale_to_inf) * scale_to_zero; ++ ++ const uint32_t w = fp32_to_bits(f); ++ const uint32_t shl1_w = w + w; ++ const uint32_t sign = w & UINT32_C(0x80000000); ++ uint32_t bias = shl1_w & UINT32_C(0xFF000000); ++ if (bias < UINT32_C(0x71000000)) { ++ bias = UINT32_C(0x71000000); ++ } ++ ++ base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; ++ const uint32_t bits = fp32_to_bits(base); ++ const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); ++ const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); ++ const uint32_t nonsign = exp_bits + mantissa_bits; ++ return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); ++} ++#elif defined(__ARM_NEON) ++static inline ggml_half ggml_compute_fp32_to_fp16(float f) { ++ ggml_half res; ++ __fp16 tmp = f; ++ memcpy(&res, &tmp, sizeof(ggml_half)); ++ return res; ++} ++#endif ++ ++#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA) ++ ++#define GGML_SIMD ++ ++// F16 NEON ++ ++#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) ++ #define GGML_F16_STEP 32 ++ #define GGML_F16_EPR 8 ++ ++ #define GGML_F16x8 float16x8_t ++ #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) ++ #define GGML_F16x8_SET1(x) vdupq_n_f16(x) ++ #define GGML_F16x8_LOAD(x) vld1q_f16((const f16 *)(x)) ++ #define GGML_F16x8_STORE vst1q_f16 ++ #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) ++ #define GGML_F16x8_ADD vaddq_f16 ++ #define GGML_F16x8_MUL vmulq_f16 ++ #define GGML_F16x8_REDUCE(res, x) \ ++ do { \ ++ int offset = GGML_F16_ARR >> 1; \ ++ for (int i = 0; i < offset; ++i) { \ ++ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ ++ } \ ++ offset >>= 1; \ ++ for (int i = 0; i < offset; ++i) { \ ++ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ ++ } \ ++ offset >>= 1; \ ++ for (int i = 0; i < offset; ++i) { \ ++ (x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \ ++ } \ ++ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \ ++ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \ ++ (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ ++ } while (0) ++ ++ #define GGML_F16_VEC GGML_F16x8 ++ #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO ++ #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 ++ #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) ++ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((f16 *)(p), (r)[i]) ++ #define GGML_F16_VEC_FMA GGML_F16x8_FMA ++ #define GGML_F16_VEC_ADD GGML_F16x8_ADD ++ #define GGML_F16_VEC_MUL GGML_F16x8_MUL ++ #define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE ++#else ++ // if FP16 vector arithmetic is not supported, we use FP32 instead ++ // and take advantage of the vcvt_ functions to convert to/from FP16 ++ ++ #define GGML_F16_STEP 16 ++ #define GGML_F16_EPR 4 ++ ++ #define GGML_F32Cx4 float32x4_t ++ #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) ++ #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) ++ #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const f16 *)(x))) ++ #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) ++ #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) ++ #define GGML_F32Cx4_ADD vaddq_f32 ++ #define GGML_F32Cx4_MUL vmulq_f32 ++ #define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE ++ ++ #define GGML_F16_VEC GGML_F32Cx4 ++ #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO ++ #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 ++ #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) ++ #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((f16 *)(p), r[i]) ++ #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA ++ #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD ++ #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL ++ #define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE ++#endif ++ ++ #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR) ++#endif ++ +diff --git a/csrc/cpu/sysHAX_ops.cpp b/csrc/cpu/sysHAX_ops.cpp +new file mode 100644 +index 000000000..3410388dc +--- /dev/null ++++ b/csrc/cpu/sysHAX_ops.cpp +@@ -0,0 +1,1651 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include"cpu_types.hpp" ++// #include "tensor.h" ++#include "quantize.h" ++#include // Linux ++#include ++ ++// #include "decode.h" ++ ++typedef unsigned int UINT32; ++typedef unsigned long long UINT64; ++typedef float f32; ++// typedef unsigned char bool; ++#define GENERAL_ARCH_BAICHUAN "baichuan" ++#define EPSILON 1e-6f /* float a1, float a2, if |a1 - a2| < EPSILON, then a1 = a2*/ ++ ++extern void transpose_v(f16 *vt, const f16 *v, int n_tokens, int dim, int qkv_dim); ++extern void prefill_attention(f16 *out_ptr, const f16 *qkv_ptr, const f16 *vt_ptr, int N_tokens, int N_seqs, const int *seq_lens); ++ ++float expf_f16_table[65536]; ++ ++ ++template ++struct KernelVecType{ ++ using q_load_vec_t = void; ++ using k_load_vec_t = void; ++ using v_load_vec_t = void; ++ using q_k_v_vec_t = void; ++ using accum_vec_t = void; ++}; ++ ++template<> ++struct KernelVecType{ ++ using q_load_vec_t = vec_op::FP16Vec8; ++ using k_load_vec_t = vec_op::FP16Vec16; ++ using v_load_vec_t = vec_op::FP16Vec16; ++ using q_k_v_vec_t = vec_op::FP16Vec16; ++ using accum_vec_t = vec_op::FP16Vec16; ++}; ++ ++/* 记录tensor的数据类型 */ ++typedef struct { ++ int token_embd_weight; ++ int attn_k_weight; ++ int attn_k_bias; ++ int attn_norm_weight; ++ int attn_q_weight; ++ int attn_q_bias; ++ int attn_v_weight; ++ int attn_v_bias; ++ int ffn_down_weight; ++ int ffn_gate_weight; ++ int ffn_norm_weight; ++ int ffn_up_weight; ++ int attn_output_weight; ++ int output_weight; ++ int output_norm_weight; ++} WeightTypes; ++ ++WeightTypes weight_types; ++ ++ ++enum ggml_type { ++ GGML_TYPE_F32 = 0, ++ GGML_TYPE_F16 = 1, ++ GGML_TYPE_Q4_0 = 2, ++ GGML_TYPE_Q4_1 = 3, ++ GGML_TYPE_Q5_0 = 6, ++ GGML_TYPE_Q5_1 = 7, ++ GGML_TYPE_Q8_0 = 8, ++ GGML_TYPE_Q8_1 = 9, ++ GGML_TYPE_Q2_K = 10, ++ GGML_TYPE_Q3_K = 11, ++ GGML_TYPE_Q4_K = 12, ++ GGML_TYPE_Q5_K = 13, ++ GGML_TYPE_Q6_K = 14, ++ GGML_TYPE_Q8_K = 15, ++ GGML_TYPE_IQ2_XXS = 16, ++ GGML_TYPE_IQ2_XS = 17, ++ GGML_TYPE_IQ3_XXS = 18, ++ GGML_TYPE_IQ1_S = 19, ++ GGML_TYPE_IQ4_NL = 20, ++ GGML_TYPE_IQ3_S = 21, ++ GGML_TYPE_IQ2_S = 22, ++ GGML_TYPE_IQ4_XS = 23, ++ GGML_TYPE_I8 = 24, ++ GGML_TYPE_I16 = 25, ++ GGML_TYPE_I32 = 26, ++ GGML_TYPE_I64 = 27, ++ GGML_TYPE_F64 = 28, ++ GGML_TYPE_IQ1_M = 29, ++ GGML_TYPE_COUNT, ++}; ++ ++// 定义工作分配结构体 ++typedef struct WorkDivider { ++ int num_threads; ++ int tid; ++ int num_numas; ++ int threads_per_numa; ++ int my_numa; ++ int tid_in_numa; ++} WorkDivider; ++ ++// 定义工作范围结构体:单numa和多numa ++typedef struct SingleNumaWorkRange { ++ int begin_thread; ++ int end_thread; ++ int work_per_thread; ++} SingleNumaWorkRange; ++ ++typedef struct MultiNumaWorkRange { ++ int begin_numa; ++ int end_numa; ++ int work_per_numa; ++ int begin_thread; ++ int end_thread; ++ int work_per_thread; ++} MultiNumaWorkRange; ++ ++ ++typedef struct { ++ const char *pcTypeName; ++ UINT32 uiblkSize; ++ UINT32 uiTypeSize; ++ ggml_from_float_t quantize; ++ ggml_to_float_t dequantize; ++ enum ggml_type VecDotType; /* 矩阵点积计算类型 */ ++ ggml_vec_dot_t VecDotFunc; /* 矩阵点积计算函数 */ ++} BLOCK_DATA_INFO; ++ ++const char *g_ModelArch = "qwen2"; /* 模型架构 */ ++ ++BLOCK_DATA_INFO g_BlockDataInfo[] = { ++ {"f32", 1, sizeof(float), NULL, NULL, GGML_TYPE_F32, NULL}, ++ {"f16", 1, sizeof(uint16_t), (ggml_from_float_t)ggml_fp32_to_fp16_row, (ggml_to_float_t)ggml_fp16_to_fp32_row, GGML_TYPE_F16, (ggml_vec_dot_t)ggml_vec_dot_f16}, ++ {"q4_0", QK4_0, sizeof(block_q4_0), (ggml_from_float_t)quantize_row_q4_0, (ggml_to_float_t)dequantize_row_q4_0, GGML_TYPE_Q8_0, (ggml_vec_dot_t)ggml_vec_dot_q4_0_q8_0}, ++ {"q4_1", QK4_1, sizeof(block_q4_1), (ggml_from_float_t)quantize_row_q4_1, (ggml_to_float_t)dequantize_row_q4_1, GGML_TYPE_Q8_1, (ggml_vec_dot_t)ggml_vec_dot_q4_1_q8_1}, ++ {"", 0, 0, NULL}, ++ {"", 0, 0, NULL}, ++ {"", 0, 0, NULL}, ++ {"", 0, 0, NULL}, ++ {"q8_0", QK8_0, sizeof(block_q8_0), (ggml_from_float_t)quantize_row_q8_0, (ggml_to_float_t)dequantize_row_q8_0, GGML_TYPE_Q8_0, (ggml_vec_dot_t)ggml_vec_dot_q8_0_q8_0}, ++ // {"q8_1", QK8_1, sizeof(block_q8_1), (ggml_from_float_t)quantize_row_q8_1}, ++ // {"q2_K", QK_K, sizeof(block_q2_K), (ggml_from_float_t)quantize_row_q2_K, (ggml_to_float_t)dequantize_row_q2_K, GGML_TYPE_Q8_K, (ggml_vec_dot_t)ggml_vec_dot_q2_K_q8_K}, //10 ++ // {"q3_K", QK_K, sizeof(block_q3_K), (ggml_from_float_t)quantize_row_q3_K, NULL, GGML_TYPE_Q8_K, (ggml_vec_dot_t)ggml_vec_dot_q3_K_q8_K}, ++ // {"", 0, 0, NULL}, ++ // {"", 0, 0, NULL}, //13 ++ // {"q6_K", QK_K, sizeof(block_q6_K), (ggml_from_float_t)quantize_row_q6_K, NULL, GGML_TYPE_Q8_K, (ggml_vec_dot_t)ggml_vec_dot_q6_K_q8_K}, ++ // {"q8_K", QK_K, sizeof(block_q8_K), (ggml_from_float_t)quantize_row_q8_K}, ++}; ++ ++typedef struct { ++ enum ggml_type DataType; /* 张量的数据类型 */ ++ union { ++ void *tensor1; ++ void **tensor2; ++ void ***tensor3; ++ } Data; ++} TENSOR_INFO; ++ ++/* 挂载模型位置 */ ++typedef struct weight { ++ TENSOR_INFO token_embedding; ++ TENSOR_INFO rms_att_norm; // (layer, dim) rmsnorm weights ++ TENSOR_INFO rms_ffn_norm; // (layer, dim) ++ TENSOR_INFO Wqkv; ++ TENSOR_INFO wo; // (layer, n_head * head_size, dim) ++ // weights for bias ++ TENSOR_INFO qkv_bias; ++ // weights for ffn ++ TENSOR_INFO w1w3; ++ TENSOR_INFO ffn_down; // (layer, dim, hidden_dim) ++ TENSOR_INFO output; //output linear ++ TENSOR_INFO output_norm; //output RMS norm ++} WEIGHT; ++ ++ ++typedef struct { ++ void *Token_Ori; /* 原始的token */ ++ void *Token_Norm; /* 归一化处理后的token */ ++ float *K; /* K矩阵 */ ++ float *Q; /* Q矩阵 */ ++ float *V; /* V矩阵 */ ++ float *QK; /* QK的转置结果 */ ++ float *Attn_out; /* 注意力输出 */ ++ f16 *Attn_out_f16; ++ float *ffn_Gate; /* ffn_Gate输出 */ ++ float *ffn_up; /* ffn_up */ ++ float *logits; /* 采样输出 */ ++ float **key_cache; /* K cache */ ++ float **value_cache; /* V cache */ ++ void **temp_output_vec_numa; ++ void **tmp_vec_numa; ++ f16 *seq_qkv; /* 新增:序列的qkv输出 */ ++ float *add_weight; /* 新增权重 */ ++ float *output_f32; ++} MODEL_RUN_STATE; ++ ++typedef struct { ++ int dim; /* embedding 维度 */ ++ int n_head; /* 注意力头个数 */ ++ int n_kv_heads; /* kv的对数 */ ++ int hidden_dim; /* ffn隐藏层维度 */ ++ int n_layers; /* 模型层数 */ ++ int context_length; /* 上下文长度 */ ++ float norm_rms_eps; /* eps */ ++ int n_vocab; /* 词汇数量 */ ++ float rope_freq_base; /* rope频率 */ ++ f16 *cos_sin_cache; /* rope历史数据 */ ++ int n_rotary; /* rotary维度 */ ++ bool is_neox_style; /* rope风格 */ ++ double attn_scale; /* rope系数 */ ++} MODEL_HYPE_PARA; ++ ++__thread f16 qk_tmp_storage[131072]; ++ ++int g_numas = numa_num_configured_nodes(); ++WEIGHT g_pstWeight; ++MODEL_RUN_STATE g_stRunState; ++MODEL_HYPE_PARA g_pstModelHypePara; ++ ++float f16_to_f32(f16 h){return h;} ++f16 f32_to_f16(float h){return h;} ++ ++__attribute__((noinline)) ++f16 DOTPRODUCT_vv_f16(int M, const f16 *src0_ptr, const f16 *src1_ptr) ++{ ++ __builtin_prefetch(&src0_ptr[0], 0 , 0); ++ __builtin_prefetch(&src1_ptr[0], 0 , 0); ++ if (M >= 128) { ++ __builtin_prefetch(&src0_ptr[32], 0 , 0); ++ __builtin_prefetch(&src1_ptr[32], 0 , 0); ++ __builtin_prefetch(&src0_ptr[64], 0 , 0); ++ __builtin_prefetch(&src1_ptr[64], 0 , 0); ++ __builtin_prefetch(&src0_ptr[96], 0 , 0); ++ __builtin_prefetch(&src1_ptr[96], 0 , 0); ++ } ++ float sumf = 0.0f; ++ int j = 0; ++#ifdef __ARM_NEON ++ const int M_UNROLL = 8; ++ const int M_SIMD = 8; ++ float16x8_t sum[M_UNROLL] = {vdupq_n_f16(0.0f)}; ++ for (; j <= M - M_UNROLL * M_SIMD; j += M_UNROLL * M_SIMD) { ++ __builtin_prefetch(&src0_ptr[j + 192], 0 , 0); ++ __builtin_prefetch(&src1_ptr[j + 192], 0 , 0); ++ __builtin_prefetch(&src0_ptr[j + 224], 0 , 0); ++ __builtin_prefetch(&src1_ptr[j + 224], 0 , 0); ++ __builtin_prefetch(&src0_ptr[j + 256], 0 , 0); ++ __builtin_prefetch(&src1_ptr[j + 256], 0 , 0); ++ __builtin_prefetch(&src0_ptr[j + 288], 0 , 0); ++ __builtin_prefetch(&src1_ptr[j + 288], 0 , 0); ++ __builtin_prefetch(&src0_ptr[j + 320], 0 , 0); ++ __builtin_prefetch(&src1_ptr[j + 320], 0 , 0); ++ for (int ss = 0; ss < M_UNROLL; ss++) { ++ sum[ss] = vfmaq_f16(sum[ss], vld1q_f16(&src0_ptr[j + ss * M_SIMD]), vld1q_f16(&src1_ptr[j + ss * M_SIMD])); ++ } ++ } ++ ++ for (; j <= M - 8; j += 8) { ++ sum[0] = vfmaq_f16(sum[0], vld1q_f16(&src0_ptr[j]), vld1q_f16(&src1_ptr[j])); ++ } ++ sum[0] = vaddq_f16(vaddq_f16(sum[0], sum[2]), vaddq_f16(sum[1], sum[3])); ++ if (M_UNROLL > 4) { ++ sum[4] = vaddq_f16(vaddq_f16(sum[4], sum[6]), vaddq_f16(sum[5], sum[7])); ++ sum[0] = vaddq_f16(sum[0], sum[4]); ++ } ++ ++ float32x4_t t0 = vcvt_f32_f16(vget_low_f16(sum[0])); ++ float32x4_t t1 = vcvt_f32_f16(vget_high_f16(sum[0])); ++ sumf = vaddvq_f32(vaddq_f32(t0, t1)); ++#endif ++ for (; j < M; j++) { ++ sumf += (f16_to_f32(src0_ptr[j]) * f16_to_f32(src1_ptr[j])); ++ } ++ ++ return sumf; ++} ++ ++void transpose_v(f16 *vt, const f16 *v, int n_tokens, int dim, int qkv_dim) ++{ ++ for (int i = 0; i < n_tokens; i++) { ++ int j = 0; ++ for (int j = 0; j < dim; j++) { ++ vt[j * n_tokens + i] = v[i * qkv_dim + j]; ++ } ++ } ++} ++ ++void prefill_attention(f16 *out_ptr, const f16 *qkv_ptr, const f16 *vt_ptr, int N_tokens, int N_seqs, const int *seq_lens) ++{ ++ int N_gqa = g_pstModelHypePara.n_head / g_pstModelHypePara.n_kv_heads, dim_gqa = g_pstModelHypePara.dim / N_gqa; ++ int qkv_stride = g_pstModelHypePara.dim + 2 * dim_gqa, head_size = g_pstModelHypePara.dim / g_pstModelHypePara.n_head; ++ const f16 *q_ptr = qkv_ptr, *k_ptr = q_ptr + g_pstModelHypePara.dim; ++ ++ std::vector seqlen_prefix_sum; ++ seqlen_prefix_sum.push_back(0); ++ for (int i = 0, sum_seq_lens = 0; i < N_seqs; i++) { ++ sum_seq_lens += seq_lens[i]; ++ seqlen_prefix_sum.push_back(sum_seq_lens); ++ } ++ ++ int num_threads = omp_get_num_threads(); ++ num_threads = num_threads / 4; ++ ++ #pragma omp parallel for collapse(2) schedule(dynamic, 1) num_threads(num_threads) ++ for (int seq = 0; seq < N_seqs; seq++) { ++ for (int h_q = 0; h_q < g_pstModelHypePara.n_head; h_q++) { ++ f16 *qk_tmp = (f16 *)qk_tmp_storage; ++ int seq_t_begin = seqlen_prefix_sum[seq], seq_t_end = seqlen_prefix_sum[seq + 1]; ++ int h_kv = h_q / N_gqa; ++ ++ for (int t = seq_t_begin; t < seq_t_end; t++) { ++ const f16 *q_head_ptr = q_ptr + t * qkv_stride + h_q * head_size; ++ int token_idx_in_seq = t - seq_t_begin; ++ f16 row_max = -INFINITY; ++ for (int i = 0; i <= token_idx_in_seq; i++) { ++ const f16 *k_head_ptr = k_ptr + (seq_t_begin + i) *qkv_stride + h_kv *head_size; ++ qk_tmp[i] = DOTPRODUCT_vv_f16(head_size, q_head_ptr, k_head_ptr); ++ row_max = qk_tmp[i] > row_max ? qk_tmp[i] : row_max; ++ } ++ f32 sumexp = 0.0f; ++ for (int i = 0; i <= token_idx_in_seq; i++) { ++ f16 diff = qk_tmp[i] - row_max; ++ f32 exp_result = expf_f16_table[*(uint16_t *)&diff]; ++ qk_tmp[i] = exp_result; ++ sumexp += exp_result; ++ } ++ ++ for (int i = 0; i <= token_idx_in_seq; i++) { ++ qk_tmp[i] /= sumexp; ++ } ++ ++ for (int iv = 0; iv < head_size; iv++) { ++ const f16 *vt_seq_ptr = vt_ptr + h_kv * head_size * N_tokens + iv *N_tokens + seq_t_begin; ++ out_ptr[t * g_pstModelHypePara.dim + h_q * head_size + iv] = DOTPRODUCT_vv_f16(token_idx_in_seq + 1, vt_seq_ptr, qk_tmp); ++ } ++ } ++ } ++ } ++} ++ ++template ++void paged_attention_v1_impl( scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size] ++ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size] ++ const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x] ++ const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size] ++ const int num_kv_heads, ++ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq] ++ const int* __restrict__ seq_lens, // [num_seqs] ++ const int max_num_blocks_per_seq, ++ const int q_stride, const int kv_block_stride, const int kv_head_stride, ++ const int num_seqs, const int num_heads, const int HEAD_SIZE) ++{ ++ using q_load_vec_t = typename KernelVecType::q_load_vec_t; ++ using k_load_vec_t = typename KernelVecType::k_load_vec_t; ++ using v_load_vec_t = typename KernelVecType::v_load_vec_t; ++ using q_k_v_vec_t = typename KernelVecType::q_k_v_vec_t; ++ using accum_vec_t = typename KernelVecType::accum_vec_t; ++ using accum_scalar_t = scalar_t; ++ ++ constexpr int BLOCK_SIZE = 16; ++ constexpr int x = BLOCK_SIZE / sizeof(scalar_t); ++ static_assert(k_load_vec_t::get_elem_num() % x == 0); ++ static_assert(q_load_vec_t::get_elem_num() * sizeof(scalar_t) == 16); ++ ++ constexpr int TOKEN_PER_GROUP = k_load_vec_t::get_elem_num() / x; ++ constexpr int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP; ++ static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4); ++ ++ const int N_gqa = num_heads / num_kv_heads; ++ ++ int num_threads = omp_get_num_threads(); ++ num_threads = num_threads / 4; ++ ++#pragma omp parallel for collapse(2) schedule(dynamic, 1) num_threads(num_threads) ++ for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) { ++ for (int head_idx = 0; head_idx < num_heads; ++head_idx) { ++ accum_scalar_t *qk_tmp = (accum_scalar_t *)qk_tmp_storage; ++ int seq_len = seq_lens[seq_idx]; ++ const int* seq_block_table = block_tables + max_num_blocks_per_seq * seq_idx; ++ const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE; ++ const int64_t kv_head_idx = head_idx / N_gqa; ++ const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE; ++ const scalar_t* __restrict__ q_vec_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ++ ++ // Compute logits ++ for (int block_idx = 0; block_idx < block_num; ++block_idx) { ++ const int64_t physical_block_idx = seq_block_table[block_idx]; ++ const scalar_t* __restrict__ k_block_cache_ptr = ++ k_cache + physical_block_idx * kv_block_stride + ++ kv_head_idx * kv_head_stride; ++ const int token_num = (block_idx == block_num - 1) ? last_block_token_num : BLOCK_SIZE; ++ const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP; ++ accum_vec_t group_accums[MAX_GROUP_NUM]; ++ for (int q_offset = 0; q_offset < HEAD_SIZE; q_offset +=x, k_block_cache_ptr += x * BLOCK_SIZE) { ++ q_load_vec_t q_load_group_vec(q_vec_ptr + q_offset); ++ q_k_v_vec_t q_group_vec(q_load_group_vec); ++ ++ for (int token_group_idx = 0; token_group_idx < group_num; token_group_idx++) { ++ k_load_vec_t k_load_group_vec(k_block_cache_ptr + token_group_idx * x * TOKEN_PER_GROUP); ++ q_k_v_vec_t k_group_vec(k_load_group_vec); ++ vec_op::fma(group_accums[token_group_idx], q_group_vec, k_group_vec); ++ vec_op::prefetch(k_block_cache_ptr + x *BLOCK_SIZE + token_group_idx * x *TOKEN_PER_GROUP); ++ } ++ } ++ for (int token_group_idx = 0; token_group_idx < group_num; token_group_idx++) { ++ for (int token_idx = 0; token_idx < TOKEN_PER_GROUP; token_idx++) { ++ accum_scalar_t dot_v = ++ group_accums[token_group_idx]. ++ template reduce_sub_sum(token_idx); ++ qk_tmp[block_idx * BLOCK_SIZE + token_group_idx * TOKEN_PER_GROUP + token_idx] = dot_v; ++ } ++ } ++ } ++ ++ f32 max = qk_tmp[0], sum = 0.0; ++ for (int i = 1; i < seq_len; i++) { ++ max = max >= qk_tmp[i] ? max : qk_tmp[i]; ++ } ++ ++ for (int i = 0; i < seq_len; i++) { ++ f16 diff = qk_tmp[i] - max; ++ qk_tmp[i] = expf_f16_table[*(uint16_t *)&diff]; ++ sum += qk_tmp[i]; ++ } ++ int i = 0; ++ for (; i < seq_len; i++) { ++ qk_tmp[i] /= sum; ++ } ++ for (; i < block_num * BLOCK_SIZE; i++) { ++ qk_tmp[i] = 0; ++ } ++ ++ constexpr int head_elem_num_per_partition = 16; ++ assert(HEAD_SIZE % head_elem_num_per_partition == 0); ++ int head_partition_num = HEAD_SIZE / head_elem_num_per_partition; ++ for (int head_part_idx = 0; head_part_idx < head_partition_num; ++head_part_idx) { ++ accum_vec_t accums[head_elem_num_per_partition]; ++ scalar_t* __restrict__ out_ptr = ++ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + ++ head_part_idx * head_elem_num_per_partition; ++ for (int block_idx = 0; block_idx < block_num; ++block_idx) { ++ const int64_t physical_block_idx = seq_block_table[block_idx]; ++ const scalar_t* __restrict__ v_block_cache_ptr = ++ v_cache + physical_block_idx * kv_block_stride + ++ kv_head_idx * kv_head_stride + BLOCK_SIZE * head_part_idx * ++ head_elem_num_per_partition; ++ ++ accum_vec_t qk_vec(qk_tmp + block_idx * BLOCK_SIZE); ++ for (int head_elem_idx = 0; head_elem_idx < head_elem_num_per_partition; head_elem_idx++) { ++ v_load_vec_t v_load_vec(v_block_cache_ptr + BLOCK_SIZE * head_elem_idx); ++ accum_vec_t v_vec(v_load_vec); ++ vec_op::fma(accums[head_elem_idx], qk_vec, v_vec); ++ } ++ ++ if (block_idx != block_num - 1) { ++ const int64_t next_physical_block_idx = ++ seq_block_table[block_idx + 1]; ++ const scalar_t* __restrict__ next_v_block_cache_ptr = ++ v_cache + next_physical_block_idx * kv_block_stride + ++ kv_head_idx * kv_head_stride + ++ BLOCK_SIZE * head_part_idx * head_elem_num_per_partition; ++ ++ for (int head_elem_idx = 0; head_elem_idx < head_elem_num_per_partition; head_elem_idx += 2) { ++ vec_op::prefetch(next_v_block_cache_ptr + BLOCK_SIZE * head_elem_idx); ++ } ++ } ++ } ++ ++ for (int head_elem_idx = 0; head_elem_idx < head_elem_num_per_partition; head_elem_idx++) { ++ float value = accums[head_elem_idx].reduce_sum(); ++ vec_op::storeFP32(value, out_ptr + head_elem_idx); ++ } ++ } ++ } ++ } ++} ++ ++void Quantize(void *Dst, float *src, enum ggml_type DataType, int size) ++{ ++ g_BlockDataInfo[DataType].quantize(src, Dst, size); ++} ++ ++// 初始化工作分配结构体 ++void init_work_divider(WorkDivider *divider, int numas) { ++ divider->num_numas = numas; ++ divider->num_threads = omp_get_num_threads(); ++ if (divider->num_threads % divider->num_numas != 0) { ++ fprintf(stderr, "nthreads (%d) %% numas (%d) != 0\n", divider->num_threads, divider->num_numas); ++ exit(1); ++ } ++ divider->tid = omp_get_thread_num(); ++ // printf("tid:%d, num_threads:%d \n", divider->tid, divider->num_threads); ++ divider->threads_per_numa = divider->num_threads / divider->num_numas; ++ divider->my_numa = divider->tid / divider->threads_per_numa; ++ divider->tid_in_numa = divider->tid % divider->threads_per_numa; ++} ++ ++void RmsNorm(float *DstData, float *SrcData, float *SrcWeight, float eps, int dataNum) ++{ ++ float ss = 0.0f; ++ for (int j = 0; j < dataNum; j++) { ++ ss += SrcData[j] * SrcData[j]; ++ } ++ ss /= dataNum; ++ ss += eps; ++ ss = 1.0f / sqrtf(ss); ++ for (int j = 0; j < dataNum; j++) { ++ DstData[j] = SrcWeight[j] * (ss * SrcData[j]); ++ } ++} ++ ++void divide_all_work(const WorkDivider *divider, int total_workitems, SingleNumaWorkRange *pstSingleRange) ++{ ++ int work_per_thread = total_workitems / divider->num_threads; ++ int work_remaining = total_workitems % divider->num_threads; ++ if (work_remaining == 0) { ++ pstSingleRange->begin_thread = divider->tid * work_per_thread; ++ pstSingleRange->end_thread = divider->tid * work_per_thread + work_per_thread; ++ pstSingleRange->work_per_thread = work_per_thread; ++ } else if (divider->tid < work_remaining) { ++ pstSingleRange->begin_thread = divider->tid * work_per_thread + divider->tid; ++ pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + (divider->tid + 1); ++ pstSingleRange->work_per_thread = work_per_thread + 1; ++ } else { ++ pstSingleRange->begin_thread = divider->tid * work_per_thread + work_remaining; ++ pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + work_remaining; ++ pstSingleRange->work_per_thread = work_per_thread; ++ } ++ return; ++} ++ ++// 分配单 NUMA 节点的工作 ++void divide_work_first_numa(const WorkDivider *divider, int total_workitems, SingleNumaWorkRange *pstSingleRange) ++{ ++ if (divider->my_numa == 0) { ++ int work_per_thread = total_workitems / divider->threads_per_numa; ++ int work_remaining = total_workitems % divider->threads_per_numa; ++ if (work_remaining == 0) { ++ pstSingleRange->begin_thread = divider->tid * work_per_thread; ++ pstSingleRange->end_thread = divider->tid * work_per_thread + work_per_thread; ++ pstSingleRange->work_per_thread = work_per_thread; ++ } else if (divider->tid < work_remaining) { ++ pstSingleRange->begin_thread = divider->tid * work_per_thread + divider->tid; ++ pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + (divider->tid + 1); ++ pstSingleRange->work_per_thread = work_per_thread + 1; ++ } else { ++ pstSingleRange->begin_thread = divider->tid * work_per_thread + work_remaining; ++ pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + work_remaining; ++ pstSingleRange->work_per_thread = work_per_thread; ++ } ++ return; ++ } ++ ++ pstSingleRange->begin_thread = 0; ++ pstSingleRange->end_thread = 0; ++ pstSingleRange->work_per_thread = 0; ++} ++ ++// 分配所有 NUMA 节点的工作 ++void divide_work_all_numas(const WorkDivider *divider, int total_workitems, MultiNumaWorkRange *pstNulRange) ++{ ++ int max_workitems_per_numa = (total_workitems - 1) / divider->num_numas + 1; ++ int workitem_numa_begin = divider->my_numa * max_workitems_per_numa; ++ int workitem_numa_end = workitem_numa_begin + max_workitems_per_numa; ++ if (workitem_numa_end > total_workitems) { ++ workitem_numa_end = total_workitems; ++ } ++ int workitems_my_numa = workitem_numa_end - workitem_numa_begin; ++ int max_workitems_per_thread = (workitems_my_numa - 1) / divider->threads_per_numa + 1; ++ int begin = divider->tid_in_numa * max_workitems_per_thread; ++ int end = begin + max_workitems_per_thread; ++ if (end > workitems_my_numa) { ++ end = workitems_my_numa; ++ } ++ ++ pstNulRange->begin_numa = workitem_numa_begin; ++ pstNulRange->end_numa = workitem_numa_end; ++ pstNulRange->work_per_numa = max_workitems_per_numa; ++ pstNulRange->begin_thread = begin; ++ pstNulRange->end_thread = end; ++ pstNulRange->work_per_thread = end - begin; ++} ++ ++/* 反量化 */ ++void Dequantize(void *DstData, void *SrcData, WEIGHT *pstWeight, int dataNum) ++{ ++ enum ggml_type TokenType = pstWeight->token_embedding.DataType; ++ ++ /* 需要反量化情况 */ ++ if (TokenType != GGML_TYPE_F32 && pstWeight->rms_att_norm.DataType == GGML_TYPE_F32) { ++ g_BlockDataInfo[TokenType].dequantize(SrcData, static_cast(DstData), dataNum); ++ } ++} ++ ++void divide_kv_cache_numa(const WorkDivider * divider, int total_workitems, ++ SingleNumaWorkRange * pstSingleRange) ++{ ++ int work_per_thread; ++ int work_remaining; ++ int work_per_numa; ++ int NumaNum = divider->num_numas; ++ int threads_per_numa = divider->threads_per_numa; ++ int head_per_numa; ++ int tid_group, tid_use, head_add; ++ ++ if (total_workitems % NumaNum != 0) { ++ fprintf(stderr, "kv cache: heads (%d) %% numas (%d) != 0\n", total_workitems, NumaNum); ++ exit(1); ++ } ++ ++ work_per_numa = total_workitems / NumaNum; ++ if (total_workitems <= divider->num_threads) { ++ /* 隔离不需要的线程 */ ++ if (divider->tid % threads_per_numa >= work_per_numa) { ++ pstSingleRange->begin_thread = 0; ++ pstSingleRange->end_thread = 0; ++ return; ++ } ++ ++ /* kvcache分配 */ ++ pstSingleRange->begin_thread = divider->tid_in_numa + divider->my_numa * work_per_numa; ++ pstSingleRange->end_thread = pstSingleRange->begin_thread + 1; ++ } else { ++ tid_group = divider->tid / threads_per_numa; ++ tid_use = divider->tid % threads_per_numa; ++ head_per_numa = total_workitems / NumaNum; ++ work_per_thread = head_per_numa / threads_per_numa; ++ work_remaining = head_per_numa % threads_per_numa; ++ head_add = head_per_numa * tid_group; ++ if (work_remaining == 0) { ++ pstSingleRange->begin_thread = tid_use * work_per_thread + head_add; ++ pstSingleRange->end_thread = tid_use * work_per_thread + work_per_thread + head_add; ++ } else if (tid_use < work_remaining) { ++ pstSingleRange->begin_thread = tid_use * work_per_thread + tid_use + head_add; ++ pstSingleRange->end_thread = (tid_use + 1) * work_per_thread + (tid_use + 1) + head_add; ++ } else { ++ pstSingleRange->begin_thread = tid_use * work_per_thread + work_remaining + head_add; ++ pstSingleRange->end_thread = (tid_use + 1) * work_per_thread + work_remaining + head_add; ++ } ++ } ++ return; ++} ++ ++void Rope_embedding(MODEL_RUN_STATE *pstRunState, MODEL_HYPE_PARA *pstModelPara, int pos, int n_tokens) ++{ ++ int dim = pstModelPara->dim; ++ int n_kv_heads = pstModelPara->n_kv_heads; ++ int n_head = pstModelPara->n_head; ++ long kv_dim = (dim * n_kv_heads) / n_head; ++ int head_size = dim / n_head; ++ float rope_freq_base = (fabsf(pstModelPara->rope_freq_base - 0.0f) < EPSILON) ++ ? 10000.0f ++ : pstModelPara->rope_freq_base; ++ ++ bool ropetype = strstr(g_ModelArch, "qwen") != NULL; ++ for (int k = 0; k < n_tokens; k++, pos++) { ++ if (!ropetype){ ++ for (int i = 0; i < dim; i += 2) { ++ int head_dim = i % head_size; ++ float freq = 1.0f / powf(rope_freq_base, head_dim / (float)head_size); ++ float val = pos * freq; ++ float fcr = cosf(val); ++ float fci = sinf(val); ++ int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only ++ for (int v = 0; v < rotn; v++) { ++ float *vec = v == 0 ? (float *)pstRunState->Q + k * dim : (float *)pstRunState->K + k * kv_dim; // the vector to rotate (query or key) ++ float v0 = vec[i]; ++ float v1 = vec[i+1]; ++ vec[i] = v0 * fcr - v1 * fci; ++ vec[i+1] = v0 * fci + v1 * fcr; ++ } ++ } ++ }else{ ++ for (int j = 0; j < dim / head_size; j++){ ++ for (int i = 0; i < head_size; i += 2) { ++ int I = i / 2; ++ float freq = 1.0f / powf(rope_freq_base, i / (float)head_size); ++ float val = pos * freq; ++ float fcr = cosf(val); ++ float fci = sinf(val); ++ int rotn = i + j * head_size < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only ++ for (int v = 0; v < rotn; v++) { ++ float *vec = v == 0 ? (float *)pstRunState->Q + k * dim : (float *)pstRunState->K + k * kv_dim; // the vector to rotate (query or key) ++ float v0 = vec[I + j * head_size]; ++ float v1 = vec[I + j * head_size + head_size / 2]; ++ vec[I + j * head_size] = v0 * fcr - v1 * fci; ++ vec[I + j * head_size + head_size / 2] = v0 * fci + v1 * fcr; ++ } ++ } ++ } ++ } ++ } ++} ++ ++void Active_Silu(f16 *dst, f16 *w1w3, int hidden_dim, int n_tokens) ++{ ++ int total_hidden_dim = hidden_dim << 1; ++ ++ for (int j = 0; j < n_tokens; j++) { ++ for (int i = 0; i < hidden_dim; i++) { ++ float val = w1w3[i + j * total_hidden_dim]; ++ val *= (1.0f / (1.0f + expf(-val))); ++ val *= w1w3[i + j * total_hidden_dim + hidden_dim]; ++ dst[i + j * hidden_dim] = (f16)val; ++ } ++ } ++} ++ ++__attribute__((noinline)) ++void Rope_embedding_impl(bool rope_type, int n_rotary, f16 *head_ptr, const f16 *cos_sin_cache, int position) ++{ ++ const f16 *cos_sin_ptr = cos_sin_cache + position * n_rotary; ++ int embed_dim = n_rotary >> 1; ++ ++ /* rope_neox */ ++ if (rope_type == true) { ++ int xx = 0, yy = embed_dim; ++ for (; xx <= embed_dim - 8; xx += 8, yy += 8) { ++ __builtin_prefetch(&head_ptr[xx + 32], 1, 2); ++ __builtin_prefetch(&head_ptr[yy + 32], 1, 2); ++ const float16x8_t qx = vld1q_f16(&head_ptr[xx]), qy = vld1q_f16(&head_ptr[yy]); ++ const float16x8_t csx = vld1q_f16(&cos_sin_ptr[xx]), csy = vld1q_f16(&cos_sin_ptr[yy]); ++ vst1q_f16(&head_ptr[xx], vfmaq_f16(vmulq_f16(qx, csx), vnegq_f16(qy), csy)); ++ vst1q_f16(&head_ptr[yy], vfmaq_f16(vmulq_f16(qy, csx), qx, csy)); ++ } ++ for (; xx < embed_dim; xx++, yy++) { ++ const f16 qx = head_ptr[xx], qy = head_ptr[yy]; ++ head_ptr[xx] = qx * cos_sin_ptr[xx] - qy * cos_sin_ptr[yy]; ++ head_ptr[yy] = qy * cos_sin_ptr[xx] + qx * cos_sin_ptr[yy]; ++ } ++ } else { /* rope_gptj */ ++ for (int j = 0; j < embed_dim; j++) { ++ const f16 qx = head_ptr[2 * j], qy = head_ptr[2 * j + 1]; ++ const f16 cos = cos_sin_ptr[j], sin = cos_sin_ptr[embed_dim + j]; ++ head_ptr[2 * j] = qx * cos - qy * sin; ++ head_ptr[2 * j + 1] = qy * cos + qx * sin; ++ } ++ } ++} ++ ++void quantization_weight_strategy(void *dst, void *src, int64_t quantization_bit_code, size_t Size) ++{ ++ int kv_dim = g_pstModelHypePara.dim * g_pstModelHypePara.n_kv_heads / g_pstModelHypePara.n_head; ++ float Buffer[kv_dim]; ++ int block_num = Size / kv_dim; ++ ++ /* 反量化 */ ++ if (quantization_bit_code == GGML_TYPE_F32) { ++ g_BlockDataInfo[GGML_TYPE_F16].dequantize(src, dst, Size); ++ } else { ++ if (quantization_bit_code != GGML_TYPE_F16) { ++ int offset = kv_dim / g_BlockDataInfo[quantization_bit_code].uiblkSize * g_BlockDataInfo[quantization_bit_code].uiTypeSize; ++ for (int i = 0; i < block_num; i++) { ++ g_BlockDataInfo[GGML_TYPE_F16].dequantize((char *)src + i * kv_dim * sizeof(f16), Buffer, kv_dim); ++ g_BlockDataInfo[quantization_bit_code].quantize(Buffer, (char *)dst + i * offset, kv_dim); ++ } ++ } else { /* 直接复制权重 */ ++ memcpy(dst, src, Size * sizeof(f16)); ++ } ++ } ++} ++ ++void load_weight_and_malloc_active_tensor( ++ int64_t dim, // MODEL_HYPE_PARA.dim embedding 维度 ++ int64_t hidden_dim, // MODEL_HYPE_PARA.hidden_dim ffn 隐藏层维度 ++ int64_t n_layers, // MODEL_HYPE_PARA.n_layers 模型层数 ++ int64_t n_vocab, // MODEL_HYPE_PARA.n_vocab 词汇数量 ++ int64_t n_head, // MODEL_HYPE_PARA.n_head 注意力头个数 ++ int64_t n_kv_heads, // MODEL_HYPE_PARA.n_kv_heads kv的对数 ++ int64_t context_length, // MODEL_HYPE_PARA.context_length 上下文长度 ++ double norm_rms_eps, // MODEL_HYPE_PARA.norm_rms_eps eps ++ double rope_freq_base, // MODEL_HYPE_PARA.rope_freq_base rope频率 ++ double attn_scale, ++ int64_t is_neox_style, ++ int64_t quantization_bit_code, ++ ++ torch::Tensor const &cos_sin_cache, ++ torch::Tensor token_embedding, // WEIGHT.token_embedding ++ torch::Tensor rms_att_norm, // WEIGHT.rms_att_norm ++ torch::Tensor rms_ffn_norm, // WEIGHT.rms_ffn_norm ++ torch::Tensor wqkv, ++ torch::Tensor wo, // WEIGHT.wo ++ torch::Tensor qkv_bias, ++ torch::Tensor w1w3, ++ torch::Tensor ffn_down, // WEIGHT.ffn_down ++ torch::Tensor output_norm, // WEIGHT.output_norm ++ torch::Tensor lm_head // WEIGHT.output ++){ ++ g_pstModelHypePara.dim = dim; /* embedding 维度 */ ++ g_pstModelHypePara.n_head = n_head; /* 注意力头个数 */ ++ g_pstModelHypePara.n_kv_heads = n_kv_heads; /* kv的对数 */ ++ g_pstModelHypePara.hidden_dim = hidden_dim; /* ffn隐藏层维度 */ ++ g_pstModelHypePara.n_layers = n_layers; /* 模型层数 */ ++ g_pstModelHypePara.context_length = context_length; /* 上下文长度 */ ++ g_pstModelHypePara.norm_rms_eps = norm_rms_eps; /* eps */ ++ g_pstModelHypePara.n_vocab = n_vocab; /* 词汇数量 */ ++ g_pstModelHypePara.rope_freq_base = rope_freq_base; /* rope频率 */ ++ g_pstModelHypePara.cos_sin_cache = (f16 *)cos_sin_cache.data_ptr(); ++ g_pstModelHypePara.n_rotary = cos_sin_cache.size(1); ++ g_pstModelHypePara.is_neox_style = is_neox_style; ++ g_pstModelHypePara.attn_scale = attn_scale; ++ ++ weight_types.token_embd_weight = GGML_TYPE_F32; ++ weight_types.attn_k_weight = quantization_bit_code; ++ weight_types.attn_k_bias = GGML_TYPE_F32; ++ weight_types.attn_norm_weight = GGML_TYPE_F32; ++ weight_types.attn_q_weight = quantization_bit_code; ++ weight_types.attn_q_bias = GGML_TYPE_F32; ++ weight_types.attn_v_weight = quantization_bit_code; ++ weight_types.attn_v_bias = GGML_TYPE_F32; ++ weight_types.ffn_down_weight = quantization_bit_code; ++ weight_types.ffn_gate_weight = quantization_bit_code; ++ weight_types.ffn_norm_weight = GGML_TYPE_F32; ++ weight_types.ffn_up_weight = quantization_bit_code; ++ weight_types.attn_output_weight = quantization_bit_code; ++ weight_types.output_weight = quantization_bit_code; ++ weight_types.output_norm_weight = GGML_TYPE_F32; ++ ++ for(int i = 0; i < (1 << 16); ++i) { ++ float f = f16_to_f32(*(f16*)(&i)); ++ expf_f16_table[i] = f32_to_f16(expf(f)); ++ } ++ ++ assert(wq.dtype() == torch::kFloat16); ++ int N_gqa = n_head / n_kv_heads; ++ int kv_dim = dim / N_gqa; ++ ++ size_t tokens_embedding_weight_size = (size_t)dim * n_vocab / g_BlockDataInfo[weight_types.token_embd_weight].uiblkSize * g_BlockDataInfo[weight_types.token_embd_weight].uiTypeSize; ++ ++ size_t attention_q_size_per_layer = (size_t)dim * dim / g_BlockDataInfo[weight_types.attn_q_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_q_weight].uiTypeSize; ++ size_t attention_k_size_per_layer = (size_t)dim * kv_dim / g_BlockDataInfo[weight_types.attn_k_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_k_weight].uiTypeSize; ++ size_t attention_v_size_per_layer = (size_t)dim * kv_dim / g_BlockDataInfo[weight_types.attn_v_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_v_weight].uiTypeSize; ++ size_t attention_size_per_layer = attention_q_size_per_layer + attention_k_size_per_layer + attention_v_size_per_layer; ++ ++ size_t bias_q_size_per_layer = (size_t)dim / g_BlockDataInfo[weight_types.attn_q_bias].uiblkSize * g_BlockDataInfo[weight_types.attn_q_bias].uiTypeSize; ++ size_t bias_k_size_per_layer = (size_t)kv_dim / g_BlockDataInfo[weight_types.attn_k_bias].uiblkSize * g_BlockDataInfo[weight_types.attn_k_bias].uiTypeSize; ++ size_t bias_v_size_per_layer = (size_t)kv_dim / g_BlockDataInfo[weight_types.attn_v_bias].uiblkSize * g_BlockDataInfo[weight_types.attn_v_bias].uiTypeSize; ++ size_t bias_qkv_size_per_layer = bias_q_size_per_layer + bias_k_size_per_layer + bias_v_size_per_layer; ++ ++ size_t attention_norm_size_per_layer = (size_t)dim / g_BlockDataInfo[weight_types.attn_norm_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_norm_weight].uiTypeSize; ++ ++ size_t ffn_down_size_per_layer = (size_t)dim * hidden_dim / g_BlockDataInfo[weight_types.ffn_down_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_down_weight].uiTypeSize; ++ size_t ffn_gate_size_per_layer = (size_t)dim * hidden_dim / g_BlockDataInfo[weight_types.ffn_gate_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_gate_weight].uiTypeSize; ++ size_t ffn_norm_size_per_layer = (size_t)dim / g_BlockDataInfo[weight_types.ffn_norm_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_norm_weight].uiTypeSize; ++ size_t ffn_up_size_per_layer = (size_t)dim * hidden_dim / g_BlockDataInfo[weight_types.ffn_up_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_up_weight].uiTypeSize; ++ size_t w1w3_size_per_layer = ffn_gate_size_per_layer + ffn_up_size_per_layer; ++ ++ size_t attention_output_size_per_layer = (size_t)dim * dim / g_BlockDataInfo[weight_types.attn_output_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_output_weight].uiTypeSize; ++ size_t output_size = (size_t)dim * n_vocab / g_BlockDataInfo[weight_types.output_weight].uiblkSize * g_BlockDataInfo[weight_types.output_weight].uiTypeSize; ++ size_t output_norm_size = (size_t)dim / g_BlockDataInfo[weight_types.output_norm_weight].uiblkSize * g_BlockDataInfo[weight_types.output_norm_weight].uiTypeSize; ++ ++ g_pstWeight.rms_att_norm.Data.tensor2 = static_cast(numa_alloc_onnode(n_layers * sizeof(float *), 0)); ++ g_pstWeight.rms_ffn_norm.Data.tensor2 = static_cast(numa_alloc_onnode(n_layers * sizeof(float *), 0)); ++ g_pstWeight.qkv_bias.Data.tensor2 = static_cast(numa_alloc_onnode(n_layers * sizeof(float *), 0)); ++ ++ for (int i = 0; i < n_layers; i++) { ++ g_pstWeight.rms_att_norm.Data.tensor2[i] = numa_alloc_onnode(attention_norm_size_per_layer, 0); ++ g_pstWeight.rms_ffn_norm.Data.tensor2[i] = numa_alloc_onnode(ffn_norm_size_per_layer, 0); ++ g_pstWeight.qkv_bias.Data.tensor2[i] = numa_alloc_onnode(bias_qkv_size_per_layer, 0); ++ } ++ g_pstWeight.output_norm.Data.tensor1 = numa_alloc_onnode(output_norm_size, 0); ++ ++ g_pstWeight.Wqkv.Data.tensor3 = static_cast(numa_alloc_onnode(g_numas * sizeof(void **), 0)); ++ g_pstWeight.wo.Data.tensor3 = static_cast(numa_alloc_onnode(g_numas * sizeof(void **), 0)); ++ g_pstWeight.w1w3.Data.tensor3 = static_cast(numa_alloc_onnode(g_numas * sizeof(void **), 0)); ++ g_pstWeight.ffn_down.Data.tensor3 = static_cast(numa_alloc_onnode(g_numas * sizeof(void **), 0)); ++ g_pstWeight.output.Data.tensor2 = static_cast(numa_alloc_onnode(g_numas * sizeof(void *), 0)); ++ g_pstWeight.token_embedding.Data.tensor1 = numa_alloc_onnode(tokens_embedding_weight_size, 0); ++ ++ for (int i = 0; i < g_numas; i++) { ++ g_pstWeight.Wqkv.Data.tensor3[i] = static_cast(numa_alloc_onnode(n_layers * sizeof(void *), i)); ++ g_pstWeight.wo.Data.tensor3[i] = static_cast(numa_alloc_onnode(n_layers * sizeof(void *), i)); ++ g_pstWeight.w1w3.Data.tensor3[i] = static_cast(numa_alloc_onnode(n_layers * sizeof(void *), i)); ++ g_pstWeight.ffn_down.Data.tensor3[i] = static_cast(numa_alloc_onnode(n_layers * sizeof(void *), i)); ++ g_pstWeight.output.Data.tensor2[i] = (void *)numa_alloc_onnode(output_size / g_numas, i); ++ ++ for (int j = 0; j < n_layers; j++) { ++ g_pstWeight.Wqkv.Data.tensor3[i][j] = numa_alloc_onnode(attention_size_per_layer / g_numas, i); ++ g_pstWeight.wo.Data.tensor3[i][j] = numa_alloc_onnode(attention_output_size_per_layer / g_numas, i); ++ g_pstWeight.w1w3.Data.tensor3[i][j] = numa_alloc_onnode(w1w3_size_per_layer / g_numas, i); ++ g_pstWeight.ffn_down.Data.tensor3[i][j] = numa_alloc_onnode(ffn_down_size_per_layer / g_numas, i); ++ } ++ } ++ ++ std::cout << "load_weight start ..." << std::endl; ++ ++ /* 量化权重 */ ++ for (int layerNum = 0; layerNum < n_layers; layerNum++) { ++ quantization_weight_strategy(g_pstWeight.rms_att_norm.Data.tensor2[layerNum], rms_att_norm.index(torch::indexing::TensorIndex(layerNum)).data_ptr(), ++ weight_types.attn_norm_weight, dim); ++ quantization_weight_strategy(g_pstWeight.rms_ffn_norm.Data.tensor2[layerNum], rms_ffn_norm.index(torch::indexing::TensorIndex(layerNum)).data_ptr(), ++ weight_types.ffn_norm_weight, dim); ++ int qkv_dim = dim + 2 * kv_dim; ++ for (int j = 0; j < g_numas; ++j) { ++ f16 *qkv_pointer = (f16 *)wqkv.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + qkv_dim / g_numas * dim * j; ++ quantization_weight_strategy(g_pstWeight.Wqkv.Data.tensor3[j][layerNum], (char *)qkv_pointer, weight_types.attn_k_weight, ++ qkv_dim / g_numas * dim); ++ ++ f16 *wo_pointer = (f16 *)wo.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + dim / g_numas * dim * j; ++ quantization_weight_strategy(g_pstWeight.wo.Data.tensor3[j][layerNum], (char *)wo_pointer, ++ weight_types.attn_output_weight, dim * dim / g_numas); ++ ++ f16 *w1w3_pointer = (f16 *)w1w3.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + 2 * hidden_dim / g_numas * dim * j; ++ quantization_weight_strategy(g_pstWeight.w1w3.Data.tensor3[j][layerNum], (char *)w1w3_pointer, ++ weight_types.ffn_up_weight, 2 * hidden_dim / g_numas * dim); ++ ++ f16 *ffn_down_pointer = (f16 *)ffn_down.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + hidden_dim / g_numas * dim * j; ++ quantization_weight_strategy(g_pstWeight.ffn_down.Data.tensor3[j][layerNum], (char *)ffn_down_pointer, ++ weight_types.ffn_down_weight, dim * hidden_dim / g_numas); ++ } ++ ++ quantization_weight_strategy(g_pstWeight.qkv_bias.Data.tensor2[layerNum], qkv_bias.index(torch::indexing::TensorIndex(layerNum)).data_ptr(), ++ weight_types.attn_q_bias, qkv_dim); ++ } ++ ++ for (int i = 0; i < g_numas; i++) { ++ f16 *output_pointer = (f16 *)lm_head.data_ptr() + n_vocab / g_numas * dim * i; ++ quantization_weight_strategy(g_pstWeight.output.Data.tensor2[i], output_pointer, ++ weight_types.output_weight, dim * n_vocab / g_numas); ++ } ++ ++ quantization_weight_strategy(g_pstWeight.token_embedding.Data.tensor1, token_embedding.data_ptr(), weight_types.token_embd_weight, dim * n_vocab); ++ quantization_weight_strategy(g_pstWeight.output_norm.Data.tensor1, output_norm.data_ptr(), weight_types.output_norm_weight, dim); ++ ++ /* 归一化结果 */ ++ g_stRunState.Token_Ori = (f32*)numa_alloc_onnode((size_t)dim * context_length * sizeof(f32), 0); ++ g_stRunState.Token_Norm = (f32*)numa_alloc_onnode((size_t)dim * context_length * sizeof(f32), 0); ++ ++ /* add的中间结果 */ ++ g_stRunState.add_weight = (f32 *)numa_alloc_onnode((size_t)dim * context_length * sizeof(f32), 0); ++ memset(g_stRunState.add_weight, 0, (size_t)dim * context_length *sizeof(f32)); ++ ++ /* 反量化挂载 */ ++ g_stRunState.temp_output_vec_numa = (void **)numa_alloc_onnode(g_numas * sizeof(void *), 0); ++ g_stRunState.tmp_vec_numa = (void **)numa_alloc_onnode(g_numas * sizeof(void *), 0); ++ for (int i = 0; i < g_numas; i++) { ++ g_stRunState.temp_output_vec_numa[i] = (void *)numa_alloc_onnode((size_t)dim * context_length * sizeof(f16), i); ++ g_stRunState.tmp_vec_numa[i] = (void *)numa_alloc_onnode((size_t)hidden_dim * context_length * sizeof(f16), i); ++ } ++ ++ /* 注意力输出 */ ++ g_stRunState.Attn_out = (f32*)numa_alloc_onnode( ++ (size_t)(dim + kv_dim + kv_dim >= 2 * hidden_dim ? dim + kv_dim + kv_dim : 2 * hidden_dim) ++ * context_length * sizeof(f32), 0); ++ ++ g_stRunState.Attn_out_f16 = (f16*)numa_alloc_onnode( ++ (size_t)(dim + kv_dim + kv_dim >= 2 * hidden_dim ? dim + kv_dim + kv_dim : 2 * hidden_dim) ++ * context_length * sizeof(f16), 0); ++ ++ /* 隐藏层 */ ++ g_stRunState.ffn_Gate = (f32 *)numa_alloc_onnode((size_t)hidden_dim * context_length * sizeof(f32), 0); ++ ++ if (!g_stRunState.Token_Ori || !g_stRunState.Token_Norm || !g_stRunState.Attn_out || !g_stRunState.ffn_Gate || ++ !g_stRunState.temp_output_vec_numa || !g_stRunState.tmp_vec_numa) { ++ fprintf(stderr, "Error: numa_alloc_onnode failed! (File: %s, Line: %d)\n", __FILE__, __LINE__); ++ } ++ std::cout << "load_weight end." << std::endl; ++} ++ ++#define DEBUG_TIME 1 ++static inline uint64_t get_time_ns(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_REALTIME, &ts); ++ return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec; ++} ++ ++void get_next_token(void* output, MODEL_HYPE_PARA *pstModelHypePara, WEIGHT *pstLlama, MODEL_RUN_STATE *pstRunState, ++ bool is_prompt, ++ torch::Tensor& block_tables, ++ torch::Tensor& seq_lens, ++ torch::Tensor& slot_mapping, ++ void *hidden_state, int64_t *pos, ++ std::vector& kv_caches, ++ int64_t block_size, ++ int n_tokens) ++{ ++ int dim = pstModelHypePara->dim; ++ int n_kv_heads = pstModelHypePara->n_kv_heads; ++ int n_head = pstModelHypePara->n_head; ++ int kv_dim = (dim * n_kv_heads) / n_head; ++ int hidden_dim = pstModelHypePara->hidden_dim; ++ int layers = pstModelHypePara->n_layers; ++ int n_vocab = pstModelHypePara->n_vocab; ++ float eps = pstModelHypePara->norm_rms_eps; ++ UINT32 srcBlockNum, srcBlocksize; ++ UINT32 dstBlockNum, dstBlocksize; ++ int srcType; ++ enum ggml_type dstType; ++ ++ int qkv_dim = dim + kv_dim + kv_dim; ++ int head_size = dim / n_head; ++ int kv_head_dim = block_size * head_size; ++ f16 *VT = (f16 *)numa_alloc_onnode(n_tokens * dim * sizeof(f16), 0); ++ ++ quantization_weight_strategy(pstRunState->Token_Norm, (char *)hidden_state, weight_types.token_embd_weight, n_tokens * dim); ++ ++#ifdef DEBUG_TIME ++ uint64_t t0 = get_time_ns(); ++ uint64_t time[25] = {0}; ++ uint64_t tt1 = get_time_ns(); ++#endif ++ ++ for(int L = 0; L < layers; L++) { ++#pragma omp parallel ++{ ++ WorkDivider work; ++ init_work_divider(&work, g_numas); ++ SingleNumaWorkRange srange; ++ MultiNumaWorkRange mrange; ++ ++ divide_work_first_numa(&work, n_tokens, &srange); ++ if (work.my_numa == 0) { ++ for (int i = srange.begin_thread; i < srange.end_thread; i++) { ++ for (int j= 0; j < dim; j++) { ++ ((f32 *)pstRunState->add_weight)[i * dim + j] += ((f32 *)pstRunState->Token_Norm)[i * dim + j]; ++ } ++ RmsNorm((f32 *)pstRunState->Token_Ori + i * dim, (f32 *)pstRunState->add_weight + i * dim, ++ (f32 *)pstLlama->rms_att_norm.Data.tensor2[L], eps, dim); ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[0] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ srcType = weight_types.attn_k_weight; ++ dstType = g_BlockDataInfo[srcType].VecDotType; ++ srcBlockNum = g_BlockDataInfo[srcType].uiblkSize; ++ srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize; ++ dstBlockNum = g_BlockDataInfo[dstType].uiblkSize; ++ dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize; ++ ++ divide_work_first_numa(&work, n_tokens * dim / dstBlockNum, &srange); ++ Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ (f32 *)pstRunState->Token_Ori + srange.begin_thread * dstBlockNum, ++ dstType, srange.work_per_thread * dstBlockNum); ++ ++ if (work.my_numa == 0) { ++ for (int i = 1; i < g_numas; i++) { ++ memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize, ++ (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ srange.work_per_thread * dstBlocksize); ++ } ++ } ++ ++#ifdef DEBUG_TIME ++if (work.tid == 0) { ++ time[1] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++} ++#endif ++ ++#pragma omp barrier ++ /* 计算qkv */ ++ divide_work_all_numas(&work, qkv_dim, &mrange); ++ for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) { ++ int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1; ++ if (nrc_i == 2) { ++ for (int k = 0; k < n_tokens; k += 2) { ++ int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1; ++ if (nrc_k == 2) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, qkv_dim, ++ (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ else if (nrc_k == 1) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, qkv_dim, ++ (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i+1, qkv_dim, ++ (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ ++ } ++ } ++ else if (nrc_i == 1) { ++ for (int k = 0; k < n_tokens; k += 1) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, qkv_dim, ++ (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1); ++ } ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[2] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ if (work.tid == 1) { ++ for (int k = 0; k < n_tokens; k++) { ++ for (int i = 0; i < qkv_dim; i++) { ++ pstRunState->Attn_out[k * qkv_dim + i] += ((f32 *)g_pstWeight.qkv_bias.Data.tensor2[L])[i]; ++ } ++ } ++ g_BlockDataInfo[GGML_TYPE_F16].quantize(pstRunState->Attn_out, pstRunState->Attn_out_f16, n_tokens * qkv_dim); ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[3] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++// 中间数据f32 -> f16, 减少attention修改 ++#pragma omp barrier ++ f16 *q_ptr = pstRunState->Attn_out_f16, *k_ptr = pstRunState->Attn_out_f16 + dim, *v_ptr = pstRunState->Attn_out_f16 + dim + kv_dim; ++ f16 *kcache_ptr = (f16 *)kv_caches[L][0].data_ptr(), *vcache_ptr = (f16 *)kv_caches[L][1].data_ptr(); ++ int64_t *slot_mapping_ptr = (int64_t *)slot_mapping.data_ptr(); ++ int kv_cache_block_elem_num = n_kv_heads * head_size * block_size; ++ ++ divide_all_work(&work, n_tokens * n_kv_heads, &srange); ++ for (int i = srange.begin_thread; i < srange.end_thread; i++) { ++ int t = i / n_kv_heads; ++ int h = i % n_kv_heads; ++ const int64_t slot = slot_mapping_ptr[t]; ++ if (slot < 0) { ++ continue; ++ } ++ int64_t block_idx = slot / block_size, block_offset = slot % block_size; ++ f16 *k_head_ptr = k_ptr + t * qkv_dim + h * head_size; ++ f16 *kcache_head_ptr = kcache_ptr + kv_cache_block_elem_num * block_idx + h * block_size * head_size; ++ const f16 *v_head_ptr = v_ptr + t * qkv_dim + h * head_size; ++ f16 *vcache_head_ptr = vcache_ptr + kv_cache_block_elem_num * block_idx + h * block_size * head_size; ++ Rope_embedding_impl(g_pstModelHypePara.is_neox_style, g_pstModelHypePara.n_rotary, k_head_ptr, g_pstModelHypePara.cos_sin_cache, ++ pos[t]); ++ for (int idx = 0; idx < head_size; idx += 8) { //8 = 16 / sizeof(f16) ++ for (int vidx = idx; vidx < idx + 8; vidx++) { ++ vcache_head_ptr[vidx * block_size + block_offset] = v_head_ptr[vidx]; ++ } ++ std::copy_n(k_head_ptr + idx, 8, kcache_head_ptr + idx * block_size + block_offset * 8); ++ } ++ } ++ divide_all_work(&work, n_tokens * n_head, &srange); ++ for (int i = srange.begin_thread; i < srange.end_thread; i++) { ++ int t = i / n_head; ++ int h = i % n_head; ++ Rope_embedding_impl(g_pstModelHypePara.is_neox_style, g_pstModelHypePara.n_rotary, q_ptr + t *qkv_dim + h * head_size, ++ g_pstModelHypePara.cos_sin_cache, pos[t]); ++ for (int j = 0; j < head_size; j++) { ++ q_ptr[t * qkv_dim + h * head_size + j] *= g_pstModelHypePara.attn_scale; ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[4] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++} ++ //divide_kv_cache_numa(&work, n_head, &srange); ++ if (is_prompt == true) { ++ f16 *v_ptr = (f16 *)pstRunState->Attn_out_f16 + dim + kv_dim; ++ transpose_v(VT, v_ptr, n_tokens, kv_dim, qkv_dim); ++ prefill_attention(pstRunState->seq_qkv, pstRunState->Attn_out_f16, VT, n_tokens, seq_lens.size(0), ++ (int *)seq_lens.data_ptr()); ++ } else { ++ int kv_block_row = kv_caches[L][0].stride(0); ++ paged_attention_v1_impl(pstRunState->seq_qkv, pstRunState->Attn_out_f16, (f16 *)kv_caches[L][0].data_ptr(), ++ (f16 *)kv_caches[L][1].data_ptr(), n_kv_heads, (int *)block_tables.data_ptr(), ++ (int *)seq_lens.data_ptr(), block_tables.size(1), qkv_dim, kv_block_row, ++ kv_head_dim, n_tokens, n_head, head_size); ++ } ++ ++#ifdef DEBUG_TIME ++ time[5] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++#endif ++ ++// 数据转换f16 —> f32 ++#pragma omp parallel ++{ ++ WorkDivider work; ++ init_work_divider(&work, g_numas); ++ SingleNumaWorkRange srange; ++ MultiNumaWorkRange mrange; ++ ++ if (work.tid == 1) { ++ g_BlockDataInfo[GGML_TYPE_F16].dequantize(pstRunState->seq_qkv, pstRunState->Attn_out, n_tokens * dim); ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[6] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ ++ srcType = weight_types.attn_output_weight; ++ dstType = g_BlockDataInfo[srcType].VecDotType; ++ srcBlockNum = g_BlockDataInfo[srcType].uiblkSize; ++ srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize; ++ dstBlockNum = g_BlockDataInfo[dstType].uiblkSize; ++ dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize; ++ ++ divide_work_first_numa(&work, n_tokens * dim / dstBlockNum, &srange); ++ Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ (f32 *)pstRunState->Attn_out + srange.begin_thread * dstBlockNum, ++ dstType, srange.work_per_thread * dstBlockNum); ++ ++ if (work.my_numa == 0) { ++ for (int i= 1; i < g_numas; i++) { ++ memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize, ++ (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ srange.work_per_thread * dstBlocksize); ++ } ++ } ++#pragma omp barrier ++ divide_work_all_numas(&work, dim, &mrange); ++ for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) { ++ int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1; ++ if (nrc_i == 2) { ++ for (int k = 0; k < n_tokens; k += 2) { ++ int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1; ++ if (nrc_k == 2) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i, dim, ++ (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ else if (nrc_k == 1) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i, dim, ++ (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i + 1, dim, ++ (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ ++ } ++ } ++ else if (nrc_i == 1) { ++ for (int k = 0; k < n_tokens; k += 1) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i, dim, ++ (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1); ++ } ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[7] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ divide_work_first_numa(&work, n_tokens, &srange); ++ if (work.my_numa == 0) { ++ for (int i = srange.begin_thread; i < srange.end_thread; i++) { ++ for (int j= 0; j < dim; j++) { ++ ((f32 *)pstRunState->add_weight)[i * dim + j] += ((f32 *)pstRunState->Attn_out)[i * dim + j]; ++ } ++ RmsNorm((f32 *)pstRunState->Attn_out + i * dim, (f32 *)pstRunState->add_weight + i * dim, ++ (f32 *)pstLlama->rms_ffn_norm.Data.tensor2[L], eps, dim); ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[8] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ srcType = weight_types.ffn_up_weight; ++ dstType = g_BlockDataInfo[srcType].VecDotType; ++ srcBlockNum = g_BlockDataInfo[srcType].uiblkSize; ++ srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize; ++ dstBlockNum = g_BlockDataInfo[dstType].uiblkSize; ++ dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize; ++ ++ divide_work_first_numa(&work, n_tokens * dim / dstBlockNum, &srange); ++ Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ (f32 *)pstRunState->Attn_out + srange.begin_thread * dstBlockNum, ++ dstType, srange.work_per_thread * dstBlockNum); ++ ++ if (work.my_numa == 0) { ++ for (int i = 1; i < g_numas; i++) { ++ memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize, ++ (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ srange.work_per_thread * dstBlocksize); ++ } ++ } ++#pragma omp barrier ++ /* w1/w3 数据类型一样 */ ++ int total_hidden_dim = hidden_dim * 2; ++ divide_work_all_numas(&work, total_hidden_dim, &mrange); ++ for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) { ++ int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1; ++ if (nrc_i == 2) { ++ for (int k = 0; k < n_tokens; k += 2) { ++ int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1; ++ if (nrc_k == 2) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, 1, 2); ++ /* 单线程的结果 */ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, total_hidden_dim, ++ (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ else if (nrc_k == 1) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, 1, 2); ++ /* 单线程的结果 */ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, total_hidden_dim, ++ (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i + 1, total_hidden_dim, ++ (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ } ++ } ++ else if (nrc_i == 1) { ++ for (int k = 0; k < n_tokens; k += 1) { ++ __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, 1, 2); ++ /* 单线程的结果 */ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, total_hidden_dim, ++ (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1); ++ } ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[9] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ /* silu激活函数 */ ++ divide_work_first_numa(&work, n_tokens * hidden_dim, &srange); ++ if (work.my_numa == 0) { ++ for (int item = srange.begin_thread; item < srange.end_thread; item++) { ++ int i = item / hidden_dim, j = item % hidden_dim; ++ f32 *w1 = pstRunState->Attn_out + i * hidden_dim * 2, *w3 = pstRunState->Attn_out + i * hidden_dim * 2 + hidden_dim; ++ f32 *result = pstRunState->ffn_Gate + i * hidden_dim; ++ f16 neg_w1 = -w1[j]; ++ f32 silu_f32 = w1[j] / (1.0 + expf_f16_table[*(uint16_t *)&neg_w1]); ++ result[j] = silu_f32 * w3[j]; ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[10] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ /* w2 */ ++ srcType = weight_types.ffn_down_weight; ++ dstType = g_BlockDataInfo[srcType].VecDotType; ++ srcBlockNum = g_BlockDataInfo[srcType].uiblkSize; ++ srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize; ++ dstBlockNum = g_BlockDataInfo[dstType].uiblkSize; ++ dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize; ++ ++ divide_work_first_numa(&work, n_tokens * hidden_dim / dstBlockNum, &srange); ++ Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ (f32 *)pstRunState->ffn_Gate + srange.begin_thread * dstBlockNum, ++ dstType, srange.work_per_thread * dstBlockNum); ++ ++ if (work.my_numa == 0) { ++ for (int i = 1; i < g_numas; i++) { ++ memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize, ++ (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ srange.work_per_thread * dstBlocksize); ++ } ++ } ++#pragma omp barrier ++ divide_work_all_numas(&work, dim, &mrange); ++ for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) { ++ int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1; ++ if (nrc_i == 2) { ++ for (int k = 0; k < n_tokens; k += 2) { ++ int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1; ++ if (nrc_k == 2) { ++ __builtin_prefetch((f16 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(hidden_dim, ++ (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, dim, ++ (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + i * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ else if (nrc_k == 1) { ++ __builtin_prefetch((f16 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(hidden_dim, ++ (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, dim, ++ (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + i * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, nrc_k); ++ ++ g_BlockDataInfo[srcType].VecDotFunc(hidden_dim, ++ (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i + 1, dim, ++ (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + (i + 1) * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ } ++ } ++ else if (nrc_i == 1) { ++ for (int k = 0; k < n_tokens; k += 1) { ++ __builtin_prefetch((f16 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(hidden_dim, ++ (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, dim, ++ (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + i * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, 1); ++ } ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[11] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++} //end omp ++ } ++ ++ std::vector last_token_indices; ++ if (is_prompt == true) { ++ for (int i = 0, sum_seq_lens = 0; i < seq_lens.size(0); i++) { ++ sum_seq_lens += ((int *)seq_lens.data_ptr())[i]; ++ last_token_indices.push_back(sum_seq_lens - 1); ++ } ++ } else { ++ for (int i = 0; i < n_tokens; i++) { ++ last_token_indices.push_back(i); ++ } ++ } ++ ++#pragma omp parallel ++{ ++ WorkDivider work; ++ init_work_divider(&work, g_numas); ++ SingleNumaWorkRange srange; ++ MultiNumaWorkRange mrange; ++ ++ divide_work_first_numa(&work, last_token_indices.size(), &srange); ++ if (work.my_numa == 0) { ++ for (int i = srange.begin_thread; i < srange.end_thread; i++) { ++ int last_token = last_token_indices[i]; ++ for (int j = 0; j < dim; j++) { ++ ((f32 *)pstRunState->Token_Norm)[last_token * dim + j] += ((f32 *)pstRunState->add_weight)[last_token * dim + j]; ++ } ++ RmsNorm((f32 *)pstRunState->Token_Ori + i * dim, (f32 *)pstRunState->Token_Norm + last_token * dim, ++ (f32 *)pstLlama->output_norm.Data.tensor1, eps, dim); ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[12] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++#pragma omp barrier ++ /* 外层linear */ ++ srcType = weight_types.output_weight; ++ dstType = g_BlockDataInfo[srcType].VecDotType; ++ srcBlockNum = g_BlockDataInfo[srcType].uiblkSize; ++ srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize; ++ dstBlockNum = g_BlockDataInfo[dstType].uiblkSize; ++ dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize; ++ ++ divide_work_first_numa(&work, last_token_indices.size() * dim / dstBlockNum, &srange); ++ Quantize((char *)pstRunState->temp_output_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ (f32 *)pstRunState->Token_Ori + srange.begin_thread * dstBlockNum, ++ dstType, srange.work_per_thread * dstBlockNum); ++ ++ if (work.my_numa == 0) { ++ for (int i = 1; i < g_numas; i++) { ++ memcpy((char *)pstRunState->temp_output_vec_numa[i] + srange.begin_thread * dstBlocksize, ++ (char *)pstRunState->temp_output_vec_numa[0] + srange.begin_thread * dstBlocksize, ++ srange.work_per_thread * dstBlocksize); ++ } ++ } ++#pragma omp barrier ++ divide_work_all_numas(&work, n_vocab, &mrange); ++ for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) { ++ int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1; ++ if (nrc_i == 2) { ++ for (int k = 0; k < last_token_indices.size(); k += 2) { ++ int nrc_k = ((last_token_indices.size() - k) >= 2) ? 2 : 1; ++ if (nrc_k == 2) { ++ __builtin_prefetch((f16 *)output + mrange.begin_numa + k * n_vocab + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i, n_vocab, ++ (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ else if (nrc_k == 1) { ++ __builtin_prefetch((f16 *)output + mrange.begin_numa + k * n_vocab + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i, n_vocab, ++ (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i + 1, n_vocab, ++ (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k); ++ } ++ ++ } ++ } ++ else if (nrc_i == 1) { ++ for (int k = 0; k < last_token_indices.size(); k += 1) { ++ __builtin_prefetch((f16 *)output + mrange.begin_numa + k * n_vocab + i, 1, 2); ++ g_BlockDataInfo[srcType].VecDotFunc(dim, ++ (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i, n_vocab, ++ (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize, ++ (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1); ++ } ++ } ++ } ++ ++#ifdef DEBUG_TIME ++ if (work.tid == 0) { ++ time[13] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++ } ++#endif ++ ++} ++ g_BlockDataInfo[GGML_TYPE_F16].quantize(pstRunState->output_f32, output, last_token_indices.size() * n_vocab); ++ ++#ifdef DEBUG_TIME ++ time[14] += get_time_ns() - tt1; ++ tt1 = get_time_ns(); ++#endif ++ ++#ifdef DEBUG_TIME ++ uint64_t t1 = get_time_ns(); ++ if (is_prompt == true) { ++ fprintf(stderr, " bs=%d prefill=%.3f ms, %.3f token/s\n\n", n_tokens, (t1 - t0) / 1000000.0, 1.0 * n_tokens / ((t1 - t0) / 1000000000.0)); ++ } else { ++ fprintf(stderr, " bs=%d decode=%.3f ms, %.3f token/s\n\n", n_tokens, (t1 - t0) / 1000000.0, 1.0 * n_tokens / ((t1 - t0) / 1000000000.0)); ++ } ++ ++ fprintf(stderr, "[0] first rms_norm ——> %8.3lf ms\n", time[0] / 1000000.0); ++ fprintf(stderr, "[1] qkv quantize and memcpy ——> %8.3lf ms\n", time[1] / 1000000.0); ++ fprintf(stderr, "[2] qkv matmul ——> %8.3lf ms\n", time[2] / 1000000.0); ++ fprintf(stderr, "[3] qkv add and quantize f16 ——> %8.3lf ms\n", time[3] / 1000000.0); ++ fprintf(stderr, "[4] rope operator ——> %8.3lf ms\n", time[4] / 1000000.0); ++ fprintf(stderr, "[5] page attention operator ——> %8.3lf ms\n", time[5] / 1000000.0); ++ fprintf(stderr, "[6] dequantize f32 ——> %8.3lf ms\n", time[6] / 1000000.0); ++ fprintf(stderr, "[7] (wo)quantize-memcpy-matmul ——> %8.3lf ms\n", time[7] / 1000000.0); ++ fprintf(stderr, "[8] ffn add and rmsnorm ——> %8.3lf ms\n", time[8] / 1000000.0); ++ fprintf(stderr, "[9] (w1w3)quantize-memcpy-matmul ——> %8.3lf ms\n", time[9] / 1000000.0); ++ fprintf(stderr, "[10] silu activation function ——> %8.3lf ms\n", time[10] / 1000000.0); ++ fprintf(stderr, "[11] (w2)quantize-memcpy-matmul ——> %8.3lf ms\n", time[11] / 1000000.0); ++ fprintf(stderr, "[12] output_norm add and rmsnorm ——> %8.3lf ms\n", time[12] / 1000000.0); ++ fprintf(stderr, "[13] (output)quantize-memcpy-matmul ——> %8.3lf ms\n", time[13] / 1000000.0); ++ fprintf(stderr, "[14] output quantize f16 ——> %8.3lf ms\n\n", time[14] / 1000000.0); ++#endif ++ ++ numa_free(VT, n_tokens * dim); ++ memset(pstRunState->add_weight, 0, (size_t)dim * g_pstModelHypePara.context_length * sizeof(f32)); ++} ++ ++void get_next_token_for_torch( ++ torch::Tensor model_output, // WEIGHT.token_embedding ++ torch::Tensor hidden_stats, ++ ++ bool is_prompt, ++ torch::Tensor block_tables, ++ torch::Tensor seq_lens, ++ torch::Tensor& slot_mapping, ++ torch::Tensor positions, ++ std::vector kv_caches, ++ int64_t block_size, ++ int64_t N_tokens) ++{ ++ void* hd = static_cast(hidden_stats.data_ptr()); ++ int64_t *pos = static_cast(positions.data_ptr()); ++ void* output = static_cast(model_output.data_ptr()); ++ ++ int seq_num = seq_lens.size(0); ++ ++ static int flag = 0; ++ if (flag == 0) { ++ g_stRunState.seq_qkv = (f16*)numa_alloc_onnode((size_t)seq_num * g_pstModelHypePara.context_length * g_pstModelHypePara.dim * sizeof(f16), 0); ++ ++ int output_tmp_size = (is_prompt == true ? seq_num : N_tokens) * g_pstModelHypePara.n_vocab; ++ g_stRunState.output_f32 = (f32*)numa_alloc_onnode(output_tmp_size * sizeof(f32), 0); ++ ++ flag = 1; ++ } ++ ++ get_next_token(output, ++ &g_pstModelHypePara, &g_pstWeight, &g_stRunState, ++ is_prompt, ++ block_tables, ++ seq_lens, ++ slot_mapping, ++ hd, ++ pos, ++ kv_caches, ++ block_size, ++ N_tokens ++ ); ++} ++ +diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp +index 5d1c5f4c8..eaeed6bc8 100644 +--- a/csrc/cpu/torch_bindings.cpp ++++ b/csrc/cpu/torch_bindings.cpp +@@ -97,6 +97,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { + " Tensor cos_sin_cache, bool is_neox) -> ()"); + ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); + ++ ops.def( ++ "load_weight_and_malloc_active_tensor(" ++ "int dim, int hidden_dim, int n_layers, int n_vocab, int n_heads, int n_kv_heads, int context_length," ++ "float norm_rms_eps, float rope_freq_base, float attn_scale, int is_neox_style, int quantization_bit_code, Tensor cos_sin_cache," ++ "Tensor token_embedding, Tensor rms_att_norm, Tensor rms_ffn_norm," ++ "Tensor wqkv, Tensor wo, Tensor bqkv_bias, Tensor w1w3," ++ "Tensor ffn_down, Tensor output_norm, Tensor lm_head) -> ()"); ++ ops.impl("load_weight_and_malloc_active_tensor", torch::kCPU, &load_weight_and_malloc_active_tensor); ++ ++ ops.def( ++ "get_next_token_for_torch(" ++ "Tensor model_output, Tensor hidden_stats, bool is_prompt, " ++ "Tensor block_tables, Tensor seq_lens, Tensor slot_mapping, Tensor positions, Tensor[]! kv_caches, int block_size," ++ "int N_tokens) -> ()"); ++ ops.impl("get_next_token_for_torch", torch::kCPU, &get_next_token_for_torch); ++ + // Quantization + #ifdef __AVX512F__ + // Compute int8 quantized tensor for given scaling factor. +diff --git a/csrc/ops.h b/csrc/ops.h +index 52ccf3b51..0f81f8ae7 100644 +--- a/csrc/ops.h ++++ b/csrc/ops.h +@@ -251,3 +251,45 @@ void register_graph_buffers(fptr_t _fa, + const std::vector>& handles, + const std::vector>& offsets); + #endif ++ ++void load_weight_and_malloc_active_tensor( ++ int64_t dim, // MODEL_HYPE_PARA.dim embedding 维度 ++ int64_t hidden_dim, // MODEL_HYPE_PARA.hidden_dim ffn 隐藏层维度 ++ int64_t n_layers, // MODEL_HYPE_PARA.n_layers 模型层数 ++ int64_t n_vocab, // MODEL_HYPE_PARA.n_vocab 词汇数量 ++ int64_t n_heads, // MODEL_HYPE_PARA.n_heads 注意力头个数 ++ int64_t n_kv_heads, // MODEL_HYPE_PARA.n_kv_heads kv的对数 ++ int64_t context_length, // MODEL_HYPE_PARA.context_length 上下文长度 ++ double norm_rms_eps, // MODEL_HYPE_PARA.norm_rms_eps eps ++ double rope_freq_base, // MODEL_HYPE_PARA.rope_freq_base rope频率 ++ double attn_scale, ++ int64_t is_neox_style, ++ int64_t quantization_bit_code, ++ ++ torch::Tensor const& cos_sin_cache, ++ torch::Tensor token_embedding, // WEIGHT.token_embedding ++ torch::Tensor rms_att_norm, // WEIGHT.rms_att_norm ++ torch::Tensor rms_ffn_norm, // WEIGHT.rms_ffn_norm ++ torch::Tensor wqkv, ++ torch::Tensor wo, // WEIGHT.wo ++ torch::Tensor qkv_bias, ++ torch::Tensor w1w3, ++ torch::Tensor ffn_down, // WEIGHT.ffn_down ++ torch::Tensor output_norm, // WEIGHT.output_norm ++ torch::Tensor lm_head // WEIGHT.output ++); ++ ++void get_next_token_for_torch( ++ torch::Tensor model_output, // WEIGHT.token_embedding ++ torch::Tensor hidden_stats, ++ ++ bool is_prompt, ++ torch::Tensor block_tables, ++ torch::Tensor seq_lens, ++ torch::Tensor& slot_mapping, ++ torch::Tensor positions, ++ std::vector kv_caches, ++ int64_t block_size, ++ int64_t N_tokens ++); ++ +diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py +index a6e96c0bb..56831ed57 100644 +--- a/examples/offline_inference/basic/basic.py ++++ b/examples/offline_inference/basic/basic.py +@@ -13,7 +13,7 @@ prompts = [ + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. +-llm = LLM(model="facebook/opt-125m") ++llm = LLM(model="/home/s30058176/DeepSeek-R1-Distill-Qwen-7B", max_model_len=8192) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) +@@ -21,4 +21,4 @@ outputs = llm.generate(prompts, sampling_params) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text +- print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +\ No newline at end of file ++ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +diff --git a/vllm/config.py b/vllm/config.py +index 56315aacb..765fe3748 100644 +--- a/vllm/config.py ++++ b/vllm/config.py +@@ -2393,7 +2393,8 @@ def _get_and_verify_dtype( + # models. + torch_dtype = torch.float16 + else: +- torch_dtype = config_dtype ++ #torch_dtype = config_dtype ++ torch_dtype = torch.float16 + + from vllm.platforms import current_platform + if (current_platform.is_cpu() +diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py +index e3de6b64f..d76f31fda 100644 +--- a/vllm/model_executor/models/qwen2.py ++++ b/vllm/model_executor/models/qwen2.py +@@ -60,6 +60,24 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + + logger = init_logger(__name__) + ++inference_fused = False ++import os ++if os.getenv("INFERENCE_OP_MODE") == "fused": ++ inference_fused = True ++ print(f"run in INFERENCE FUSED MODE") ++ ++#量化设置 ++quantization_bit_mode = os.getenv("SYSHAX_QUANTIZE") ++quantization_bit_code = 1 #默认是f16 ++if quantization_bit_mode != "": ++ if quantization_bit_mode == "q8_0": ++ quantization_bit_code = 8 ++ print(f"Use q8_0 quantization!") ++ elif quantization_bit_mode == "q4_0": ++ quantization_bit_code = 2 ++ print(f"Use q4_0 quantization!") ++ else: ++ print(f"Unsupported quantization type !") + + class Qwen2MLP(nn.Module): + +@@ -442,6 +460,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() ++ self.apply_memory = True + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config +@@ -483,6 +502,90 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: ++ if inference_fused: ++ self.fused_forward = True ++ ++ model = self.model ++ qkv_bias, wqkv, wo, w1w3, w2, preattn, postattn = [], [], [], [], [], [], [] ++ for i in range(model.start_layer, model.end_layer): ++ layer = model.layers[i] ++ if layer.self_attn.qkv_proj.bias is not None: ++ qkv_bias.append(layer.self_attn.qkv_proj.bias) ++ wqkv.append(layer.self_attn.qkv_proj.weight) ++ wo.append(layer.self_attn.o_proj.weight) ++ w1w3.append(layer.mlp.gate_up_proj.weight) ++ w2.append(layer.mlp.down_proj.weight) ++ preattn.append(layer.input_layernorm.weight) ++ postattn.append(layer.post_attention_layernorm.weight) ++ assert not qkv_bias or len(qkv_bias) == len(wqkv) ++ first_attn = model.layers[0].self_attn ++ preattn = torch.stack(preattn) ++ postattn = torch.stack(postattn) ++ wqkv = torch.stack(wqkv) ++ w1w3 = torch.stack(w1w3) ++ qkv_bias = torch.stack(qkv_bias) ++ wo = torch.stack(wo) ++ w2 = torch.stack(w2) ++ ++ num_group, _ = divmod(self.model.config.num_attention_heads, self.model.config.num_key_value_heads) ++ assert _ == 0 ++ kv_size, _ = divmod(self.model.embed_tokens.embedding_dim, num_group) ++ assert _ == 0 ++ ++ if self.apply_memory: ++ self.apply_memory = False ++ torch.ops._C.load_weight_and_malloc_active_tensor( ++ self.model.embed_tokens.embedding_dim, # MODEL_HYPE_PARA.dim embedding 维度 ++ self.model.config.intermediate_size, # MODEL_HYPE_PARA.hidden_dim ffn 隐藏层维度 ++ self.model.config.num_hidden_layers, # MODEL_HYPE_PARA.n_layers 模型层数 ++ self.model.embed_tokens.org_vocab_size, # MODEL_HYPE_PARA.n_vocab 词汇数量 ++ self.model.config.num_attention_heads, # MODEL_HYPE_PARA.n_heads 注意力头个数 ++ self.model.config.num_key_value_heads, # MODEL_HYPE_PARA.n_kv_heads kv的对数 ++ self.model.config.sliding_window if self.model.config.use_sliding_window else self.model.config.max_position_embeddings, ++ self.model.config.rms_norm_eps, # MODEL_HYPE_PARA.norm_rms_eps eps ++ self.model.config.rope_theta, # MODEL_HYPE_PARA.rope_freq_base rope频率 ++ first_attn.attn.impl.scale, ++ first_attn.rotary_emb.is_neox_style, ++ quantization_bit_code, ++ ++ first_attn.rotary_emb.cos_sin_cache, ++ self.model.embed_tokens.weight, # WEIGHT.token_embedding ++ preattn, # WEIGHT.rms_att_norm ++ postattn, # WEIGHT.rms_ffn_norm ++ wqkv, ++ wo, # WEIGHT.wo ++ qkv_bias, ++ w1w3, ++ w2, # WEIGHT.ffn_down ++ self.model.norm.weight, # WEIGHT.output_norm ++ self.lm_head.weight # WEIGHT.output ++ ) ++ block_size = 16 ++ N_tokens = len(input_ids) ++ hidden_states = model.get_input_embeddings(input_ids) ++ model_output = torch.zeros( ++ (len(attn_metadata.seq_lens) if attn_metadata.prefill_metadata else N_tokens, self.config.vocab_size), ++ dtype=hidden_states.dtype, ++ device=hidden_states.device ++ ) ++ ++ torch.ops._C.get_next_token_for_torch( ++ model_output, ++ hidden_states, ++ ++ attn_metadata.prefill_metadata is not None, ++ attn_metadata.block_tables, ++ attn_metadata.seq_lens_tensor, ++ attn_metadata.slot_mapping.flatten(), ++ positions, ++ kv_caches, ++ block_size, ++ N_tokens ++ ) ++ return model_output ++ else: ++ self.fused_forward = False ++ + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) +@@ -493,6 +596,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: ++ if inference_fused and self.fused_forward: ++ return hidden_states + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits diff --git a/vllm.spec b/vllm.spec index b0fec1db5d52f9da51d85115336f14980019ca2b..9b904f4bad0c33985b0099ef2aecc7bec303af71 100644 --- a/vllm.spec +++ b/vllm.spec @@ -3,12 +3,14 @@ Name: vllm Version: 0.7.3 -Release: 1 +Release: 2 Summary: Powerful engine for LLMs License: (Apache-2.0 AND BSD-3-Clause) OR BSD-3-CLause URL: https://github.com/vllm-project/vllm Source0: https://gitee.com/src-openeuler/vllm/raw/master/vllm-%{version}.tar.gz +Patch0001: cpu-fast-inference.patch + BuildArch: noarch %description @@ -18,7 +20,7 @@ BuildArch: noarch Summary: %{summary} Buildrequires: cmake python3-pip python3-devel python3-setuptools python3-pytest Buildrequires: python3-setuptools_scm python3-wheel python3-pytest-asyncio python3-grpcio -Buildrequires: python3-pytorch +Buildrequires: python3-pytorch gcc gcc-c++ numactl-devel kmod %{?python_provide:%python_provide python3-%{_name}} %description -n python3-%{_name} @@ -31,12 +33,12 @@ Buildrequires: python3-pytorch %build export SETUPTOOLS_SCM_PRETEND_VERSION=%{version} -export VLLM_TARGET_DEVICE=empty +export VLLM_TARGET_DEVICE=cpu %py3_build %install export SETUPTOOLS_SCM_PRETEND_VERSION=%{version} -export VLLM_TARGET_DEVICE=empty +export VLLM_TARGET_DEVICE=cpu %py3_install VERSION_FILE=$(find %{buildroot} -name '_version.py') @@ -69,6 +71,9 @@ mv %{buildroot}/filelist.lst . %files -n python3-%{_name} -f filelist.lst %changelog +* Thu May 15 2025 qmzznbxhl - 0.7.3-2 +- Add cpu fast-inference + * Tue Apr 8 2025 renwenjie - 0.7.3-1 - Change the baseline version to 0.7.3