diff --git a/cpu-fast-inference.patch b/cpu-fast-inference.patch
new file mode 100644
index 0000000000000000000000000000000000000000..90143ed61cc16e1dc32d6636e4248799674dc0a8
--- /dev/null
+++ b/cpu-fast-inference.patch
@@ -0,0 +1,5819 @@
+diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
+index 714abca2a..c5113f059 100644
+--- a/cmake/cpu_extension.cmake
++++ b/cmake/cpu_extension.cmake
+@@ -72,17 +72,14 @@ endfunction()
+ 
+ is_avx512_disabled(AVX512_DISABLED)
+ 
+-if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+-    set(APPLE_SILICON_FOUND TRUE)
+-else()
+-    find_isa(${CPUINFO} "avx2" AVX2_FOUND)
+-    find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+-    find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
+-    find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
+-    find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
+-    find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+-endif()
++find_isa(${CPUINFO} "avx2" AVX2_FOUND)
++find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
++find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
++find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
++find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
++find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
+ 
++add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+ 
+ if (AVX512_FOUND AND NOT AVX512_DISABLED)
+     list(APPEND CXX_COMPILE_FLAGS
+@@ -119,16 +116,19 @@ elseif (ASIMD_FOUND)
+     message(STATUS "ARMv8 or later architecture detected")
+     if(ARM_BF16_FOUND)
+         message(STATUS "BF16 extension detected")
+-        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16")
++        set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16+i8mm")
++        # set(MARCH_FLAGS "-march=armv8.6-a+bf16+dotprod+fp16+sve+i8mm")
+         add_compile_definitions(ARM_BF16_SUPPORT)
+     else()
+         message(WARNING "BF16 functionality is not available")
+-        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")  
++        set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16+i8mm")  
++        # set(MARCH_FLAGS "-march=armv8.6-a+bf16+dotprod+fp16+sve+i8mm")  
+     endif()
+-    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})     
+-elseif(APPLE_SILICON_FOUND)
+-    message(STATUS "Apple Silicon Detected")
+-    set(ENABLE_NUMA OFF)
++    list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS} "-fpermissive"
++        "-O3" "-funroll-loops" "-fomit-frame-pointer"
++        "-ffast-math" "-finline-functions"
++        "-flto" "-ftree-vectorize"
++        "-falign-functions=16" "-falign-loops=16" "-fno-unwind-tables")
+ else()
+     message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
+ endif()
+@@ -182,7 +182,9 @@ set(VLLM_EXT_SRC
+     "csrc/cpu/utils.cpp"
+     "csrc/cpu/layernorm.cpp"
+     "csrc/cpu/pos_encoding.cpp"
+-    "csrc/cpu/torch_bindings.cpp")
++    "csrc/cpu/sysHAX_ops.cpp"
++    "csrc/cpu/torch_bindings.cpp"
++    "csrc/cpu/quantize.cpp")
+ 
+ if (AVX512_FOUND AND NOT AVX512_DISABLED)
+     set(VLLM_EXT_SRC
+diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp
+index 990e99f2f..db71196a4 100644
+--- a/csrc/cpu/cpu_types_arm.hpp
++++ b/csrc/cpu/cpu_types_arm.hpp
+@@ -2,6 +2,10 @@
+ #include <torch/all.h>
+ #include <cmath>
+ 
++typedef float16_t f16;
++extern float f16_to_f32(f16 h);
++extern f16 f32_to_f16(float h);
++
+ namespace vec_op {
+ 
+ #ifdef ARM_BF16_SUPPORT
+@@ -65,96 +69,71 @@ struct FP16Vec8 : public Vec<FP16Vec8> {
+ };
+ 
+ struct FP16Vec16 : public Vec<FP16Vec16> {
+-  constexpr static int VEC_ELEM_NUM = 16;
+-
+-  float16x8x2_t reg;
+-
+-  explicit FP16Vec16(const void* ptr) {
+-    reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));
+-    reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);
+-  }
+-
+-  explicit FP16Vec16(const FP32Vec16& vec);
+-
+-  void save(void* ptr) const {
+-    vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+-    vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+-  }
++    constexpr static int VEC_ELEM_NUM = 16;
++    
++    union {
++      float16x8x2_t reg;
++      f16 s[VEC_ELEM_NUM];
++    };
++
++    explicit FP16Vec16() {
++        reg.val[0] = reg.val[1] = vdupq_n_f16(0.0f);
++    }
++    
++    explicit FP16Vec16(const void *ptr) {
++        reg.val[0] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr));        
++        reg.val[1] = vld1q_f16(reinterpret_cast<const __fp16*>(ptr) + 8);    
++    }
++    
++    explicit FP16Vec16(const FP32Vec16& vec);
+ 
+-  void save(void* ptr, const int elem_num) const {
+-    int full_blocks = elem_num / 8;
+-    int remainder = elem_num % 8;
++    explicit FP16Vec16(const FP16Vec8& vec) {
++        reg.val[0] = reg.val[1] = vec.reg;
++    }
++    
++    void save(void *ptr) const {
++        vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);       
++        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);   
++    }
++    
++    void save(void *ptr, const int elem_num) const {
++        int full_blocks = elem_num / 8;   
++        int remainder = elem_num % 8;     
++        
++        if (full_blocks > 0) {
++            vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
++            if (full_blocks > 1) {
++                vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
++            }
++        }
++        
++        if (remainder > 0) {
++            float16x8_t temp = reg.val[full_blocks];
++            for (int i = 0; i < remainder; ++i) {
++                reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
++            }
++        }
++    }
+ 
+-    if (full_blocks > 0) {
+-      vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]);
+-      if (full_blocks > 1) {
+-        vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
+-      }
++    f16 reduce_sum() const {
++      float16x8_t sum = vaddq_f16(reg.val[0], reg.val[1]);
++      float32x4_t t0 = vcvt_f32_f16(vget_low_f16(sum));
++      float32x4_t t1 = vcvt_f32_f16(vget_high_f16(sum));
++      return f32_to_f16(vaddvq_f32(vaddq_f32(t0, t1)));
+     }
+ 
+-    // Note: below is the unrolled version of the following code:
+-    //
+-    // for (int i = 0; i < remainder; ++i) {
+-    //     reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
+-    //          vgetq_lane_f16(temp, i);
+-    // }
+-    //
+-    // For macOS build (Clang), the arm/neon intrinsics function
+-    // `vgetq_lane_f16` needs the parameter `i` to be constant at compile
+-    // time.
+-
+-    if (remainder > 0) {
+-      float16x8_t temp = reg.val[full_blocks];
+-      __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
+-      switch (remainder) {
+-        case 1:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          break;
+-        case 2:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+-          break;
+-        case 3:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+-          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+-          break;
+-        case 4:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+-          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+-          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+-          break;
+-        case 5:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+-          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+-          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+-          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+-          break;
+-        case 6:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+-          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+-          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+-          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+-          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+-          break;
+-        case 7:
+-          fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
+-          fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
+-          fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
+-          fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
+-          fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
+-          fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
+-          fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
+-          break;
+-
+-        default:
+-          break;
+-      }
++    template <int group_size>
++    float reduce_sub_sum(int idx) {
++      f16 sum = 0.0;
++      constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
++      uint32_t mask = base_mask << (idx * group_size);
++      unroll_loop<int, 16>([&sum, &mask, this](int i){
++        int flag = mask & 0x1;
++        mask = mask >> 1;
++        if (flag != 0) sum += s[i];
++      });
++      return sum;
+     }
+-  }
+ };
+ 
+ #ifdef ARM_BF16_SUPPORT
+@@ -550,6 +529,11 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
+   acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]);
+ };
+ 
++inline void fma(FP16Vec16 &acc, FP16Vec16 &a, FP16Vec16 &b) {
++  acc.reg.val[0] = vfmaq_f16(acc.reg.val[0], a.reg.val[0], b.reg.val[0]);
++  acc.reg.val[1] = vfmaq_f16(acc.reg.val[1], a.reg.val[1], b.reg.val[1]);
++}
++
+ #ifdef ARM_BF16_SUPPORT
+ inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) {
+   float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0]));
+diff --git a/csrc/cpu/instruct.h b/csrc/cpu/instruct.h
+new file mode 100644
+index 000000000..07eac058b
+--- /dev/null
++++ b/csrc/cpu/instruct.h
+@@ -0,0 +1,81 @@
++#pragma once
++
++#include <arm_neon.h>
++
++#define MIN(a, b) ((a) < (b) ? (a) : (b))
++#define MAX(a, b) ((a) > (b) ? (a) : (b))
++
++#ifdef __ARM_NEON
++
++typedef struct ggml_int16x8x2_t {
++    int16x8_t val[2];
++} ggml_int16x8x2_t;
++
++inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
++    ggml_int16x8x2_t res;
++
++    res.val[0] = vld1q_s16(ptr + 0);
++    res.val[1] = vld1q_s16(ptr + 8);
++
++    return res;
++}
++
++typedef struct ggml_uint8x16x2_t {
++    uint8x16_t val[2];
++} ggml_uint8x16x2_t;
++
++inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
++    ggml_uint8x16x2_t res;
++
++    res.val[0] = vld1q_u8(ptr + 0);
++    res.val[1] = vld1q_u8(ptr + 16);
++
++    return res;
++}
++
++typedef struct ggml_uint8x16x4_t {
++    uint8x16_t val[4];
++} ggml_uint8x16x4_t;
++
++inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
++    ggml_uint8x16x4_t res;
++
++    res.val[0] = vld1q_u8(ptr + 0);
++    res.val[1] = vld1q_u8(ptr + 16);
++    res.val[2] = vld1q_u8(ptr + 32);
++    res.val[3] = vld1q_u8(ptr + 48);
++
++    return res;
++}
++
++typedef struct ggml_int8x16x2_t {
++    int8x16_t val[2];
++} ggml_int8x16x2_t;
++
++inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
++    ggml_int8x16x2_t res;
++
++    res.val[0] = vld1q_s8(ptr + 0);
++    res.val[1] = vld1q_s8(ptr + 16);
++
++    return res;
++}
++
++typedef struct ggml_int8x16x4_t {
++    int8x16_t val[4];
++} ggml_int8x16x4_t;
++
++inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
++    ggml_int8x16x4_t res;
++
++    res.val[0] = vld1q_s8(ptr + 0);
++    res.val[1] = vld1q_s8(ptr + 16);
++    res.val[2] = vld1q_s8(ptr + 32);
++    res.val[3] = vld1q_s8(ptr + 48);
++
++    return res;
++}
++
++#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
++
++#endif
+\ No newline at end of file
+diff --git a/csrc/cpu/quantize.cpp b/csrc/cpu/quantize.cpp
+new file mode 100644
+index 000000000..2194f8fdf
+--- /dev/null
++++ b/csrc/cpu/quantize.cpp
+@@ -0,0 +1,3311 @@
++#include <stdint.h>
++#include <string.h>
++#if __AVX__ || __AVX2__ || __AVX512F__
++#include <emmintrin.h>
++#include <immintrin.h>
++#endif
++#ifdef __ARM_FEATURE_SVE
++#include <arm_sve.h>
++#endif
++#include "quantize.h"
++#include "instruct.h"
++
++static inline int nearest_int(float fval)
++{
++    float val = fval + 12582912.f;
++    int i; memcpy(&i, &val, sizeof(int));
++    return (i & 0x007fffff) - 0x00400000;
++}
++
++#if __AVX__ || __AVX2__ || __AVX512F__
++static inline float hsum_float_8(const __m256 x)
++{
++    __m128 res = _mm256_extractf128_ps(x, 1);
++    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
++    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
++    res = _mm_add_ss(res, _mm_movehdup_ps(res));
++    return _mm_cvtss_f32(res);
++}
++#endif
++
++void dequantize_row_q2_K(const block_q2_K *__restrict__ src, float *__restrict__ dst, int64_t k)
++{
++    const int nb = k / QK_K;
++    for (int i = 0; i < nb; i++) {
++        const float d = GGML_FP16_TO_FP32(src[i].GGML_COMMON_AGGR.d);
++        const float min = GGML_FP16_TO_FP32(src[i].GGML_COMMON_AGGR.dmin);
++        const uint8_t *q = src[i].qs;
++#if QK_K == 256
++        int is = 0;
++        float dl, ml;
++        for (int n = 0; n < QK_K; n += 128) {
++            int shift = 0;
++            for (int j = 0; j < 4; ++j) {
++                uint8_t sc = src[i].scales[is++];
++                dl = d * (sc & 0xF);
++                ml = min * (sc >> 4);
++                for (int l = 0; l < 16; ++l) {
++                    *dst++ = dl * ((int8_t)((q[l] >> shift) & 3)) - ml;
++                }
++                sc = src[i].scales[is++];
++                dl = d * (sc & 0xF);
++                ml = min * (sc >> 4);
++                for (int l = 0; l < 16; ++l) {
++                    *dst++ = dl * ((int8_t)((q[l+16] >> shift) & 3)) - ml;
++                }
++                shift += 2;
++            }
++            q += 32;
++        }
++#else
++        float dl1 = d * (src[i].scales[0] & 0xF), ml1 = min * (src[i].scales[0] >> 4);
++        float dl2 = d * (src[i].scales[1] & 0xF), ml2 = min * (src[i].scales[1] >> 4);
++        float dl3 = d * (src[i].scales[2] & 0xF), ml3 = min * (src[i].scales[2] >> 4);
++        float dl4 = d * (src[i].scales[3] & 0xF), ml4 = min * (src[i].scales[3] >> 4);
++        for (int l = 0; l < 16; ++l) {
++            dst[l+ 0] = dl1 * ((int8_t)((q[l] >> 0) & 3)) - ml1;
++            dst[l+16] = dl2 * ((int8_t)((q[l] >> 2) & 3)) - ml2;
++            dst[l+32] = dl3 * ((int8_t)((q[l] >> 4) & 3)) - ml3;
++            dst[l+48] = dl4 * ((int8_t)((q[l] >> 6) & 3)) - ml4;
++        }
++        dst += QK_K;
++#endif
++    }
++}
++
++void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n)
++{
++    int64_t i = 0;
++#if defined(__F16C__)
++    //if (ggml_cpu_has_f16c()) {
++        for (; i + 7 < n; i += 8) {
++            __m256 x_vec = _mm256_loadu_ps(x + i);
++            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
++            _mm_storeu_si128((__m128i *)(y + i), y_vec);
++        }
++        for(; i + 3 < n; i += 4) {
++            __m128 x_vec = _mm_loadu_ps(x + i);
++            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
++            _mm_storel_epi64((__m128i *)(y + i), y_vec);
++        }
++    //}
++#endif
++    for (; i < n; i++) {
++        y[i] = GGML_FP32_TO_FP16(x[i]);
++    }
++}
++
++
++void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n)
++{
++    for (int64_t i = 0; i < n; i++) {
++        y[i] = GGML_FP16_TO_FP32(x[i]);
++    }
++}
++
++#define GGML_F16_SVE_STEP 64
++#define GGML_F16_SVE_ARR 4
++#define GGML_F16_SVE_EPR 16
++                      
++void ggml_vec_dot_f16(int n, float * __restrict__ s, size_t bs, ggml_fp16_t * __restrict__ x, size_t bx, ggml_fp16_t * __restrict__ y, size_t by, int nrc) {
++    double sumf = 0.0;
++
++#if defined(__ARM_FEATURE_SVE)
++    svbool_t pre = svptrue_b16();
++
++    const int np = (n & ~(GGML_F16_SVE_STEP - 1));
++
++    svfloat16_t sum = {svdup_f16(0.0f)};
++
++    svfloat16_t ax;    //需要4个256，即4个ax
++    svfloat16_t ay;
++
++    for (int i = 0; i < np; i += GGML_F16_SVE_STEP) {
++        for (int j = 0; j < GGML_F16_SVE_ARR; j++) {    //4个256，即64个f16
++            ax = svld1_f16(pre, x + i + j * GGML_F16_SVE_EPR);
++            ay = svld1_f16(pre, y + i + j * GGML_F16_SVE_EPR);
++
++            sum = svmla_f16_z(pre, sum, ax, ay);
++        }
++    }
++
++    /* 合并 */
++    sumf = svaddv_f16(pre, sum);
++    /* 剩余的维度 */
++    for (int i = np; i < n; ++i) {
++        sumf += x[i] * y[i];
++    }
++
++#elif defined(GGML_SIMD)
++    const int np = (n & ~(GGML_F16_STEP - 1));
++
++    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
++
++    GGML_F16_VEC ax[GGML_F16_ARR];
++    GGML_F16_VEC ay[GGML_F16_ARR];
++
++    for (int i = 0; i < np; i += GGML_F16_STEP) {
++        for (int j = 0; j < GGML_F16_ARR; j++) {
++            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
++            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
++
++            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
++        }
++    }
++
++    // reduce sum0..sum3 to sum0
++    GGML_F16_VEC_REDUCE(sumf, sum);
++
++    // leftovers
++    for (int i = np; i < n; ++i) {
++        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
++        //sumf += x[i] * y[i];
++    }
++
++#else
++    for (int i = 0; i < n; ++i) {
++        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
++    }
++#endif
++
++    *s = sumf;
++}
++
++void dequantize_row_q4_0(const block_q4_0 * __restrict__ src, float * __restrict__ dst, int64_t k)
++{
++    static const int qk = QK4_0;
++    const int nb = k / qk;
++
++    for (int i = 0; i < nb; i++) {
++        const float d = GGML_FP16_TO_FP32(src[i].d);
++
++        for (int j = 0; j < qk / 2; ++j) {
++            const int x0 = (src[i].qs[j] & 0x0F) - 8;
++            const int x1 = (src[i].qs[j] >> 4) - 8;
++
++            dst[i*qk + j + 0   ] = x0*d;
++            dst[i*qk + j + qk/2] = x1*d;
++        }
++    }
++}
++
++void dequantize_row_q4_1(const block_q4_1 * __restrict__ src, float * __restrict__ dst, int64_t k)
++{
++    static const int qk = QK4_1;
++
++    const int nb = k / qk;
++
++    for (int i = 0; i < nb; i++) {
++        const float d = GGML_FP16_TO_FP32(src[i].d);
++        const float m = GGML_FP16_TO_FP32(src[i].m);
++
++        for (int j = 0; j < qk/2; ++j) {
++            const int x0 = (src[i].qs[j] & 0x0F);
++            const int x1 = (src[i].qs[j] >>   4);
++
++            dst[i*qk + j + 0   ] = x0*d + m;
++            dst[i*qk + j + qk/2] = x1*d + m;
++        }
++    }
++}
++
++void dequantize_row_q8_0(const block_q8_0 *__restrict__ x, float *__restrict__ y, int64_t k)
++{
++    static const int qk = QK8_0;
++
++    const int nb = k / qk;
++    for (int i = 0; i < nb; i++) {
++        const float d = GGML_FP16_TO_FP32(x[i].d);
++
++        for (int j = 0; j < qk; ++j) {
++            y[i*qk + j] = x[i].qs[j]*d;
++        }
++    }
++}
++
++/* QK_K == 256 */
++void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc)
++{
++    (void)(nrc);
++    (void)(bx);
++    (void)(by);
++    (void)(bs);
++
++    const block_q2_K * __restrict__ x = vx;
++    const block_q8_K * __restrict__ y = vy;
++    const int nb = n / QK_K;
++
++#ifdef __ARM_NEON
++    const uint8x16_t m3 = vdupq_n_u8(0x3);
++    const uint8x16_t m4 = vdupq_n_u8(0xF);
++
++    const int32x4_t vzero = vdupq_n_s32(0);
++
++    ggml_int8x16x2_t q2bytes;
++    uint8_t aux[16];
++
++    float sum = 0;
++
++    for (int i = 0; i < nb; ++i) {
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.d);
++        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.dmin);
++
++        const uint8_t * __restrict__ q2 = x[i].qs;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++        const uint8_t * __restrict__ sc = x[i].scales;
++
++        const uint8x16_t mins_and_scales = vld1q_u8(sc);
++        const uint8x16_t scales = vandq_u8(mins_and_scales, m4);
++        vst1q_u8(aux, scales);
++
++        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
++        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
++        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
++        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
++                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
++        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
++                                       vmull_s16(vget_high_s16(mins16.val[1]), vget_high_s16(q8sums.val[1])));
++        sum += dmin * vaddvq_s32(vaddq_s32(s0, s1));
++
++        int isum = 0;
++        int is = 0;
++
++// We use this macro instead of a function call because for some reason
++// the code runs 2-3% slower, even if the function is declared inline
++#define MULTIPLY_ACCUM_WITH_SCALE(index)\
++        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
++        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
++
++#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
++        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
++        q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[0], (shift)), m3));\
++        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
++        MULTIPLY_ACCUM_WITH_SCALE((index));
++
++        for (int j = 0; j < QK_K/128; ++j) {
++            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
++
++            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
++            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
++            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
++
++            MULTIPLY_ACCUM_WITH_SCALE(0);
++
++            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
++            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
++            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
++
++            is += 8;
++        }
++
++        sum += d * isum;
++    }
++
++    *s = sum;
++
++#elif defined __AVX2__
++
++    const __m256i m3 = _mm256_set1_epi8(3);
++    const __m128i m4 = _mm_set1_epi8(0xF);
++
++    __m256 acc = _mm256_setzero_ps();
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
++
++        const uint8_t * __restrict__ q2 = x[i].qs;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
++        const __m128i scales8 = _mm_and_si128(mins_and_scales, m4);
++        const __m128i mins8 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
++        const __m256i mins = _mm256_cvtepi8_epi16(mins8);
++        const __m256i prod = _mm256_madd_epi16(mins, _mm256_loadu_si256((const __m256i*)y[i].bsums));
++
++        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(prod), acc);
++
++        const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
++        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
++        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
++        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
++
++        __m256i sumi = _mm256_setzero_si256();
++
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            const __m256i q2bits = _mm256_loadu_si256((const __m256i*)q2); q2 += 32;
++
++            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++
++            const __m256i q2_0 = _mm256_and_si256(q2bits, m3);
++            const __m256i q2_1 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 2), m3);
++            const __m256i q2_2 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 4), m3);
++            const __m256i q2_3 = _mm256_and_si256(_mm256_srli_epi16(q2bits, 6), m3);
++
++            __m256i p0 = _mm256_maddubs_epi16(q2_0, q8_0);
++            __m256i p1 = _mm256_maddubs_epi16(q2_1, q8_1);
++            __m256i p2 = _mm256_maddubs_epi16(q2_2, q8_2);
++            __m256i p3 = _mm256_maddubs_epi16(q2_3, q8_3);
++
++            p0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(0)), p0);
++            p1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(1)), p1);
++            p2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(2)), p2);
++            p3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(3)), p3);
++
++            p0 = _mm256_add_epi32(p0, p1);
++            p2 = _mm256_add_epi32(p2, p3);
++
++            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p0, p2));
++        }
++
++        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
++
++    }
++
++    *s = hsum_float_8(acc);
++
++#elif defined __AVX__
++
++    const __m128i m3 = _mm_set1_epi8(0x3);
++    const __m128i m4 = _mm_set1_epi8(0xF);
++    const __m128i m2 = _mm_set1_epi8(0x2);
++
++    __m256 acc = _mm256_setzero_ps();
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.d);
++        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.dmin);
++
++        const uint8_t * __restrict__ q2 = x[i].qs;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        // load mins and scales from block_q2_K.scales[QK_K/16]
++        const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales);
++        const __m128i scales16 = _mm_and_si128(mins_and_scales, m4);
++        const __m128i mins16 = _mm_and_si128(_mm_srli_epi16(mins_and_scales, 4), m4);
++        const __m128i mins_0 = _mm_cvtepi8_epi16(mins16);
++        const __m128i mins_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(mins16, mins16));
++
++        // summs = y[i].bsums * (x[i].scales >> 4) in 16bits*8*2 to 32bits*4*2
++        const __m128i summs_0 = _mm_madd_epi16(mins_0, _mm_loadu_si128((const __m128i*)&y[i].bsums[0]));
++        const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
++
++        // sumf += -dmin * summs in 32bits*8
++        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
++
++        const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
++        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
++        const __m128i scales[2] = { scales_0, scales_1 };
++
++        __m128i sumi_0 = _mm_setzero_si128();
++        __m128i sumi_1 = _mm_setzero_si128();
++
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            // load Q8 quants int8*16*8 from block_q8_K.qs[QK_K]
++            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++
++            // load 2bits*16*8 from block_q2_K.qs[QK_K/4]
++            __m128i q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
++            const __m128i q2_0 = _mm_and_si128(q2bits, m3);
++            const __m128i q2_2 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
++            const __m128i q2_4 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
++            const __m128i q2_6 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
++            q2bits = _mm_loadu_si128((const __m128i*)q2); q2 += 16;
++            const __m128i q2_1 = _mm_and_si128(q2bits, m3);
++            const __m128i q2_3 = _mm_and_si128(_mm_srli_epi16(q2bits, 2), m3);
++            const __m128i q2_5 = _mm_and_si128(_mm_srli_epi16(q2bits, 4), m3);
++            const __m128i q2_7 = _mm_and_si128(_mm_srli_epi16(q2bits, 6), m3);
++
++            // isuml = q8[l] * ((q2[l] >> shift) & 3) in 8bits*16*8 to 16bits*8*8
++            __m128i p0 = _mm_maddubs_epi16(q2_0, q8_0);
++            __m128i p1 = _mm_maddubs_epi16(q2_1, q8_1);
++            __m128i p2 = _mm_maddubs_epi16(q2_2, q8_2);
++            __m128i p3 = _mm_maddubs_epi16(q2_3, q8_3);
++            __m128i p4 = _mm_maddubs_epi16(q2_4, q8_4);
++            __m128i p5 = _mm_maddubs_epi16(q2_5, q8_5);
++            __m128i p6 = _mm_maddubs_epi16(q2_6, q8_6);
++            __m128i p7 = _mm_maddubs_epi16(q2_7, q8_7);
++
++            // isum += (x[i].scales[is++] & 0xF) * isuml in 16bits*8*8 to 32bits*4*8
++            __m128i shuffle = _mm_set1_epi16(0x0100);
++            p0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p0);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p1);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p2);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p3);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p4);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p5);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p6);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p7);
++
++            p0 = _mm_add_epi32(p0, p1);
++            p2 = _mm_add_epi32(p2, p3);
++            p4 = _mm_add_epi32(p4, p5);
++            p6 = _mm_add_epi32(p6, p7);
++
++            // isum in 32bits*4*2
++            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p0, p2));
++            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p4, p6));
++        }
++
++        // sumf += dall * isum - dmin * summs in 32bits
++        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
++        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
++    }
++
++    *s = hsum_float_8(acc);
++
++#elif defined __riscv_v_intrinsic
++
++    float sumf = 0;
++    uint8_t temp_01[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
++                            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
++
++    for (int i = 0; i < nb; ++i) {
++
++        const uint8_t * q2 = x[i].qs;
++        const  int8_t * q8 = y[i].qs;
++        const uint8_t * sc = x[i].scales;
++
++        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
++
++        size_t vl = 16;
++
++        vuint8m1_t scales = __riscv_vle8_v_u8m1(sc, vl);
++        vuint8m1_t aux = __riscv_vand_vx_u8m1(scales, 0x0F, vl);
++
++        vint16m1_t q8sums = __riscv_vle16_v_i16m1(y[i].bsums, vl);
++
++        vuint8mf2_t scales_2 = __riscv_vle8_v_u8mf2(sc, vl);
++        vuint8mf2_t mins8 = __riscv_vsrl_vx_u8mf2(scales_2, 0x4, vl);
++        vint16m1_t mins = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vzext_vf2_u16m1(mins8, vl));
++        vint32m2_t prod = __riscv_vwmul_vv_i32m2(q8sums, mins, vl);
++        vint32m1_t vsums = __riscv_vredsum_vs_i32m2_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl);
++
++        sumf  += dmin * __riscv_vmv_x_s_i32m1_i32(vsums);
++
++        vl = 32;
++
++        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
++        vuint8m1_t v_b = __riscv_vle8_v_u8m1(temp_01, vl);
++
++        uint8_t is=0;
++        int isum=0;
++
++        for (int j = 0; j < QK_K/128; ++j) {
++            // load Q2
++            vuint8m1_t q2_x = __riscv_vle8_v_u8m1(q2, vl);
++
++            vuint8m1_t q2_0 = __riscv_vand_vx_u8m1(q2_x, 0x03, vl);
++            vuint8m1_t q2_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x2, vl), 0x03 , vl);
++            vuint8m1_t q2_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x4, vl), 0x03 , vl);
++            vuint8m1_t q2_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q2_x, 0x6, vl), 0x03 , vl);
++
++            // duplicate scale elements for product
++            vuint8m1_t sc0 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 0+is, vl), vl);
++            vuint8m1_t sc1 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 2+is, vl), vl);
++            vuint8m1_t sc2 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 4+is, vl), vl);
++            vuint8m1_t sc3 = __riscv_vrgather_vv_u8m1(aux, __riscv_vadd_vx_u8m1(v_b, 6+is, vl), vl);
++
++            vint16m2_t p0 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_0, sc0, vl));
++            vint16m2_t p1 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_1, sc1, vl));
++            vint16m2_t p2 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_2, sc2, vl));
++            vint16m2_t p3 = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vwmulu_vv_u16m2(q2_3, sc3, vl));
++
++            // load Q8
++            vint8m1_t q8_0 = __riscv_vle8_v_i8m1(q8, vl);
++            vint8m1_t q8_1 = __riscv_vle8_v_i8m1(q8+32, vl);
++            vint8m1_t q8_2 = __riscv_vle8_v_i8m1(q8+64, vl);
++            vint8m1_t q8_3 = __riscv_vle8_v_i8m1(q8+96, vl);
++
++            vint32m4_t s0 = __riscv_vwmul_vv_i32m4(p0, __riscv_vwcvt_x_x_v_i16m2(q8_0, vl), vl);
++            vint32m4_t s1 = __riscv_vwmul_vv_i32m4(p1, __riscv_vwcvt_x_x_v_i16m2(q8_1, vl), vl);
++            vint32m4_t s2 = __riscv_vwmul_vv_i32m4(p2, __riscv_vwcvt_x_x_v_i16m2(q8_2, vl), vl);
++            vint32m4_t s3 = __riscv_vwmul_vv_i32m4(p3, __riscv_vwcvt_x_x_v_i16m2(q8_3, vl), vl);
++
++            vint32m1_t isum0 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s0, s1, vl), vzero, vl);
++            vint32m1_t isum1 = __riscv_vredsum_vs_i32m4_i32m1(__riscv_vadd_vv_i32m4(s2, s3, vl), isum0, vl);
++
++            isum += __riscv_vmv_x_s_i32m1_i32(isum1);
++
++            q2+=32;  q8+=128;  is=8;
++
++        }
++
++        sumf += dall * isum;
++
++    }
++
++    *s = sumf;
++
++#else
++    float sumf = 0;
++    for (int i = 0; i < nb; ++i) {
++        const uint8_t * q2 = x[i].qs;
++        const  int8_t * q8 = y[i].qs;
++        const uint8_t * sc = x[i].scales;
++
++        int summs = 0;
++        for (int j = 0; j < 16; ++j) {
++            summs += y[i].bsums[j] * (sc[j] >> 4);
++        }
++
++        const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.d);
++        const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].GGML_COMMON_AGGR.dmin);
++
++        int isum = 0;
++        int is = 0;
++        int d;
++        for (int k = 0; k < QK_K/128; ++k) {
++            int shift = 0;
++            for (int j = 0; j < 4; ++j) {
++                d = sc[is++] & 0xF;
++                int isuml = 0;
++                for (int l =  0; l < 16; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
++                isum += d * isuml;
++                d = sc[is++] & 0xF;
++                isuml = 0;
++                for (int l = 16; l < 32; ++l) isuml += q8[l] * ((q2[l] >> shift) & 3);
++                isum += d * isuml;
++                shift += 2;
++                q8 += 32;
++            }
++            q2 += 32;
++        }
++        sumf += dall * isum - dmin * summs;
++    }
++    *s = sumf;
++#endif
++}
++
++static float make_qkx2_quants(int n, int nmax, const float *__restrict__ x, const float * __restrict__ weights,
++        uint8_t * __restrict__ L, float * __restrict__ the_min, uint8_t *__restrict__ Laux,  //Laux表示微调后每个微调结果储存
++        float rmin, float rdelta, int nstep, int use_mad)
++{
++    float min = x[0];
++    float max = x[0];
++    float sum_w = weights[0];
++    float sum_x = sum_w * x[0];
++#ifdef HAVE_BUGGY_APPLE_LINKER
++    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
++    for (volatile int i = 1; i < n; ++i) {
++#else
++    for (int i = 1; i < n; ++i) {  //求数据里最小和最大值
++#endif
++        if (x[i] < min) min = x[i];
++        if (x[i] > max) max = x[i];
++        float w = weights[i];
++        sum_w += w;     //序列绝对值的和
++        sum_x += w * x[i];  //序列数字平方和(带符号)
++    }
++    if (min > 0) min = 0;    //最小的都大于0，min变成0
++    if (max == min) {   //序列元素都一样
++        for (int i = 0; i < n; ++i) L[i] = 0;
++        *the_min = -min;
++        return 0.f;   //此时的
++    }
++    float iscale = nmax/(max - min);  //量化级别，因为2位bit表示范围是3
++    float scale = 1/iscale;   //缩放因子
++    float best_mad = 0;
++    for (int i = 0; i < n; ++i) {
++        int l = nearest_int(iscale*(x[i] - min));   //这个元素能用整数表示的数字
++        L[i] = MAX(0, MIN(nmax, l));    //该值能表示的数字，一定是0-3
++        float diff = scale * L[i] + min - x[i];  //反量化求一下误差
++        diff = use_mad ? fabsf(diff) : diff * diff;    //1 使用误差绝对值，否则误差平方值
++        float w = weights[i];
++        best_mad += w * diff;  //误差 * 元素绝对值 累加和
++    } 
++    if (nstep < 1) {   //0表示别微调
++        *the_min = -min;
++        return scale;
++    }
++    for (int is = 0; is <= nstep; ++is) {   //15 + 1次循环
++        iscale = (rmin + rdelta*is + nmax)/(max - min);    //-0.5 + 0.1 * is + 3   微调以后的？ 每一个的表示范围微调
++        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
++        for (int i = 0; i < n; ++i) {
++            int l = nearest_int(iscale*(x[i] - min));
++            l = MAX(0, MIN(nmax, l));
++            Laux[i] = l;
++            float w = weights[i];
++            sum_l += w*l;
++            sum_l2 += w*l*l;
++            sum_xl += w*l*x[i];
++        }
++        float D = sum_w * sum_l2 - sum_l * sum_l;
++        if (D > 0) {
++            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
++            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
++            if (this_min > 0) {
++                this_min = 0;
++                this_scale = sum_xl / sum_l2;
++            }
++            float mad = 0;
++            for (int i = 0; i < n; ++i) {
++                float diff = this_scale * Laux[i] + this_min - x[i];
++                diff = use_mad ? fabsf(diff) : diff * diff;
++                float w = weights[i];
++                mad += w * diff;
++            }
++            if (mad < best_mad) {  //mad越小应该表示越好
++                for (int i = 0; i < n; ++i) {
++                    L[i] = Laux[i];
++                }
++                best_mad = mad;
++                scale = this_scale;   //这个块最好的缩放因子
++                min = this_min;
++            }
++        }
++    }
++    *the_min = -min;     //返回最好块的最小值(取反)
++    return scale;
++}
++
++void quantize_row_q2_K(const float *__restrict__ x, block_q2_K *__restrict__ y, int64_t k)
++{
++    const int nb = k / QK_K;  //块数，超级块个数
++    uint8_t L[QK_K];
++    uint8_t Laux[16];
++    float   weights[16];
++    float mins[QK_K/16];
++    float scales[QK_K/16];
++    const float q4scale = 15.f;
++
++    for (int i = 0; i < nb; i++) {
++        float max_scale = 0; // as we are deducting the min, scales are always positive 扣除了最小值，因此scales始终为正
++        float max_min = 0;
++        for (int j = 0; j < QK_K/16; ++j) {  //一个超级块有16个小块，每小块16个元素
++            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);   //每块16个元素，先算绝对值
++            //找到误差最小的量化的办法去求scales
++            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, 1);
++            float scale = scales[j];
++            if (scale > max_scale) {   //得到超级块最大的scale
++                max_scale = scale;
++            }
++            float min = mins[j];
++            if (min > max_min) {  //块里面返回的是-min, 负数反值最大的min, 超级快最大的负数
++                max_min = min;
++            }
++        }
++
++        if (max_scale > 0) {
++            float iscale = q4scale/max_scale;      //再4位量化，最小scale是0，因此这里省略min_scale
++            for (int j = 0; j < QK_K/16; ++j) {
++                int l = nearest_int(iscale*scales[j]);
++                y[i].scales[j] = l;         //每个超级块的scales就是每个小块的scale再量化后的，储存在低4B(因为是4位再量化)
++            }
++            y[i].GGML_COMMON_AGGR.d = GGML_FP32_TO_FP16(max_scale/q4scale);   //每个超级块的量化scale = d
++        } else {
++            for (int j = 0; j < QK_K/16; ++j) y[i].scales[j] = 0;
++            y[i].GGML_COMMON_AGGR.d = GGML_FP32_TO_FP16(0.f);
++        }
++        if (max_min > 0) {    //小块里有负数，也是4位量化
++            float iscale = q4scale/max_min;         //基准也是0
++            for (int j = 0; j < QK_K/16; ++j) {
++                int l = nearest_int(iscale*mins[j]);
++                y[i].scales[j] |= (l << 4);   //填充到高4位
++            }
++            y[i].GGML_COMMON_AGGR.dmin = GGML_FP32_TO_FP16(max_min/q4scale);    //量化因子
++        } else {
++            y[i].GGML_COMMON_AGGR.dmin = GGML_FP32_TO_FP16(0.f);
++        }
++        for (int j = 0; j < QK_K/16; ++j) {
++            const float d = GGML_FP16_TO_FP32(y[i].GGML_COMMON_AGGR.d) * (y[i].scales[j] & 0xF);  //每个小块的scale提取
++            if (!d) continue;
++            const float dm = GGML_FP16_TO_FP32(y[i].GGML_COMMON_AGGR.dmin) * (y[i].scales[j] >> 4);  //d min提取
++            for (int ii = 0; ii < 16; ++ii) {
++                int l = nearest_int((x[16*j + ii] + dm)/d);  //r /s = q
++                l = MAX(0, MIN(3, l));
++                L[16*j + ii] = l;    //0-3范围，L实际有效值就是2位; 根据每个小块的scale和dmin又重新量化每个元素
++            }
++        }
++
++#if QK_K == 256
++        for (int j = 0; j < QK_K; j += 128) {
++            for (int l = 0; l < 32; ++l) {       //32个数,  以0，32，64，96作为步长储存，一轮储存128个
++                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
++            }
++        }
++#else
++        for (int l = 0; l < 16; ++l) {
++            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
++        }
++#endif
++
++        x += QK_K;
++
++    }
++}
++
++void quantize_row_q4_0(const float *__restrict__ x, block_q4_0 *__restrict__ y, int64_t k)
++{
++    static const int qk = QK4_0;
++    const int nb = k / qk;
++
++    for (int i = 0; i < nb; i++) {
++        float amax = 0.0f; // absolute max
++        float max  = 0.0f;
++
++        for (int j = 0; j < qk; j++) {
++            const float v = x[i*qk + j];
++            if (amax < fabsf(v)) {
++                amax = fabsf(v);
++                max  = v;
++            }
++        }
++
++        const float d  = max / -8;
++        const float id = d ? 1.0f/d : 0.0f;
++
++        y[i].d = GGML_FP32_TO_FP16(d);
++
++        for (int j = 0; j < qk/2; ++j) {
++            const float x0 = x[i*qk + 0    + j]*id;
++            const float x1 = x[i*qk + qk/2 + j]*id;
++
++            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
++            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
++
++            y[i].qs[j]  = xi0;
++            y[i].qs[j] |= xi1 << 4;
++        }
++    }
++}
++
++void quantize_row_q4_1(const float *__restrict__ x, block_q4_1 *__restrict__ y, int64_t k)
++{
++    const int qk = QK4_1;
++
++    const int nb = k / qk;
++
++    for (int i = 0; i < nb; i++) {
++        float min = FLT_MAX;
++        float max = -FLT_MAX;
++
++        for (int j = 0; j < qk; j++) {
++            const float v = x[i*qk + j];
++
++            if (v < min) min = v;
++            if (v > max) max = v;
++        }
++
++        const float d  = (max - min) / ((1 << 4) - 1);
++        const float id = d ? 1.0f/d : 0.0f;
++
++        y[i].d = GGML_FP32_TO_FP16(d);
++        y[i].m = GGML_FP32_TO_FP16(min);
++
++        for (int j = 0; j < qk/2; ++j) {
++            const float x0 = (x[i*qk + 0    + j] - min)*id;
++            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
++
++            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
++            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
++
++            y[i].qs[j]  = xi0;
++            y[i].qs[j] |= xi1 << 4;
++        }
++    }
++}
++
++void quantize_row_q8_0(const float *__restrict__ x, block_q8_0 *__restrict__ vy, int64_t k)
++{
++    const int nb = k / QK8_0;
++    block_q8_0 *__restrict__ y = vy;
++
++#if defined(__ARM_NEON)
++    for (int i = 0; i < nb; i++) {
++        float32x4_t srcv [8];
++        float32x4_t asrcv[8];
++        float32x4_t amaxv[8];
++
++        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
++        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
++
++        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
++        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
++        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
++
++        const float amax = vmaxvq_f32(amaxv[0]);
++
++        const float d = amax / ((1 << 7) - 1);
++        const float id = d ? 1.0f/d : 0.0f;
++
++        y[i].d = GGML_FP32_TO_FP16(d);
++
++        for (int j = 0; j < 8; j++) {
++            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
++            const int32x4_t   vi = vcvtnq_s32_f32(v);
++
++            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
++            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
++            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
++            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
++        }
++    }
++#elif defined(__wasm_simd128__)
++    for (int i = 0; i < nb; i++) {
++        v128_t srcv [8];
++        v128_t asrcv[8];
++        v128_t amaxv[8];
++
++        for (int j = 0; j < 8; j++) srcv[j]  = wasm_v128_load(x + i*32 + 4*j);
++        for (int j = 0; j < 8; j++) asrcv[j] = wasm_f32x4_abs(srcv[j]);
++
++        for (int j = 0; j < 4; j++) amaxv[2*j] = wasm_f32x4_max(asrcv[2*j], asrcv[2*j+1]);
++        for (int j = 0; j < 2; j++) amaxv[4*j] = wasm_f32x4_max(amaxv[4*j], amaxv[4*j+2]);
++        for (int j = 0; j < 1; j++) amaxv[8*j] = wasm_f32x4_max(amaxv[8*j], amaxv[8*j+4]);
++
++        const float amax = MAX(MAX(wasm_f32x4_extract_lane(amaxv[0], 0),
++                                   wasm_f32x4_extract_lane(amaxv[0], 1)),
++                               MAX(wasm_f32x4_extract_lane(amaxv[0], 2),
++                                   wasm_f32x4_extract_lane(amaxv[0], 3)));
++
++        const float d = amax / ((1 << 7) - 1);
++        const float id = d ? 1.0f/d : 0.0f;
++
++        y[i].d = GGML_FP32_TO_FP16(d);
++
++        for (int j = 0; j < 8; j++) {
++            const v128_t v  = wasm_f32x4_mul(srcv[j], wasm_f32x4_splat(id));
++            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(v);
++
++            y[i].qs[4*j + 0] = wasm_i32x4_extract_lane(vi, 0);
++            y[i].qs[4*j + 1] = wasm_i32x4_extract_lane(vi, 1);
++            y[i].qs[4*j + 2] = wasm_i32x4_extract_lane(vi, 2);
++            y[i].qs[4*j + 3] = wasm_i32x4_extract_lane(vi, 3);
++        }
++    }
++#elif defined(__AVX2__) || defined(__AVX__)
++    for (int i = 0; i < nb; i++) {
++        // Load elements into 4 AVX vectors
++        __m256 v0 = _mm256_loadu_ps( x );
++        __m256 v1 = _mm256_loadu_ps( x + 8 );
++        __m256 v2 = _mm256_loadu_ps( x + 16 );
++        __m256 v3 = _mm256_loadu_ps( x + 24 );
++        x += 32;
++
++        // Compute max(abs(e)) for the block
++        const __m256 signBit = _mm256_set1_ps( -0.0f );
++        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
++        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
++        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
++        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
++
++        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
++        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
++        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
++        const float maxScalar = _mm_cvtss_f32( max4 );
++
++        // Quantize these floats
++        const float d = maxScalar / 127.f;
++        y[i].d = GGML_FP32_TO_FP16(d);
++        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
++        const __m256 mul = _mm256_set1_ps( id );
++
++        // Apply the multiplier
++        v0 = _mm256_mul_ps( v0, mul );
++        v1 = _mm256_mul_ps( v1, mul );
++        v2 = _mm256_mul_ps( v2, mul );
++        v3 = _mm256_mul_ps( v3, mul );
++
++        // Round to nearest integer
++        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
++        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
++        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
++        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
++
++        // Convert floats to integers
++        __m256i i0 = _mm256_cvtps_epi32( v0 );
++        __m256i i1 = _mm256_cvtps_epi32( v1 );
++        __m256i i2 = _mm256_cvtps_epi32( v2 );
++        __m256i i3 = _mm256_cvtps_epi32( v3 );
++
++#if defined(__AVX2__)
++        // Convert int32 to int16
++        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
++        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
++                                            // Convert int16 to int8
++        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
++
++        // We got our precious signed bytes, but the order is now wrong
++        // These AVX2 pack instructions process 16-byte pieces independently
++        // The following instruction is fixing the order
++        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
++        i0 = _mm256_permutevar8x32_epi32( i0, perm );
++
++        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
++#else
++        // Since we don't have in AVX some necessary functions,
++        // we split the registers in half and call AVX2 analogs from SSE
++        __m128i ni0 = _mm256_castsi256_si128( i0 );
++        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
++        __m128i ni2 = _mm256_castsi256_si128( i1 );
++        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
++        __m128i ni4 = _mm256_castsi256_si128( i2 );
++        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
++        __m128i ni6 = _mm256_castsi256_si128( i3 );
++        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
++
++        // Convert int32 to int16
++        ni0 = _mm_packs_epi32( ni0, ni1 );
++        ni2 = _mm_packs_epi32( ni2, ni3 );
++        ni4 = _mm_packs_epi32( ni4, ni5 );
++        ni6 = _mm_packs_epi32( ni6, ni7 );
++        // Convert int16 to int8
++        ni0 = _mm_packs_epi16( ni0, ni2 );
++        ni4 = _mm_packs_epi16( ni4, ni6 );
++
++        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
++        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
++#endif
++    }
++#elif defined(__riscv_v_intrinsic)
++
++    size_t vl = __riscv_vsetvl_e32m4(QK8_0);
++
++    for (int i = 0; i < nb; i++) {
++        // load elements
++        vfloat32m4_t v_x   = __riscv_vle32_v_f32m4(x+i*QK8_0, vl);
++
++        vfloat32m4_t vfabs = __riscv_vfabs_v_f32m4(v_x, vl);
++        vfloat32m1_t tmp   = __riscv_vfmv_v_f_f32m1(0.0f, vl);
++        vfloat32m1_t vmax  = __riscv_vfredmax_vs_f32m4_f32m1(vfabs, tmp, vl);
++        float amax = __riscv_vfmv_f_s_f32m1_f32(vmax);
++
++        const float d = amax / ((1 << 7) - 1);
++        const float id = d ? 1.0f/d : 0.0f;
++
++        y[i].d = GGML_FP32_TO_FP16(d);
++
++        vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
++
++        // convert to integer
++        vint16m2_t   vi = __riscv_vfncvt_x_f_w_i16m2(x0, vl);
++        vint8m1_t    vs = __riscv_vncvt_x_x_w_i8m1(vi, vl);
++
++        // store result
++        __riscv_vse8_v_i8m1(y[i].qs , vs, vl);
++    }
++#else
++    GGML_UNUSED(nb);
++    // scalar
++    quantize_row_q8_0_reference(x, y, k);
++#endif
++}
++
++void quantize_row_q8_1(const float * __restrict__ x, block_q8_1 * __restrict__ y, int64_t k)
++{
++    const int nb = k / QK8_1;
++
++    for (int i = 0; i < nb; i++) {
++        float amax = 0.0f; // absolute max
++
++        for (int j = 0; j < QK8_1; j++) {
++            const float v = x[i*QK8_1 + j];
++            amax = MAX(amax, fabsf(v));
++        }
++
++        const float d = amax / ((1 << 7) - 1);
++        const float id = d ? 1.0f/d : 0.0f;
++
++        y[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d = GGML_FP32_TO_FP16(d);
++
++        int sum = 0;
++
++        for (int j = 0; j < QK8_1/2; ++j) {
++            const float v0 = x[i*QK8_1           + j]*id;
++            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
++
++            y[i].qs[          j] = roundf(v0);
++            y[i].qs[QK8_1/2 + j] = roundf(v1);
++
++            sum += y[i].qs[          j];
++            sum += y[i].qs[QK8_1/2 + j];
++        }
++
++        y[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s = GGML_FP32_TO_FP16(sum*d);
++    }
++}
++
++static float make_qx_quants(int n, int nmax, const float *__restrict__ x, int8_t * __restrict__ L, int rmse_type,
++        const float *__restrict__ qw)
++{
++    float max = 0;
++    float amax = 0;
++    for (int i = 0; i < n; ++i) {
++        float ax = fabsf(x[i]);
++        if (ax > amax) { amax = ax; max = x[i]; }
++    }
++    if (amax < 1e-30f) { // all zero
++        for (int i = 0; i < n; ++i) {
++            L[i] = 0;
++        }
++        return 0.f;
++    }
++    float iscale = -nmax / max;
++    if (rmse_type == 0) {
++        for (int i = 0; i < n; ++i) {
++            int l = nearest_int(iscale * x[i]);
++            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
++        }
++        return 1/iscale;
++    }
++    int return_early = 0;
++    if (rmse_type < 0) {
++        rmse_type = -rmse_type;
++        return_early = 1;
++    }
++    float sumlx = 0;
++    float suml2 = 0;
++#ifdef HAVE_BUGGY_APPLE_LINKER
++    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
++    for (volatile int i = 0; i < n; ++i) {
++#else
++    for (int i = 0; i < n; ++i) {
++#endif
++        int l = nearest_int(iscale * x[i]);
++        l = MAX(-nmax, MIN(nmax-1, l));
++        L[i] = l + nmax;
++        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
++        sumlx += w*x[i]*l;
++        suml2 += w*l*l;
++    }
++    float scale = sumlx/suml2;
++    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
++    float best = scale * sumlx;
++    for (int is = -9; is <= 9; ++is) {
++        if (is == 0) {
++            continue;
++        }
++        iscale = -(nmax + 0.1f*is) / max;
++        sumlx = suml2 = 0;
++        for (int i = 0; i < n; ++i) {
++            int l = nearest_int(iscale * x[i]);
++            l = MAX(-nmax, MIN(nmax-1, l));
++            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
++            sumlx += w*x[i]*l;
++            suml2 += w*l*l;
++        }
++        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
++            for (int i = 0; i < n; ++i) {
++                int l = nearest_int(iscale * x[i]);
++                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
++            }
++            scale = sumlx/suml2; best = scale*sumlx;
++        }
++    }
++    return scale;
++}
++
++void quantize_row_q6_K(const float * __restrict__ x, block_q6_K *__restrict__ y, int64_t k)
++{
++    const int64_t nb = k / QK_K;
++
++    int8_t L[QK_K];
++    float scales[QK_K/16];
++
++    for (int i = 0; i < nb; i++) {
++        float max_scale = 0;
++        float max_abs_scale = 0;
++
++        for (int ib = 0; ib < QK_K/16; ++ib) {
++
++            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
++            scales[ib] = scale;
++
++            const float abs_scale = fabsf(scale);
++            if (abs_scale > max_abs_scale) {
++                max_abs_scale = abs_scale;
++                max_scale = scale;
++            }
++
++        }
++
++        if (!max_abs_scale) {
++            memset(&y[i], 0, sizeof(block_q6_K));
++            y[i].d = GGML_FP32_TO_FP16(0.f);
++            x += QK_K;
++            continue;
++        }
++
++        float iscale = -128.f/max_scale;
++        y[i].d = GGML_FP32_TO_FP16(1/iscale);
++        for (int ib = 0; ib < QK_K/16; ++ib) {
++            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
++        }
++
++        for (int j = 0; j < QK_K/16; ++j) {
++            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
++            if (!d) {
++                continue;
++            }
++            for (int ii = 0; ii < 16; ++ii) {
++                int l = nearest_int(x[16*j + ii]/d);
++                l = MAX(-32, MIN(31, l));
++                L[16*j + ii] = l + 32;
++            }
++        }
++
++        uint8_t * __restrict__ ql = y[i].ql;
++        uint8_t * __restrict__ qh = y[i].qh;
++#if QK_K == 256
++        for (int j = 0; j < QK_K; j += 128) {
++            for (int l = 0; l < 32; ++l) {
++                const uint8_t q1 = L[j + l +  0] & 0xF;
++                const uint8_t q2 = L[j + l + 32] & 0xF;
++                const uint8_t q3 = L[j + l + 64] & 0xF;
++                const uint8_t q4 = L[j + l + 96] & 0xF;
++                ql[l+ 0] = q1 | (q3 << 4);
++                ql[l+32] = q2 | (q4 << 4);
++                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
++            }
++            ql += 64;
++            qh += 32;
++        }
++#else
++        for (int l = 0; l < 32; ++l) {
++            const uint8_t q1 = L[l +  0] & 0xF;
++            const uint8_t q2 = L[l + 32] & 0xF;
++            ql[l] = q1 | (q2 << 4);
++        }
++        for (int l = 0; l < 16; ++l) {
++            qh[l] = (L[l] >> 4) | ((L[l + 16] >> 4) << 2) | ((L[l + 32] >> 4) << 4) | ((L[l + 48] >> 4) << 6);
++        }
++#endif
++
++        x += QK_K;
++
++    }
++}
++
++void quantize_row_q8_K(const float *__restrict__ x, block_q8_K *__restrict__ y, int64_t k)
++{
++    const int64_t nb = k / QK_K;
++
++    for (int i = 0; i < nb; i++) {
++        float max = 0;
++        float amax = 0;
++        for (int j = 0; j < QK_K; ++j) {
++            float ax = fabsf(x[j]);
++            if (ax > amax) {
++                amax = ax; max = x[j];
++            }
++        }
++        if (!amax) {
++            y[i].d = 0;
++            memset(y[i].qs, 0, QK_K);
++            x += QK_K;
++            continue;
++        }
++        //const float iscale = -128.f/max;
++        // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
++        const float iscale = -127.f/max;
++        for (int j = 0; j < QK_K; ++j) {
++            int v = nearest_int(iscale*x[j]);
++            y[i].qs[j] = MIN(127, v);
++        }
++        for (int j = 0; j < QK_K/16; ++j) {
++            int sum = 0;
++            for (int ii = 0; ii < 16; ++ii) {
++                sum += y[i].qs[j*16 + ii];
++            }
++            y[i].bsums[j] = sum;
++        }
++        y[i].d = 1 / iscale;
++        x += QK_K;
++    }
++}
++
++static float make_q3_quants(int n, int nmax, const float *__restrict__ x, int8_t * __restrict__ L, int do_rmse)
++{
++    float max = 0;
++    float amax = 0;
++    for (int i = 0; i < n; ++i) {
++        float ax = fabsf(x[i]);
++        if (ax > amax) { amax = ax; max = x[i]; }
++    }
++    if (!amax) { // all zero
++        for (int i = 0; i < n; ++i) { L[i] = 0; }
++        return 0.f;
++    }
++    float iscale = -nmax / max;
++    if (do_rmse) {
++        float sumlx = 0;
++        float suml2 = 0;
++        for (int i = 0; i < n; ++i) {
++            int l = nearest_int(iscale * x[i]);
++            l = MAX(-nmax, MIN(nmax-1, l));
++            L[i] = l;
++            float w = x[i]*x[i];
++            sumlx += w*x[i]*l;
++            suml2 += w*l*l;
++        }
++        for (int itry = 0; itry < 5; ++itry) {
++            int n_changed = 0;
++            for (int i = 0; i < n; ++i) {
++                float w = x[i]*x[i];
++                float slx = sumlx - w*x[i]*L[i];
++                if (slx > 0) {
++                    float sl2 = suml2 - w*L[i]*L[i];
++                    int new_l = nearest_int(x[i] * sl2 / slx);
++                    new_l = MAX(-nmax, MIN(nmax-1, new_l));
++                    if (new_l != L[i]) {
++                        slx += w*x[i]*new_l;
++                        sl2 += w*new_l*new_l;
++                        if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
++                            L[i] = new_l; sumlx = slx; suml2 = sl2;
++                            ++n_changed;
++                        }
++                    }
++                }
++            }
++            if (!n_changed) {
++                break;
++            }
++        }
++        for (int i = 0; i < n; ++i) {
++            L[i] += nmax;
++        }
++        return sumlx / suml2;
++    }
++    for (int i = 0; i < n; ++i) {
++        int l = nearest_int(iscale * x[i]);
++        l = MAX(-nmax, MIN(nmax-1, l));
++        L[i] = l + nmax;
++    }
++    return 1/iscale;
++}
++
++void quantize_row_q3_K(const float *__restrict__ x, block_q3_K *__restrict__ y, int64_t k)
++{
++    const int nb = k / QK_K;
++    int8_t L[QK_K];
++    float scales[QK_K / 16];
++
++    for (int i = 0; i < nb; i++) {
++        float max_scale = 0;
++        float amax = 0;
++        for (int j = 0; j < QK_K / 16; ++j) {
++            scales[j] = make_q3_quants(16, 4, x + 16 * j, L + 16 * j, 1);
++            float scale = fabsf(scales[j]);
++            if (scale > amax) {
++                amax = scale; max_scale = scales[j];
++            }
++        }
++
++#if QK_K == 256
++        memset(y[i].scales, 0, 12);
++        if (max_scale) {
++            float iscale = -32.f/max_scale;
++            for (int j = 0; j < QK_K/16; ++j) {
++                int8_t l = nearest_int(iscale*scales[j]);
++                l = MAX(-32, MIN(31, l)) + 32;
++                if (j < 8) {
++                    y[i].scales[j] = l & 0xF;
++                } else {
++                    y[i].scales[j-8] |= ((l & 0xF) << 4);
++                }
++                l >>= 4;
++                y[i].scales[j%4 + 8] |= (l << (2*(j/4)));
++            }
++            y[i].d = GGML_FP32_TO_FP16(1/iscale);
++        } else {
++            y[i].d = GGML_FP32_TO_FP16(0.f);
++        }
++
++        int8_t sc;
++        for (int j = 0; j < QK_K/16; ++j) {
++            sc = j < 8 ? y[i].scales[j] & 0xF : y[i].scales[j-8] >> 4;
++            sc = (sc | (((y[i].scales[8 + j%4] >> (2*(j/4))) & 3) << 4)) - 32;
++            float d = GGML_FP16_TO_FP32(y[i].d) * sc;
++            if (!d) {
++                continue;
++            }
++            for (int ii = 0; ii < 16; ++ii) {
++                int l = nearest_int(x[16*j + ii]/d);
++                l = MAX(-4, MIN(3, l));
++                L[16*j + ii] = l + 4;
++            }
++        }
++#else
++        if (max_scale) {
++            float iscale = -8.f/max_scale;
++            for (int j = 0; j < QK_K/16; j+=2) {
++                int l1 = nearest_int(iscale*scales[j]);
++                l1 = 8 + MAX(-8, MIN(7, l1));
++                int l2 = nearest_int(iscale*scales[j+1]);
++                l2 = 8 + MAX(-8, MIN(7, l2));
++                y[i].scales[j/2] = l1 | (l2 << 4);
++            }
++            y[i].d = GGML_FP32_TO_FP16(1/iscale);
++        } else {
++            for (int j = 0; j < QK_K/16; j+=2) {
++                y[i].scales[j/2] = 0;
++            }
++            y[i].d = GGML_FP32_TO_FP16(0.f);
++        }
++        for (int j = 0; j < QK_K/16; ++j) {
++            int s = j%2 == 0 ? y[i].scales[j/2] & 0xF : y[i].scales[j/2] >> 4;
++            float d = GGML_FP16_TO_FP32(y[i].d) * (s - 8);
++            if (!d) {
++                continue;
++            }
++            for (int ii = 0; ii < 16; ++ii) {
++                int l = nearest_int(x[16*j + ii]/d);
++                l = MAX(-4, MIN(3, l));
++                L[16*j + ii] = l + 4;
++            }
++        }
++#endif
++
++        memset(y[i].hmask, 0, QK_K/8);
++        // We put the high-bit for the 1st 8 quants into bit 0, the next 8 into bit 1, etc.
++        int m = 0;
++        uint8_t hm = 1;
++        for (int j = 0; j < QK_K; ++j) {
++            if (L[j] > 3) {
++                y[i].hmask[m] |= hm;
++                L[j] -= 4;
++            }
++            if (++m == QK_K/8) {
++                m = 0; hm <<= 1;
++            }
++        }
++#if QK_K == 256
++        for (int j = 0; j < QK_K; j += 128) {
++            for (int l = 0; l < 32; ++l) {
++                y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
++            }
++        }
++#else
++        for (int l = 0; l < 16; ++l) {
++            y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
++        }
++#endif
++
++        x += QK_K;
++    }
++}
++
++void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx,
++                            size_t bx, const void * __restrict__ vy, size_t by, int nrc)
++{
++    const int qk = QK8_0;
++    const int nb = n / qk;
++
++    (void)(nrc);
++    (void)(bx);
++    (void)(by);
++    (void)(bs);
++
++    const block_q4_0 * __restrict__ x = vx;
++    const block_q8_0 * __restrict__ y = vy;
++
++#if defined(__ARM_FEATURE_MATMUL_INT8)
++    if (nrc == 2) {
++        const block_q4_0 * __restrict__ vx0 = vx;
++        const block_q4_0 * __restrict__ vx1 = (const block_q4_0 *) ((const uint8_t*)vx + bx);
++
++        const block_q8_0 * __restrict__ vy0 = vy;
++        const block_q8_0 * __restrict__ vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
++
++        float32x4_t sumv0 = vdupq_n_f32(0.0f);
++
++        for (int i = 0; i < nb; i++) {
++            const block_q4_0 * __restrict__ b_x0 = &vx0[i];
++            const block_q4_0 * __restrict__ b_x1 = &vx1[i];
++            const block_q8_0 * __restrict__ b_y0 = &vy0[i];
++            const block_q8_0 * __restrict__ b_y1 = &vy1[i];
++
++            const uint8x16_t m4b = vdupq_n_u8(0x0F);
++            const int8x16_t  s8b = vdupq_n_s8(0x8);
++
++            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
++            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
++
++            // 4-bit -> 8-bit
++            const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
++            const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
++            const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
++            const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
++
++            // sub 8
++            const int8x16_t x0_l = vsubq_s8(v0_0l, s8b);
++            const int8x16_t x0_h = vsubq_s8(v0_0h, s8b);
++            const int8x16_t x1_l = vsubq_s8(v0_1l, s8b);
++            const int8x16_t x1_h = vsubq_s8(v0_1h, s8b);
++
++            // load y
++            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
++            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
++            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
++            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
++
++            float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
++                                 GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
++                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
++                                 GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
++
++            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
++            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
++
++            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
++            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
++
++            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
++            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
++
++            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
++            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
++
++            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
++                                                                                l1, r1)), l2, r2)), l3, r3))), scale);
++        }
++        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
++        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
++
++        vst1_f32(s, vget_low_f32(sumv2));
++        vst1_f32(s + bs, vget_high_f32(sumv2));
++        return;
++    }
++#endif
++
++#if defined(__ARM_FEATURE_SVE)
++    float sum[4] = {0.0};
++    int64_t x0_sum, x1_sum, x0_sum_1, x1_sum_1;
++    svbool_t pre = svptrue_b8();
++    svbool_t pre32 = svptrue_b32();
++    svint32_t x0_32_0 = svdup_s32(0);
++    svint32_t x1_32_0 = svdup_s32(0);
++
++    for (int i = 0; i < nb; i += 4) {
++        __builtin_prefetch((char *)&x[i], 0, 3);
++        __builtin_prefetch((char *)&y[i], 0, 3);
++        const block_q4_0 *__restrict__ x0 = &x[i + 0];
++        const block_q4_0 *__restrict__ x1 = &x[i + 1];
++        const block_q4_0 *__restrict__ x2 = &x[i + 2];
++        const block_q4_0 *__restrict__ x3 = &x[i + 3];
++
++        const block_q8_0 *__restrict__ y0 = &y[i + 0];
++        const block_q8_0 *__restrict__ y1 = &y[i + 1];
++        const block_q8_0 *__restrict__ y2 = &y[i + 2];
++        const block_q8_0 *__restrict__ y3 = &y[i + 3];
++
++        svuint8_t x_merge = svld1(pre, x0->qs);
++        memcpy((char *)&x_merge + 16, x1->qs, 16);
++        /* 低4位-8 */
++        svint8_t x_low4_0 = svsub_z(pre, svreinterpret_s8(svand_z(pre, x_merge, 0xf)), 8);
++        svint8_t x_high4_0 = svsub_z(pre, svreinterpret_s8(svlsr_z(pre, x_merge, 4)), 8);
++        /* 交叉存储 */
++        svint8_t x_tmp = svext(x_high4_0, x_low4_0, 16);
++        svint8_t x_low4 = svext(x_tmp, x_high4_0, 16);
++        svint8_t x_high4 = svext(x_low4_0, x_tmp, 16);
++        /* 提取y */
++        svint8_t y0_merge = svld1(pre, y0->qs);
++        svint8_t y1_merge = svld1(pre, y1->qs);
++
++        svuint8_t x_merge1 = svld1(pre, x2->qs);
++        memcpy((char *)&x_merge1 + 16, x3->qs, 16);
++        /* 低4位-8 */
++        svint8_t x_low4_2 = svsub_z(pre, svreinterpret_s8(svand_z(pre, x_merge1, 0xf)), 8);
++        svint8_t x_high4_2 = svsub_z(pre, svreinterpret_s8(svlsr_z(pre, x_merge1, 4)), 8);
++        x_tmp = svext(x_high4_2, x_low4_2, 16);
++        svint8_t x_low4_1 = svext(x_tmp, x_high4_2, 16);
++        svint8_t x_high4_1 = svext(x_low4_2, x_tmp, 16);
++        /* 提取y */
++        svint8_t y0_merge_1 = svld1(pre, y2->qs);
++        svint8_t y1_merge_1 = svld1(pre, y3->qs);
++
++        svint32_t x0_mul32 = svdot(x0_32_0, x_low4, y0_merge);
++        svint32_t x1_mul32 = svdot(x1_32_0, x_high4, y1_merge);
++        svint32_t x0_mul32_1 = svdot(x0_32_0, x_low4_1, y0_merge_1);
++        svint32_t x1_mul32_1 = svdot(x1_32_0, x_high4_1, y1_merge_1);
++
++        x0_sum = svaddv(pre32, x0_mul32);
++        x1_sum = svaddv(pre32, x1_mul32);
++        x0_sum_1 = svaddv(pre32, x0_mul32_1);
++        x1_sum_1 = svaddv(pre32, x1_mul32_1);
++
++        sum[0] += x0_sum * x0->d * y0->d;
++        sum[1] += x1_sum * x1->d * y1->d;
++        sum[2] += x0_sum_1 * x2->d * y2->d;
++        sum[3] += x1_sum_1 * x3->d * y3->d;
++    }
++    *s = sum[0] + sum[1] + sum[2] + sum[3];
++#elif defined(__ARM_NEON)
++    float32x4_t sumv0 = vdupq_n_f32(0.0f);
++    float32x4_t sumv1 = vdupq_n_f32(0.0f);
++
++    for (int i = 0; i < nb; i += 2) {
++        const block_q4_0 * __restrict__ x0 = &x[i + 0];
++        const block_q4_0 * __restrict__ x1 = &x[i + 1];
++        const block_q8_0 * __restrict__ y0 = &y[i + 0];
++        const block_q8_0 * __restrict__ y1 = &y[i + 1];
++
++        const uint8x16_t m4b = vdupq_n_u8(0x0F);
++        const int8x16_t  s8b = vdupq_n_s8(0x8);
++
++        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
++        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
++
++        // 4-bit -> 8-bit
++        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
++        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
++        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
++        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
++
++        // sub 8
++        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
++        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
++        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
++        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
++
++        // load y
++        const int8x16_t v1_0l = vld1q_s8(y0->qs);
++        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
++        const int8x16_t v1_1l = vld1q_s8(y1->qs);
++        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
++
++        // dot product into int32x4_t
++        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
++        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
++
++        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
++        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
++    }
++
++    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
++#elif defined(__AVX2__)
++    // Initialize accumulator with zeros
++    __m256 acc = _mm256_setzero_ps();
++
++    // Main loop
++    for (int i = 0; i < nb; ++i) {
++        /* Compute combined scale for the block */
++        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
++
++        __m256i qx = bytes_from_nibbles_32(x[i].qs);
++
++        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
++        const __m256i off = _mm256_set1_epi8( 8 );
++        qx = _mm256_sub_epi8( qx, off );
++
++        __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
++
++        const __m256 q = mul_sum_i8_pairs_float(qx, qy);
++
++        /* Multiply q with scale and accumulate */
++        acc = _mm256_fmadd_ps( d, q, acc );
++    }
++
++    *s = hsum_float_8(acc);
++#elif defined(__AVX__)
++    // Initialize accumulator with zeros
++    __m256 acc = _mm256_setzero_ps();
++
++    // Main loop
++    for (int i = 0; i < nb; ++i) {
++        // Compute combined scale for the block
++        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
++
++        const __m128i lowMask = _mm_set1_epi8(0xF);
++        const __m128i off = _mm_set1_epi8(8);
++
++        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
++
++        __m128i bx_0 = _mm_and_si128(lowMask, tmp);
++        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
++        bx_0 = _mm_sub_epi8(bx_0, off);
++        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
++
++        bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
++        by_0 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
++        bx_0 = _mm_sub_epi8(bx_0, off);
++        const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
++
++        // Convert int32_t to float
++        __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
++
++        // Apply the scale, and accumulate
++        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
++    }
++
++    *s = hsum_float_8(acc);
++#elif defined(__SSSE3__)
++    // set constants
++    const __m128i lowMask = _mm_set1_epi8(0xF);
++    const __m128i off = _mm_set1_epi8(8);
++
++    // Initialize accumulator with zeros
++    __m128 acc_0 = _mm_setzero_ps();
++    __m128 acc_1 = _mm_setzero_ps();
++    __m128 acc_2 = _mm_setzero_ps();
++    __m128 acc_3 = _mm_setzero_ps();
++
++    // First round without accumulation
++    {
++        _mm_prefetch(&x[0] + sizeof(block_q4_0), _MM_HINT_T0);
++        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);
++
++        // Compute combined scale for the block 0 and 1
++        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
++
++        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
++
++        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
++        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
++        bx_0 = _mm_sub_epi8(bx_0, off);
++        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
++
++        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
++        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
++        bx_1 = _mm_sub_epi8(bx_1, off);
++        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
++
++        _mm_prefetch(&x[1] + sizeof(block_q4_0), _MM_HINT_T0);
++        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);
++
++        // Compute combined scale for the block 2 and 3
++        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
++
++        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
++
++        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
++        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
++        bx_2 = _mm_sub_epi8(bx_2, off);
++        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
++
++        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
++        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
++        bx_3 = _mm_sub_epi8(bx_3, off);
++        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
++
++        // Convert int32_t to float
++        __m128 p0 = _mm_cvtepi32_ps(i32_0);
++        __m128 p1 = _mm_cvtepi32_ps(i32_1);
++        __m128 p2 = _mm_cvtepi32_ps(i32_2);
++        __m128 p3 = _mm_cvtepi32_ps(i32_3);
++
++        // Apply the scale
++        acc_0 = _mm_mul_ps( d_0_1, p0 );
++        acc_1 = _mm_mul_ps( d_0_1, p1 );
++        acc_2 = _mm_mul_ps( d_2_3, p2 );
++        acc_3 = _mm_mul_ps( d_2_3, p3 );
++    }
++
++    assert(nb % 2 == 0); // TODO: handle odd nb
++
++    // Main loop
++    for (int i = 2; i < nb; i+=2) {
++        _mm_prefetch(&x[i] + sizeof(block_q4_0), _MM_HINT_T0);
++        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);
++
++        // Compute combined scale for the block 0 and 1
++        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
++
++        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
++
++        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
++        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
++        bx_0 = _mm_sub_epi8(bx_0, off);
++        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
++
++        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
++        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
++        bx_1 = _mm_sub_epi8(bx_1, off);
++        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
++
++        _mm_prefetch(&x[i] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
++        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
++
++        // Compute combined scale for the block 2 and 3
++        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
++
++        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
++
++        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
++        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
++        bx_2 = _mm_sub_epi8(bx_2, off);
++        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
++
++        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
++        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
++        bx_3 = _mm_sub_epi8(bx_3, off);
++        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
++
++        // Convert int32_t to float
++        __m128 p0 = _mm_cvtepi32_ps(i32_0);
++        __m128 p1 = _mm_cvtepi32_ps(i32_1);
++        __m128 p2 = _mm_cvtepi32_ps(i32_2);
++        __m128 p3 = _mm_cvtepi32_ps(i32_3);
++
++        // Apply the scale
++        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
++        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
++        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
++        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
++
++        // Acummulate
++        acc_0 = _mm_add_ps(p0_d, acc_0);
++        acc_1 = _mm_add_ps(p1_d, acc_1);
++        acc_2 = _mm_add_ps(p2_d, acc_2);
++        acc_3 = _mm_add_ps(p3_d, acc_3);
++    }
++
++    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
++#elif defined(__riscv_v_intrinsic)
++    float sumf = 0.0;
++
++    size_t vl = __riscv_vsetvl_e8m1(qk/2);
++
++    for (int i = 0; i < nb; i++) {
++        // load elements
++        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[i].qs, vl);
++
++        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[i].qs, vl);
++        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[i].qs+16, vl);
++
++        // mask and store lower part of x, and then upper part
++        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
++        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
++
++        vint8mf2_t x_ai = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
++        vint8mf2_t x_li = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
++
++        // subtract offset
++        vint8mf2_t v0 = __riscv_vsub_vx_i8mf2(x_ai, 8, vl);
++        vint8mf2_t v1 = __riscv_vsub_vx_i8mf2(x_li, 8, vl);
++
++        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
++        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
++
++        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
++
++        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
++        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
++
++        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
++
++        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
++    }
++
++    *s = sumf;
++#else
++    // scalar
++    float sumf = 0.0;
++
++    for (int i = 0; i < nb; i++) {
++        int sumi = 0;
++
++        for (int j = 0; j < qk/2; ++j) {
++            const int v0 = (x[i].qs[j] & 0x0F) - 8;
++            const int v1 = (x[i].qs[j] >>   4) - 8;
++
++            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
++        }
++
++        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
++    }
++
++    *s = sumf;
++#endif
++}
++
++void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc)
++{
++    const int qk = QK8_1;
++    const int nb = n / qk;
++
++    const block_q4_1 * __restrict__ x = vx;
++    const block_q8_1 * __restrict__ y = vy;
++
++#if defined(__ARM_FEATURE_MATMUL_INT8_0)
++    if (nrc == 2) {
++        const block_q4_1 * __restrict__ vx0 = vx;
++        const block_q4_1 * __restrict__ vx1 = (const block_q4_1 *) ((const uint8_t*)vx + bx);
++        const block_q8_1 * __restrict__ vy0 = vy;
++        const block_q8_1 * __restrict__ vy1 = (const block_q8_1 *) ((const uint8_t*)vy + by);
++
++        float32x4_t sumv0 = vdupq_n_f32(0.0f);
++        float32x4_t summs0 = vdupq_n_f32(0.0f);
++
++        for (int i = 0; i < nb; i++) {
++            const block_q4_1 * __restrict__ b_x0 = &vx0[i];
++            const block_q4_1 * __restrict__ b_x1 = &vx1[i];
++            const block_q8_1 * __restrict__ b_y0 = &vy0[i];
++            const block_q8_1 * __restrict__ b_y1 = &vy1[i];
++
++            float32_t summs_t[4] = {
++                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
++                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
++                GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
++                GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)
++            };
++            summs0 = vaddq_f32(summs0, vld1q_f32(summs_t));
++
++            const uint8x16_t m4b = vdupq_n_u8(0x0F);
++
++            const uint8x16_t v0_0 = vld1q_u8(b_x0->qs);
++            const uint8x16_t v0_1 = vld1q_u8(b_x1->qs);
++
++            // 4-bit -> 8-bit
++            const int8x16_t x0_l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
++            const int8x16_t x0_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
++            const int8x16_t x1_l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
++            const int8x16_t x1_h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
++
++            // load y
++            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
++            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
++            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
++            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
++
++            // mmla into int32x4_t
++            float32_t _scale[4] = {
++                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
++                GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
++                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
++                GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)
++            };
++            float32x4_t scale = vld1q_f32(_scale);
++
++            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
++            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
++
++            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
++            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
++
++            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
++            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
++
++            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
++            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
++            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
++                                                l1, r1)), l2, r2)), l3, r3))), scale);
++        }
++
++        float32x4_t sumv1 = vextq_f32 (sumv0, sumv0, 2);
++        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
++
++        sumv2 = vaddq_f32(sumv2, summs0);
++
++        vst1_f32(s,      vget_low_f32 (sumv2));
++        vst1_f32(s + bs, vget_high_f32(sumv2));
++
++        return;
++    }
++#endif
++
++    int ib = 0;
++    float sumf = 0;
++
++    // TODO: add WASM SIMD
++#if defined(__ARM_NEON)
++    float32x4_t sumv0 = vdupq_n_f32(0.0f);
++    float32x4_t sumv1 = vdupq_n_f32(0.0f);
++
++    float summs = 0;
++
++    for (; ib + 1 < nb; ib += 2) {
++        const block_q4_1 * __restrict__ x0 = &x[ib + 0];
++        const block_q4_1 * __restrict__ x1 = &x[ib + 1];
++        const block_q8_1 * __restrict__ y0 = &y[ib + 0];
++        const block_q8_1 * __restrict__ y1 = &y[ib + 1];
++
++        summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s);
++
++        const uint8x16_t m4b = vdupq_n_u8(0x0F);
++
++        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
++        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
++
++        // 4-bit -> 8-bit
++        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
++        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
++        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
++        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
++
++        // load y
++        const int8x16_t v1_0l = vld1q_s8(y0->qs);
++        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
++        const int8x16_t v1_1l = vld1q_s8(y1->qs);
++        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
++
++        // dot product into int32x4_t
++        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
++        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
++
++        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
++        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d));
++    }
++
++    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
++#elif defined(__AVX2__) || defined(__AVX__)
++    // Initialize accumulator with zeros
++    __m256 acc = _mm256_setzero_ps();
++
++    float summs = 0;
++
++    // Main loop
++    for (; ib < nb; ++ib) {
++        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
++        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
++
++        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
++
++        const __m256 d0v = _mm256_set1_ps( d0 );
++        const __m256 d1v = _mm256_set1_ps( d1 );
++
++        // Compute combined scales
++        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
++
++        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
++        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
++        const __m256i qy = _mm256_loadu_si256( (const __m256i *)y[ib].qs );
++
++        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
++
++        // Accumulate d0*d1*x*y
++#if defined(__AVX2__)
++        acc = _mm256_fmadd_ps( d0d1, xy, acc );
++#else
++        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
++#endif
++    }
++
++    sumf = hsum_float_8(acc) + summs;
++#elif defined(__riscv_v_intrinsic)
++    size_t vl = __riscv_vsetvl_e8m1(qk/2);
++
++    for (; ib < nb; ++ib) {
++        // load elements
++        vuint8mf2_t tx = __riscv_vle8_v_u8mf2(x[ib].qs, vl);
++
++        vint8mf2_t y0 = __riscv_vle8_v_i8mf2(y[ib].qs, vl);
++        vint8mf2_t y1 = __riscv_vle8_v_i8mf2(y[ib].qs+16, vl);
++
++        // mask and store lower part of x, and then upper part
++        vuint8mf2_t x_a = __riscv_vand_vx_u8mf2(tx, 0x0F, vl);
++        vuint8mf2_t x_l = __riscv_vsrl_vx_u8mf2(tx, 0x04, vl);
++
++        vint8mf2_t v0 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_a);
++        vint8mf2_t v1 = __riscv_vreinterpret_v_u8mf2_i8mf2(x_l);
++
++        vint16m1_t vec_mul1 = __riscv_vwmul_vv_i16m1(v0, y0, vl);
++        vint16m1_t vec_mul2 = __riscv_vwmul_vv_i16m1(v1, y1, vl);
++
++        vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl);
++
++        vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
++        vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
++
++        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
++
++        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
++    }
++
++#elif defined(__POWER9_VECTOR__)
++    const vector signed char lowMask = vec_splats((signed char)0xF);
++    const vector signed int v0 = vec_splats((int32_t)0);
++    const vector unsigned char v4 = vec_splats((unsigned char)0x4);
++
++    vector float vsumf0 = vec_splats(0.0f);
++
++#pragma GCC unroll 4
++    for (; ib < nb; ++ib) {
++        __builtin_prefetch(x[ib].qs, 0, 1);
++        __builtin_prefetch(y[ib].qs, 0, 1);
++
++        vector float vxd = vec_splats(GGML_FP16_TO_FP32(x[ib].d));
++        vector float vyd = vec_splats(GGML_FP16_TO_FP32(y[ib].d));
++        vector float vd = vec_mul(vxd, vyd);
++
++        vector float vxmin = vec_splats(GGML_FP16_TO_FP32(x[ib].m));
++        vector float vys = {GGML_FP16_TO_FP32(y[ib].s), 0.0f, 0.0f, 0.0f};
++        vsumf0 = vec_madd(vxmin, vys, vsumf0);
++
++        vector signed char qxs = (vector signed char)vec_xl( 0, x[ib].qs);
++        vector signed char q8y0 = vec_xl( 0, y[ib].qs);
++        vector signed char q8y1 = vec_xl(16, y[ib].qs);
++
++        vector unsigned char q4x0 = (vector unsigned char)vec_and(qxs, lowMask);
++        vector unsigned char q4x1 = (vector unsigned char)vec_sr(qxs, v4);
++
++        vector signed int vsumi0 = v0;
++
++        vsumi0 = vec_msum(q8y0, q4x0, vsumi0);
++        vsumi0 = vec_msum(q8y1, q4x1, vsumi0);
++
++        vsumf0 = vec_madd(vec_ctf(vsumi0, 0), vd, vsumf0);
++    }
++
++    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 4));
++    vsumf0 = vec_add(vsumf0, vec_sld(vsumf0, vsumf0, 8));
++
++    sumf = vec_extract(vsumf0, 0);
++
++#elif defined(__loongarch_asx)
++    // Initialize accumulator with zeros
++    __m256 acc = (__m256)__lasx_xvldi(0);
++
++    float summs = 0;
++
++    // Main loop
++    for (; ib < nb; ++ib) {
++        const float d0 = GGML_FP16_TO_FP32(x[ib].d);
++        const float d1 = GGML_FP16_TO_FP32(y[ib].d);
++
++        summs += GGML_FP16_TO_FP32(x[ib].m) * GGML_FP16_TO_FP32(y[ib].s);
++
++        const __m256 d0v = __lasx_xvreplfr2vr_s( d0 );
++        const __m256 d1v = __lasx_xvreplfr2vr_s( d1 );
++
++        // Compute combined scales
++        const __m256 d0d1 = __lasx_xvfmul_s( d0v, d1v );
++
++        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
++        const __m256i qx = bytes_from_nibbles_32(x[ib].qs);
++        const __m256i qy = __lasx_xvld( (const __m256i *)y[ib].qs, 0);
++
++        const __m256 xy = mul_sum_us8_pairs_float(qx, qy);
++
++        // Accumulate d0*d1*x*y
++        acc = __lasx_xvfmadd_s( d0d1, xy, acc );
++    }
++
++    sumf = hsum_float_8(acc) + summs;
++#endif
++    for (; ib < nb; ++ib) {
++        int sumi0 = 0;
++        int sumi1 = 0;
++
++        for (int j = 0; j < qk/2; ++j) {
++            const int v0 = (x[ib].qs[j] & 0x0F);
++            const int v1 = (x[ib].qs[j] >>   4);
++
++            sumi0 += (v0 * y[ib].qs[j]);
++            sumi1 += (v1 * y[ib].qs[j + qk/2]);
++        }
++
++        int sumi = sumi0 + sumi1;
++        sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.s);
++    }
++
++    *s = sumf;
++}
++
++void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc)
++{
++    (void)(nrc);
++    (void)(bx);
++    (void)(by);
++    (void)(bs);
++
++    const uint32_t kmask1 = 0x03030303;
++    const uint32_t kmask2 = 0x0f0f0f0f;
++
++    const block_q3_K *__restrict__ x = vx;
++    const block_q8_K *__restrict__ y = vy;
++    const int nb = n / QK_K;
++
++#ifdef __ARM_NEON
++    uint32_t aux[3];
++    uint32_t utmp[4];
++
++    const uint8x16_t m3b = vdupq_n_u8(0x3);
++    const int32x4_t  vzero = vdupq_n_s32(0);
++
++    const uint8x16_t m0 = vdupq_n_u8(1);
++    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
++    const uint8x16_t m2 = vshlq_n_u8(m0, 2);
++    const uint8x16_t m3 = vshlq_n_u8(m0, 3);
++    const int8_t m32 = 32;
++
++    ggml_int8x16x4_t q3bytes;
++
++    float sum = 0;
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++
++        const uint8_t * __restrict__ q3 = x[i].qs;
++        const uint8_t * __restrict__ qh = x[i].hmask;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh);
++
++        ggml_uint8x16x4_t q3h;
++
++        int32_t isum = 0;
++
++        // Set up scales
++        memcpy(aux, x[i].scales, 12);
++        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
++        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
++        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
++        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
++
++        int8_t * scale = (int8_t *)utmp;
++        for (int j = 0; j < 16; ++j) scale[j] -= m32;
++
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            const ggml_uint8x16x2_t q3bits = ggml_vld1q_u8_x2(q3); q3 += 32;
++            const ggml_int8x16x4_t q8bytes_1 = ggml_vld1q_s8_x4(q8); q8 += 64;
++            const ggml_int8x16x4_t q8bytes_2 = ggml_vld1q_s8_x4(q8); q8 += 64;
++
++            q3h.val[0] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[0]), 2);
++            q3h.val[1] = vshlq_n_u8(vbicq_u8(m0, qhbits.val[1]), 2);
++            q3h.val[2] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[0]), 1);
++            q3h.val[3] = vshlq_n_u8(vbicq_u8(m1, qhbits.val[1]), 1);
++
++            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[0], m3b)), vreinterpretq_s8_u8(q3h.val[0]));
++            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(q3bits.val[1], m3b)), vreinterpretq_s8_u8(q3h.val[1]));
++            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
++            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
++
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
++
++            scale += 4;
++
++            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
++            q3h.val[1] = vbicq_u8(m2, qhbits.val[1]);
++            q3h.val[2] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[0]), 1);
++            q3h.val[3] = vshrq_n_u8(vbicq_u8(m3, qhbits.val[1]), 1);
++
++            q3bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 4), m3b)), vreinterpretq_s8_u8(q3h.val[0]));
++            q3bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 4), m3b)), vreinterpretq_s8_u8(q3h.val[1]));
++            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
++            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
++
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
++
++            scale += 4;
++
++            if (j == 0) {
++                qhbits.val[0] = vshrq_n_u8(qhbits.val[0], 4);
++                qhbits.val[1] = vshrq_n_u8(qhbits.val[1], 4);
++            }
++
++        }
++        sum += d * isum;
++
++    }
++
++    *s = sum;
++
++#elif defined __AVX2__
++
++    const __m256i m3 = _mm256_set1_epi8(3);
++    const __m256i mone = _mm256_set1_epi8(1);
++    const __m128i m32 = _mm_set1_epi8(32);
++
++    __m256 acc = _mm256_setzero_ps();
++
++    uint32_t aux[3];
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++
++        const uint8_t * __restrict__ q3 = x[i].qs;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        // Set up scales
++        memcpy(aux, x[i].scales, 12);
++        __m128i scales128 = _mm_set_epi32(
++                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
++                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
++                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
++                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
++        scales128 = _mm_sub_epi8(scales128, m32);
++        const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
++        const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
++        const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
++        const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
++
++        // high bit
++        const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
++
++        // integer accumulator
++        __m256i sumi = _mm256_setzero_si256();
++
++        int bit = 0;
++        int is  = 0;
++
++        for (int j = 0; j < QK_K/128; ++j) {
++            // load low 2 bits
++            const __m256i q3bits = _mm256_loadu_si256((const __m256i*)q3); q3 += 32;
++
++            // prepare low and high bits
++            const __m256i q3l_0 = _mm256_and_si256(q3bits, m3);
++            const __m256i q3h_0 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
++            ++bit;
++
++            const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 2), m3);
++            const __m256i q3h_1 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
++            ++bit;
++
++            const __m256i q3l_2 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 4), m3);
++            const __m256i q3h_2 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
++            ++bit;
++
++            const __m256i q3l_3 = _mm256_and_si256(_mm256_srli_epi16(q3bits, 6), m3);
++            const __m256i q3h_3 = _mm256_slli_epi16(_mm256_srli_epi16(_mm256_andnot_si256(hbits, _mm256_slli_epi16(mone, bit)), bit), 2);
++            ++bit;
++
++            // load Q8 quants
++            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++
++            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
++            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
++            // and 2 if the high bit was set)
++            __m256i q8s_0 = _mm256_maddubs_epi16(q3h_0, q8_0);
++            __m256i q8s_1 = _mm256_maddubs_epi16(q3h_1, q8_1);
++            __m256i q8s_2 = _mm256_maddubs_epi16(q3h_2, q8_2);
++            __m256i q8s_3 = _mm256_maddubs_epi16(q3h_3, q8_3);
++
++            __m256i p16_0 = _mm256_maddubs_epi16(q3l_0, q8_0);
++            __m256i p16_1 = _mm256_maddubs_epi16(q3l_1, q8_1);
++            __m256i p16_2 = _mm256_maddubs_epi16(q3l_2, q8_2);
++            __m256i p16_3 = _mm256_maddubs_epi16(q3l_3, q8_3);
++
++            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
++            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
++            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
++            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
++
++            // multiply with scales
++            p16_0 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 0)), p16_0);
++            p16_1 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 1)), p16_1);
++            p16_2 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 2)), p16_2);
++            p16_3 = _mm256_madd_epi16(_mm256_shuffle_epi8(scales[j], get_scale_shuffle_q3k(is + 3)), p16_3);
++
++            // accumulate
++            p16_0 = _mm256_add_epi32(p16_0, p16_1);
++            p16_2 = _mm256_add_epi32(p16_2, p16_3);
++            sumi  = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_2));
++
++        }
++
++        // multiply with block scale and accumulate
++        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
++
++    }
++
++    *s = hsum_float_8(acc);
++
++#elif defined __AVX__
++
++    const __m128i m3 = _mm_set1_epi8(3);
++    const __m128i mone = _mm_set1_epi8(1);
++    const __m128i m32 = _mm_set1_epi8(32);
++    const __m128i m2 = _mm_set1_epi8(2);
++
++    __m256 acc = _mm256_setzero_ps();
++
++    const uint32_t *aux;
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++
++        const uint8_t * __restrict__ q3 = x[i].qs;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        // Set up scales
++        aux = (const uint32_t *)x[i].scales;
++        __m128i scales128 = _mm_set_epi32(
++                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
++                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
++                (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4),
++                (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4));
++        scales128 = _mm_sub_epi8(scales128, m32);
++        const __m128i scales_0 = _mm_cvtepi8_epi16(scales128);
++        const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales128, scales128));
++        const __m128i scales[2] = { scales_0, scales_1 };
++
++        // high bit *128*2 from block_q3_K.hmask[QK_K/8]
++        const __m128i hbits_0 = _mm_loadu_si128((const __m128i*)&x[i].hmask[0]);
++        const __m128i hbits_1 = _mm_loadu_si128((const __m128i*)&x[i].hmask[16]);
++
++        // integer accumulator
++        __m128i sumi_0 = _mm_setzero_si128();
++        __m128i sumi_1 = _mm_setzero_si128();
++
++        for (int j = 0; j < QK_K/128; ++j) {
++            // load low 2 bits *64*2 from block_q3_K.qs[QK_K/4]
++            const __m128i q3bits_0 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
++            const __m128i q3bits_1 = _mm_loadu_si128((const __m128i*)q3); q3 += 16;
++
++            // prepare low and high bits
++            const int bit = j << 2;
++
++            const __m128i q3l_0 = _mm_and_si128(q3bits_0, m3);
++            const __m128i q3l_1 = _mm_and_si128(q3bits_1, m3);
++            const __m128i q3h_0 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit)), bit), 2);
++            const __m128i q3h_1 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit)), bit), 2);
++
++            const __m128i q3l_2 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 2), m3);
++            const __m128i q3l_3 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 2), m3);
++            const __m128i q3h_2 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
++            const __m128i q3h_3 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+1)), bit+1), 2);
++
++            const __m128i q3l_4 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 4), m3);
++            const __m128i q3l_5 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 4), m3);
++            const __m128i q3h_4 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
++            const __m128i q3h_5 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+2)), bit+2), 2);
++
++            const __m128i q3l_6 = _mm_and_si128(_mm_srli_epi16(q3bits_0, 6), m3);
++            const __m128i q3l_7 = _mm_and_si128(_mm_srli_epi16(q3bits_1, 6), m3);
++            const __m128i q3h_6 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_0, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
++            const __m128i q3h_7 = _mm_slli_epi16(_mm_srli_epi16(_mm_andnot_si128(hbits_1, _mm_slli_epi16(mone, bit+3)), bit+3), 2);
++
++            // load Q8 quants from block_q8_K.qs[QK_K]
++            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++
++            // Dot product: we multiply the 2 low bits and 1 high bit part separately, so we can use _mm256_maddubs_epi16,
++            // and then subtract. The high bit part has the 2 already subtracted (and so, it is zero if the high bit was not set,
++            // and 2 if the high bit was set)
++            __m128i q8s_0 = _mm_maddubs_epi16(q3h_0, q8_0);
++            __m128i q8s_1 = _mm_maddubs_epi16(q3h_1, q8_1);
++            __m128i q8s_2 = _mm_maddubs_epi16(q3h_2, q8_2);
++            __m128i q8s_3 = _mm_maddubs_epi16(q3h_3, q8_3);
++            __m128i q8s_4 = _mm_maddubs_epi16(q3h_4, q8_4);
++            __m128i q8s_5 = _mm_maddubs_epi16(q3h_5, q8_5);
++            __m128i q8s_6 = _mm_maddubs_epi16(q3h_6, q8_6);
++            __m128i q8s_7 = _mm_maddubs_epi16(q3h_7, q8_7);
++
++            __m128i p16_0 = _mm_maddubs_epi16(q3l_0, q8_0);
++            __m128i p16_1 = _mm_maddubs_epi16(q3l_1, q8_1);
++            __m128i p16_2 = _mm_maddubs_epi16(q3l_2, q8_2);
++            __m128i p16_3 = _mm_maddubs_epi16(q3l_3, q8_3);
++            __m128i p16_4 = _mm_maddubs_epi16(q3l_4, q8_4);
++            __m128i p16_5 = _mm_maddubs_epi16(q3l_5, q8_5);
++            __m128i p16_6 = _mm_maddubs_epi16(q3l_6, q8_6);
++            __m128i p16_7 = _mm_maddubs_epi16(q3l_7, q8_7);
++
++            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
++            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
++            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
++            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
++            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
++            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
++            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
++            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
++
++            // multiply with scales
++            __m128i shuffle = _mm_set1_epi16(0x0100);
++            p16_0 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_0);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_1 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_1);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_2 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_2);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_3 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_3);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_4 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_4);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_5 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_5);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_6 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_6);
++            shuffle = _mm_add_epi16(shuffle, m2);
++            p16_7 = _mm_madd_epi16(_mm_shuffle_epi8(scales[j], shuffle), p16_7);
++
++            // accumulate
++            p16_0 = _mm_add_epi32(p16_0, p16_1);
++            p16_2 = _mm_add_epi32(p16_2, p16_3);
++            p16_4 = _mm_add_epi32(p16_4, p16_5);
++            p16_6 = _mm_add_epi32(p16_6, p16_7);
++            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
++            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_6));
++
++        }
++
++        // multiply with block scale and accumulate
++        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
++        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
++
++    }
++
++    *s = hsum_float_8(acc);
++
++#elif defined __riscv_v_intrinsic
++
++    uint32_t aux[3];
++    uint32_t utmp[4];
++
++    float sumf = 0;
++    for (int i = 0; i < nb; ++i) {
++
++        const uint8_t * __restrict__ q3 = x[i].qs;
++        const uint8_t * __restrict__ qh = x[i].hmask;
++        const  int8_t * __restrict__ q8 = y[i].qs;
++
++        memcpy(aux, x[i].scales, 12);
++        utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
++        utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
++        utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
++        utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
++
++        int8_t * scale = (int8_t *)utmp;
++        for (int j = 0; j < 16; ++j) scale[j] -= 32;
++
++
++        size_t vl = 32;
++        uint8_t m =  1;
++
++        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
++        vuint8m1_t vqh = __riscv_vle8_v_u8m1(qh, vl);
++
++        int sum_t = 0;
++
++        for (int j = 0; j < QK_K; j += 128) {
++
++            vl = 32;
++
++            // load Q3
++            vuint8m1_t q3_x = __riscv_vle8_v_u8m1(q3, vl);
++
++            vint8m1_t q3_0 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q3_x, 0x03, vl));
++            vint8m1_t q3_1 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x2, vl), 0x03 , vl));
++            vint8m1_t q3_2 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x4, vl), 0x03 , vl));
++            vint8m1_t q3_3 = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(q3_x, 0x6, vl), 0x03 , vl));
++
++            // compute mask for subtraction
++            vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
++            vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
++            vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
++            m <<= 1;
++
++            vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
++            vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
++            vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
++            m <<= 1;
++
++            vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
++            vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
++            vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
++            m <<= 1;
++
++            vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
++            vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
++            vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
++            m <<= 1;
++
++            // load Q8 and take product with Q3
++            vint16m2_t a0 = __riscv_vwmul_vv_i16m2(q3_m0, __riscv_vle8_v_i8m1(q8, vl), vl);
++            vint16m2_t a1 = __riscv_vwmul_vv_i16m2(q3_m1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
++            vint16m2_t a2 = __riscv_vwmul_vv_i16m2(q3_m2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
++            vint16m2_t a3 = __riscv_vwmul_vv_i16m2(q3_m3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
++
++            vl = 16;
++
++            // retrieve lane to multiply with scale
++            vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
++            vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
++            vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
++            vint32m2_t aux1_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 1), (scale[3]), vl);
++            vint32m2_t aux2_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 0), (scale[4]), vl);
++            vint32m2_t aux2_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a2, 1), (scale[5]), vl);
++            vint32m2_t aux3_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 0), (scale[6]), vl);
++            vint32m2_t aux3_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a3, 1), (scale[7]), vl);
++
++            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux0_0, aux0_1, vl), vzero, vl);
++            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux1_0, aux1_1, vl), isum0, vl);
++            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux2_0, aux2_1, vl), isum1, vl);
++            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(aux3_0, aux3_1, vl), isum2, vl);
++
++            sum_t +=  __riscv_vmv_x_s_i32m1_i32(isum3);
++
++            q3 += 32;    q8 += 128;   scale += 8;
++
++        }
++
++        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
++
++        sumf += d*sum_t;
++
++    }
++
++    *s = sumf;
++
++#else
++    // scalar version
++    // This function is written like this so the compiler can manage to vectorize most of it
++    // Using -Ofast, GCC and clang manage to produce code that is within a factor of 2 or so from the
++    // manually vectorized version above. Every other version I tried would run at least 4 times slower.
++    // The ideal situation would be if we could just write the code once, and the compiler would
++    // automatically produce the best possible set of machine instructions, instead of us having to manually
++    // write vectorized versions for AVX, ARM_NEON, etc.
++
++    int8_t  aux8[QK_K];
++    int16_t aux16[8];
++    float   sums [8];
++    int32_t aux32[8];
++    memset(sums, 0, 8*sizeof(float));
++
++    uint32_t auxs[4];
++    const int8_t * scales = (const int8_t*)auxs;
++
++    float sumf = 0;
++    for (int i = 0; i < nb; ++i) {
++        const uint8_t * __restrict__ q3 = x[i].qs;
++        const uint8_t * __restrict__ hm = x[i].hmask;
++        const  int8_t * __restrict__ q8 = y[i].qs;
++        memset(aux32, 0, 8*sizeof(int32_t));
++        int8_t * __restrict__ a = aux8;
++        uint8_t m = 1;
++        for (int j = 0; j < QK_K; j += 128) {
++            for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3;
++            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
++            a += 32; m <<= 1;
++            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 2) & 3;
++            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
++            a += 32; m <<= 1;
++            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 4) & 3;
++            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
++            a += 32; m <<= 1;
++            for (int l = 0; l < 32; ++l) a[l] = (q3[l] >> 6) & 3;
++            for (int l = 0; l < 32; ++l) a[l] -= (hm[l] & m ? 0 : 4);
++            a += 32; m <<= 1;
++            q3 += 32;
++        }
++        a = aux8;
++
++        memcpy(auxs, x[i].scales, 12);
++        uint32_t tmp = auxs[2];
++        auxs[2] = ((auxs[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4);
++        auxs[3] = ((auxs[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4);
++        auxs[0] = (auxs[0] & kmask2) | (((tmp >> 0) & kmask1) << 4);
++        auxs[1] = (auxs[1] & kmask2) | (((tmp >> 2) & kmask1) << 4);
++        for (int j = 0; j < QK_K/16; ++j) {
++            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
++            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
++            q8 += 8; a += 8;
++            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
++            for (int l = 0; l < 8; ++l) aux32[l] += (scales[j] - 32) * aux16[l];
++            q8 += 8; a += 8;
++        }
++        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
++        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
++    }
++    for (int l = 0; l < 8; ++l) sumf += sums[l];
++    *s = sumf;
++
++#endif
++
++}
++
++void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx, size_t bx, const void * __restrict__ vy, size_t by, int nrc)
++{
++    (void)(nrc);
++    (void)(bx);
++    (void)(by);
++    (void)(bs);
++
++    const block_q6_K * __restrict__ x = vx;
++    const block_q8_K * __restrict__ y = vy;
++
++    const int nb = n / QK_K;
++
++#ifdef __ARM_NEON
++    float sum = 0;
++
++    const uint8x16_t m4b = vdupq_n_u8(0xF);
++    const int32x4_t  vzero = vdupq_n_s32(0);
++    //const int8x16_t  m32s = vdupq_n_s8(32);
++
++    const uint8x16_t mone = vdupq_n_u8(3);
++
++    ggml_int8x16x4_t q6bytes;
++    ggml_uint8x16x4_t q6h;
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d_all = GGML_FP16_TO_FP32(x[i].d);
++
++        const uint8_t * __restrict__ q6 = x[i].ql;
++        const uint8_t * __restrict__ qh = x[i].qh;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        const int8_t * __restrict__ scale = x[i].scales;
++
++        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
++        const int8x16_t scales = vld1q_s8(scale);
++        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
++
++        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
++                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
++                                         vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
++                                                   vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
++        int32_t isum_mins = vaddvq_s32(prod);
++
++        int32_t isum = 0;
++
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
++            ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
++            ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
++
++            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
++            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
++            uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
++            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
++            shifted = vshrq_n_u8(qhbits.val[1], 2);
++            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
++
++            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0])), m32s);
++            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1])), m32s);
++            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2])), m32s);
++            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3])), m32s);
++            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
++            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
++            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
++            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
++
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
++                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
++                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
++                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
++
++            scale += 4;
++
++            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
++
++            shifted = vshrq_n_u8(qhbits.val[0], 4);
++            q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
++            shifted = vshrq_n_u8(qhbits.val[1], 4);
++            q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
++            shifted = vshrq_n_u8(qhbits.val[0], 6);
++            q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
++            shifted = vshrq_n_u8(qhbits.val[1], 6);
++            q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
++
++            //q6bytes.val[0] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0])), m32s);
++            //q6bytes.val[1] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1])), m32s);
++            //q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2])), m32s);
++            //q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3])), m32s);
++            q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
++            q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
++            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
++            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
++
++            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
++                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
++                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
++                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
++            scale += 4;
++        }
++        //sum += isum * d_all * y[i].d;
++        sum += d_all * y[i].d * (isum - 32 * isum_mins);
++
++    }
++    *s = sum;
++
++#elif defined __AVX2__
++
++    const __m256i m4 = _mm256_set1_epi8(0xF);
++    const __m256i m2 = _mm256_set1_epi8(3);
++    const __m256i m32s = _mm256_set1_epi8(32);
++
++    __m256 acc = _mm256_setzero_ps();
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++
++        const uint8_t * __restrict__ q4 = x[i].ql;
++        const uint8_t * __restrict__ qh = x[i].qh;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
++
++        __m256i sumi = _mm256_setzero_si256();
++
++        int is = 0;
++
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
++            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
++            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
++            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
++            is += 4;
++
++            const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
++            const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
++            const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
++
++            const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
++            const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
++            const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
++            const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
++
++            const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
++            const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
++            const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
++            const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
++
++            const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++            const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
++
++            __m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
++            __m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
++            __m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
++            __m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
++
++            __m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
++            __m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
++            __m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
++            __m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
++
++            p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
++            p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
++            p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
++            p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
++
++            p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
++            p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
++            p16_2 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_2), p16_2);
++            p16_3 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_3), p16_3);
++
++            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_0, p16_1));
++            sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p16_2, p16_3));
++
++        }
++
++        acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
++    }
++
++    *s = hsum_float_8(acc);
++
++#elif defined __AVX__
++
++    const __m128i m4 = _mm_set1_epi8(0xF);
++    const __m128i m3 = _mm_set1_epi8(3);
++    const __m128i m32s = _mm_set1_epi8(32);
++    const __m128i m2 = _mm_set1_epi8(2);
++
++    __m256 acc = _mm256_setzero_ps();
++
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
++
++        const uint8_t * __restrict__ q4 = x[i].ql;
++        const uint8_t * __restrict__ qh = x[i].qh;
++        const int8_t  * __restrict__ q8 = y[i].qs;
++
++        const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
++
++        __m128i sumi_0 = _mm_setzero_si128();
++        __m128i sumi_1 = _mm_setzero_si128();
++
++        __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
++            const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
++
++            const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
++            const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
++            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
++            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
++            const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
++            const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
++            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
++            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
++
++            const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
++            const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
++            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
++            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
++
++            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
++            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
++            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
++            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
++            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
++            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
++            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
++            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
++
++            const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
++
++            __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
++            __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
++            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
++            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
++            __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
++            __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
++            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
++            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
++
++            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
++            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
++            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
++            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
++            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
++            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
++            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
++            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
++
++            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
++            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
++            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
++            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
++            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
++            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
++            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
++            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
++
++            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
++            shuffle = _mm_add_epi8(shuffle, m2);
++            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
++            shuffle = _mm_add_epi8(shuffle, m2);
++            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
++            shuffle = _mm_add_epi8(shuffle, m2);
++            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
++            shuffle = _mm_add_epi8(shuffle, m2);
++
++            p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
++            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
++            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
++            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
++            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
++            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
++            p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
++            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
++
++            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
++            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
++            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
++            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
++
++        }
++
++        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
++        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
++    }
++
++    *s = hsum_float_8(acc);
++
++#elif defined __riscv_v_intrinsic
++
++    float sumf = 0;
++    for (int i = 0; i < nb; ++i) {
++
++        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
++
++        const uint8_t * __restrict__ q6 = x[i].ql;
++        const uint8_t * __restrict__ qh = x[i].qh;
++        const  int8_t * __restrict__ q8 = y[i].qs;
++
++        const int8_t * __restrict__ scale = x[i].scales;
++
++        size_t vl;
++
++        vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1);
++
++        int sum_t = 0;
++        int is = 0;
++
++        for (int j = 0; j < QK_K/128; ++j) {
++
++            vl = 32;
++
++            // load qh
++            vuint8m1_t qh_x = __riscv_vle8_v_u8m1(qh, vl);
++
++            // load Q6
++            vuint8m1_t q6_0 = __riscv_vle8_v_u8m1(q6, vl);
++            vuint8m1_t q6_1 = __riscv_vle8_v_u8m1(q6+32, vl);
++
++            vuint8m1_t q6a_0 = __riscv_vand_vx_u8m1(q6_0, 0x0F, vl);
++            vuint8m1_t q6a_1 = __riscv_vand_vx_u8m1(q6_1, 0x0F, vl);
++            vuint8m1_t q6s_0 = __riscv_vsrl_vx_u8m1(q6_0, 0x04, vl);
++            vuint8m1_t q6s_1 = __riscv_vsrl_vx_u8m1(q6_1, 0x04, vl);
++
++            vuint8m1_t qh_0 = __riscv_vand_vx_u8m1(qh_x, 0x03, vl);
++            vuint8m1_t qh_1 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x2, vl), 0x03 , vl);
++            vuint8m1_t qh_2 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x4, vl), 0x03 , vl);
++            vuint8m1_t qh_3 = __riscv_vand_vx_u8m1(__riscv_vsrl_vx_u8m1(qh_x, 0x6, vl), 0x03 , vl);
++
++            vuint8m1_t qhi_0 = __riscv_vor_vv_u8m1(q6a_0, __riscv_vsll_vx_u8m1(qh_0, 0x04, vl), vl);
++            vuint8m1_t qhi_1 = __riscv_vor_vv_u8m1(q6a_1, __riscv_vsll_vx_u8m1(qh_1, 0x04, vl), vl);
++            vuint8m1_t qhi_2 = __riscv_vor_vv_u8m1(q6s_0, __riscv_vsll_vx_u8m1(qh_2, 0x04, vl), vl);
++            vuint8m1_t qhi_3 = __riscv_vor_vv_u8m1(q6s_1, __riscv_vsll_vx_u8m1(qh_3, 0x04, vl), vl);
++
++            vint8m1_t a_0 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_0), 32, vl);
++            vint8m1_t a_1 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_1), 32, vl);
++            vint8m1_t a_2 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_2), 32, vl);
++            vint8m1_t a_3 = __riscv_vsub_vx_i8m1(__riscv_vreinterpret_v_u8m1_i8m1(qhi_3), 32, vl);
++
++            // load Q8 and take product
++            vint16m2_t va_q_0 = __riscv_vwmul_vv_i16m2(a_0, __riscv_vle8_v_i8m1(q8, vl), vl);
++            vint16m2_t va_q_1 = __riscv_vwmul_vv_i16m2(a_1, __riscv_vle8_v_i8m1(q8+32, vl), vl);
++            vint16m2_t va_q_2 = __riscv_vwmul_vv_i16m2(a_2, __riscv_vle8_v_i8m1(q8+64, vl), vl);
++            vint16m2_t va_q_3 = __riscv_vwmul_vv_i16m2(a_3, __riscv_vle8_v_i8m1(q8+96, vl), vl);
++
++            vl = 16;
++
++            vint32m2_t vaux_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 0), scale[is+0], vl);
++            vint32m2_t vaux_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_0, 1), scale[is+1], vl);
++            vint32m2_t vaux_2 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 0), scale[is+2], vl);
++            vint32m2_t vaux_3 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_1, 1), scale[is+3], vl);
++            vint32m2_t vaux_4 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 0), scale[is+4], vl);
++            vint32m2_t vaux_5 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_2, 1), scale[is+5], vl);
++            vint32m2_t vaux_6 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 0), scale[is+6], vl);
++            vint32m2_t vaux_7 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(va_q_3, 1), scale[is+7], vl);
++
++            vint32m1_t isum0 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_0, vaux_1, vl), vzero, vl);
++            vint32m1_t isum1 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_2, vaux_3, vl), isum0, vl);
++            vint32m1_t isum2 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_4, vaux_5, vl), isum1, vl);
++            vint32m1_t isum3 = __riscv_vredsum_vs_i32m2_i32m1(__riscv_vadd_vv_i32m2(vaux_6, vaux_7, vl), isum2, vl);
++
++            sum_t += __riscv_vmv_x_s_i32m1_i32(isum3);
++
++            q6 += 64;   qh += 32;   q8 += 128;   is=8;
++
++        }
++
++        sumf += d * sum_t;
++
++    }
++
++    *s = sumf;
++
++#else
++
++    int8_t  aux8[QK_K];
++    int16_t aux16[8];
++    float   sums [8];
++    int32_t aux32[8];
++    memset(sums, 0, 8*sizeof(float));
++
++    float sumf = 0;
++    for (int i = 0; i < nb; ++i) {
++        const uint8_t * __restrict__ q4 = x[i].ql;
++        const uint8_t * __restrict__ qh = x[i].qh;
++        const  int8_t * __restrict__ q8 = y[i].qs;
++        memset(aux32, 0, 8*sizeof(int32_t));
++        int8_t * __restrict__ a = aux8;
++        for (int j = 0; j < QK_K; j += 128) {
++            for (int l = 0; l < 32; ++l) {
++                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
++                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
++                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
++                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
++            }
++            a  += 128;
++            q4 += 64;
++            qh += 32;
++        }
++        a = aux8;
++        int is = 0;
++        for (int j = 0; j < QK_K/16; ++j) {
++            int scale = x[i].scales[is++];
++            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
++            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
++            q8 += 8; a += 8;
++            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
++            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
++            q8 += 8; a += 8;
++        }
++        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
++        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
++    }
++    for (int l = 0; l < 8; ++l) sumf += sums[l];
++    *s = sumf;
++#endif
++}
++
++void ggml_vec_dot_q8_0_q8_0(int n, float *__restrict__ s, size_t bs, const void *__restrict__ vx, size_t bx, const void *__restrict__ vy, size_t by, int nrc)
++{
++    const int qk = QK8_0;
++    const int nb = n / qk;
++
++#if defined(__ARM_FEATURE_MATMUL_INT8)
++    assert((nrc == 2) || (nrc == 1) || (nrc == 16));
++#else
++    assert(nrc == 1);
++#endif
++
++    const block_q8_0 *__restrict__ x = vx;
++    const block_q8_0 *__restrict__ y = vy;
++
++#if defined(__ARM_FEATURE_MATMUL_INT8)
++    if (nrc == 2) {
++        const block_q8_0 * __restrict__ vx0 = vx;
++        const block_q8_0 * __restrict__ vx1 = (const block_q8_0 *) ((const uint8_t*)vx + bx);
++        const block_q8_0 * __restrict__ vy0 = vy;
++        const block_q8_0 * __restrict__ vy1 = (const block_q8_0 *) ((const uint8_t*)vy + by);
++
++        float32x4_t sumv0 = vdupq_n_f32(0.0f);
++
++        for (int i = 0; i < nb; i++) {
++            const block_q8_0 * __restrict__ b_x0 = &vx0[i];
++            const block_q8_0 * __restrict__ b_y0 = &vy0[i];
++
++            const block_q8_0 * __restrict__ b_x1 = &vx1[i];
++            const block_q8_0 * __restrict__ b_y1 = &vy1[i];
++
++            const int8x16_t x0_l = vld1q_s8(b_x0->qs);
++            const int8x16_t x0_h = vld1q_s8(b_x0->qs + 16);
++            const int8x16_t x1_l = vld1q_s8(b_x1->qs);
++            const int8x16_t x1_h = vld1q_s8(b_x1->qs + 16);
++
++            // load y
++            const int8x16_t y0_l = vld1q_s8(b_y0->qs);
++            const int8x16_t y0_h = vld1q_s8(b_y0->qs + 16);
++            const int8x16_t y1_l = vld1q_s8(b_y1->qs);
++            const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16);
++
++            float32_t _scale[4] = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d),
++                                   GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d),
++                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d),
++                                   GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)};
++            float32x4_t scale = vld1q_f32(_scale);
++
++            int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
++            int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l)));
++
++            int8x16_t l2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
++            int8x16_t l3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_h), vreinterpretq_s64_s8(x1_h)));
++
++            int8x16_t r0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
++            int8x16_t r1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_l), vreinterpretq_s64_s8(y1_l)));
++
++            int8x16_t r2 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
++            int8x16_t r3 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(y0_h), vreinterpretq_s64_s8(y1_h)));
++
++            sumv0 = vmlaq_f32(sumv0,(vcvtq_f32_s32(vmmlaq_s32((vmmlaq_s32((vmmlaq_s32((vmmlaq_s32(vdupq_n_s32(0), l0, r0)),
++                                                                                       l1, r1)), l2, r2)), l3, r3))), scale);
++        }
++        float32x4_t sumv1 = vextq_f32(sumv0, sumv0, 2);
++        float32x4_t sumv2 = vzip1q_f32(sumv0, sumv1);
++
++        vst1_f32(s, vget_low_f32(sumv2));
++        vst1_f32(s + bs, vget_high_f32(sumv2));
++        return;
++    }
++#endif
++
++    int ib = 0;
++    float sumf = 0;
++
++#if defined(__ARM_FEATURE_SVE)
++    svfloat32_t sumv0 = svdup_n_f32(0.0f);
++    svfloat32_t sumv1 = svdup_n_f32(0.0f);
++
++    const int vector_length = ggml_cpu_get_sve_cnt() * 8;
++
++    //VLA Implemenation for SVE
++    switch (vector_length) {
++        case 128:
++            {
++                // predicate for activating lanes for 16 Int8 elements
++                const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
++                const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
++
++                for (; ib + 1 < nb; ib += 2) {
++                    const block_q8_0 *__restrict__ x0 = &x[ib + 0];
++                    const block_q8_0 *__restrict__ x1 = &x[ib + 1];
++                    const block_q8_0 *__restrict__ y0 = &y[ib + 0];
++                    const block_q8_0 *__restrict__ y1 = &y[ib + 1];
++
++                    // load x
++                    const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
++                    const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
++                    const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
++                    const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
++
++                    // load y
++                    const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
++                    const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
++                    const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
++                    const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
++
++                    sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
++                                    svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
++                                    svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
++                    sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
++                                    svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
++                                    svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
++                }
++
++                sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
++            } break;
++        case 256:
++            {
++                //printf("sve256");
++                for (; ib + 1 < nb; ib += 2) {
++                    const block_q8_0 *__restrict__ x0 = &x[ib + 0];
++                    const block_q8_0 *__restrict__ x1 = &x[ib + 1];
++                    const block_q8_0 *__restrict__ y0 = &y[ib + 0];
++                    const block_q8_0 *__restrict__ y1 = &y[ib + 1];
++
++                    // load x
++                    const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
++                    const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
++
++                    // load y
++                    const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
++                    const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
++
++                    sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
++                                svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
++                    sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
++                                svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
++                }
++
++                sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
++            } break;
++        case 512:
++            {
++                // predicate for activating high 256 bit
++                const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
++                // predicate for activating low 256 bit
++                const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
++
++                // predicate for activating high lanes for 8 float32 elements
++                const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
++                // predicate for activating low lanes for 8 float32 elements
++                const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
++
++                svfloat32_t sumv00 = svdup_n_f32(0.0f);
++
++                for (; ib + 1 < nb; ib += 2) {
++                    const block_q8_0 *__restrict__ x0 = &x[ib + 0];
++                    const block_q8_0 *__restrict__ x1 = &x[ib + 1];
++                    const block_q8_0 *__restrict__ y0 = &y[ib + 0];
++                    const block_q8_0 *__restrict__ y1 = &y[ib + 1];
++
++                    //load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
++                    // and add them to make one 64 element vector
++                    // load x
++                    const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
++                          svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
++
++                    qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
++
++                    // load y
++                    const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
++                          svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
++
++                    qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
++
++                    // scale creation
++                    const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
++                    const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
++
++                    // duplicate deq1 in first half of vector and deq2 in second half of vector
++                    const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
++
++                    const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
++
++                    sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
++                }
++
++                sumf = svaddv_f32(svptrue_b32(), sumv00);
++                break;
++            }
++        default:
++            assert(false && "Unsupported vector length");
++            break;
++    }
++#elif defined(__ARM_NEON)
++    float32x4_t sumv0 = vdupq_n_f32(0.0f);
++    float32x4_t sumv1 = vdupq_n_f32(0.0f);
++
++    for (; ib + 1 < nb; ib += 2) {
++        const block_q8_0 *__restrict__ x0 = &x[ib + 0];
++        const block_q8_0 *__restrict__ x1 = &x[ib + 1];
++        const block_q8_0 *__restrict__ y0 = &y[ib + 0];
++        const block_q8_0 *__restrict__ y1 = &y[ib + 1];
++
++        const int8x16_t x0_0 = vld1q_s8(x0->qs);
++        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
++        const int8x16_t x1_0 = vld1q_s8(x1->qs);
++        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
++
++        // load y
++        const int8x16_t y0_0 = vld1q_s8(y0->qs);
++        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
++        const int8x16_t y1_0 = vld1q_s8(y1->qs);
++        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
++
++        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
++                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
++                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
++
++        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
++                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
++                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
++    }
++
++    sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
++#else
++    /* 不加速 */
++    for (; ib < nb; ++ib) {
++        int sumi = 0;
++
++        for (int j = 0; j < qk; j++) {
++            sumi += x[ib].qs[j] * y[ib].qs[j];
++        }
++
++        sumf += sumi * (GGML_FP16_TO_FP32(x[ib].d) * GGML_FP16_TO_FP32(y[ib].d));
++    }
++#endif
++    *s = sumf;
++}
++
+diff --git a/csrc/cpu/quantize.h b/csrc/cpu/quantize.h
+new file mode 100644
+index 000000000..10e988da1
+--- /dev/null
++++ b/csrc/cpu/quantize.h
+@@ -0,0 +1,253 @@
++#include <stdint.h>
++#include <stddef.h>
++#include <math.h>
++#include <arm_neon.h>
++#include <float.h>
++#include <cassert>
++#include <stdio.h>
++
++typedef float16_t ggml_half;
++typedef float32_t ggml_half2;
++typedef float16_t ggml_fp16_t;
++typedef float16_t ggml_float;
++typedef float16_t f16;
++
++#define QK4_0 32
++typedef struct {
++    ggml_half d;          // delta
++    uint8_t qs[QK4_0 / 2];  // nibbles / quants  ggml_half
++} block_q4_0;
++
++#define QK4_1 32
++typedef struct {
++    float   d;          // delta
++    float   m;          // min
++    uint8_t qs[QK4_1 / 2];  // nibbles / quants
++} block_q4_1;
++
++#define QK8_0 32
++typedef struct {
++    ggml_half d;       // delta
++    int8_t  qs[QK8_0]; // quants
++} block_q8_0;
++
++#define QK8_1 32
++typedef struct {
++    union {
++        struct {
++            ggml_half d; // delta
++            ggml_half s; // d * sum(qs[i])
++        } GGML_COMMON_AGGR_S;
++        ggml_half2 ds;
++    } GGML_COMMON_AGGR_U;
++    int8_t qs[QK8_1]; // quants
++} block_q8_1;
++
++#define QK_K 256  //目前不支持GGML_QKK_64
++typedef struct {
++    uint8_t ql[QK_K/2];      // quants, lower 4 bits
++    uint8_t qh[QK_K/4];      // quants, upper 2 bits
++    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
++    ggml_half d;             // super-block scale ggml_half
++} block_q6_K;
++
++typedef struct {
++    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
++    uint8_t qs[QK_K/4];
++    union {
++        struct {
++            ggml_half d;    // super-block scale for quantized scales
++            ggml_half dmin; // super-block scale for quantized mins
++        } GGML_COMMON_AGGR;
++        ggml_half2 dm;
++    };
++} block_q2_K;
++
++typedef struct {
++    uint8_t hmask[QK_K/8]; // quants - high bit
++    uint8_t qs[QK_K/4];    // quants - low 2 bits
++    uint8_t scales[12];    // scales, quantized with 6 bits
++    ggml_half d;           // super-block scale
++} block_q3_K;
++
++typedef struct {
++    float   d;              // delta
++    int8_t  qs[QK_K];       // quants
++    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
++} block_q8_K;
++
++#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
++
++
++void dequantize_row_q2_K(const block_q2_K *__restrict__  src, float *__restrict__  dst, int64_t k);
++void dequantize_row_q4_0(const block_q4_0 * __restrict__ src, float * __restrict__ dst, int64_t k);
++void dequantize_row_q4_1(const block_q4_1 * __restrict__ src, float * __restrict__ dst, int64_t k);
++void dequantize_row_q8_0(const block_q8_0 *__restrict__ x, float *__restrict__ y, int64_t k);
++
++void quantize_row_q8_K(const float *__restrict__ x, block_q8_K *__restrict__ y, int64_t k);
++void quantize_row_q6_K(const float *__restrict__ x, block_q6_K *__restrict__ y, int64_t k);
++void quantize_row_q3_K(const float *__restrict__ x, block_q3_K *__restrict__ y, int64_t k);
++void quantize_row_q2_K(const float *__restrict__ x, block_q2_K *__restrict__ y, int64_t k);
++void quantize_row_q4_0(const float *__restrict__ x, block_q4_0 *__restrict__ y, int64_t k);
++void quantize_row_q4_1(const float *__restrict__ x, block_q4_1 *__restrict__ y, int64_t k);
++void quantize_row_q8_0(const float *__restrict__ x, block_q8_0 *__restrict__ y, int64_t k);
++void quantize_row_q8_1(const float * __restrict__ x, block_q8_1 * __restrict__ y, int64_t k);
++void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx,
++                            size_t bx, const void * __restrict__ vy, size_t by, int nrc);
++void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx,
++                            size_t bx, const void * __restrict__ vy, size_t by, int nrc);
++void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx,
++                            size_t bx, const void * __restrict__ vy, size_t by, int nrc);
++void ggml_vec_dot_q4_0_q8_0(int n, float *__restrict__ s, size_t bs, const void *__restrict__ vx,
++                            size_t bx, const void *__restrict__ vy, size_t by, int nrc);
++void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, size_t bs, const void * __restrict__ vx,
++                            size_t bx, const void * __restrict__ vy, size_t by, int nrc);
++void ggml_vec_dot_q8_0_q8_0(int n, float *__restrict__ s, size_t bs, const void *__restrict__ vx,
++                            size_t bx, const void *__restrict__ vy, size_t by, int nrc);
++
++void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
++void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
++
++void ggml_vec_dot_f16(int n, float * __restrict__ s, size_t bs, ggml_fp16_t * __restrict__ x, size_t bx, ggml_fp16_t * __restrict__ y, size_t by, int nrc);
++typedef void (*ggml_to_float_t)(const void  *__restrict__ x, float *__restrict__ y, int64_t k);
++typedef void (*ggml_vec_dot_t)(int n, float *__restrict__ s, size_t bs, const void *__restrict__ x, size_t bx,
++                               const void *__restrict__ y, size_t by, int nrc);
++typedef void (*ggml_from_float_t)(const float *__restrict__ x, void  *__restrict__ y, int64_t k);
++
++static inline float GGML_FP16_TO_FP32(ggml_half h) {
++    //ggml_half tmp;
++    //memcpy(&tmp, &h, sizeof(ggml_half));
++    return (float)h;
++}
++
++static inline float fp32_from_bits(uint32_t w) {
++    union {
++        uint32_t as_bits;
++        float as_value;
++    } fp32;
++    fp32.as_bits = w;
++    return fp32.as_value;
++}
++
++static inline uint32_t fp32_to_bits(float f) {
++    union {
++        float as_value;
++        uint32_t as_bits;
++    } fp32;
++    fp32.as_value = f;
++    return fp32.as_bits;
++}
++
++#define GGML_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
++#if __AVX__ || __AVX2__ || __AVX512F__
++static inline uint16_t ggml_compute_fp32_to_fp16(float f) {
++#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
++    const float scale_to_inf = 0x1.0p+112f;
++    const float scale_to_zero = 0x1.0p-110f;
++#else
++    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
++    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
++#endif
++    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
++
++    const uint32_t w = fp32_to_bits(f);
++    const uint32_t shl1_w = w + w;
++    const uint32_t sign = w & UINT32_C(0x80000000);
++    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
++    if (bias < UINT32_C(0x71000000)) {
++        bias = UINT32_C(0x71000000);
++    }
++
++    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
++    const uint32_t bits = fp32_to_bits(base);
++    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
++    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
++    const uint32_t nonsign = exp_bits + mantissa_bits;
++    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
++}
++#elif defined(__ARM_NEON)
++static inline ggml_half ggml_compute_fp32_to_fp16(float f) {
++    ggml_half res;
++    __fp16 tmp = f;
++    memcpy(&res, &tmp, sizeof(ggml_half));
++    return res;
++}
++#endif
++
++#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
++
++#define GGML_SIMD
++
++// F16 NEON
++
++#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
++    #define GGML_F16_STEP 32
++    #define GGML_F16_EPR  8
++
++    #define GGML_F16x8              float16x8_t
++    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
++    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
++    #define GGML_F16x8_LOAD(x)      vld1q_f16((const f16 *)(x))
++    #define GGML_F16x8_STORE        vst1q_f16
++    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
++    #define GGML_F16x8_ADD          vaddq_f16
++    #define GGML_F16x8_MUL          vmulq_f16
++    #define GGML_F16x8_REDUCE(res, x)                               \
++    do {                                                            \
++        int offset = GGML_F16_ARR >> 1;                             \
++        for (int i = 0; i < offset; ++i) {                          \
++            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
++        }                                                           \
++        offset >>= 1;                                               \
++        for (int i = 0; i < offset; ++i) {                          \
++            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
++        }                                                           \
++        offset >>= 1;                                               \
++        for (int i = 0; i < offset; ++i) {                          \
++            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
++        }                                                           \
++        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
++        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
++        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
++    } while (0)
++
++    #define GGML_F16_VEC                GGML_F16x8
++    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
++    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
++    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
++    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((f16 *)(p), (r)[i])
++    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
++    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
++    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
++    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
++#else
++    // if FP16 vector arithmetic is not supported, we use FP32 instead
++    // and take advantage of the vcvt_ functions to convert to/from FP16
++
++    #define GGML_F16_STEP 16
++    #define GGML_F16_EPR  4
++
++    #define GGML_F32Cx4              float32x4_t
++    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
++    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
++    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const f16 *)(x)))
++    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
++    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
++    #define GGML_F32Cx4_ADD          vaddq_f32
++    #define GGML_F32Cx4_MUL          vmulq_f32
++    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
++
++    #define GGML_F16_VEC                GGML_F32Cx4
++    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
++    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
++    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
++    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((f16 *)(p), r[i])
++    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
++    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
++    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
++    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
++#endif
++
++    #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
++#endif
++
+diff --git a/csrc/cpu/sysHAX_ops.cpp b/csrc/cpu/sysHAX_ops.cpp
+new file mode 100644
+index 000000000..3410388dc
+--- /dev/null
++++ b/csrc/cpu/sysHAX_ops.cpp
+@@ -0,0 +1,1651 @@
++#include <sys/types.h>
++#include <ctype.h>
++#include <cmath>
++#include <cassert>
++#include <numa.h>
++#include <iostream>
++#include"cpu_types.hpp"
++// #include "tensor.h"
++#include "quantize.h"
++#include <unistd.h>  // Linux
++#include <csignal>
++
++// #include "decode.h"
++
++typedef unsigned int UINT32;
++typedef unsigned long long UINT64;
++typedef float f32;
++// typedef unsigned char bool;
++#define GENERAL_ARCH_BAICHUAN "baichuan"
++#define EPSILON 1e-6f   /* float a1， float a2， if |a1 - a2| < EPSILON, then a1 = a2*/
++
++extern void transpose_v(f16 *vt, const f16 *v, int n_tokens, int dim, int qkv_dim);
++extern void prefill_attention(f16 *out_ptr, const f16 *qkv_ptr, const f16 *vt_ptr, int N_tokens, int N_seqs, const int *seq_lens);
++
++float expf_f16_table[65536];
++
++
++template<class scaler_t>
++struct KernelVecType{
++  using q_load_vec_t = void;
++  using k_load_vec_t = void;
++  using v_load_vec_t = void;
++  using q_k_v_vec_t = void;
++  using accum_vec_t = void;
++};
++
++template<>
++struct KernelVecType<f16>{
++  using q_load_vec_t = vec_op::FP16Vec8;
++  using k_load_vec_t = vec_op::FP16Vec16;
++  using v_load_vec_t = vec_op::FP16Vec16;
++  using q_k_v_vec_t = vec_op::FP16Vec16;
++  using accum_vec_t = vec_op::FP16Vec16;
++};
++
++/* 记录tensor的数据类型 */
++typedef struct {
++    int token_embd_weight;
++    int attn_k_weight;
++    int attn_k_bias;
++    int attn_norm_weight;
++    int attn_q_weight;
++    int attn_q_bias;
++    int attn_v_weight;
++    int attn_v_bias;
++    int ffn_down_weight;
++    int ffn_gate_weight;
++    int ffn_norm_weight;
++    int ffn_up_weight;
++    int attn_output_weight;
++    int output_weight;
++    int output_norm_weight;
++} WeightTypes;
++
++WeightTypes weight_types;
++
++
++enum ggml_type {
++    GGML_TYPE_F32     = 0,
++    GGML_TYPE_F16     = 1,
++    GGML_TYPE_Q4_0    = 2,
++    GGML_TYPE_Q4_1    = 3,
++    GGML_TYPE_Q5_0    = 6,
++    GGML_TYPE_Q5_1    = 7,
++    GGML_TYPE_Q8_0    = 8,
++    GGML_TYPE_Q8_1    = 9,
++    GGML_TYPE_Q2_K    = 10,
++    GGML_TYPE_Q3_K    = 11,
++    GGML_TYPE_Q4_K    = 12,
++    GGML_TYPE_Q5_K    = 13,
++    GGML_TYPE_Q6_K    = 14,
++    GGML_TYPE_Q8_K    = 15,
++    GGML_TYPE_IQ2_XXS = 16,
++    GGML_TYPE_IQ2_XS  = 17,
++    GGML_TYPE_IQ3_XXS = 18,
++    GGML_TYPE_IQ1_S   = 19,
++    GGML_TYPE_IQ4_NL  = 20,
++    GGML_TYPE_IQ3_S   = 21,
++    GGML_TYPE_IQ2_S   = 22,
++    GGML_TYPE_IQ4_XS  = 23,
++    GGML_TYPE_I8      = 24,
++    GGML_TYPE_I16     = 25,
++    GGML_TYPE_I32     = 26,
++    GGML_TYPE_I64     = 27,
++    GGML_TYPE_F64     = 28,
++    GGML_TYPE_IQ1_M   = 29,
++    GGML_TYPE_COUNT,
++};
++
++// 定义工作分配结构体
++typedef struct WorkDivider {
++    int num_threads;
++    int tid;
++    int num_numas;
++    int threads_per_numa;
++    int my_numa;
++    int tid_in_numa;
++} WorkDivider;
++
++// 定义工作范围结构体:单numa和多numa
++typedef struct SingleNumaWorkRange {
++    int begin_thread;
++    int end_thread;
++    int work_per_thread;
++} SingleNumaWorkRange;
++
++typedef struct MultiNumaWorkRange {
++    int begin_numa;
++    int end_numa;
++    int work_per_numa;
++    int begin_thread;
++    int end_thread;
++    int work_per_thread;
++} MultiNumaWorkRange;
++
++
++typedef struct {
++    const char *pcTypeName;
++    UINT32 uiblkSize;
++    UINT32 uiTypeSize;
++    ggml_from_float_t quantize;
++    ggml_to_float_t dequantize;
++    enum ggml_type VecDotType;  /* 矩阵点积计算类型 */
++    ggml_vec_dot_t VecDotFunc;  /* 矩阵点积计算函数 */
++} BLOCK_DATA_INFO;
++
++const char *g_ModelArch = "qwen2"; /* 模型架构 */
++
++BLOCK_DATA_INFO g_BlockDataInfo[] = {
++    {"f32", 1, sizeof(float), NULL, NULL, GGML_TYPE_F32, NULL},
++    {"f16", 1, sizeof(uint16_t), (ggml_from_float_t)ggml_fp32_to_fp16_row, (ggml_to_float_t)ggml_fp16_to_fp32_row, GGML_TYPE_F16, (ggml_vec_dot_t)ggml_vec_dot_f16},
++    {"q4_0", QK4_0, sizeof(block_q4_0), (ggml_from_float_t)quantize_row_q4_0, (ggml_to_float_t)dequantize_row_q4_0, GGML_TYPE_Q8_0, (ggml_vec_dot_t)ggml_vec_dot_q4_0_q8_0},
++    {"q4_1", QK4_1, sizeof(block_q4_1), (ggml_from_float_t)quantize_row_q4_1, (ggml_to_float_t)dequantize_row_q4_1, GGML_TYPE_Q8_1, (ggml_vec_dot_t)ggml_vec_dot_q4_1_q8_1},
++    {"", 0, 0, NULL},
++    {"", 0, 0, NULL},
++    {"", 0, 0, NULL},
++    {"", 0, 0, NULL},
++    {"q8_0", QK8_0, sizeof(block_q8_0), (ggml_from_float_t)quantize_row_q8_0, (ggml_to_float_t)dequantize_row_q8_0, GGML_TYPE_Q8_0, (ggml_vec_dot_t)ggml_vec_dot_q8_0_q8_0},
++    // {"q8_1", QK8_1, sizeof(block_q8_1), (ggml_from_float_t)quantize_row_q8_1},
++    // {"q2_K", QK_K, sizeof(block_q2_K), (ggml_from_float_t)quantize_row_q2_K, (ggml_to_float_t)dequantize_row_q2_K, GGML_TYPE_Q8_K, (ggml_vec_dot_t)ggml_vec_dot_q2_K_q8_K},  //10
++    // {"q3_K", QK_K, sizeof(block_q3_K), (ggml_from_float_t)quantize_row_q3_K, NULL, GGML_TYPE_Q8_K, (ggml_vec_dot_t)ggml_vec_dot_q3_K_q8_K},
++    // {"", 0, 0, NULL},
++    // {"", 0, 0, NULL},  //13
++    // {"q6_K", QK_K, sizeof(block_q6_K), (ggml_from_float_t)quantize_row_q6_K, NULL, GGML_TYPE_Q8_K, (ggml_vec_dot_t)ggml_vec_dot_q6_K_q8_K},
++    // {"q8_K", QK_K, sizeof(block_q8_K), (ggml_from_float_t)quantize_row_q8_K},
++};
++
++typedef struct {
++    enum ggml_type DataType;  /* 张量的数据类型 */
++    union {
++        void *tensor1;
++        void **tensor2;
++        void ***tensor3;
++    } Data;
++} TENSOR_INFO;
++
++/* 挂载模型位置 */
++typedef struct weight {
++    TENSOR_INFO token_embedding;
++    TENSOR_INFO rms_att_norm; // (layer, dim) rmsnorm weights
++    TENSOR_INFO rms_ffn_norm; // (layer, dim)
++    TENSOR_INFO Wqkv;
++    TENSOR_INFO wo; // (layer, n_head * head_size, dim)
++    // weights for bias
++    TENSOR_INFO qkv_bias;
++    // weights for ffn
++    TENSOR_INFO w1w3;
++    TENSOR_INFO ffn_down; // (layer, dim, hidden_dim)
++    TENSOR_INFO output;       //output linear
++    TENSOR_INFO output_norm;  //output RMS norm
++} WEIGHT;
++
++
++typedef struct {
++    void *Token_Ori;    /* 原始的token */
++    void *Token_Norm;   /* 归一化处理后的token */
++    float *K;            /* K矩阵 */
++    float *Q;            /* Q矩阵 */
++    float *V;            /* V矩阵 */
++    float *QK;           /* QK的转置结果 */
++    float *Attn_out;     /* 注意力输出 */
++    f16 *Attn_out_f16;
++    float *ffn_Gate;     /* ffn_Gate输出 */
++    float *ffn_up;       /* ffn_up */
++    float *logits;       /* 采样输出 */
++    float **key_cache;    /* K cache */
++    float **value_cache;  /* V cache */
++    void **temp_output_vec_numa;
++    void **tmp_vec_numa;
++    f16 *seq_qkv;      /* 新增:序列的qkv输出 */
++    float *add_weight;   /* 新增权重 */
++    float *output_f32;
++} MODEL_RUN_STATE;
++
++typedef struct {
++    int dim;            /* embedding 维度 */
++    int n_head;        /* 注意力头个数 */
++    int n_kv_heads;     /* kv的对数 */
++    int hidden_dim;     /* ffn隐藏层维度 */
++    int n_layers;       /* 模型层数 */
++    int context_length; /* 上下文长度 */
++    float norm_rms_eps; /* eps */
++    int n_vocab;        /* 词汇数量 */
++    float rope_freq_base; /* rope频率 */
++    f16 *cos_sin_cache;  /* rope历史数据 */
++    int n_rotary;        /* rotary维度 */
++    bool is_neox_style;  /* rope风格 */
++    double attn_scale;   /* rope系数 */
++} MODEL_HYPE_PARA;
++
++__thread f16 qk_tmp_storage[131072];
++
++int g_numas = numa_num_configured_nodes();
++WEIGHT g_pstWeight;
++MODEL_RUN_STATE g_stRunState;
++MODEL_HYPE_PARA g_pstModelHypePara;
++
++float f16_to_f32(f16 h){return h;}
++f16 f32_to_f16(float h){return h;}
++
++__attribute__((noinline))
++f16 DOTPRODUCT_vv_f16(int M, const f16 *src0_ptr, const f16 *src1_ptr)
++{
++    __builtin_prefetch(&src0_ptr[0], 0 , 0);
++    __builtin_prefetch(&src1_ptr[0], 0 , 0);
++    if (M >= 128) {
++        __builtin_prefetch(&src0_ptr[32], 0 , 0);
++        __builtin_prefetch(&src1_ptr[32], 0 , 0);
++        __builtin_prefetch(&src0_ptr[64], 0 , 0);
++        __builtin_prefetch(&src1_ptr[64], 0 , 0);
++        __builtin_prefetch(&src0_ptr[96], 0 , 0);
++        __builtin_prefetch(&src1_ptr[96], 0 , 0);
++    }
++    float sumf = 0.0f;
++    int j = 0;
++#ifdef __ARM_NEON
++    const int M_UNROLL = 8;
++    const int M_SIMD = 8;
++    float16x8_t sum[M_UNROLL] = {vdupq_n_f16(0.0f)};
++    for (; j <= M - M_UNROLL * M_SIMD; j += M_UNROLL * M_SIMD) {
++        __builtin_prefetch(&src0_ptr[j + 192], 0 , 0);
++        __builtin_prefetch(&src1_ptr[j + 192], 0 , 0);
++        __builtin_prefetch(&src0_ptr[j + 224], 0 , 0);
++        __builtin_prefetch(&src1_ptr[j + 224], 0 , 0);
++        __builtin_prefetch(&src0_ptr[j + 256], 0 , 0);
++        __builtin_prefetch(&src1_ptr[j + 256], 0 , 0);
++        __builtin_prefetch(&src0_ptr[j + 288], 0 , 0);
++        __builtin_prefetch(&src1_ptr[j + 288], 0 , 0);
++        __builtin_prefetch(&src0_ptr[j + 320], 0 , 0);
++        __builtin_prefetch(&src1_ptr[j + 320], 0 , 0);
++        for (int ss = 0; ss < M_UNROLL; ss++) {
++            sum[ss] = vfmaq_f16(sum[ss], vld1q_f16(&src0_ptr[j + ss * M_SIMD]), vld1q_f16(&src1_ptr[j + ss * M_SIMD]));
++        }
++    }
++
++    for (; j <= M - 8; j += 8) {
++        sum[0] = vfmaq_f16(sum[0], vld1q_f16(&src0_ptr[j]), vld1q_f16(&src1_ptr[j]));
++    }
++    sum[0] = vaddq_f16(vaddq_f16(sum[0], sum[2]), vaddq_f16(sum[1], sum[3]));
++    if (M_UNROLL > 4) {
++        sum[4] = vaddq_f16(vaddq_f16(sum[4], sum[6]), vaddq_f16(sum[5], sum[7]));
++        sum[0] = vaddq_f16(sum[0], sum[4]);
++    }
++
++    float32x4_t t0 = vcvt_f32_f16(vget_low_f16(sum[0]));
++    float32x4_t t1 = vcvt_f32_f16(vget_high_f16(sum[0]));
++    sumf = vaddvq_f32(vaddq_f32(t0, t1));
++#endif
++    for (; j < M; j++) {
++        sumf += (f16_to_f32(src0_ptr[j]) * f16_to_f32(src1_ptr[j]));
++    }
++
++    return sumf;
++}
++
++void transpose_v(f16 *vt, const f16 *v, int n_tokens, int dim, int qkv_dim)
++{
++    for (int i = 0; i < n_tokens; i++) {
++        int j = 0;
++        for (int j = 0; j < dim; j++) {
++            vt[j * n_tokens + i] = v[i * qkv_dim + j];
++        }
++    }
++}
++
++void prefill_attention(f16 *out_ptr, const f16 *qkv_ptr, const f16 *vt_ptr, int N_tokens, int N_seqs, const int *seq_lens)
++{
++    int N_gqa = g_pstModelHypePara.n_head / g_pstModelHypePara.n_kv_heads, dim_gqa = g_pstModelHypePara.dim / N_gqa;
++    int qkv_stride = g_pstModelHypePara.dim + 2 * dim_gqa, head_size = g_pstModelHypePara.dim / g_pstModelHypePara.n_head;
++    const f16 *q_ptr = qkv_ptr, *k_ptr = q_ptr + g_pstModelHypePara.dim;
++
++    std::vector<int> seqlen_prefix_sum;
++    seqlen_prefix_sum.push_back(0);
++    for (int i = 0, sum_seq_lens = 0; i < N_seqs; i++) {
++        sum_seq_lens += seq_lens[i];
++        seqlen_prefix_sum.push_back(sum_seq_lens);
++    }
++
++    int num_threads = omp_get_num_threads();
++    num_threads = num_threads / 4;
++
++    #pragma omp parallel for collapse(2) schedule(dynamic, 1) num_threads(num_threads)
++    for (int seq = 0; seq < N_seqs; seq++) {
++        for (int h_q = 0; h_q < g_pstModelHypePara.n_head; h_q++) {
++            f16 *qk_tmp = (f16 *)qk_tmp_storage;
++            int seq_t_begin = seqlen_prefix_sum[seq], seq_t_end = seqlen_prefix_sum[seq + 1];
++            int h_kv = h_q / N_gqa;
++
++            for (int t = seq_t_begin; t < seq_t_end; t++) {
++                const f16 *q_head_ptr = q_ptr + t * qkv_stride + h_q * head_size;
++                int token_idx_in_seq = t - seq_t_begin;
++                f16 row_max = -INFINITY;
++                for (int i = 0; i <= token_idx_in_seq; i++) {
++                    const f16 *k_head_ptr = k_ptr + (seq_t_begin + i) *qkv_stride + h_kv *head_size;
++                    qk_tmp[i] = DOTPRODUCT_vv_f16(head_size, q_head_ptr, k_head_ptr);
++                    row_max = qk_tmp[i] > row_max ? qk_tmp[i] : row_max;
++                }
++                f32 sumexp = 0.0f;
++                for (int i = 0; i <= token_idx_in_seq; i++) {
++                    f16 diff = qk_tmp[i] - row_max;
++                    f32 exp_result = expf_f16_table[*(uint16_t *)&diff];
++                    qk_tmp[i] = exp_result;
++                    sumexp += exp_result;
++                }
++
++                for (int i = 0; i <= token_idx_in_seq; i++) {
++                    qk_tmp[i] /= sumexp;
++                }
++
++                for (int iv = 0; iv < head_size; iv++) {
++                    const f16 *vt_seq_ptr = vt_ptr + h_kv * head_size * N_tokens + iv *N_tokens + seq_t_begin;
++                    out_ptr[t * g_pstModelHypePara.dim + h_q * head_size + iv] = DOTPRODUCT_vv_f16(token_idx_in_seq + 1, vt_seq_ptr, qk_tmp);
++                }
++            }
++        }
++    }
++}
++
++template<class scalar_t>
++void paged_attention_v1_impl(      scalar_t* __restrict__ out,            // [num_seqs, num_heads, head_size]
++      const scalar_t* __restrict__ q,        // [num_seqs, num_heads, head_size]
++      const scalar_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads, head_size/x, block_size, x]
++      const scalar_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads, head_size, block_size]
++      const int num_kv_heads,
++      const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
++      const int* __restrict__ seq_lens,      // [num_seqs]
++      const int max_num_blocks_per_seq,
++      const int q_stride, const int kv_block_stride, const int kv_head_stride,
++      const int num_seqs, const int num_heads, const int HEAD_SIZE)
++{
++    using q_load_vec_t = typename KernelVecType<scalar_t>::q_load_vec_t;
++    using k_load_vec_t = typename KernelVecType<scalar_t>::k_load_vec_t;
++    using v_load_vec_t = typename KernelVecType<scalar_t>::v_load_vec_t;
++    using q_k_v_vec_t = typename KernelVecType<scalar_t>::q_k_v_vec_t;
++    using accum_vec_t = typename KernelVecType<scalar_t>::accum_vec_t;
++    using accum_scalar_t = scalar_t;
++
++    constexpr int BLOCK_SIZE = 16;
++    constexpr int x = BLOCK_SIZE / sizeof(scalar_t);
++    static_assert(k_load_vec_t::get_elem_num() % x == 0);
++    static_assert(q_load_vec_t::get_elem_num() * sizeof(scalar_t) == 16);
++
++    constexpr int TOKEN_PER_GROUP = k_load_vec_t::get_elem_num() / x;
++    constexpr int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
++    static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
++
++    const int N_gqa = num_heads / num_kv_heads;
++
++    int num_threads = omp_get_num_threads();
++    num_threads = num_threads / 4;
++
++#pragma omp parallel for collapse(2) schedule(dynamic, 1) num_threads(num_threads)
++    for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
++      for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
++        accum_scalar_t *qk_tmp = (accum_scalar_t *)qk_tmp_storage;
++        int seq_len = seq_lens[seq_idx];
++        const int* seq_block_table = block_tables + max_num_blocks_per_seq * seq_idx;
++        const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
++        const int64_t kv_head_idx = head_idx / N_gqa;
++        const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
++        const scalar_t* __restrict__ q_vec_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
++
++        // Compute logits
++        for (int block_idx = 0; block_idx < block_num; ++block_idx) {
++            const int64_t physical_block_idx = seq_block_table[block_idx];
++            const scalar_t* __restrict__ k_block_cache_ptr =
++                k_cache + physical_block_idx * kv_block_stride +
++                kv_head_idx * kv_head_stride;
++            const int token_num = (block_idx == block_num - 1) ? last_block_token_num : BLOCK_SIZE;
++            const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
++            accum_vec_t group_accums[MAX_GROUP_NUM];
++            for (int q_offset = 0; q_offset < HEAD_SIZE; q_offset +=x, k_block_cache_ptr += x * BLOCK_SIZE) {
++                q_load_vec_t q_load_group_vec(q_vec_ptr + q_offset);
++                q_k_v_vec_t q_group_vec(q_load_group_vec);
++
++                for (int token_group_idx = 0; token_group_idx < group_num; token_group_idx++) {
++                    k_load_vec_t k_load_group_vec(k_block_cache_ptr + token_group_idx * x * TOKEN_PER_GROUP);
++                    q_k_v_vec_t k_group_vec(k_load_group_vec);
++                    vec_op::fma(group_accums[token_group_idx], q_group_vec, k_group_vec);
++                    vec_op::prefetch(k_block_cache_ptr + x *BLOCK_SIZE + token_group_idx * x *TOKEN_PER_GROUP);
++                }
++            }
++            for (int token_group_idx = 0; token_group_idx < group_num; token_group_idx++) {
++                for (int token_idx = 0; token_idx < TOKEN_PER_GROUP; token_idx++) {
++                    accum_scalar_t dot_v =
++                        group_accums[token_group_idx].
++                        template reduce_sub_sum<accum_vec_t::get_elem_num() / TOKEN_PER_GROUP>(token_idx);
++                    qk_tmp[block_idx * BLOCK_SIZE + token_group_idx * TOKEN_PER_GROUP + token_idx] = dot_v;
++                }
++            }
++        }
++
++        f32 max = qk_tmp[0], sum = 0.0;
++        for (int i = 1; i < seq_len; i++) {
++            max = max >= qk_tmp[i] ? max : qk_tmp[i];
++        }
++
++        for (int i = 0; i < seq_len; i++) {
++            f16 diff = qk_tmp[i] - max;
++            qk_tmp[i] = expf_f16_table[*(uint16_t *)&diff];
++            sum += qk_tmp[i];
++        }
++        int i = 0;
++        for (; i < seq_len; i++) {
++            qk_tmp[i] /= sum;
++        }
++        for (; i < block_num * BLOCK_SIZE; i++) {
++            qk_tmp[i] = 0;
++        }
++
++        constexpr int head_elem_num_per_partition = 16;
++        assert(HEAD_SIZE % head_elem_num_per_partition == 0);
++        int head_partition_num = HEAD_SIZE / head_elem_num_per_partition;
++        for (int head_part_idx = 0; head_part_idx < head_partition_num; ++head_part_idx) {
++          accum_vec_t accums[head_elem_num_per_partition];
++          scalar_t* __restrict__ out_ptr =
++              out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
++              head_part_idx * head_elem_num_per_partition;
++          for (int block_idx = 0; block_idx < block_num; ++block_idx) {
++            const int64_t physical_block_idx = seq_block_table[block_idx];
++            const scalar_t* __restrict__ v_block_cache_ptr =
++                v_cache + physical_block_idx * kv_block_stride +
++                kv_head_idx * kv_head_stride + BLOCK_SIZE * head_part_idx *
++                head_elem_num_per_partition;
++
++                accum_vec_t qk_vec(qk_tmp + block_idx * BLOCK_SIZE);
++                for (int head_elem_idx = 0; head_elem_idx < head_elem_num_per_partition; head_elem_idx++) {
++                    v_load_vec_t v_load_vec(v_block_cache_ptr + BLOCK_SIZE * head_elem_idx);
++                    accum_vec_t v_vec(v_load_vec);
++                    vec_op::fma(accums[head_elem_idx], qk_vec, v_vec);
++                }
++
++                if (block_idx != block_num - 1) {
++                  const int64_t next_physical_block_idx =
++                      seq_block_table[block_idx + 1];
++                  const scalar_t* __restrict__ next_v_block_cache_ptr =
++                      v_cache + next_physical_block_idx * kv_block_stride +
++                      kv_head_idx * kv_head_stride +
++                      BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
++
++                  for (int head_elem_idx = 0; head_elem_idx < head_elem_num_per_partition; head_elem_idx += 2) {
++                      vec_op::prefetch(next_v_block_cache_ptr + BLOCK_SIZE * head_elem_idx);
++                  }
++                }
++          }
++
++          for (int head_elem_idx = 0; head_elem_idx < head_elem_num_per_partition; head_elem_idx++) {
++              float value = accums[head_elem_idx].reduce_sum();
++              vec_op::storeFP32(value, out_ptr + head_elem_idx);
++          }
++      }
++    }
++  }
++}
++
++void Quantize(void *Dst, float *src, enum ggml_type DataType, int size)
++{
++    g_BlockDataInfo[DataType].quantize(src, Dst, size);
++}
++
++// 初始化工作分配结构体
++void init_work_divider(WorkDivider *divider, int numas) {
++    divider->num_numas = numas;
++    divider->num_threads = omp_get_num_threads();
++    if (divider->num_threads % divider->num_numas != 0) {
++        fprintf(stderr, "nthreads (%d) %% numas (%d) != 0\n", divider->num_threads, divider->num_numas);
++        exit(1);
++    }
++    divider->tid = omp_get_thread_num();
++    // printf("tid:%d, num_threads:%d \n", divider->tid, divider->num_threads);
++    divider->threads_per_numa = divider->num_threads / divider->num_numas;
++    divider->my_numa = divider->tid / divider->threads_per_numa;
++    divider->tid_in_numa = divider->tid % divider->threads_per_numa;
++}
++
++void RmsNorm(float *DstData, float *SrcData, float *SrcWeight, float eps, int dataNum)
++{
++    float ss = 0.0f;
++    for (int j = 0; j < dataNum; j++) {
++        ss += SrcData[j] * SrcData[j];
++    }
++    ss /= dataNum;
++    ss += eps;
++    ss = 1.0f / sqrtf(ss);
++    for (int j = 0; j < dataNum; j++) {
++        DstData[j] = SrcWeight[j] * (ss * SrcData[j]);
++    }
++}
++
++void divide_all_work(const WorkDivider *divider, int total_workitems, SingleNumaWorkRange *pstSingleRange)
++{
++    int work_per_thread = total_workitems / divider->num_threads;
++    int work_remaining = total_workitems % divider->num_threads;
++    if (work_remaining == 0) {
++        pstSingleRange->begin_thread = divider->tid * work_per_thread;
++        pstSingleRange->end_thread = divider->tid * work_per_thread + work_per_thread;
++        pstSingleRange->work_per_thread = work_per_thread;
++    } else if (divider->tid < work_remaining) {
++        pstSingleRange->begin_thread = divider->tid * work_per_thread + divider->tid;
++        pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + (divider->tid + 1);
++        pstSingleRange->work_per_thread = work_per_thread + 1;
++    } else {
++        pstSingleRange->begin_thread = divider->tid * work_per_thread + work_remaining;
++        pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + work_remaining;
++        pstSingleRange->work_per_thread = work_per_thread;
++    }
++    return;
++}
++
++// 分配单 NUMA 节点的工作
++void divide_work_first_numa(const WorkDivider *divider, int total_workitems, SingleNumaWorkRange *pstSingleRange)
++{
++    if (divider->my_numa == 0) {
++        int work_per_thread = total_workitems / divider->threads_per_numa;
++        int work_remaining = total_workitems % divider->threads_per_numa;
++        if (work_remaining == 0) {
++            pstSingleRange->begin_thread = divider->tid * work_per_thread;
++            pstSingleRange->end_thread = divider->tid * work_per_thread + work_per_thread;
++            pstSingleRange->work_per_thread = work_per_thread;
++        } else if (divider->tid < work_remaining) {
++            pstSingleRange->begin_thread = divider->tid * work_per_thread + divider->tid;
++            pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + (divider->tid + 1);
++            pstSingleRange->work_per_thread = work_per_thread + 1;
++        } else {
++            pstSingleRange->begin_thread = divider->tid * work_per_thread + work_remaining;
++            pstSingleRange->end_thread = (divider->tid + 1) * work_per_thread + work_remaining;
++            pstSingleRange->work_per_thread = work_per_thread;
++        }
++        return;
++    }
++
++    pstSingleRange->begin_thread = 0;
++    pstSingleRange->end_thread = 0;
++    pstSingleRange->work_per_thread = 0;
++}
++
++// 分配所有 NUMA 节点的工作
++void divide_work_all_numas(const WorkDivider *divider, int total_workitems, MultiNumaWorkRange *pstNulRange)
++{
++    int max_workitems_per_numa = (total_workitems - 1) / divider->num_numas + 1;
++    int workitem_numa_begin = divider->my_numa * max_workitems_per_numa;
++    int workitem_numa_end = workitem_numa_begin + max_workitems_per_numa;
++    if (workitem_numa_end > total_workitems) {
++        workitem_numa_end = total_workitems;
++    }
++    int workitems_my_numa = workitem_numa_end - workitem_numa_begin;
++    int max_workitems_per_thread = (workitems_my_numa - 1) / divider->threads_per_numa + 1;
++    int begin = divider->tid_in_numa * max_workitems_per_thread;
++    int end = begin + max_workitems_per_thread;
++    if (end > workitems_my_numa) {
++        end = workitems_my_numa;
++    }
++
++    pstNulRange->begin_numa = workitem_numa_begin;
++    pstNulRange->end_numa = workitem_numa_end;
++    pstNulRange->work_per_numa = max_workitems_per_numa;
++    pstNulRange->begin_thread = begin;
++    pstNulRange->end_thread = end;
++    pstNulRange->work_per_thread = end - begin;
++}
++
++/* 反量化 */
++void Dequantize(void *DstData, void *SrcData, WEIGHT *pstWeight, int dataNum)
++{
++    enum ggml_type TokenType = pstWeight->token_embedding.DataType;
++
++    /* 需要反量化情况 */
++    if (TokenType != GGML_TYPE_F32 && pstWeight->rms_att_norm.DataType == GGML_TYPE_F32) {
++        g_BlockDataInfo[TokenType].dequantize(SrcData, static_cast<float*>(DstData), dataNum);
++    }
++}
++
++void divide_kv_cache_numa(const WorkDivider * divider, int total_workitems,
++                          SingleNumaWorkRange * pstSingleRange)
++{
++    int work_per_thread;
++    int work_remaining;
++    int work_per_numa;
++    int NumaNum = divider->num_numas;
++    int threads_per_numa = divider->threads_per_numa;
++    int head_per_numa;
++    int tid_group, tid_use, head_add;
++
++    if (total_workitems % NumaNum != 0) {
++        fprintf(stderr, "kv cache: heads (%d) %% numas (%d) != 0\n", total_workitems, NumaNum);
++        exit(1);
++    }
++
++    work_per_numa = total_workitems / NumaNum;
++    if (total_workitems <= divider->num_threads) {
++        /* 隔离不需要的线程 */
++        if (divider->tid % threads_per_numa >= work_per_numa) {
++            pstSingleRange->begin_thread = 0;
++            pstSingleRange->end_thread = 0;
++            return;
++        }
++
++        /* kvcache分配 */
++        pstSingleRange->begin_thread = divider->tid_in_numa + divider->my_numa * work_per_numa;
++        pstSingleRange->end_thread = pstSingleRange->begin_thread + 1;
++    } else {
++        tid_group = divider->tid / threads_per_numa;
++        tid_use = divider->tid % threads_per_numa;
++        head_per_numa = total_workitems / NumaNum;
++        work_per_thread = head_per_numa / threads_per_numa;
++        work_remaining = head_per_numa % threads_per_numa;
++        head_add = head_per_numa * tid_group;
++        if (work_remaining == 0) {
++            pstSingleRange->begin_thread = tid_use * work_per_thread + head_add;
++            pstSingleRange->end_thread = tid_use * work_per_thread + work_per_thread + head_add;
++        } else if (tid_use < work_remaining) {
++            pstSingleRange->begin_thread = tid_use * work_per_thread + tid_use + head_add;
++            pstSingleRange->end_thread = (tid_use + 1) * work_per_thread + (tid_use + 1) + head_add;
++        } else {
++            pstSingleRange->begin_thread = tid_use * work_per_thread + work_remaining + head_add;
++            pstSingleRange->end_thread = (tid_use + 1) * work_per_thread + work_remaining + head_add;
++        }
++    }
++    return;
++}
++
++void Rope_embedding(MODEL_RUN_STATE *pstRunState, MODEL_HYPE_PARA *pstModelPara, int pos, int n_tokens)
++{
++    int dim = pstModelPara->dim;
++    int n_kv_heads = pstModelPara->n_kv_heads;
++    int n_head = pstModelPara->n_head;
++    long kv_dim = (dim * n_kv_heads) / n_head;
++    int head_size = dim / n_head;
++    float rope_freq_base = (fabsf(pstModelPara->rope_freq_base - 0.0f) < EPSILON)
++        ? 10000.0f
++        : pstModelPara->rope_freq_base;
++
++    bool ropetype = strstr(g_ModelArch, "qwen") != NULL;
++    for (int k = 0; k < n_tokens; k++, pos++) {
++        if (!ropetype){
++            for (int i = 0; i < dim; i += 2) {
++                int head_dim = i % head_size;
++                float freq = 1.0f / powf(rope_freq_base, head_dim / (float)head_size);
++                float val = pos * freq;
++                float fcr = cosf(val);
++                float fci = sinf(val);
++                int rotn = i < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
++                for (int v = 0; v < rotn; v++) {
++                    float *vec = v == 0 ? (float *)pstRunState->Q + k * dim : (float *)pstRunState->K + k * kv_dim; // the vector to rotate (query or key)
++                    float v0 = vec[i];
++                    float v1 = vec[i+1];
++                    vec[i]   = v0 * fcr - v1 * fci;
++                    vec[i+1] = v0 * fci + v1 * fcr;
++                }
++            }
++        }else{
++            for (int j = 0; j < dim / head_size; j++){
++                for (int i = 0; i < head_size; i += 2) {
++                    int I = i / 2;
++                    float freq = 1.0f / powf(rope_freq_base, i / (float)head_size);
++                    float val = pos * freq;
++                    float fcr = cosf(val);
++                    float fci = sinf(val);
++                    int rotn = i + j * head_size < kv_dim ? 2 : 1; // how many vectors? 2 = q & k, 1 = q only
++                    for (int v = 0; v < rotn; v++) {
++                        float *vec = v == 0 ? (float *)pstRunState->Q + k * dim : (float *)pstRunState->K + k * kv_dim; // the vector to rotate (query or key)
++                        float v0 = vec[I + j * head_size];
++                        float v1 = vec[I + j * head_size + head_size / 2];
++                        vec[I + j * head_size]                 = v0 * fcr - v1 * fci;
++                        vec[I + j * head_size + head_size / 2] = v0 * fci + v1 * fcr;
++                    }
++                }
++            }
++        }   
++    }
++}
++
++void Active_Silu(f16 *dst, f16 *w1w3, int hidden_dim, int n_tokens)
++{
++    int total_hidden_dim = hidden_dim << 1;
++
++    for (int j = 0; j < n_tokens; j++) {
++        for (int i = 0; i < hidden_dim; i++) {
++            float val = w1w3[i + j * total_hidden_dim];
++            val *= (1.0f / (1.0f + expf(-val)));
++            val *= w1w3[i + j * total_hidden_dim + hidden_dim];
++            dst[i + j * hidden_dim] = (f16)val;
++        }
++    }
++}
++
++__attribute__((noinline))
++void Rope_embedding_impl(bool rope_type, int n_rotary, f16 *head_ptr, const f16 *cos_sin_cache, int position)
++{
++    const f16 *cos_sin_ptr = cos_sin_cache + position * n_rotary;
++    int embed_dim = n_rotary >> 1;
++
++    /* rope_neox */
++    if (rope_type == true) {
++        int xx = 0, yy = embed_dim;
++        for (; xx <= embed_dim - 8; xx += 8, yy += 8) {
++            __builtin_prefetch(&head_ptr[xx + 32], 1, 2);
++            __builtin_prefetch(&head_ptr[yy + 32], 1, 2);
++            const float16x8_t qx = vld1q_f16(&head_ptr[xx]), qy = vld1q_f16(&head_ptr[yy]);
++            const float16x8_t csx = vld1q_f16(&cos_sin_ptr[xx]), csy = vld1q_f16(&cos_sin_ptr[yy]);
++            vst1q_f16(&head_ptr[xx], vfmaq_f16(vmulq_f16(qx, csx), vnegq_f16(qy), csy));
++            vst1q_f16(&head_ptr[yy], vfmaq_f16(vmulq_f16(qy, csx), qx, csy));
++        }
++        for (; xx < embed_dim; xx++, yy++) {
++            const f16 qx = head_ptr[xx], qy = head_ptr[yy];
++            head_ptr[xx] = qx * cos_sin_ptr[xx] - qy * cos_sin_ptr[yy];
++            head_ptr[yy] = qy * cos_sin_ptr[xx] + qx * cos_sin_ptr[yy];
++        }
++     } else { /* rope_gptj */
++        for (int j = 0; j < embed_dim; j++) {
++            const f16 qx = head_ptr[2 * j], qy = head_ptr[2 * j + 1];
++            const f16 cos = cos_sin_ptr[j], sin = cos_sin_ptr[embed_dim + j];
++            head_ptr[2 * j] = qx * cos - qy * sin;
++            head_ptr[2 * j + 1] = qy * cos + qx * sin;
++        }
++     }
++}
++
++void quantization_weight_strategy(void *dst, void *src, int64_t quantization_bit_code, size_t Size)
++{
++    int kv_dim = g_pstModelHypePara.dim * g_pstModelHypePara.n_kv_heads / g_pstModelHypePara.n_head;
++    float Buffer[kv_dim];
++    int block_num = Size / kv_dim;
++
++    /* 反量化 */
++    if (quantization_bit_code == GGML_TYPE_F32) {
++        g_BlockDataInfo[GGML_TYPE_F16].dequantize(src, dst, Size);
++    } else {
++        if (quantization_bit_code != GGML_TYPE_F16) {
++            int offset = kv_dim / g_BlockDataInfo[quantization_bit_code].uiblkSize * g_BlockDataInfo[quantization_bit_code].uiTypeSize;
++            for (int i = 0; i < block_num; i++) {
++                g_BlockDataInfo[GGML_TYPE_F16].dequantize((char *)src + i * kv_dim * sizeof(f16), Buffer, kv_dim);
++                g_BlockDataInfo[quantization_bit_code].quantize(Buffer, (char *)dst + i * offset, kv_dim);
++            }
++        } else { /* 直接复制权重 */
++            memcpy(dst, src, Size * sizeof(f16));
++        }
++    }
++}
++
++void load_weight_and_malloc_active_tensor(
++    int64_t dim,  // MODEL_HYPE_PARA.dim embedding 维度
++    int64_t hidden_dim,  // MODEL_HYPE_PARA.hidden_dim ffn 隐藏层维度
++    int64_t n_layers,  // MODEL_HYPE_PARA.n_layers 模型层数
++    int64_t n_vocab,  // MODEL_HYPE_PARA.n_vocab 词汇数量
++    int64_t n_head,  // MODEL_HYPE_PARA.n_head 注意力头个数
++    int64_t n_kv_heads,  // MODEL_HYPE_PARA.n_kv_heads kv的对数
++    int64_t context_length,   // MODEL_HYPE_PARA.context_length 上下文长度
++    double norm_rms_eps,   // MODEL_HYPE_PARA.norm_rms_eps eps
++    double rope_freq_base,   // MODEL_HYPE_PARA.rope_freq_base rope频率
++    double attn_scale,
++    int64_t is_neox_style,
++    int64_t quantization_bit_code,
++
++    torch::Tensor const &cos_sin_cache,
++    torch::Tensor token_embedding,   // WEIGHT.token_embedding
++    torch::Tensor rms_att_norm,   // WEIGHT.rms_att_norm
++    torch::Tensor rms_ffn_norm,   // WEIGHT.rms_ffn_norm
++    torch::Tensor wqkv,
++    torch::Tensor wo,   // WEIGHT.wo
++    torch::Tensor qkv_bias,
++    torch::Tensor w1w3,
++    torch::Tensor ffn_down,   // WEIGHT.ffn_down
++    torch::Tensor output_norm,  // WEIGHT.output_norm
++    torch::Tensor lm_head       // WEIGHT.output
++){
++    g_pstModelHypePara.dim = dim;            /* embedding 维度 */
++    g_pstModelHypePara.n_head = n_head;        /* 注意力头个数 */
++    g_pstModelHypePara.n_kv_heads = n_kv_heads;     /* kv的对数 */
++    g_pstModelHypePara.hidden_dim = hidden_dim;     /* ffn隐藏层维度 */
++    g_pstModelHypePara.n_layers = n_layers;       /* 模型层数 */
++    g_pstModelHypePara.context_length = context_length; /* 上下文长度 */
++    g_pstModelHypePara.norm_rms_eps = norm_rms_eps; /* eps */
++    g_pstModelHypePara.n_vocab = n_vocab;        /* 词汇数量 */
++    g_pstModelHypePara.rope_freq_base = rope_freq_base; /* rope频率 */
++    g_pstModelHypePara.cos_sin_cache = (f16 *)cos_sin_cache.data_ptr();
++    g_pstModelHypePara.n_rotary = cos_sin_cache.size(1);
++    g_pstModelHypePara.is_neox_style = is_neox_style;
++    g_pstModelHypePara.attn_scale = attn_scale;
++
++    weight_types.token_embd_weight = GGML_TYPE_F32;
++    weight_types.attn_k_weight = quantization_bit_code;
++    weight_types.attn_k_bias = GGML_TYPE_F32;
++    weight_types.attn_norm_weight = GGML_TYPE_F32;
++    weight_types.attn_q_weight = quantization_bit_code;
++    weight_types.attn_q_bias = GGML_TYPE_F32;
++    weight_types.attn_v_weight = quantization_bit_code;
++    weight_types.attn_v_bias = GGML_TYPE_F32;
++    weight_types.ffn_down_weight = quantization_bit_code;
++    weight_types.ffn_gate_weight = quantization_bit_code;
++    weight_types.ffn_norm_weight = GGML_TYPE_F32;
++    weight_types.ffn_up_weight = quantization_bit_code;
++    weight_types.attn_output_weight = quantization_bit_code;
++    weight_types.output_weight = quantization_bit_code;
++    weight_types.output_norm_weight = GGML_TYPE_F32;
++
++    for(int i = 0; i < (1 << 16); ++i) {
++        float f = f16_to_f32(*(f16*)(&i));
++        expf_f16_table[i] = f32_to_f16(expf(f));
++    }
++
++    assert(wq.dtype() == torch::kFloat16);
++    int N_gqa = n_head / n_kv_heads;
++    int kv_dim = dim / N_gqa;
++
++    size_t tokens_embedding_weight_size = (size_t)dim * n_vocab / g_BlockDataInfo[weight_types.token_embd_weight].uiblkSize * g_BlockDataInfo[weight_types.token_embd_weight].uiTypeSize;
++
++    size_t attention_q_size_per_layer = (size_t)dim * dim / g_BlockDataInfo[weight_types.attn_q_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_q_weight].uiTypeSize;
++    size_t attention_k_size_per_layer = (size_t)dim * kv_dim / g_BlockDataInfo[weight_types.attn_k_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_k_weight].uiTypeSize;
++    size_t attention_v_size_per_layer = (size_t)dim * kv_dim / g_BlockDataInfo[weight_types.attn_v_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_v_weight].uiTypeSize;
++    size_t attention_size_per_layer = attention_q_size_per_layer + attention_k_size_per_layer + attention_v_size_per_layer;
++
++    size_t bias_q_size_per_layer = (size_t)dim / g_BlockDataInfo[weight_types.attn_q_bias].uiblkSize * g_BlockDataInfo[weight_types.attn_q_bias].uiTypeSize;
++    size_t bias_k_size_per_layer = (size_t)kv_dim / g_BlockDataInfo[weight_types.attn_k_bias].uiblkSize * g_BlockDataInfo[weight_types.attn_k_bias].uiTypeSize;
++    size_t bias_v_size_per_layer = (size_t)kv_dim / g_BlockDataInfo[weight_types.attn_v_bias].uiblkSize * g_BlockDataInfo[weight_types.attn_v_bias].uiTypeSize;
++    size_t bias_qkv_size_per_layer = bias_q_size_per_layer + bias_k_size_per_layer + bias_v_size_per_layer;
++
++    size_t attention_norm_size_per_layer = (size_t)dim / g_BlockDataInfo[weight_types.attn_norm_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_norm_weight].uiTypeSize;
++
++    size_t ffn_down_size_per_layer = (size_t)dim * hidden_dim / g_BlockDataInfo[weight_types.ffn_down_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_down_weight].uiTypeSize;
++    size_t ffn_gate_size_per_layer = (size_t)dim * hidden_dim / g_BlockDataInfo[weight_types.ffn_gate_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_gate_weight].uiTypeSize;
++    size_t ffn_norm_size_per_layer = (size_t)dim / g_BlockDataInfo[weight_types.ffn_norm_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_norm_weight].uiTypeSize;
++    size_t ffn_up_size_per_layer = (size_t)dim * hidden_dim / g_BlockDataInfo[weight_types.ffn_up_weight].uiblkSize * g_BlockDataInfo[weight_types.ffn_up_weight].uiTypeSize;
++    size_t w1w3_size_per_layer = ffn_gate_size_per_layer + ffn_up_size_per_layer;
++
++    size_t attention_output_size_per_layer = (size_t)dim * dim / g_BlockDataInfo[weight_types.attn_output_weight].uiblkSize * g_BlockDataInfo[weight_types.attn_output_weight].uiTypeSize;
++    size_t output_size = (size_t)dim * n_vocab / g_BlockDataInfo[weight_types.output_weight].uiblkSize * g_BlockDataInfo[weight_types.output_weight].uiTypeSize;
++    size_t output_norm_size = (size_t)dim / g_BlockDataInfo[weight_types.output_norm_weight].uiblkSize * g_BlockDataInfo[weight_types.output_norm_weight].uiTypeSize;
++
++    g_pstWeight.rms_att_norm.Data.tensor2 = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(float *), 0));
++    g_pstWeight.rms_ffn_norm.Data.tensor2 = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(float *), 0));
++    g_pstWeight.qkv_bias.Data.tensor2 = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(float *), 0));
++
++    for (int i = 0; i < n_layers; i++) {
++        g_pstWeight.rms_att_norm.Data.tensor2[i] = numa_alloc_onnode(attention_norm_size_per_layer, 0);
++        g_pstWeight.rms_ffn_norm.Data.tensor2[i] = numa_alloc_onnode(ffn_norm_size_per_layer, 0);
++        g_pstWeight.qkv_bias.Data.tensor2[i] = numa_alloc_onnode(bias_qkv_size_per_layer, 0);
++    }
++    g_pstWeight.output_norm.Data.tensor1 = numa_alloc_onnode(output_norm_size, 0);
++
++    g_pstWeight.Wqkv.Data.tensor3 = static_cast<void***>(numa_alloc_onnode(g_numas * sizeof(void **), 0));
++    g_pstWeight.wo.Data.tensor3 = static_cast<void***>(numa_alloc_onnode(g_numas * sizeof(void **), 0));
++    g_pstWeight.w1w3.Data.tensor3 = static_cast<void***>(numa_alloc_onnode(g_numas * sizeof(void **), 0));
++    g_pstWeight.ffn_down.Data.tensor3 = static_cast<void***>(numa_alloc_onnode(g_numas * sizeof(void **), 0));
++    g_pstWeight.output.Data.tensor2 = static_cast<void**>(numa_alloc_onnode(g_numas * sizeof(void *), 0));
++    g_pstWeight.token_embedding.Data.tensor1 = numa_alloc_onnode(tokens_embedding_weight_size, 0);
++
++    for (int i = 0; i < g_numas; i++) {
++        g_pstWeight.Wqkv.Data.tensor3[i] = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(void *), i));
++        g_pstWeight.wo.Data.tensor3[i] = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(void *), i));
++        g_pstWeight.w1w3.Data.tensor3[i] = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(void *), i));
++        g_pstWeight.ffn_down.Data.tensor3[i] = static_cast<void**>(numa_alloc_onnode(n_layers * sizeof(void *), i));
++        g_pstWeight.output.Data.tensor2[i] = (void *)numa_alloc_onnode(output_size / g_numas, i);
++
++        for (int j = 0; j < n_layers; j++) {
++            g_pstWeight.Wqkv.Data.tensor3[i][j] = numa_alloc_onnode(attention_size_per_layer / g_numas, i);
++            g_pstWeight.wo.Data.tensor3[i][j] = numa_alloc_onnode(attention_output_size_per_layer / g_numas, i);
++            g_pstWeight.w1w3.Data.tensor3[i][j] = numa_alloc_onnode(w1w3_size_per_layer / g_numas, i);
++            g_pstWeight.ffn_down.Data.tensor3[i][j] = numa_alloc_onnode(ffn_down_size_per_layer / g_numas, i);
++        }
++    }
++
++    std::cout << "load_weight start ..." << std::endl;
++
++    /* 量化权重 */
++    for (int layerNum = 0; layerNum < n_layers; layerNum++) {
++        quantization_weight_strategy(g_pstWeight.rms_att_norm.Data.tensor2[layerNum], rms_att_norm.index(torch::indexing::TensorIndex(layerNum)).data_ptr(),
++                                     weight_types.attn_norm_weight, dim);
++        quantization_weight_strategy(g_pstWeight.rms_ffn_norm.Data.tensor2[layerNum], rms_ffn_norm.index(torch::indexing::TensorIndex(layerNum)).data_ptr(),
++                                     weight_types.ffn_norm_weight, dim);
++        int qkv_dim = dim + 2 * kv_dim;
++        for (int j = 0; j < g_numas; ++j) {
++            f16 *qkv_pointer = (f16 *)wqkv.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + qkv_dim / g_numas * dim * j;
++            quantization_weight_strategy(g_pstWeight.Wqkv.Data.tensor3[j][layerNum], (char *)qkv_pointer, weight_types.attn_k_weight,
++                                         qkv_dim / g_numas * dim);
++
++            f16 *wo_pointer = (f16 *)wo.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + dim / g_numas * dim * j;
++            quantization_weight_strategy(g_pstWeight.wo.Data.tensor3[j][layerNum], (char *)wo_pointer,
++                                         weight_types.attn_output_weight, dim * dim / g_numas);
++
++            f16 *w1w3_pointer = (f16 *)w1w3.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + 2 * hidden_dim / g_numas * dim * j;
++            quantization_weight_strategy(g_pstWeight.w1w3.Data.tensor3[j][layerNum], (char *)w1w3_pointer,
++                                         weight_types.ffn_up_weight, 2 * hidden_dim / g_numas * dim);
++
++            f16 *ffn_down_pointer = (f16 *)ffn_down.index(torch::indexing::TensorIndex(layerNum)).data_ptr() + hidden_dim / g_numas * dim * j;
++            quantization_weight_strategy(g_pstWeight.ffn_down.Data.tensor3[j][layerNum], (char *)ffn_down_pointer,
++                                         weight_types.ffn_down_weight, dim * hidden_dim / g_numas);
++        }
++
++        quantization_weight_strategy(g_pstWeight.qkv_bias.Data.tensor2[layerNum], qkv_bias.index(torch::indexing::TensorIndex(layerNum)).data_ptr(),
++                                     weight_types.attn_q_bias, qkv_dim);
++    }
++
++    for (int i = 0; i < g_numas; i++) {
++        f16 *output_pointer = (f16 *)lm_head.data_ptr() + n_vocab / g_numas * dim * i;
++        quantization_weight_strategy(g_pstWeight.output.Data.tensor2[i], output_pointer,
++                                     weight_types.output_weight, dim * n_vocab /  g_numas);
++    }
++
++    quantization_weight_strategy(g_pstWeight.token_embedding.Data.tensor1, token_embedding.data_ptr(), weight_types.token_embd_weight, dim * n_vocab);
++    quantization_weight_strategy(g_pstWeight.output_norm.Data.tensor1, output_norm.data_ptr(), weight_types.output_norm_weight, dim);
++
++    /* 归一化结果 */
++    g_stRunState.Token_Ori = (f32*)numa_alloc_onnode((size_t)dim * context_length * sizeof(f32), 0);
++    g_stRunState.Token_Norm = (f32*)numa_alloc_onnode((size_t)dim * context_length * sizeof(f32), 0);
++
++    /* add的中间结果 */
++    g_stRunState.add_weight = (f32 *)numa_alloc_onnode((size_t)dim * context_length * sizeof(f32), 0);
++    memset(g_stRunState.add_weight, 0, (size_t)dim * context_length *sizeof(f32));
++
++    /* 反量化挂载 */
++    g_stRunState.temp_output_vec_numa = (void **)numa_alloc_onnode(g_numas * sizeof(void *), 0);
++    g_stRunState.tmp_vec_numa = (void **)numa_alloc_onnode(g_numas * sizeof(void *), 0);
++    for (int i = 0; i < g_numas; i++) {
++        g_stRunState.temp_output_vec_numa[i] = (void *)numa_alloc_onnode((size_t)dim * context_length * sizeof(f16), i);
++        g_stRunState.tmp_vec_numa[i] = (void *)numa_alloc_onnode((size_t)hidden_dim * context_length * sizeof(f16), i);
++    }
++
++    /* 注意力输出 */
++    g_stRunState.Attn_out = (f32*)numa_alloc_onnode(
++        (size_t)(dim + kv_dim + kv_dim >= 2 * hidden_dim ? dim + kv_dim + kv_dim : 2 * hidden_dim)
++        * context_length * sizeof(f32), 0);
++
++    g_stRunState.Attn_out_f16 = (f16*)numa_alloc_onnode(
++        (size_t)(dim + kv_dim + kv_dim >= 2 * hidden_dim ? dim + kv_dim + kv_dim : 2 * hidden_dim)
++        * context_length * sizeof(f16), 0);
++
++    /* 隐藏层 */
++    g_stRunState.ffn_Gate = (f32 *)numa_alloc_onnode((size_t)hidden_dim * context_length * sizeof(f32), 0);
++
++    if (!g_stRunState.Token_Ori || !g_stRunState.Token_Norm || !g_stRunState.Attn_out || !g_stRunState.ffn_Gate ||
++        !g_stRunState.temp_output_vec_numa || !g_stRunState.tmp_vec_numa) {
++        fprintf(stderr, "Error: numa_alloc_onnode failed! (File: %s, Line: %d)\n", __FILE__, __LINE__);
++    }
++    std::cout << "load_weight end." << std::endl;
++}
++
++#define DEBUG_TIME 1
++static inline uint64_t get_time_ns(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_REALTIME, &ts);
++    return ts.tv_sec * 1000 * 1000 * 1000 + ts.tv_nsec;
++}
++
++void get_next_token(void* output, MODEL_HYPE_PARA *pstModelHypePara, WEIGHT *pstLlama, MODEL_RUN_STATE *pstRunState,
++                      bool is_prompt,
++                      torch::Tensor& block_tables,
++                      torch::Tensor& seq_lens,
++                      torch::Tensor& slot_mapping,
++                      void *hidden_state, int64_t *pos,
++                      std::vector<torch::Tensor>& kv_caches,
++                      int64_t block_size,
++                      int n_tokens)
++{
++    int dim = pstModelHypePara->dim;
++    int n_kv_heads = pstModelHypePara->n_kv_heads;
++    int n_head = pstModelHypePara->n_head;
++    int kv_dim = (dim * n_kv_heads) / n_head;
++    int hidden_dim =  pstModelHypePara->hidden_dim;
++    int layers = pstModelHypePara->n_layers;
++    int n_vocab = pstModelHypePara->n_vocab;
++    float eps = pstModelHypePara->norm_rms_eps;
++    UINT32 srcBlockNum, srcBlocksize;
++    UINT32 dstBlockNum, dstBlocksize;
++    int srcType;
++    enum ggml_type dstType;
++    
++    int qkv_dim = dim + kv_dim + kv_dim;
++    int head_size = dim / n_head;
++    int kv_head_dim = block_size * head_size;
++    f16 *VT = (f16 *)numa_alloc_onnode(n_tokens * dim * sizeof(f16), 0);
++
++    quantization_weight_strategy(pstRunState->Token_Norm, (char *)hidden_state, weight_types.token_embd_weight, n_tokens * dim);
++
++#ifdef DEBUG_TIME
++    uint64_t t0 = get_time_ns();
++    uint64_t time[25] = {0};
++    uint64_t tt1 = get_time_ns();
++#endif
++
++    for(int L = 0; L < layers; L++) {
++#pragma omp parallel
++{
++        WorkDivider work;
++        init_work_divider(&work, g_numas);
++        SingleNumaWorkRange srange;
++        MultiNumaWorkRange mrange;
++
++        divide_work_first_numa(&work, n_tokens, &srange);
++        if (work.my_numa == 0) {
++            for (int i = srange.begin_thread; i < srange.end_thread; i++) {
++                for (int j= 0; j < dim; j++) {
++                    ((f32 *)pstRunState->add_weight)[i * dim + j] += ((f32 *)pstRunState->Token_Norm)[i * dim + j];
++                }
++                RmsNorm((f32 *)pstRunState->Token_Ori + i * dim, (f32 *)pstRunState->add_weight + i * dim,
++                        (f32 *)pstLlama->rms_att_norm.Data.tensor2[L], eps, dim);
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[0] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        srcType = weight_types.attn_k_weight;
++        dstType = g_BlockDataInfo[srcType].VecDotType;
++        srcBlockNum = g_BlockDataInfo[srcType].uiblkSize;
++        srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize;  
++        dstBlockNum = g_BlockDataInfo[dstType].uiblkSize;
++        dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize;
++
++        divide_work_first_numa(&work, n_tokens * dim / dstBlockNum, &srange);
++        Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, 
++                 (f32 *)pstRunState->Token_Ori + srange.begin_thread * dstBlockNum,
++                 dstType, srange.work_per_thread * dstBlockNum);
++
++        if (work.my_numa == 0) {
++            for (int i = 1; i < g_numas; i++) {
++                memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize,
++                        (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                        srange.work_per_thread * dstBlocksize);
++            }
++        }
++
++#ifdef DEBUG_TIME
++if (work.tid == 0) {
++    time[1] += get_time_ns() - tt1;
++    tt1 = get_time_ns();
++}
++#endif
++
++#pragma omp barrier
++        /* 计算qkv */
++        divide_work_all_numas(&work, qkv_dim, &mrange);
++        for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) {
++            int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1;    
++            if (nrc_i == 2) {
++                for (int k = 0; k < n_tokens; k += 2) {
++                    int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1;
++                    if (nrc_k == 2) {
++                        __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, qkv_dim,
++                                (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                    else if (nrc_k == 1) {
++                        __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, qkv_dim,
++                                (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i+1, qkv_dim,
++                                (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++
++                }
++            } 
++            else if (nrc_i == 1) {
++                for (int k = 0; k < n_tokens; k += 1) {
++                    __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, 1, 2);
++                    g_BlockDataInfo[srcType].VecDotFunc(dim,
++                            (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * qkv_dim + i, qkv_dim,
++                            (char *)g_pstWeight.Wqkv.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1);
++                }
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[2] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        if (work.tid == 1) {
++            for (int k = 0; k < n_tokens; k++) {
++                for (int i = 0; i < qkv_dim; i++) {
++                    pstRunState->Attn_out[k * qkv_dim + i] += ((f32 *)g_pstWeight.qkv_bias.Data.tensor2[L])[i];
++                }
++            }
++            g_BlockDataInfo[GGML_TYPE_F16].quantize(pstRunState->Attn_out, pstRunState->Attn_out_f16, n_tokens * qkv_dim);
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[3] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++// 中间数据f32 -> f16, 减少attention修改
++#pragma omp barrier
++        f16 *q_ptr = pstRunState->Attn_out_f16, *k_ptr = pstRunState->Attn_out_f16 + dim, *v_ptr = pstRunState->Attn_out_f16 + dim + kv_dim;
++        f16 *kcache_ptr = (f16 *)kv_caches[L][0].data_ptr(), *vcache_ptr = (f16 *)kv_caches[L][1].data_ptr();
++        int64_t *slot_mapping_ptr = (int64_t *)slot_mapping.data_ptr();
++        int kv_cache_block_elem_num = n_kv_heads * head_size * block_size;
++
++        divide_all_work(&work, n_tokens * n_kv_heads, &srange);
++        for (int i = srange.begin_thread; i < srange.end_thread; i++) {
++            int t = i / n_kv_heads;
++            int h = i % n_kv_heads;
++            const int64_t slot = slot_mapping_ptr[t];
++            if (slot < 0) {
++                continue;
++            }
++            int64_t block_idx = slot / block_size, block_offset = slot % block_size;
++            f16 *k_head_ptr = k_ptr + t * qkv_dim + h * head_size;
++            f16 *kcache_head_ptr = kcache_ptr + kv_cache_block_elem_num * block_idx + h * block_size * head_size;
++            const f16 *v_head_ptr = v_ptr + t * qkv_dim + h * head_size;
++            f16 *vcache_head_ptr = vcache_ptr + kv_cache_block_elem_num * block_idx + h * block_size * head_size;
++            Rope_embedding_impl(g_pstModelHypePara.is_neox_style, g_pstModelHypePara.n_rotary, k_head_ptr, g_pstModelHypePara.cos_sin_cache,
++                                pos[t]);
++            for (int idx = 0; idx < head_size; idx += 8) {   //8 = 16 / sizeof(f16)
++                for (int vidx = idx; vidx < idx + 8; vidx++) {
++                    vcache_head_ptr[vidx * block_size + block_offset] = v_head_ptr[vidx];
++                }
++                std::copy_n(k_head_ptr + idx, 8, kcache_head_ptr + idx * block_size + block_offset * 8);
++            }
++        }
++        divide_all_work(&work, n_tokens * n_head, &srange);
++        for (int i = srange.begin_thread; i < srange.end_thread; i++) {
++            int t = i / n_head;
++            int h = i % n_head;
++            Rope_embedding_impl(g_pstModelHypePara.is_neox_style, g_pstModelHypePara.n_rotary, q_ptr + t *qkv_dim + h * head_size,
++                                g_pstModelHypePara.cos_sin_cache, pos[t]);
++            for (int j = 0; j < head_size; j++) {
++                q_ptr[t * qkv_dim + h * head_size + j] *= g_pstModelHypePara.attn_scale;
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[4] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++}
++        //divide_kv_cache_numa(&work, n_head, &srange);
++        if (is_prompt == true) {
++            f16 *v_ptr = (f16 *)pstRunState->Attn_out_f16 + dim + kv_dim;
++            transpose_v(VT, v_ptr, n_tokens, kv_dim, qkv_dim);
++            prefill_attention(pstRunState->seq_qkv, pstRunState->Attn_out_f16, VT, n_tokens, seq_lens.size(0),
++                              (int *)seq_lens.data_ptr());
++        } else {
++            int kv_block_row = kv_caches[L][0].stride(0);
++            paged_attention_v1_impl(pstRunState->seq_qkv, pstRunState->Attn_out_f16, (f16 *)kv_caches[L][0].data_ptr(),
++                                    (f16 *)kv_caches[L][1].data_ptr(), n_kv_heads, (int *)block_tables.data_ptr(),
++                                    (int *)seq_lens.data_ptr(), block_tables.size(1), qkv_dim, kv_block_row,
++                                    kv_head_dim, n_tokens, n_head, head_size);
++        }
++
++#ifdef DEBUG_TIME
++    time[5] += get_time_ns() - tt1;
++    tt1 = get_time_ns();
++#endif
++
++// 数据转换f16 —> f32
++#pragma omp parallel 
++{
++        WorkDivider work;
++        init_work_divider(&work, g_numas);
++        SingleNumaWorkRange srange;
++        MultiNumaWorkRange mrange;
++
++        if (work.tid == 1) {
++            g_BlockDataInfo[GGML_TYPE_F16].dequantize(pstRunState->seq_qkv, pstRunState->Attn_out, n_tokens * dim);
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[6] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++
++        srcType = weight_types.attn_output_weight;
++        dstType = g_BlockDataInfo[srcType].VecDotType;
++        srcBlockNum = g_BlockDataInfo[srcType].uiblkSize;
++        srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize;
++        dstBlockNum = g_BlockDataInfo[dstType].uiblkSize;
++        dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize;
++
++        divide_work_first_numa(&work, n_tokens * dim / dstBlockNum, &srange);
++        Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                 (f32 *)pstRunState->Attn_out + srange.begin_thread * dstBlockNum,
++                 dstType, srange.work_per_thread * dstBlockNum);
++
++        if (work.my_numa == 0) {
++            for (int i= 1; i < g_numas; i++) {
++                memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize,
++                        (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                        srange.work_per_thread * dstBlocksize);
++            }
++        }
++#pragma omp barrier
++        divide_work_all_numas(&work, dim, &mrange);
++        for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) {
++            int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1;
++            if (nrc_i == 2) {
++                for (int k = 0; k < n_tokens; k += 2) {
++                    int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1;
++                    if (nrc_k == 2) {
++                        __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * dim + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i, dim,
++                                (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                    else if (nrc_k == 1) {
++                        __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * dim + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i, dim,
++                                (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i + 1, dim,
++                                (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++
++                }
++            }
++            else if (nrc_i == 1) {
++                for (int k = 0; k < n_tokens; k += 1) {
++                    __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * dim + i, 1, 2);
++                    g_BlockDataInfo[srcType].VecDotFunc(dim,
++                            (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * dim + i, dim,
++                            (char *)g_pstWeight.wo.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1);
++                }
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[7] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        divide_work_first_numa(&work, n_tokens, &srange);
++        if (work.my_numa == 0) {
++            for (int i = srange.begin_thread; i < srange.end_thread; i++) {
++                for (int j= 0; j < dim; j++) {
++                    ((f32 *)pstRunState->add_weight)[i * dim + j] += ((f32 *)pstRunState->Attn_out)[i * dim + j];
++                }
++                RmsNorm((f32 *)pstRunState->Attn_out + i * dim, (f32 *)pstRunState->add_weight + i * dim,
++                        (f32 *)pstLlama->rms_ffn_norm.Data.tensor2[L], eps, dim);
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[8] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        srcType = weight_types.ffn_up_weight;
++        dstType = g_BlockDataInfo[srcType].VecDotType;
++        srcBlockNum = g_BlockDataInfo[srcType].uiblkSize;
++        srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize;
++        dstBlockNum = g_BlockDataInfo[dstType].uiblkSize;
++        dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize;
++
++        divide_work_first_numa(&work, n_tokens * dim / dstBlockNum, &srange);
++        Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                 (f32 *)pstRunState->Attn_out + srange.begin_thread * dstBlockNum,
++                 dstType, srange.work_per_thread * dstBlockNum);
++
++        if (work.my_numa == 0) {
++            for (int i = 1; i < g_numas; i++) {
++                memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize,
++                        (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                        srange.work_per_thread * dstBlocksize);
++            }
++        }
++#pragma omp barrier
++        /* w1/w3 数据类型一样 */
++        int total_hidden_dim = hidden_dim * 2;
++        divide_work_all_numas(&work, total_hidden_dim, &mrange);
++        for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) {
++            int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1;
++            if (nrc_i == 2) {
++                for (int k = 0; k < n_tokens; k += 2) {
++                    int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1;
++                    if (nrc_k == 2) {
++                        __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, 1, 2);
++                        /* 单线程的结果 */
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                            (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, total_hidden_dim,
++                            (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                    else if (nrc_k == 1) {
++                        __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, 1, 2);
++                        /* 单线程的结果 */
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                            (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, total_hidden_dim,
++                            (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                        
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                            (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i + 1, total_hidden_dim,
++                            (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                }
++            }
++            else if (nrc_i == 1) {
++                for (int k = 0; k < n_tokens; k += 1) {
++                    __builtin_prefetch(pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, 1, 2);
++                    /* 单线程的结果 */
++                    g_BlockDataInfo[srcType].VecDotFunc(dim,
++                        (f32 *)pstRunState->Attn_out + mrange.begin_numa + k * total_hidden_dim + i, total_hidden_dim,
++                        (char *)g_pstWeight.w1w3.Data.tensor3[work.my_numa][L] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                        (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1);
++                }
++            } 
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[9] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        /* silu激活函数 */
++        divide_work_first_numa(&work, n_tokens * hidden_dim, &srange);
++        if (work.my_numa == 0) {
++            for (int item = srange.begin_thread; item < srange.end_thread; item++) {
++                int i = item / hidden_dim, j = item % hidden_dim;
++                f32 *w1 = pstRunState->Attn_out + i * hidden_dim * 2, *w3 = pstRunState->Attn_out + i * hidden_dim * 2 + hidden_dim;
++                f32 *result = pstRunState->ffn_Gate + i * hidden_dim;
++                f16 neg_w1 = -w1[j];
++                f32 silu_f32 = w1[j] / (1.0 + expf_f16_table[*(uint16_t *)&neg_w1]);
++                result[j] = silu_f32 * w3[j];
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[10] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        /* w2 */
++        srcType = weight_types.ffn_down_weight;
++        dstType = g_BlockDataInfo[srcType].VecDotType;
++        srcBlockNum = g_BlockDataInfo[srcType].uiblkSize;
++        srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize;
++        dstBlockNum = g_BlockDataInfo[dstType].uiblkSize;
++        dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize;
++
++        divide_work_first_numa(&work, n_tokens * hidden_dim / dstBlockNum, &srange);
++        Quantize((char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize, 
++                 (f32 *)pstRunState->ffn_Gate + srange.begin_thread * dstBlockNum,
++                 dstType, srange.work_per_thread * dstBlockNum);
++
++        if (work.my_numa == 0) {
++            for (int i = 1; i < g_numas; i++) {
++                memcpy((char *)pstRunState->tmp_vec_numa[i] + srange.begin_thread * dstBlocksize,
++                        (char *)pstRunState->tmp_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                        srange.work_per_thread * dstBlocksize);
++            }
++        }
++#pragma omp barrier
++        divide_work_all_numas(&work, dim, &mrange);
++        for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) {
++            int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1;
++            if (nrc_i == 2) {
++                for (int k = 0; k < n_tokens; k += 2) {
++                    int nrc_k = ((n_tokens - k) >= 2) ? 2 : 1;
++                    if (nrc_k == 2) {
++                        __builtin_prefetch((f16 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(hidden_dim,
++                                (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, dim,
++                                (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + i * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                    else if (nrc_k == 1) {
++                        __builtin_prefetch((f16 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(hidden_dim,
++                                (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, dim,
++                                (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + i * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, nrc_k);
++
++                        g_BlockDataInfo[srcType].VecDotFunc(hidden_dim,
++                                (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i + 1, dim,
++                                (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + (i + 1) * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                }
++            }
++            else if (nrc_i == 1) {
++                for (int k = 0; k < n_tokens; k += 1) {
++                    __builtin_prefetch((f16 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, 1, 2);
++                    g_BlockDataInfo[srcType].VecDotFunc(hidden_dim,
++                            (f32 *)pstRunState->Token_Norm + mrange.begin_numa + k * dim + i, dim,
++                            (char *)g_pstWeight.ffn_down.Data.tensor3[work.my_numa][L] + i * hidden_dim / srcBlockNum * srcBlocksize, hidden_dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->tmp_vec_numa[work.my_numa] + k * hidden_dim / dstBlockNum * dstBlocksize, hidden_dim / dstBlockNum * dstBlocksize, 1);
++                }
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[11] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++} //end omp
++    }
++
++        std::vector<int> last_token_indices;
++        if (is_prompt == true) {
++            for (int i = 0, sum_seq_lens = 0; i < seq_lens.size(0); i++) {
++                sum_seq_lens += ((int *)seq_lens.data_ptr())[i];
++                last_token_indices.push_back(sum_seq_lens - 1);
++            }
++        } else {
++            for (int i = 0; i < n_tokens; i++) {
++                last_token_indices.push_back(i);
++            }
++        }
++
++#pragma omp parallel
++{
++        WorkDivider work;
++        init_work_divider(&work, g_numas);
++        SingleNumaWorkRange srange;
++        MultiNumaWorkRange mrange;
++
++        divide_work_first_numa(&work, last_token_indices.size(), &srange);
++        if (work.my_numa == 0) {
++            for (int i = srange.begin_thread; i < srange.end_thread; i++) {
++                int last_token = last_token_indices[i];
++                for (int j = 0; j < dim; j++) {
++                    ((f32 *)pstRunState->Token_Norm)[last_token * dim + j] += ((f32 *)pstRunState->add_weight)[last_token * dim + j];
++                }
++                RmsNorm((f32 *)pstRunState->Token_Ori + i * dim, (f32 *)pstRunState->Token_Norm + last_token * dim,
++                        (f32 *)pstLlama->output_norm.Data.tensor1, eps, dim);
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[12] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++#pragma omp barrier
++        /* 外层linear */
++        srcType = weight_types.output_weight;
++        dstType = g_BlockDataInfo[srcType].VecDotType;
++        srcBlockNum = g_BlockDataInfo[srcType].uiblkSize;
++        srcBlocksize = g_BlockDataInfo[srcType].uiTypeSize;
++        dstBlockNum = g_BlockDataInfo[dstType].uiblkSize;
++        dstBlocksize = g_BlockDataInfo[dstType].uiTypeSize;
++
++        divide_work_first_numa(&work, last_token_indices.size() * dim / dstBlockNum, &srange);
++        Quantize((char *)pstRunState->temp_output_vec_numa[0] + srange.begin_thread * dstBlocksize, 
++                 (f32 *)pstRunState->Token_Ori + srange.begin_thread * dstBlockNum,
++                 dstType, srange.work_per_thread * dstBlockNum);
++
++        if (work.my_numa == 0) {
++            for (int i = 1; i < g_numas; i++) {
++                memcpy((char *)pstRunState->temp_output_vec_numa[i] + srange.begin_thread * dstBlocksize,
++                        (char *)pstRunState->temp_output_vec_numa[0] + srange.begin_thread * dstBlocksize,
++                        srange.work_per_thread * dstBlocksize);
++            }
++        }
++#pragma omp barrier
++        divide_work_all_numas(&work, n_vocab, &mrange);
++        for (int i = mrange.begin_thread; i < mrange.end_thread; i += 2) {
++            int nrc_i = ((mrange.end_thread - i) >= 2) ? 2 : 1;
++            if (nrc_i == 2) {
++                for (int k = 0; k < last_token_indices.size(); k += 2) {
++                    int nrc_k = ((last_token_indices.size() - k) >= 2) ? 2 : 1;
++                    if (nrc_k == 2) {
++                        __builtin_prefetch((f16 *)output + mrange.begin_numa + k * n_vocab + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i, n_vocab,
++                                (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++                    }
++                    else if (nrc_k == 1) {
++                        __builtin_prefetch((f16 *)output + mrange.begin_numa + k * n_vocab + i, 1, 2);
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i, n_vocab,
++                                (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);
++
++                        g_BlockDataInfo[srcType].VecDotFunc(dim,
++                                (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i + 1, n_vocab,
++                                (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + (i + 1) * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                                (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, nrc_k);   
++                    }
++
++                }
++            }
++            else if (nrc_i == 1) {
++                for (int k = 0; k < last_token_indices.size(); k += 1) {
++                    __builtin_prefetch((f16 *)output + mrange.begin_numa + k * n_vocab + i, 1, 2);
++                    g_BlockDataInfo[srcType].VecDotFunc(dim,
++                            (f32 *)pstRunState->output_f32 + mrange.begin_numa + k * n_vocab + i, n_vocab,
++                            (char *)g_pstWeight.output.Data.tensor2[work.my_numa] + i * dim / srcBlockNum * srcBlocksize, dim / srcBlockNum * srcBlocksize,
++                            (char *)pstRunState->temp_output_vec_numa[work.my_numa] + k * dim / dstBlockNum * dstBlocksize, dim / dstBlockNum * dstBlocksize, 1);
++                }
++            }
++        }
++
++#ifdef DEBUG_TIME
++    if (work.tid == 0) {
++        time[13] += get_time_ns() - tt1;
++        tt1 = get_time_ns();
++    }
++#endif
++
++}
++    g_BlockDataInfo[GGML_TYPE_F16].quantize(pstRunState->output_f32, output, last_token_indices.size() * n_vocab);
++
++#ifdef DEBUG_TIME
++    time[14] += get_time_ns() - tt1;
++    tt1 = get_time_ns();
++#endif
++
++#ifdef DEBUG_TIME
++    uint64_t t1 = get_time_ns();
++    if (is_prompt == true) {
++        fprintf(stderr, " bs=%d prefill=%.3f ms, %.3f token/s\n\n", n_tokens, (t1 - t0) / 1000000.0, 1.0 * n_tokens / ((t1 - t0) / 1000000000.0));
++    } else {
++        fprintf(stderr, " bs=%d decode=%.3f ms, %.3f token/s\n\n", n_tokens, (t1 - t0) / 1000000.0, 1.0 * n_tokens / ((t1 - t0) / 1000000000.0));
++    }
++
++    fprintf(stderr, "[0] first rms_norm ——> %8.3lf ms\n", time[0] / 1000000.0);
++    fprintf(stderr, "[1] qkv quantize and memcpy ——> %8.3lf ms\n", time[1] / 1000000.0);
++    fprintf(stderr, "[2] qkv matmul ——> %8.3lf ms\n", time[2] / 1000000.0);
++    fprintf(stderr, "[3] qkv add and quantize f16 ——> %8.3lf ms\n", time[3] / 1000000.0);
++    fprintf(stderr, "[4] rope operator ——> %8.3lf ms\n", time[4] / 1000000.0);
++    fprintf(stderr, "[5] page attention operator ——> %8.3lf ms\n", time[5] / 1000000.0);
++    fprintf(stderr, "[6] dequantize f32 ——> %8.3lf ms\n", time[6] / 1000000.0);
++    fprintf(stderr, "[7] (wo)quantize-memcpy-matmul ——> %8.3lf ms\n", time[7] / 1000000.0);
++    fprintf(stderr, "[8] ffn add and rmsnorm ——> %8.3lf ms\n", time[8] / 1000000.0);
++    fprintf(stderr, "[9] (w1w3)quantize-memcpy-matmul ——> %8.3lf ms\n", time[9] / 1000000.0);
++    fprintf(stderr, "[10] silu activation function ——> %8.3lf ms\n", time[10] / 1000000.0);
++    fprintf(stderr, "[11] (w2)quantize-memcpy-matmul ——> %8.3lf ms\n", time[11] / 1000000.0);
++    fprintf(stderr, "[12] output_norm add and rmsnorm ——> %8.3lf ms\n", time[12] / 1000000.0);
++    fprintf(stderr, "[13] (output)quantize-memcpy-matmul ——> %8.3lf ms\n", time[13] / 1000000.0);
++    fprintf(stderr, "[14] output quantize f16 ——> %8.3lf ms\n\n", time[14] / 1000000.0);
++#endif
++
++    numa_free(VT, n_tokens * dim);
++    memset(pstRunState->add_weight, 0, (size_t)dim * g_pstModelHypePara.context_length * sizeof(f32));
++}
++
++void get_next_token_for_torch(
++    torch::Tensor model_output,   // WEIGHT.token_embedding
++    torch::Tensor hidden_stats,
++
++    bool is_prompt,
++    torch::Tensor block_tables,
++    torch::Tensor seq_lens,
++    torch::Tensor& slot_mapping,
++    torch::Tensor positions,
++    std::vector<torch::Tensor> kv_caches,
++    int64_t block_size,
++    int64_t N_tokens)
++{
++    void* hd = static_cast<void*>(hidden_stats.data_ptr());
++    int64_t *pos = static_cast<int64_t *>(positions.data_ptr());
++    void* output = static_cast<void*>(model_output.data_ptr());
++
++    int seq_num = seq_lens.size(0);
++    
++    static int flag = 0;
++    if (flag == 0) {
++        g_stRunState.seq_qkv = (f16*)numa_alloc_onnode((size_t)seq_num * g_pstModelHypePara.context_length * g_pstModelHypePara.dim * sizeof(f16), 0);
++
++        int output_tmp_size = (is_prompt == true ?  seq_num : N_tokens) * g_pstModelHypePara.n_vocab;
++        g_stRunState.output_f32 = (f32*)numa_alloc_onnode(output_tmp_size * sizeof(f32), 0);
++        
++        flag = 1;
++    }
++
++    get_next_token(output,
++        &g_pstModelHypePara, &g_pstWeight, &g_stRunState, 
++        is_prompt, 
++        block_tables,
++        seq_lens,
++        slot_mapping,
++        hd, 
++        pos, 
++        kv_caches,
++        block_size,
++        N_tokens
++    );
++}
++
+diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
+index 5d1c5f4c8..eaeed6bc8 100644
+--- a/csrc/cpu/torch_bindings.cpp
++++ b/csrc/cpu/torch_bindings.cpp
+@@ -97,6 +97,22 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+       "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+   ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+ 
++  ops.def(
++      "load_weight_and_malloc_active_tensor("
++          "int dim, int hidden_dim, int n_layers, int n_vocab, int n_heads, int n_kv_heads, int context_length,"
++          "float norm_rms_eps, float rope_freq_base, float attn_scale, int is_neox_style, int quantization_bit_code, Tensor cos_sin_cache,"
++          "Tensor token_embedding, Tensor rms_att_norm, Tensor rms_ffn_norm,"
++          "Tensor wqkv, Tensor wo, Tensor bqkv_bias, Tensor w1w3,"
++          "Tensor ffn_down, Tensor output_norm, Tensor lm_head) -> ()");
++  ops.impl("load_weight_and_malloc_active_tensor", torch::kCPU, &load_weight_and_malloc_active_tensor);
++
++  ops.def(
++        "get_next_token_for_torch("
++            "Tensor model_output, Tensor hidden_stats, bool is_prompt, "
++            "Tensor block_tables, Tensor seq_lens, Tensor slot_mapping, Tensor positions, Tensor[]! kv_caches, int block_size,"
++            "int N_tokens) -> ()");
++  ops.impl("get_next_token_for_torch", torch::kCPU, &get_next_token_for_torch);
++
+   // Quantization
+ #ifdef __AVX512F__
+   // Compute int8 quantized tensor for given scaling factor.
+diff --git a/csrc/ops.h b/csrc/ops.h
+index 52ccf3b51..0f81f8ae7 100644
+--- a/csrc/ops.h
++++ b/csrc/ops.h
+@@ -251,3 +251,45 @@ void register_graph_buffers(fptr_t _fa,
+                             const std::vector<std::vector<int64_t>>& handles,
+                             const std::vector<std::vector<int64_t>>& offsets);
+ #endif
++
++void load_weight_and_malloc_active_tensor(
++    int64_t dim,  // MODEL_HYPE_PARA.dim embedding 维度
++    int64_t hidden_dim,  // MODEL_HYPE_PARA.hidden_dim ffn 隐藏层维度
++    int64_t n_layers,  // MODEL_HYPE_PARA.n_layers 模型层数
++    int64_t n_vocab,  // MODEL_HYPE_PARA.n_vocab 词汇数量
++    int64_t n_heads,  // MODEL_HYPE_PARA.n_heads 注意力头个数
++    int64_t n_kv_heads,  // MODEL_HYPE_PARA.n_kv_heads kv的对数
++    int64_t context_length,   // MODEL_HYPE_PARA.context_length 上下文长度
++    double norm_rms_eps,   // MODEL_HYPE_PARA.norm_rms_eps eps
++    double rope_freq_base,   // MODEL_HYPE_PARA.rope_freq_base rope频率
++    double attn_scale,
++    int64_t is_neox_style,
++    int64_t quantization_bit_code,
++
++    torch::Tensor const& cos_sin_cache,
++    torch::Tensor token_embedding,   // WEIGHT.token_embedding
++    torch::Tensor rms_att_norm,   // WEIGHT.rms_att_norm
++    torch::Tensor rms_ffn_norm,   // WEIGHT.rms_ffn_norm
++    torch::Tensor wqkv,
++    torch::Tensor wo,   // WEIGHT.wo
++    torch::Tensor qkv_bias,
++    torch::Tensor w1w3,
++    torch::Tensor ffn_down,   // WEIGHT.ffn_down
++    torch::Tensor output_norm,  // WEIGHT.output_norm
++    torch::Tensor lm_head       // WEIGHT.output
++);
++
++void get_next_token_for_torch(
++    torch::Tensor model_output,   // WEIGHT.token_embedding
++    torch::Tensor hidden_stats,
++
++    bool is_prompt,
++    torch::Tensor block_tables,
++    torch::Tensor seq_lens,
++    torch::Tensor& slot_mapping,
++    torch::Tensor positions,
++    std::vector<torch::Tensor> kv_caches,
++    int64_t block_size,
++    int64_t N_tokens
++);
++
+diff --git a/examples/offline_inference/basic/basic.py b/examples/offline_inference/basic/basic.py
+index a6e96c0bb..56831ed57 100644
+--- a/examples/offline_inference/basic/basic.py
++++ b/examples/offline_inference/basic/basic.py
+@@ -13,7 +13,7 @@ prompts = [
+ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+ 
+ # Create an LLM.
+-llm = LLM(model="facebook/opt-125m")
++llm = LLM(model="/home/s30058176/DeepSeek-R1-Distill-Qwen-7B", max_model_len=8192)
+ # Generate texts from the prompts. The output is a list of RequestOutput objects
+ # that contain the prompt, generated text, and other information.
+ outputs = llm.generate(prompts, sampling_params)
+@@ -21,4 +21,4 @@ outputs = llm.generate(prompts, sampling_params)
+ for output in outputs:
+     prompt = output.prompt
+     generated_text = output.outputs[0].text
+-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+\ No newline at end of file
++    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+diff --git a/vllm/config.py b/vllm/config.py
+index 56315aacb..765fe3748 100644
+--- a/vllm/config.py
++++ b/vllm/config.py
+@@ -2393,7 +2393,8 @@ def _get_and_verify_dtype(
+                     # models.
+                     torch_dtype = torch.float16
+             else:
+-                torch_dtype = config_dtype
++                #torch_dtype = config_dtype
++                torch_dtype = torch.float16
+ 
+             from vllm.platforms import current_platform
+             if (current_platform.is_cpu()
+diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
+index e3de6b64f..d76f31fda 100644
+--- a/vllm/model_executor/models/qwen2.py
++++ b/vllm/model_executor/models/qwen2.py
+@@ -60,6 +60,24 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
+ 
+ logger = init_logger(__name__)
+ 
++inference_fused = False
++import os
++if os.getenv("INFERENCE_OP_MODE") == "fused":
++    inference_fused = True
++    print(f"run in INFERENCE FUSED MODE")
++
++#量化设置
++quantization_bit_mode = os.getenv("SYSHAX_QUANTIZE")
++quantization_bit_code = 1 #默认是f16
++if quantization_bit_mode != "":
++    if quantization_bit_mode == "q8_0":
++        quantization_bit_code = 8
++        print(f"Use q8_0 quantization!")
++    elif quantization_bit_mode == "q4_0":
++        quantization_bit_code = 2
++        print(f"Use q4_0 quantization!")
++    else:
++        print(f"Unsupported quantization type !")
+ 
+ class Qwen2MLP(nn.Module):
+ 
+@@ -442,6 +460,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+ 
+     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+         super().__init__()
++        self.apply_memory = True
+         config = vllm_config.model_config.hf_config
+         quant_config = vllm_config.quant_config
+         lora_config = vllm_config.lora_config
+@@ -483,6 +502,90 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+         intermediate_tensors: Optional[IntermediateTensors] = None,
+         inputs_embeds: Optional[torch.Tensor] = None,
+     ) -> Union[torch.Tensor, IntermediateTensors]:
++        if inference_fused:
++            self.fused_forward = True
++
++            model = self.model
++            qkv_bias, wqkv, wo, w1w3, w2, preattn, postattn = [], [], [], [], [], [], []
++            for i in range(model.start_layer, model.end_layer):
++                layer = model.layers[i]
++                if layer.self_attn.qkv_proj.bias is not None:
++                    qkv_bias.append(layer.self_attn.qkv_proj.bias)
++                wqkv.append(layer.self_attn.qkv_proj.weight)
++                wo.append(layer.self_attn.o_proj.weight)
++                w1w3.append(layer.mlp.gate_up_proj.weight)
++                w2.append(layer.mlp.down_proj.weight)
++                preattn.append(layer.input_layernorm.weight)
++                postattn.append(layer.post_attention_layernorm.weight)
++            assert not qkv_bias or len(qkv_bias) == len(wqkv)
++            first_attn = model.layers[0].self_attn
++            preattn = torch.stack(preattn)
++            postattn = torch.stack(postattn)
++            wqkv = torch.stack(wqkv)
++            w1w3 = torch.stack(w1w3)
++            qkv_bias = torch.stack(qkv_bias)
++            wo = torch.stack(wo)
++            w2 = torch.stack(w2)
++
++            num_group, _ = divmod(self.model.config.num_attention_heads, self.model.config.num_key_value_heads)
++            assert _ == 0
++            kv_size, _ = divmod(self.model.embed_tokens.embedding_dim, num_group)
++            assert _ == 0
++
++            if self.apply_memory:
++                self.apply_memory = False
++                torch.ops._C.load_weight_and_malloc_active_tensor(
++                    self.model.embed_tokens.embedding_dim,  # MODEL_HYPE_PARA.dim embedding 维度
++                    self.model.config.intermediate_size,  # MODEL_HYPE_PARA.hidden_dim ffn 隐藏层维度
++                    self.model.config.num_hidden_layers,  # MODEL_HYPE_PARA.n_layers 模型层数
++                    self.model.embed_tokens.org_vocab_size,  # MODEL_HYPE_PARA.n_vocab 词汇数量
++                    self.model.config.num_attention_heads,  # MODEL_HYPE_PARA.n_heads 注意力头个数
++                    self.model.config.num_key_value_heads,  # MODEL_HYPE_PARA.n_kv_heads kv的对数
++                    self.model.config.sliding_window if self.model.config.use_sliding_window else self.model.config.max_position_embeddings,
++                    self.model.config.rms_norm_eps,   # MODEL_HYPE_PARA.norm_rms_eps eps
++                    self.model.config.rope_theta,   # MODEL_HYPE_PARA.rope_freq_base rope频率
++                    first_attn.attn.impl.scale,
++                    first_attn.rotary_emb.is_neox_style,
++                    quantization_bit_code,
++
++                    first_attn.rotary_emb.cos_sin_cache,
++                    self.model.embed_tokens.weight,   # WEIGHT.token_embedding
++                    preattn,   # WEIGHT.rms_att_norm
++                    postattn,   # WEIGHT.rms_ffn_norm
++                    wqkv,
++                    wo,   # WEIGHT.wo
++                    qkv_bias,
++                    w1w3,
++                    w2,   # WEIGHT.ffn_down
++                    self.model.norm.weight,     # WEIGHT.output_norm
++                    self.lm_head.weight         # WEIGHT.output
++                )
++            block_size = 16
++            N_tokens = len(input_ids)
++            hidden_states = model.get_input_embeddings(input_ids)
++            model_output = torch.zeros(
++                (len(attn_metadata.seq_lens) if attn_metadata.prefill_metadata else N_tokens, self.config.vocab_size), 
++                dtype=hidden_states.dtype, 
++                device=hidden_states.device
++            )
++
++            torch.ops._C.get_next_token_for_torch(
++                model_output,
++                hidden_states,
++
++                attn_metadata.prefill_metadata is not None,
++                attn_metadata.block_tables,
++                attn_metadata.seq_lens_tensor,
++                attn_metadata.slot_mapping.flatten(),
++                positions,
++                kv_caches,
++                block_size,
++                N_tokens
++            )
++            return model_output
++        else:
++            self.fused_forward = False
++
+         hidden_states = self.model(input_ids, positions, kv_caches,
+                                    attn_metadata, intermediate_tensors,
+                                    inputs_embeds)
+@@ -493,6 +596,8 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+         hidden_states: torch.Tensor,
+         sampling_metadata: SamplingMetadata,
+     ) -> Optional[torch.Tensor]:
++        if inference_fused and self.fused_forward:
++            return hidden_states
+         logits = self.logits_processor(self.lm_head, hidden_states,
+                                        sampling_metadata)
+         return logits
diff --git a/vllm.spec b/vllm.spec
index b0fec1db5d52f9da51d85115336f14980019ca2b..9b904f4bad0c33985b0099ef2aecc7bec303af71 100644
--- a/vllm.spec
+++ b/vllm.spec
@@ -3,12 +3,14 @@
 
 Name:       vllm
 Version:    0.7.3
-Release:    1
+Release:    2
 Summary:    Powerful engine for LLMs
 License:    (Apache-2.0 AND BSD-3-Clause) OR BSD-3-CLause
 URL:        https://github.com/vllm-project/vllm
 Source0:    https://gitee.com/src-openeuler/vllm/raw/master/vllm-%{version}.tar.gz
 
+Patch0001:   cpu-fast-inference.patch
+
 BuildArch:  noarch
 
 %description
@@ -18,7 +20,7 @@ BuildArch:  noarch
 Summary:    %{summary}
 Buildrequires:  cmake python3-pip python3-devel python3-setuptools python3-pytest
 Buildrequires:  python3-setuptools_scm python3-wheel python3-pytest-asyncio python3-grpcio
-Buildrequires:  python3-pytorch
+Buildrequires:  python3-pytorch gcc gcc-c++ numactl-devel kmod
 %{?python_provide:%python_provide python3-%{_name}}
 
 %description -n python3-%{_name}
@@ -31,12 +33,12 @@ Buildrequires:  python3-pytorch
 
 %build
 export SETUPTOOLS_SCM_PRETEND_VERSION=%{version}
-export VLLM_TARGET_DEVICE=empty
+export VLLM_TARGET_DEVICE=cpu
 %py3_build
 
 %install
 export SETUPTOOLS_SCM_PRETEND_VERSION=%{version}
-export VLLM_TARGET_DEVICE=empty
+export VLLM_TARGET_DEVICE=cpu
 %py3_install
 
 VERSION_FILE=$(find %{buildroot} -name '_version.py')
@@ -69,6 +71,9 @@ mv %{buildroot}/filelist.lst .
 %files -n python3-%{_name} -f filelist.lst
 
 %changelog
+* Thu May 15 2025 qmzznbxhl<sunhailiang3@huawei.com> - 0.7.3-2
+- Add cpu fast-inference
+
 * Tue Apr 8 2025 renwenjie<renwenjie5@huawei.com> - 0.7.3-1
 - Change the baseline version to 0.7.3