diff --git a/AArch64-enabled-instruction-set.patch b/AArch64-enabled-instruction-set.patch
new file mode 100644
index 0000000000000000000000000000000000000000..369e26db23974b7840af1840a6fb8cd58b969fa5
--- /dev/null
+++ b/AArch64-enabled-instruction-set.patch
@@ -0,0 +1,156 @@
+From df2a8abcd79f90d13e8d1893099cc8a6039576c4 Mon Sep 17 00:00:00 2001
+From: bitianyuan <bitianyuan@huawei.com>
+Date: Tue, 21 Oct 2025 17:06:05 +0800
+Subject: [PATCH] AArch64 enabled instruction set
+
+---
+ .gitignore                       |  2 ++
+ CMakeLists.txt                   | 21 +++++++++++++
+ ggml/src/CMakeLists.txt          | 15 +++++++++
+ ggml/src/ggml-cpu/CMakeLists.txt | 54 ++++++++++++++++++++++++++++++++
+ 4 files changed, 92 insertions(+)
+
+diff --git a/.gitignore b/.gitignore
+index c7d00097..9df14e2c 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -152,3 +152,5 @@ poetry.toml
+ # IDE
+ *.code-workspace
+ .windsurf/
++
++cmake-build-debug
+\ No newline at end of file
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4bf8b278..f758fbeb 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -159,6 +159,27 @@ if (NOT MSVC)
+         add_compile_options(-fsanitize=undefined)
+         link_libraries     (-fsanitize=undefined)
+     endif()
++
++    if (LLAMA_PGO_GEN)
++        message(STATUS "Using -fprofile-generate")
++
++        add_compile_options(-fprofile-generate)
++        link_libraries     (-fprofile-generate)
++    endif()
++
++    if (LLAMA_PGO_USE)
++        message(STATUS "Using -fprofile-use")
++
++        add_compile_options(-fprofile-use)
++        link_libraries     (-fprofile-use)
++    endif()
++
++    if (LLAMA_LTO)
++        message(STATUS "Using -flto")
++
++        add_compile_options(-flto)
++        link_libraries     (-flto)
++    endif()
+ endif()
+ 
+ #
+diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
+index 892c2331..802bc25c 100644
+--- a/ggml/src/CMakeLists.txt
++++ b/ggml/src/CMakeLists.txt
+@@ -23,6 +23,21 @@ if (NOT MSVC)
+         add_compile_options(-fsanitize=undefined)
+         link_libraries     (-fsanitize=undefined)
+     endif()
++
++    if (LLAMA_PGO_GEN)
++        add_compile_options(-fprofile-generate)
++        link_libraries     (-fprofile-generate)
++    endif()
++
++    if (LLAMA_PGO_USE)
++        add_compile_options(-fprofile-use)
++        link_libraries     (-fprofile-use)
++    endif()
++
++    if (LLAMA_LTO)
++        add_compile_options(-flto)
++        link_libraries     (-flto)
++    endif()
+ endif()
+ 
+ if (GGML_FATAL_WARNINGS)
+diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
+index 42041b71..f78e5caf 100644
+--- a/ggml/src/ggml-cpu/CMakeLists.txt
++++ b/ggml/src/ggml-cpu/CMakeLists.txt
+@@ -126,6 +126,59 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
+                 )
+                 if (NOT ARM_MCPU_RESULT)
+                     string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}")
++                    set(AUTO_ARM_FEATURES "")
++                    set(ARM_MCPU_FLAG "")
++                    if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|AARCH64" AND CMAKE_HOST_UNIX)
++                        if(EXISTS "/proc/cpuinfo")
++                            file(READ "/proc/cpuinfo" CPUINFO_CONTENT)
++
++                            if(CPUINFO_CONTENT MATCHES "[Ff]eatures[ \t]*:[ \t]*([^\n]+)")
++                                string(STRIP "${CMAKE_MATCH_1}" CPU_FEATURES_STR)
++                                if(NOT "${CPU_FEATURES_STR}" STREQUAL "")
++                                    string(REPLACE " " ";" CPU_FEATURES_LIST "${CPU_FEATURES_STR}")
++                                    list(REMOVE_DUPLICATES CPU_FEATURES_LIST)
++
++                                    if("asimddp" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "dotprod")
++                                    endif()
++                                    if("i8mm" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "i8mm")
++                                    endif()
++                                    if("sve" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "sve")
++                                    endif()
++                                    if("sve2" IN_LIST CPU_FEATURES_LIST OR "svebf16" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "sve2")
++                                    endif()
++                                    if("bf16" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "bf16")
++                                    endif()
++                                    if("sme" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "sme")
++                                    endif()
++                                    if("fp16" IN_LIST CPU_FEATURES_LIST OR "fphp" IN_LIST CPU_FEATURES_LIST)
++                                        list(APPEND AUTO_ARM_FEATURES "fp16")
++                                    endif()
++
++                                    if(AUTO_ARM_FEATURES)
++                                        set(BASE_ARCH "armv8.2-a")
++                                        if("i8mm" IN_LIST AUTO_ARM_FEATURES OR "sve2" IN_LIST AUTO_ARM_FEATURES)
++                                            set(BASE_ARCH "armv8.6-a")
++                                        elseif("sme" IN_LIST AUTO_ARM_FEATURES)
++                                            set(BASE_ARCH "armv9-a")
++                                        endif()
++
++                                        list(JOIN AUTO_ARM_FEATURES "+" FEATURE_STR)
++                                        set(ARM_MCPU_FLAG "-march=${BASE_ARCH}")
++                                        message(STATUS "Auto-detected ARM features from /proc/cpuinfo Features: ${FEATURE_STR}")
++                                        message(STATUS "Using ARM_MCPU_FLAG: ${ARM_MCPU_FLAG}")
++                                    endif()
++                                endif()
++                            else()
++                                message(STATUS "No 'Features' line found in /proc/cpuinfo")
++                            endif()
++                        endif()
++                    endif()
+                 endif()
+                 if ("${ARM_MCPU_FLAG}" STREQUAL "")
+                     set(ARM_MCPU_FLAG -mcpu=native)
+@@ -212,6 +265,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
+                 set(FEAT_INPUT_FILE "/dev/null")
+             endif()
+ 
++            message(STATUS "arm: CPU-specific optimizations: ${ARCH_FLAGS}")
+             execute_process(
+                 COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
+                 INPUT_FILE ${FEAT_INPUT_FILE}
+-- 
+2.33.0
+
diff --git a/Optimization-on-aarch64.patch b/Optimization-on-aarch64.patch
new file mode 100644
index 0000000000000000000000000000000000000000..6301c646e895fff677d9fe571c39403fdefc083d
--- /dev/null
+++ b/Optimization-on-aarch64.patch
@@ -0,0 +1,509 @@
+From d21c6f5611b84a0f938c14a472a4340801d4db25 Mon Sep 17 00:00:00 2001
+From: bitianyuan <bitianyuan@huawei.com>
+Date: Wed, 22 Oct 2025 09:19:14 +0800
+Subject: [PATCH] Optimization on aarch64
+
+---
+ common/arg.cpp                    |   1 +
+ ggml/include/ggml-cpu.h           |   4 +
+ ggml/include/ggml.h               |  13 ++-
+ ggml/src/ggml-cpu/ggml-cpu-impl.h |   2 +
+ ggml/src/ggml-cpu/ggml-cpu.c      | 157 +++++++++++++++++++++++++++---
+ ggml/src/ggml.c                   |   4 +-
+ src/CMakeLists.txt                |   2 +-
+ src/llama-model-loader.cpp        |  10 ++
+ src/llama-model.cpp               |   9 ++
+ 9 files changed, 188 insertions(+), 14 deletions(-)
+
+diff --git a/common/arg.cpp b/common/arg.cpp
+index 33ed7ae8..ce52115c 100644
+--- a/common/arg.cpp
++++ b/common/arg.cpp
+@@ -2804,6 +2804,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
+             /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
+             else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
+             else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
++            else if (value == "oto") { params.numa = GGML_NUMA_STRATEGY_OTO_AFF; }
+             else { throw std::invalid_argument("invalid value"); }
+         }
+     ).set_env("LLAMA_ARG_NUMA"));
+diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
+index 9edd4851..ee6e705a 100644
+--- a/ggml/include/ggml-cpu.h
++++ b/ggml/include/ggml-cpu.h
+@@ -28,6 +28,9 @@ extern "C" {
+         GGML_NUMA_STRATEGY_ISOLATE    = 2,
+         GGML_NUMA_STRATEGY_NUMACTL    = 3,
+         GGML_NUMA_STRATEGY_MIRROR     = 4,
++        GGML_NUMA_STRATEGY_SOCKET1    = 5,
++        GGML_NUMA_STRATEGY_SOCKET2    = 6,
++        GGML_NUMA_STRATEGY_OTO_AFF    = 7,
+         GGML_NUMA_STRATEGY_COUNT
+     };
+ 
+@@ -103,6 +106,7 @@ extern "C" {
+     GGML_BACKEND_API int ggml_cpu_has_vxe        (void);
+     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+     GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
++    GGML_BACKEND_API int ggml_cpu_has_matmul_int16(void);
+ 
+     // Internal types and functions exposed for tests and benchmarks
+ 
+diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
+index d948b00c..efb350b9 100644
+--- a/ggml/include/ggml.h
++++ b/ggml/include/ggml.h
+@@ -345,6 +345,9 @@ extern "C" {
+     GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
+     GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+ 
++    #define GGML_NUMA_MAX_NODES 8
++    #define GGML_NUMA_MAX_CPUS 512
++
+     enum ggml_status {
+         GGML_STATUS_ALLOC_FAILED = -2,
+         GGML_STATUS_FAILED = -1,
+@@ -420,6 +423,11 @@ extern "C" {
+         GGML_TYPE_COUNT   = 40,
+     };
+ 
++    enum tensor_attr {
++        TENSOR_ATTR_INVALID = 0,
++        TENSOR_ATTR_NUMA_SPLIT = 0x01,
++    };
++
+     // precision
+     enum ggml_prec {
+         GGML_PREC_DEFAULT =  0, // stored as ggml_tensor.op_params, 0 by default
+@@ -656,9 +664,12 @@ extern "C" {
+ 
+         char name[GGML_MAX_NAME];
+ 
++        void * data_numa[GGML_NUMA_MAX_NODES]; // 临时，拆分到所有numa节点后的张量数据
++        enum tensor_attr attr; // 临时，张量属性
++
+         void * extra; // extra things e.g. for ggml-cuda.cu
+ 
+-        char padding[8];
++        char padding[0];
+     };
+ 
+     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h
+index 713bf85e..f606ec0b 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
++++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
+@@ -19,6 +19,8 @@ struct ggml_compute_params {
+     // ith = thread index, nth = number of threads
+     int ith, nth;
+ 
++    struct ggml_numainfo * numainfo;
++
+     // work buffer for all threads
+     size_t wsize;
+     void * wdata;
+diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
+index 9ec485cf..32a49d23 100644
+--- a/ggml/src/ggml-cpu/ggml-cpu.c
++++ b/ggml/src/ggml-cpu/ggml-cpu.c
+@@ -429,6 +429,13 @@ typedef pthread_mutex_t    ggml_mutex_t;
+ #endif
+ #define ggml_lock_unlock(x)  UNUSED(x)
+ 
++#define KUN_PENG_SOCKET_1 0
++#define KUN_PENG_SOCKET_2 1
++#define KUN_PENG_SOCKET_NUM 2
++ 
++#define KUN_PENG_SOCKET_THREAD_NUM 64
++#define KUN_PENG_MAX_THREAD_NUM 128
++
+ #define GGML_LOCK_INITIALIZER 0
+ #define ggml_cond_init(c)      pthread_cond_init(c, NULL)
+ #define ggml_cond_destroy(c)   pthread_cond_destroy(c)
+@@ -448,6 +455,8 @@ struct ggml_threadpool {
+     struct ggml_cgraph * cgraph;
+     struct ggml_cplan  * cplan;
+ 
++    int threads_per_numa;
++
+     // synchronization primitives
+     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
+     atomic_int GGML_CACHE_ALIGN n_barrier;
+@@ -469,6 +478,11 @@ struct ggml_threadpool {
+     enum ggml_status ec;
+ };
+ 
++struct ggml_numainfo {
++    int my_numa, ith_in_numa;
++    int max_item_per_thread, max_item_per_numa;
++};
++
+ // Per-thread state
+ struct ggml_compute_state {
+ #ifndef GGML_USE_OPENMP
+@@ -498,8 +512,8 @@ static inline void ggml_thread_cpu_relax(void) {;}
+ // NUMA support
+ //
+ 
+-#define GGML_NUMA_MAX_NODES 8
+-#define GGML_NUMA_MAX_CPUS 512
++// #define GGML_NUMA_MAX_NODES 8
++// #define GGML_NUMA_MAX_CPUS 512
+ 
+ struct ggml_numa_node {
+     uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+@@ -626,6 +640,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
+     }
+ 
+     GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
++    fprintf(stderr, "found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+ 
+     // figure out which node we're on
+     uint current_cpu;
+@@ -646,6 +661,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
+     }
+ 
+     GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
++    fprintf(stderr, "found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
+ 
+     for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+         struct ggml_numa_node * node = &g_state.numa.nodes[n];
+@@ -1117,10 +1133,31 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2,
+ 
+ // ggml_compute_forward_mul_mat
+ 
++static bool is_numa_aware_tensor(enum tensor_attr attr) {
++    return attr == TENSOR_ATTR_NUMA_SPLIT;
++}
++
++static int64_t ggml_get_num_step(
++        const int64_t num_rows_per_vec_dot,
++        const enum ggml_type type) {
++    if (num_rows_per_vec_dot != 2) {
++        return num_rows_per_vec_dot;
++    }
++ 
++    switch (type) {
++        case GGML_TYPE_Q4_K:
++            return num_rows_per_vec_dot;
++        case GGML_TYPE_Q4_0:
++        case GGML_TYPE_Q8_0:
++            return 8;
++        default:
++            return num_rows_per_vec_dot;
++    }
++}
++
+ static void ggml_compute_forward_mul_mat_one_chunk(
+     const struct ggml_compute_params * params,
+     struct ggml_tensor * dst,
+-    const enum ggml_type type,
+     const int64_t num_rows_per_vec_dot,
+     const int64_t ir0_start,
+     const int64_t ir0_end,
+@@ -1132,6 +1169,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
+ 
+     GGML_TENSOR_BINARY_OP_LOCALS
+ 
++    const enum ggml_type type = src0->type;
++
+     const bool src1_cont = ggml_is_contiguous(src1);
+ 
+     ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
+@@ -1180,6 +1219,11 @@ static void ggml_compute_forward_mul_mat_one_chunk(
+                 const int64_t i3 = i13;
+ 
+                 const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
++                int src0_start = 0;
++                if (is_numa_aware_tensor(src0->attr)) {
++                    src0_row = (const char *) src0->data_numa[params->numainfo->my_numa];
++                    src0_start = params->numainfo->my_numa * params->numainfo->max_item_per_numa;
++                }
+ 
+                 // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                 //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+@@ -1195,8 +1239,11 @@ static void ggml_compute_forward_mul_mat_one_chunk(
+                 //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                 //}
+ 
+-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
++                // 修改multi_load，外部2次，内部循环4次
++                int64_t row_step = ggml_get_num_step(num_rows_per_vec_dot, type);
++                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += row_step) {
++                    vec_dot(ne00, &tmp[ir0 - iir0], (row_step > 1 ? 16 : 0), src0_row + (ir0 - src0_start) * nb01,
++                        (row_step > 1 ? nb01 : 0), src1_col, (row_step > 1 ? src1_col_stride : 0), row_step);
+                 }
+ 
+                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+@@ -1373,6 +1420,9 @@ UseGgmlGemm2:;
+     // The first chunk comes from our thread_id, the rest will get auto-assigned.
+     int current_chunk = ith;
+ 
++    params->numainfo->max_item_per_numa = ne0 / g_state.numa.n_nodes;
++    params->numainfo->max_item_per_thread = dr0;
++
+     while (current_chunk < nchunk0 * nchunk1) {
+         const int64_t ith0 = current_chunk % nchunk0;
+         const int64_t ith1 = current_chunk / nchunk0;
+@@ -1391,7 +1441,7 @@ UseGgmlGemm2:;
+         if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+             num_rows_per_vec_dot = 1;
+         }
+-        ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
++        ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+ 
+         if (nth >= nchunk0 * nchunk1) {
+             break;
+@@ -2056,7 +2106,69 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
+ 
+ // Android's libc implementation "bionic" does not support setting affinity
+ #if defined(__gnu_linux__)
+-static void set_numa_thread_affinity(int thread_n) {
++
++static void set_numa_thread_affinit_one_by_one(size_t setsize) {
++    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
++    CPU_ZERO_S(setsize, cpus);
++ 
++    pid_t cur_thread = gettid();
++    int cpu_index = cur_thread % KUN_PENG_MAX_THREAD_NUM;
++    int node_num = cpu_index / KUN_PENG_SOCKET_THREAD_NUM;
++ 
++    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
++    int node_index = cpu_index % node->n_cpus;
++    CPU_SET_S(node->cpus[node_index], setsize, cpus);
++ 
++    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
++    if (rv) {
++        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
++    }
++ 
++    CPU_FREE(cpus);
++}
++ 
++static void set_numa_thread_affinit_socket(size_t setsize, int socket) {
++    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
++    CPU_ZERO_S(setsize, cpus);
++
++    size_t socket_node_num = g_state.numa.n_nodes / KUN_PENG_SOCKET_NUM;
++    size_t start_node_idx = socket * socket_node_num;
++    for (size_t i = 0; i < socket_node_num; ++i) {
++        int node_idx = start_node_idx + i;
++        struct ggml_numa_node * node = &g_state.numa.nodes[node_idx];
++        for (size_t j = 0; j < node->n_cpus; ++j) {
++            CPU_SET_S(node->cpus[j], setsize, cpus);
++        }
++    }
++
++    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
++    if (rv) {
++        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
++    }
++
++    CPU_FREE(cpus);
++}
++
++static void set_numa_thread_affinit_one_by_one_ex(int thread_n, int threads_per_numa) {
++    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
++    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
++    CPU_ZERO_S(setsize, cpus);
++
++    int node_index = thread_n / threads_per_numa;
++    int cpu_index = (thread_n % threads_per_numa) * 2;
++
++    struct ggml_numa_node * node = &g_state.numa.nodes[node_index];
++    CPU_SET_S(node->cpus[cpu_index], setsize, cpus);
++
++    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
++    if (rv) {
++        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv));
++    }
++
++    CPU_FREE(cpus);
++}
++
++static void set_numa_thread_affinity(int thread_n, int threads_per_numa) {
+     if (!ggml_is_numa()) {
+         return;
+     }
+@@ -2068,12 +2180,21 @@ static void set_numa_thread_affinity(int thread_n) {
+     switch(g_state.numa.numa_strategy) {
+         case GGML_NUMA_STRATEGY_DISTRIBUTE:
+             // run thread on node_num thread_n / (threads per node)
+-            node_num = thread_n % g_state.numa.n_nodes;
++            node_num = thread_n / threads_per_numa;
+             break;
+         case GGML_NUMA_STRATEGY_ISOLATE:
+             // run thread on current_node
+             node_num = g_state.numa.current_node;
+             break;
++        case GGML_NUMA_STRATEGY_OTO_AFF:
++            set_numa_thread_affinit_one_by_one_ex(setsize, threads_per_numa);
++            return;
++        case GGML_NUMA_STRATEGY_SOCKET1:
++            set_numa_thread_affinit_socket(setsize, KUN_PENG_SOCKET_1);
++            return;
++        case GGML_NUMA_STRATEGY_SOCKET2:
++            set_numa_thread_affinit_socket(setsize, KUN_PENG_SOCKET_2);
++            return;
+         case GGML_NUMA_STRATEGY_NUMACTL:
+             // use the cpuset that numactl gave us
+             rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
+@@ -2124,7 +2245,7 @@ static void clear_numa_thread_affinity(void) {
+ #else
+ // TODO: Windows etc.
+ // (the linux implementation may also work on BSD, someone should test)
+-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n);  }
++static void set_numa_thread_affinity(int thread_n, int threads_per_numa) { UNUSED(thread_n);  }
+ static void clear_numa_thread_affinity(void) {}
+ #endif
+ 
+@@ -2872,15 +2993,19 @@ struct ggml_cplan ggml_graph_plan(
+ static thread_ret_t ggml_graph_compute_thread(void * data) {
+     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+     struct ggml_threadpool    * tp    = state->threadpool;
++    struct ggml_numainfo        ni    = { 0 };
+ 
+     const struct ggml_cgraph * cgraph = tp->cgraph;
+     const struct ggml_cplan  * cplan  = tp->cplan;
+ 
+-    set_numa_thread_affinity(state->ith);
++    set_numa_thread_affinity(state->ith, state->threadpool->threads_per_numa);
++    ni.my_numa = state->ith / state->threadpool->threads_per_numa;
++    ni.ith_in_numa = state->ith % state->threadpool->threads_per_numa;
+ 
+     struct ggml_compute_params params = {
+         /*.ith       =*/ state->ith,
+         /*.nth       =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
++        /*.numainfo  =*/ &ni,
+         /*.wsize     =*/ cplan->work_size,
+         /*.wdata     =*/ cplan->work_data,
+         /*.threadpool=*/ tp,
+@@ -3158,6 +3283,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
+         threadpool->current_chunk    = 0;
+         threadpool->abort            = -1;
+         threadpool->ec               = GGML_STATUS_SUCCESS;
++        threadpool->threads_per_numa = 0;
+     }
+ 
+ #ifdef GGML_USE_OPENMP
+@@ -3169,6 +3295,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
+                 // update the number of threads from the actual number of threads that we got from OpenMP
+                 n_threads = omp_get_num_threads();
+                 atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed);
++                atomic_store_explicit(&threadpool->threads_per_numa, n_threads / g_state.numa.n_nodes, memory_order_relaxed);
+             }
+ 
+             // Apply thread CPU mask and priority
+@@ -3493,7 +3620,7 @@ int ggml_cpu_has_neon(void) {
+ }
+ 
+ int ggml_cpu_has_dotprod(void) {
+-#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
++#if defined(__ARM_FEATURE_DOTPROD)
+     return 1;
+ #else
+     return 0;
+@@ -3516,6 +3643,14 @@ int ggml_cpu_has_matmul_int8(void) {
+ #endif
+ }
+ 
++int ggml_cpu_has_matmul_int16(void) {
++#if defined(__ARM_FEATURE_MATMUL_INT16)
++    return 1;
++#else
++    return 0;
++#endif
++}
++
+ int ggml_cpu_get_sve_cnt(void) {
+ #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
+     return ggml_arm_arch_features.sve_cnt;
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index 86f1c31a..7b838ddc 100644
+--- a/ggml/src/ggml.c
++++ b/ggml/src/ggml.c
+@@ -1680,9 +1680,11 @@ static struct ggml_tensor * ggml_new_tensor_impl(
+         /*.view_src     =*/ view_src,
+         /*.view_offs    =*/ view_offs,
+         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
++        /*.data_numa    =*/ { NULL },
++        /*.attr         =*/ TENSOR_ATTR_INVALID,
+         /*.name         =*/ { 0 },
+         /*.extra        =*/ NULL,
+-        /*.padding      =*/ { 0 },
++        /*.padding      =*/ // { 0 },
+     };
+ 
+     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
+diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
+index 18cfc765..aa8b9a27 100644
+--- a/src/CMakeLists.txt
++++ b/src/CMakeLists.txt
+@@ -41,7 +41,7 @@ target_include_directories(llama PRIVATE .)
+ target_include_directories(llama PUBLIC ../include)
+ target_compile_features   (llama PRIVATE cxx_std_17) # don't bump
+ 
+-target_link_libraries(llama PUBLIC ggml)
++target_link_libraries(llama PUBLIC ggml numa)
+ 
+ if (BUILD_SHARED_LIBS)
+     set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
+diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
+index aa3a65f8..b34e7325 100644
+--- a/src/llama-model-loader.cpp
++++ b/src/llama-model-loader.cpp
+@@ -6,6 +6,7 @@
+ #include <cinttypes>
+ #include <cstring>
+ #include <future>
++#include <numa.h>
+ 
+ static const size_t kiB = 1024;
+ static const size_t MiB = 1024*kiB;
+@@ -1067,6 +1068,15 @@ bool llama_model_loader::load_all_data(
+             if (ggml_backend_buffer_is_host(cur->buffer)) {
+                 file->seek(weight->offs, SEEK_SET);
+                 file->read_raw(cur->data, n_size);
++                if (cur->attr == TENSOR_ATTR_NUMA_SPLIT) {
++                        const int spec_numa_num = numa_max_node() + 1;
++                        int n_size_per_numa = n_size / spec_numa_num;
++ 
++                        for (int i = 0; i < spec_numa_num; ++i) {
++                            cur->data_numa[i] = numa_alloc_onnode(n_size_per_numa, i);
++                            memcpy(cur->data_numa[i], cur->data + n_size_per_numa * i, n_size_per_numa);
++                        }
++                }
+                 if (check_tensors) {
+                     validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
+                         return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+index 522d1f67..f799b939 100644
+--- a/src/llama-model.cpp
++++ b/src/llama-model.cpp
+@@ -2410,6 +2410,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+                         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_v_gqa}, 0);
+                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+ 
++                        layer.wq->attr = TENSOR_ATTR_NUMA_SPLIT;
++                        layer.wk->attr = TENSOR_ATTR_NUMA_SPLIT;
++                        layer.wv->attr = TENSOR_ATTR_NUMA_SPLIT;
++                        layer.wo->attr = TENSOR_ATTR_NUMA_SPLIT;
++
+                         // optional bias tensors
+                         layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     TENSOR_NOT_REQUIRED);
+                         layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+@@ -2431,6 +2436,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
+                             layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+                             layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+ 
++                            layer.ffn_gate->attr = TENSOR_ATTR_NUMA_SPLIT;
++                            layer.ffn_down->attr = TENSOR_ATTR_NUMA_SPLIT;
++                            layer.ffn_up->attr = TENSOR_ATTR_NUMA_SPLIT;
++
+                             // optional MLP bias
+                             layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
+                             layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+-- 
+2.33.0
+
diff --git a/b4295.tar.gz b/b6602.tar.gz
similarity index 59%
rename from b4295.tar.gz
rename to b6602.tar.gz
index 0c324cf412af298ddfaf2bb013185adee3744822..f4ac243156c0f8c792ec58325ef5b09c67323e09 100644
Binary files a/b4295.tar.gz and b/b6602.tar.gz differ
diff --git a/backport-CVE-2025-49847.patch b/backport-CVE-2025-49847.patch
deleted file mode 100644
index f5835a9710c6c4d04704e60d4938081b63f37e27..0000000000000000000000000000000000000000
--- a/backport-CVE-2025-49847.patch
+++ /dev/null
@@ -1,45 +0,0 @@
-From e6d21d901a0e5aabd08a41d8000c5f4cd80c8b0f Mon Sep 17 00:00:00 2001
-From: Guy Goldenberg <guy110698@gmail.com>
-Date: Fri, 13 Jun 2025 19:20:25 +0300
-Subject: [PATCH] Merge commit from fork
-
-* vocab : prevent integer overflow during load
-
-* Add static cast and GGML_ABORT
-
----------
-
-Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
----
- src/llama-vocab.cpp | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 8c9aaf5..6974a33 100644
---- a/src/llama-vocab.cpp
-+++ b/src/llama-vocab.cpp
-@@ -11,6 +11,9 @@
- #include <forward_list>
- #include <queue>
- #include <sstream>
-+#include <limits>
-+#include <cstdint>
-+#include <cinttypes>
- 
- //
- // helpers
-@@ -1785,6 +1788,10 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
-     // copy piece chars to output text buffer
-     // skip up to 'lstrip' leading spaces before copying
-     auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
-+        if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
-+            GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
-+        }
-+
-         for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
-             token++;
-             size--;
--- 
-2.43.0
-
-
diff --git a/backport-CVE-2025-52566.patch b/backport-CVE-2025-52566.patch
deleted file mode 100644
index 6a11a3aa50a6cef81b0ba54bd2a386f2b3eda3c5..0000000000000000000000000000000000000000
--- a/backport-CVE-2025-52566.patch
+++ /dev/null
@@ -1,61 +0,0 @@
-From 5084d9fc8b876172678ce3d3ba81223e7934be4b Mon Sep 17 00:00:00 2001
-From: Ruikai Peng <retr0@retr0.blog>
-Date: Fri, 20 Jun 2025 22:13:06 +0800
-Subject: [PATCH] vocab : prevent tokenizer overflow (#14301)
-
-* vocab : prevent stack overflow in tokenize
-
-* vocab : return error instead of aborting on oversized token count
-
-* vocab : INT32_MIN from llama_tokenize on overflow
----
- common/common.cpp   | 3 +++
- include/llama.h     | 1 +
- src/llama-vocab.cpp | 4 ++++
- 3 files changed, 8 insertions(+)
-
-diff --git a/common/common.cpp b/common/common.cpp
-index 6143516..c139773 100644
---- a/common/common.cpp
-+++ b/common/common.cpp
-@@ -1584,6 +1584,9 @@ std::vector<llama_token> common_tokenize(
-     int n_tokens = text.length() + 2 * add_special;
-     std::vector<llama_token> result(n_tokens);
-     n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-+    if (n_tokens == std::numeric_limits<int32_t>::min()) {
-+        throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit");
-+    }
-     if (n_tokens < 0) {
-         result.resize(-n_tokens);
-         int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-diff --git a/include/llama.h b/include/llama.h
-index 36945cd..50a1ca3 100644
---- a/include/llama.h
-+++ b/include/llama.h
-@@ -929,6 +929,7 @@ extern "C" {
-     /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-     /// @return Returns the number of tokens on success, no more than n_tokens_max
-     /// @return Returns a negative number on failure - the number of tokens that would have been returned
-+    /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
-     /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
-     /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
-     ///                      as plaintext. Does not insert a leading space.
-diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
-index 6974a33..497d780 100644
---- a/src/llama-vocab.cpp
-+++ b/src/llama-vocab.cpp
-@@ -1744,6 +1744,10 @@ int32_t llama_tokenize_impl(
-                             bool   add_special,
-                             bool   parse_special) {
-     auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
-+    if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
-+        LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
-+        return std::numeric_limits<int32_t>::min();
-+    }
-     if (n_tokens_max < (int) res.size()) {
-         // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
-         return -((int) res.size());
--- 
-2.43.0
-
-
diff --git a/backport-CVE-2025-53630.patch b/backport-CVE-2025-53630.patch
deleted file mode 100644
index b293355131b8549b4c2c9f7c381d8835cff391ad..0000000000000000000000000000000000000000
--- a/backport-CVE-2025-53630.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From 7d00e32369b13b1820d4acbf453232cef6de3171 Mon Sep 17 00:00:00 2001
-From: Miaoqian Lin <linmq006@gmail.com>
-Date: Wed, 9 Jul 2025 20:33:53 +0800
-Subject: [PATCH] ggml : prevent integer overflow in gguf tensor size
- calculation (#14595)
-
----
- ggml/src/ggml.c | 9 ++++++++-
- 1 file changed, 8 insertions(+), 1 deletion(-)
-
-diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
-index 058941c..8845215 100644
---- a/ggml/src/ggml.c
-+++ b/ggml/src/ggml.c
-@@ -6854,7 +6854,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
- 
-             const size_t size_cur = ggml_row_size(info->type, ne);
- 
--            ctx->size += GGML_PAD(size_cur, ctx->alignment);
-+            size_t padded_size = GGML_PAD(size_cur, ctx->alignment);
-+            if (SIZE_MAX - ctx->size < padded_size) {
-+                GGML_LOG_ERROR("%s: tensor size overflow, cannot accumulate size %zu + %zu\n",
-+                    __func__, ctx->size, padded_size);
-+                gguf_free(ctx);
-+                return NULL;
-+            }
-+            ctx->size += padded_size;
-         }
-     }
- 
--- 
-2.43.0
-
-
diff --git a/llama.cpp.spec b/llama.cpp.spec
index cf9ca8bb6bba36d92a4f430009fc2c405294a8b6..833f820c0625de912bfcb15de83a11c5305976b9 100644
--- a/llama.cpp.spec
+++ b/llama.cpp.spec
@@ -1,62 +1,69 @@
 %define debug_package %{nil}
-%global llama_commitid b4295
+%global llama_commitid b6602
 
-Name:       llama.cpp
-Version:    20241210
-Release:    4
-License:    MIT
-Summary:    Port of English lagre model LLaMA implemented based on C/C++
+Name:           llama.cpp
+Version:        20251009
+Release:        2
+License:        MIT
+Summary:        Port of English large model LLaMA implemented in C/C++
 
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggerganov/llama.cpp 
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/tags/%{llama_commitid}.tar.gz
 
-Patch001:        backport-CVE-2025-49847.patch
-Patch002:        backport-CVE-2025-52566.patch
-Patch003:        backport-CVE-2025-53630.patch
+Patch001:AArch64-enabled-instruction-set.patch
+Patch002:Optimization-on-aarch64.patch
 
-BuildRequires:  gcc,gcc-c++,cmake
+BuildRequires:  gcc
+BuildRequires:  gcc-c++
+BuildRequires:  cmake libcurl libcurl-devel numactl-devel
 
 %description
-Port of English lagre model LLaMA implemented based on C/C++,
-it can be used for model dialogue based on local laptops.
+Port of English large model LLaMA implemented in C/C++,
+can be used for local inference on laptops.
 
 %package devel
-Summary:        Port of Facebook's LLaMA model in C/C++
+Summary:        Development headers and libraries for %{name}
 Requires:       %{name}%{?_isa} = %{version}-%{release}
 
 %description devel
-Port of English lagre model LLaMA implemented based on C/C++,
-it can be used for model dialogue based on local laptops.
+This package contains the header files, CMake config files and pkg-config
+files needed to develop against %{name}.
 
 %prep
-%autosetup -b 0 -n %{name}-%{llama_commitid} -p1
+%autosetup -n %{name}-%{llama_commitid} -p1
 
 %build
-%cmake  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
-        -DCMAKE_INSTALL_LIBDIR=%{_libdir} \
-        -DCMAKE_INSTALL_BINDIR=%{_bindir} \
-        -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir}
-%cmake_build
+mkdir -p build
+cd build
+cmake .. \
+  -DCMAKE_INSTALL_PREFIX=%{_prefix} \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DCMAKE_INSTALL_LIBDIR=%{_libdir}
+%make_build
 
 %install
-%cmake_install
-
+cd build
+%make_install
+# 清理空目录
+find %{buildroot} -type d -empty -delete
 
 %files
 %{_bindir}/*
 %{_libdir}/*.so
 
 %files devel
-%dir %{_libdir}/cmake/llama
-%doc README.md
-%{_includedir}/ggml.h
-%{_includedir}/ggml-*.h
-%{_includedir}/llama.h
-%{_includedir}/llama-*.h
-%{_libdir}/cmake/llama/*.cmake
-%{_exec_prefix}/lib/pkgconfig/llama.pc
+%{_includedir}/*.h
+%{_libdir}/cmake/
+%{_libdir}/pkgconfig/
+%{_libdir}/*.so
 
 %changelog
+* Wed Oct 22 2025 bitianyuan <bitianyuan@huawei.com> - 20251009-2
+- Optimization on Aarch64
+
+* Wed Oct 22 2025 GS_Stephen_Curry <wangshuyuan17@huawei.com> - 20251009-1
+- Upgrade to b6602
+
 * Mon Jul 21 2025 PshySimon <caixiaomeng2@huawei.com> - 20241210-4
 - fix CVE-2025-53630