diff --git a/AArch64-enabled-instruction-set.patch b/AArch64-enabled-instruction-set.patch new file mode 100644 index 0000000000000000000000000000000000000000..369e26db23974b7840af1840a6fb8cd58b969fa5 --- /dev/null +++ b/AArch64-enabled-instruction-set.patch @@ -0,0 +1,156 @@ +From df2a8abcd79f90d13e8d1893099cc8a6039576c4 Mon Sep 17 00:00:00 2001 +From: bitianyuan +Date: Tue, 21 Oct 2025 17:06:05 +0800 +Subject: [PATCH] AArch64 enabled instruction set + +--- + .gitignore | 2 ++ + CMakeLists.txt | 21 +++++++++++++ + ggml/src/CMakeLists.txt | 15 +++++++++ + ggml/src/ggml-cpu/CMakeLists.txt | 54 ++++++++++++++++++++++++++++++++ + 4 files changed, 92 insertions(+) + +diff --git a/.gitignore b/.gitignore +index c7d00097..9df14e2c 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -152,3 +152,5 @@ poetry.toml + # IDE + *.code-workspace + .windsurf/ ++ ++cmake-build-debug +\ No newline at end of file +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 4bf8b278..f758fbeb 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -159,6 +159,27 @@ if (NOT MSVC) + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() ++ ++ if (LLAMA_PGO_GEN) ++ message(STATUS "Using -fprofile-generate") ++ ++ add_compile_options(-fprofile-generate) ++ link_libraries (-fprofile-generate) ++ endif() ++ ++ if (LLAMA_PGO_USE) ++ message(STATUS "Using -fprofile-use") ++ ++ add_compile_options(-fprofile-use) ++ link_libraries (-fprofile-use) ++ endif() ++ ++ if (LLAMA_LTO) ++ message(STATUS "Using -flto") ++ ++ add_compile_options(-flto) ++ link_libraries (-flto) ++ endif() + endif() + + # +diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt +index 892c2331..802bc25c 100644 +--- a/ggml/src/CMakeLists.txt ++++ b/ggml/src/CMakeLists.txt +@@ -23,6 +23,21 @@ if (NOT MSVC) + add_compile_options(-fsanitize=undefined) + link_libraries (-fsanitize=undefined) + endif() ++ ++ if (LLAMA_PGO_GEN) ++ add_compile_options(-fprofile-generate) ++ link_libraries (-fprofile-generate) ++ endif() ++ ++ if (LLAMA_PGO_USE) ++ add_compile_options(-fprofile-use) ++ link_libraries (-fprofile-use) ++ endif() ++ ++ if (LLAMA_LTO) ++ add_compile_options(-flto) ++ link_libraries (-flto) ++ endif() + endif() + + if (GGML_FATAL_WARNINGS) +diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt +index 42041b71..f78e5caf 100644 +--- a/ggml/src/ggml-cpu/CMakeLists.txt ++++ b/ggml/src/ggml-cpu/CMakeLists.txt +@@ -126,6 +126,59 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + ) + if (NOT ARM_MCPU_RESULT) + string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}") ++ set(AUTO_ARM_FEATURES "") ++ set(ARM_MCPU_FLAG "") ++ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|AARCH64" AND CMAKE_HOST_UNIX) ++ if(EXISTS "/proc/cpuinfo") ++ file(READ "/proc/cpuinfo" CPUINFO_CONTENT) ++ ++ if(CPUINFO_CONTENT MATCHES "[Ff]eatures[ \t]*:[ \t]*([^\n]+)") ++ string(STRIP "${CMAKE_MATCH_1}" CPU_FEATURES_STR) ++ if(NOT "${CPU_FEATURES_STR}" STREQUAL "") ++ string(REPLACE " " ";" CPU_FEATURES_LIST "${CPU_FEATURES_STR}") ++ list(REMOVE_DUPLICATES CPU_FEATURES_LIST) ++ ++ if("asimddp" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "dotprod") ++ endif() ++ if("i8mm" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "i8mm") ++ endif() ++ if("sve" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "sve") ++ endif() ++ if("sve2" IN_LIST CPU_FEATURES_LIST OR "svebf16" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "sve2") ++ endif() ++ if("bf16" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "bf16") ++ endif() ++ if("sme" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "sme") ++ endif() ++ if("fp16" IN_LIST CPU_FEATURES_LIST OR "fphp" IN_LIST CPU_FEATURES_LIST) ++ list(APPEND AUTO_ARM_FEATURES "fp16") ++ endif() ++ ++ if(AUTO_ARM_FEATURES) ++ set(BASE_ARCH "armv8.2-a") ++ if("i8mm" IN_LIST AUTO_ARM_FEATURES OR "sve2" IN_LIST AUTO_ARM_FEATURES) ++ set(BASE_ARCH "armv8.6-a") ++ elseif("sme" IN_LIST AUTO_ARM_FEATURES) ++ set(BASE_ARCH "armv9-a") ++ endif() ++ ++ list(JOIN AUTO_ARM_FEATURES "+" FEATURE_STR) ++ set(ARM_MCPU_FLAG "-march=${BASE_ARCH}") ++ message(STATUS "Auto-detected ARM features from /proc/cpuinfo Features: ${FEATURE_STR}") ++ message(STATUS "Using ARM_MCPU_FLAG: ${ARM_MCPU_FLAG}") ++ endif() ++ endif() ++ else() ++ message(STATUS "No 'Features' line found in /proc/cpuinfo") ++ endif() ++ endif() ++ endif() + endif() + if ("${ARM_MCPU_FLAG}" STREQUAL "") + set(ARM_MCPU_FLAG -mcpu=native) +@@ -212,6 +265,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) + set(FEAT_INPUT_FILE "/dev/null") + endif() + ++ message(STATUS "arm: CPU-specific optimizations: ${ARCH_FLAGS}") + execute_process( + COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E - + INPUT_FILE ${FEAT_INPUT_FILE} +-- +2.33.0 + diff --git a/Optimization-on-aarch64.patch b/Optimization-on-aarch64.patch new file mode 100644 index 0000000000000000000000000000000000000000..6301c646e895fff677d9fe571c39403fdefc083d --- /dev/null +++ b/Optimization-on-aarch64.patch @@ -0,0 +1,509 @@ +From d21c6f5611b84a0f938c14a472a4340801d4db25 Mon Sep 17 00:00:00 2001 +From: bitianyuan +Date: Wed, 22 Oct 2025 09:19:14 +0800 +Subject: [PATCH] Optimization on aarch64 + +--- + common/arg.cpp | 1 + + ggml/include/ggml-cpu.h | 4 + + ggml/include/ggml.h | 13 ++- + ggml/src/ggml-cpu/ggml-cpu-impl.h | 2 + + ggml/src/ggml-cpu/ggml-cpu.c | 157 +++++++++++++++++++++++++++--- + ggml/src/ggml.c | 4 +- + src/CMakeLists.txt | 2 +- + src/llama-model-loader.cpp | 10 ++ + src/llama-model.cpp | 9 ++ + 9 files changed, 188 insertions(+), 14 deletions(-) + +diff --git a/common/arg.cpp b/common/arg.cpp +index 33ed7ae8..ce52115c 100644 +--- a/common/arg.cpp ++++ b/common/arg.cpp +@@ -2804,6 +2804,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } ++ else if (value == "oto") { params.numa = GGML_NUMA_STRATEGY_OTO_AFF; } + else { throw std::invalid_argument("invalid value"); } + } + ).set_env("LLAMA_ARG_NUMA")); +diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h +index 9edd4851..ee6e705a 100644 +--- a/ggml/include/ggml-cpu.h ++++ b/ggml/include/ggml-cpu.h +@@ -28,6 +28,9 @@ extern "C" { + GGML_NUMA_STRATEGY_ISOLATE = 2, + GGML_NUMA_STRATEGY_NUMACTL = 3, + GGML_NUMA_STRATEGY_MIRROR = 4, ++ GGML_NUMA_STRATEGY_SOCKET1 = 5, ++ GGML_NUMA_STRATEGY_SOCKET2 = 6, ++ GGML_NUMA_STRATEGY_OTO_AFF = 7, + GGML_NUMA_STRATEGY_COUNT + }; + +@@ -103,6 +106,7 @@ extern "C" { + GGML_BACKEND_API int ggml_cpu_has_vxe (void); + GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); + GGML_BACKEND_API int ggml_cpu_has_llamafile (void); ++ GGML_BACKEND_API int ggml_cpu_has_matmul_int16(void); + + // Internal types and functions exposed for tests and benchmarks + +diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h +index d948b00c..efb350b9 100644 +--- a/ggml/include/ggml.h ++++ b/ggml/include/ggml.h +@@ -345,6 +345,9 @@ extern "C" { + GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4) + GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...); + ++ #define GGML_NUMA_MAX_NODES 8 ++ #define GGML_NUMA_MAX_CPUS 512 ++ + enum ggml_status { + GGML_STATUS_ALLOC_FAILED = -2, + GGML_STATUS_FAILED = -1, +@@ -420,6 +423,11 @@ extern "C" { + GGML_TYPE_COUNT = 40, + }; + ++ enum tensor_attr { ++ TENSOR_ATTR_INVALID = 0, ++ TENSOR_ATTR_NUMA_SPLIT = 0x01, ++ }; ++ + // precision + enum ggml_prec { + GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default +@@ -656,9 +664,12 @@ extern "C" { + + char name[GGML_MAX_NAME]; + ++ void * data_numa[GGML_NUMA_MAX_NODES]; // 临时,拆分到所有numa节点后的张量数据 ++ enum tensor_attr attr; // 临时,张量属性 ++ + void * extra; // extra things e.g. for ggml-cuda.cu + +- char padding[8]; ++ char padding[0]; + }; + + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); +diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h +index 713bf85e..f606ec0b 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h ++++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h +@@ -19,6 +19,8 @@ struct ggml_compute_params { + // ith = thread index, nth = number of threads + int ith, nth; + ++ struct ggml_numainfo * numainfo; ++ + // work buffer for all threads + size_t wsize; + void * wdata; +diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c +index 9ec485cf..32a49d23 100644 +--- a/ggml/src/ggml-cpu/ggml-cpu.c ++++ b/ggml/src/ggml-cpu/ggml-cpu.c +@@ -429,6 +429,13 @@ typedef pthread_mutex_t ggml_mutex_t; + #endif + #define ggml_lock_unlock(x) UNUSED(x) + ++#define KUN_PENG_SOCKET_1 0 ++#define KUN_PENG_SOCKET_2 1 ++#define KUN_PENG_SOCKET_NUM 2 ++ ++#define KUN_PENG_SOCKET_THREAD_NUM 64 ++#define KUN_PENG_MAX_THREAD_NUM 128 ++ + #define GGML_LOCK_INITIALIZER 0 + #define ggml_cond_init(c) pthread_cond_init(c, NULL) + #define ggml_cond_destroy(c) pthread_cond_destroy(c) +@@ -448,6 +455,8 @@ struct ggml_threadpool { + struct ggml_cgraph * cgraph; + struct ggml_cplan * cplan; + ++ int threads_per_numa; ++ + // synchronization primitives + atomic_int n_graph; // incremented when there is work to be done (i.e each graph) + atomic_int GGML_CACHE_ALIGN n_barrier; +@@ -469,6 +478,11 @@ struct ggml_threadpool { + enum ggml_status ec; + }; + ++struct ggml_numainfo { ++ int my_numa, ith_in_numa; ++ int max_item_per_thread, max_item_per_numa; ++}; ++ + // Per-thread state + struct ggml_compute_state { + #ifndef GGML_USE_OPENMP +@@ -498,8 +512,8 @@ static inline void ggml_thread_cpu_relax(void) {;} + // NUMA support + // + +-#define GGML_NUMA_MAX_NODES 8 +-#define GGML_NUMA_MAX_CPUS 512 ++// #define GGML_NUMA_MAX_NODES 8 ++// #define GGML_NUMA_MAX_CPUS 512 + + struct ggml_numa_node { + uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node +@@ -626,6 +640,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { + } + + GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); ++ fprintf(stderr, "found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); + + // figure out which node we're on + uint current_cpu; +@@ -646,6 +661,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { + } + + GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); ++ fprintf(stderr, "found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); + + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { + struct ggml_numa_node * node = &g_state.numa.nodes[n]; +@@ -1117,10 +1133,31 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, + + // ggml_compute_forward_mul_mat + ++static bool is_numa_aware_tensor(enum tensor_attr attr) { ++ return attr == TENSOR_ATTR_NUMA_SPLIT; ++} ++ ++static int64_t ggml_get_num_step( ++ const int64_t num_rows_per_vec_dot, ++ const enum ggml_type type) { ++ if (num_rows_per_vec_dot != 2) { ++ return num_rows_per_vec_dot; ++ } ++ ++ switch (type) { ++ case GGML_TYPE_Q4_K: ++ return num_rows_per_vec_dot; ++ case GGML_TYPE_Q4_0: ++ case GGML_TYPE_Q8_0: ++ return 8; ++ default: ++ return num_rows_per_vec_dot; ++ } ++} ++ + static void ggml_compute_forward_mul_mat_one_chunk( + const struct ggml_compute_params * params, + struct ggml_tensor * dst, +- const enum ggml_type type, + const int64_t num_rows_per_vec_dot, + const int64_t ir0_start, + const int64_t ir0_end, +@@ -1132,6 +1169,8 @@ static void ggml_compute_forward_mul_mat_one_chunk( + + GGML_TENSOR_BINARY_OP_LOCALS + ++ const enum ggml_type type = src0->type; ++ + const bool src1_cont = ggml_is_contiguous(src1); + + ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot; +@@ -1180,6 +1219,11 @@ static void ggml_compute_forward_mul_mat_one_chunk( + const int64_t i3 = i13; + + const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03); ++ int src0_start = 0; ++ if (is_numa_aware_tensor(src0->attr)) { ++ src0_row = (const char *) src0->data_numa[params->numainfo->my_numa]; ++ src0_start = params->numainfo->my_numa * params->numainfo->max_item_per_numa; ++ } + + // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides + // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using +@@ -1195,8 +1239,11 @@ static void ggml_compute_forward_mul_mat_one_chunk( + // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); + //} + +- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { +- vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); ++ // 修改multi_load,外部2次,内部循环4次 ++ int64_t row_step = ggml_get_num_step(num_rows_per_vec_dot, type); ++ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += row_step) { ++ vec_dot(ne00, &tmp[ir0 - iir0], (row_step > 1 ? 16 : 0), src0_row + (ir0 - src0_start) * nb01, ++ (row_step > 1 ? nb01 : 0), src1_col, (row_step > 1 ? src1_col_stride : 0), row_step); + } + + for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) { +@@ -1373,6 +1420,9 @@ UseGgmlGemm2:; + // The first chunk comes from our thread_id, the rest will get auto-assigned. + int current_chunk = ith; + ++ params->numainfo->max_item_per_numa = ne0 / g_state.numa.n_nodes; ++ params->numainfo->max_item_per_thread = dr0; ++ + while (current_chunk < nchunk0 * nchunk1) { + const int64_t ith0 = current_chunk % nchunk0; + const int64_t ith1 = current_chunk / nchunk0; +@@ -1391,7 +1441,7 @@ UseGgmlGemm2:; + if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) { + num_rows_per_vec_dot = 1; + } +- ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); ++ ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); + + if (nth >= nchunk0 * nchunk1) { + break; +@@ -2056,7 +2106,69 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm + + // Android's libc implementation "bionic" does not support setting affinity + #if defined(__gnu_linux__) +-static void set_numa_thread_affinity(int thread_n) { ++ ++static void set_numa_thread_affinit_one_by_one(size_t setsize) { ++ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); ++ CPU_ZERO_S(setsize, cpus); ++ ++ pid_t cur_thread = gettid(); ++ int cpu_index = cur_thread % KUN_PENG_MAX_THREAD_NUM; ++ int node_num = cpu_index / KUN_PENG_SOCKET_THREAD_NUM; ++ ++ struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; ++ int node_index = cpu_index % node->n_cpus; ++ CPU_SET_S(node->cpus[node_index], setsize, cpus); ++ ++ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); ++ if (rv) { ++ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); ++ } ++ ++ CPU_FREE(cpus); ++} ++ ++static void set_numa_thread_affinit_socket(size_t setsize, int socket) { ++ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); ++ CPU_ZERO_S(setsize, cpus); ++ ++ size_t socket_node_num = g_state.numa.n_nodes / KUN_PENG_SOCKET_NUM; ++ size_t start_node_idx = socket * socket_node_num; ++ for (size_t i = 0; i < socket_node_num; ++i) { ++ int node_idx = start_node_idx + i; ++ struct ggml_numa_node * node = &g_state.numa.nodes[node_idx]; ++ for (size_t j = 0; j < node->n_cpus; ++j) { ++ CPU_SET_S(node->cpus[j], setsize, cpus); ++ } ++ } ++ ++ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); ++ if (rv) { ++ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); ++ } ++ ++ CPU_FREE(cpus); ++} ++ ++static void set_numa_thread_affinit_one_by_one_ex(int thread_n, int threads_per_numa) { ++ size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); ++ cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); ++ CPU_ZERO_S(setsize, cpus); ++ ++ int node_index = thread_n / threads_per_numa; ++ int cpu_index = (thread_n % threads_per_numa) * 2; ++ ++ struct ggml_numa_node * node = &g_state.numa.nodes[node_index]; ++ CPU_SET_S(node->cpus[cpu_index], setsize, cpus); ++ ++ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); ++ if (rv) { ++ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", strerror(rv)); ++ } ++ ++ CPU_FREE(cpus); ++} ++ ++static void set_numa_thread_affinity(int thread_n, int threads_per_numa) { + if (!ggml_is_numa()) { + return; + } +@@ -2068,12 +2180,21 @@ static void set_numa_thread_affinity(int thread_n) { + switch(g_state.numa.numa_strategy) { + case GGML_NUMA_STRATEGY_DISTRIBUTE: + // run thread on node_num thread_n / (threads per node) +- node_num = thread_n % g_state.numa.n_nodes; ++ node_num = thread_n / threads_per_numa; + break; + case GGML_NUMA_STRATEGY_ISOLATE: + // run thread on current_node + node_num = g_state.numa.current_node; + break; ++ case GGML_NUMA_STRATEGY_OTO_AFF: ++ set_numa_thread_affinit_one_by_one_ex(setsize, threads_per_numa); ++ return; ++ case GGML_NUMA_STRATEGY_SOCKET1: ++ set_numa_thread_affinit_socket(setsize, KUN_PENG_SOCKET_1); ++ return; ++ case GGML_NUMA_STRATEGY_SOCKET2: ++ set_numa_thread_affinit_socket(setsize, KUN_PENG_SOCKET_2); ++ return; + case GGML_NUMA_STRATEGY_NUMACTL: + // use the cpuset that numactl gave us + rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); +@@ -2124,7 +2245,7 @@ static void clear_numa_thread_affinity(void) { + #else + // TODO: Windows etc. + // (the linux implementation may also work on BSD, someone should test) +-static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } ++static void set_numa_thread_affinity(int thread_n, int threads_per_numa) { UNUSED(thread_n); } + static void clear_numa_thread_affinity(void) {} + #endif + +@@ -2872,15 +2993,19 @@ struct ggml_cplan ggml_graph_plan( + static thread_ret_t ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + struct ggml_threadpool * tp = state->threadpool; ++ struct ggml_numainfo ni = { 0 }; + + const struct ggml_cgraph * cgraph = tp->cgraph; + const struct ggml_cplan * cplan = tp->cplan; + +- set_numa_thread_affinity(state->ith); ++ set_numa_thread_affinity(state->ith, state->threadpool->threads_per_numa); ++ ni.my_numa = state->ith / state->threadpool->threads_per_numa; ++ ni.ith_in_numa = state->ith % state->threadpool->threads_per_numa; + + struct ggml_compute_params params = { + /*.ith =*/ state->ith, + /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed), ++ /*.numainfo =*/ &ni, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + /*.threadpool=*/ tp, +@@ -3158,6 +3283,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl + threadpool->current_chunk = 0; + threadpool->abort = -1; + threadpool->ec = GGML_STATUS_SUCCESS; ++ threadpool->threads_per_numa = 0; + } + + #ifdef GGML_USE_OPENMP +@@ -3169,6 +3295,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl + // update the number of threads from the actual number of threads that we got from OpenMP + n_threads = omp_get_num_threads(); + atomic_store_explicit(&threadpool->n_threads_cur, n_threads, memory_order_relaxed); ++ atomic_store_explicit(&threadpool->threads_per_numa, n_threads / g_state.numa.n_nodes, memory_order_relaxed); + } + + // Apply thread CPU mask and priority +@@ -3493,7 +3620,7 @@ int ggml_cpu_has_neon(void) { + } + + int ggml_cpu_has_dotprod(void) { +-#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD) ++#if defined(__ARM_FEATURE_DOTPROD) + return 1; + #else + return 0; +@@ -3516,6 +3643,14 @@ int ggml_cpu_has_matmul_int8(void) { + #endif + } + ++int ggml_cpu_has_matmul_int16(void) { ++#if defined(__ARM_FEATURE_MATMUL_INT16) ++ return 1; ++#else ++ return 0; ++#endif ++} ++ + int ggml_cpu_get_sve_cnt(void) { + #if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE) + return ggml_arm_arch_features.sve_cnt; +diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c +index 86f1c31a..7b838ddc 100644 +--- a/ggml/src/ggml.c ++++ b/ggml/src/ggml.c +@@ -1680,9 +1680,11 @@ static struct ggml_tensor * ggml_new_tensor_impl( + /*.view_src =*/ view_src, + /*.view_offs =*/ view_offs, + /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, ++ /*.data_numa =*/ { NULL }, ++ /*.attr =*/ TENSOR_ATTR_INVALID, + /*.name =*/ { 0 }, + /*.extra =*/ NULL, +- /*.padding =*/ { 0 }, ++ /*.padding =*/ // { 0 }, + }; + + // TODO: this should not be needed as long as we don't rely on aligned SIMD loads +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 18cfc765..aa8b9a27 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -41,7 +41,7 @@ target_include_directories(llama PRIVATE .) + target_include_directories(llama PUBLIC ../include) + target_compile_features (llama PRIVATE cxx_std_17) # don't bump + +-target_link_libraries(llama PUBLIC ggml) ++target_link_libraries(llama PUBLIC ggml numa) + + if (BUILD_SHARED_LIBS) + set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) +diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp +index aa3a65f8..b34e7325 100644 +--- a/src/llama-model-loader.cpp ++++ b/src/llama-model-loader.cpp +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + + static const size_t kiB = 1024; + static const size_t MiB = 1024*kiB; +@@ -1067,6 +1068,15 @@ bool llama_model_loader::load_all_data( + if (ggml_backend_buffer_is_host(cur->buffer)) { + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); ++ if (cur->attr == TENSOR_ATTR_NUMA_SPLIT) { ++ const int spec_numa_num = numa_max_node() + 1; ++ int n_size_per_numa = n_size / spec_numa_num; ++ ++ for (int i = 0; i < spec_numa_num; ++i) { ++ cur->data_numa[i] = numa_alloc_onnode(n_size_per_numa, i); ++ memcpy(cur->data_numa[i], cur->data + n_size_per_numa * i, n_size_per_numa); ++ } ++ } + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); +diff --git a/src/llama-model.cpp b/src/llama-model.cpp +index 522d1f67..f799b939 100644 +--- a/src/llama-model.cpp ++++ b/src/llama-model.cpp +@@ -2410,6 +2410,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + ++ layer.wq->attr = TENSOR_ATTR_NUMA_SPLIT; ++ layer.wk->attr = TENSOR_ATTR_NUMA_SPLIT; ++ layer.wv->attr = TENSOR_ATTR_NUMA_SPLIT; ++ layer.wo->attr = TENSOR_ATTR_NUMA_SPLIT; ++ + // optional bias tensors + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); +@@ -2431,6 +2436,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + ++ layer.ffn_gate->attr = TENSOR_ATTR_NUMA_SPLIT; ++ layer.ffn_down->attr = TENSOR_ATTR_NUMA_SPLIT; ++ layer.ffn_up->attr = TENSOR_ATTR_NUMA_SPLIT; ++ + // optional MLP bias + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); +-- +2.33.0 + diff --git a/b4295.tar.gz b/b6602.tar.gz similarity index 59% rename from b4295.tar.gz rename to b6602.tar.gz index 0c324cf412af298ddfaf2bb013185adee3744822..f4ac243156c0f8c792ec58325ef5b09c67323e09 100644 Binary files a/b4295.tar.gz and b/b6602.tar.gz differ diff --git a/backport-CVE-2025-49847.patch b/backport-CVE-2025-49847.patch deleted file mode 100644 index f5835a9710c6c4d04704e60d4938081b63f37e27..0000000000000000000000000000000000000000 --- a/backport-CVE-2025-49847.patch +++ /dev/null @@ -1,45 +0,0 @@ -From e6d21d901a0e5aabd08a41d8000c5f4cd80c8b0f Mon Sep 17 00:00:00 2001 -From: Guy Goldenberg -Date: Fri, 13 Jun 2025 19:20:25 +0300 -Subject: [PATCH] Merge commit from fork - -* vocab : prevent integer overflow during load - -* Add static cast and GGML_ABORT - ---------- - -Co-authored-by: Georgi Gerganov ---- - src/llama-vocab.cpp | 7 +++++++ - 1 file changed, 7 insertions(+) - -diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 8c9aaf5..6974a33 100644 ---- a/src/llama-vocab.cpp -+++ b/src/llama-vocab.cpp -@@ -11,6 +11,9 @@ - #include - #include - #include -+#include -+#include -+#include - - // - // helpers -@@ -1785,6 +1788,10 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token - // copy piece chars to output text buffer - // skip up to 'lstrip' leading spaces before copying - auto _try_copy = [=] (const char * token, size_t size) -> int32_t { -+ if (size >= static_cast(std::numeric_limits::max())) { -+ GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size); -+ } -+ - for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) { - token++; - size--; --- -2.43.0 - - diff --git a/backport-CVE-2025-52566.patch b/backport-CVE-2025-52566.patch deleted file mode 100644 index 6a11a3aa50a6cef81b0ba54bd2a386f2b3eda3c5..0000000000000000000000000000000000000000 --- a/backport-CVE-2025-52566.patch +++ /dev/null @@ -1,61 +0,0 @@ -From 5084d9fc8b876172678ce3d3ba81223e7934be4b Mon Sep 17 00:00:00 2001 -From: Ruikai Peng -Date: Fri, 20 Jun 2025 22:13:06 +0800 -Subject: [PATCH] vocab : prevent tokenizer overflow (#14301) - -* vocab : prevent stack overflow in tokenize - -* vocab : return error instead of aborting on oversized token count - -* vocab : INT32_MIN from llama_tokenize on overflow ---- - common/common.cpp | 3 +++ - include/llama.h | 1 + - src/llama-vocab.cpp | 4 ++++ - 3 files changed, 8 insertions(+) - -diff --git a/common/common.cpp b/common/common.cpp -index 6143516..c139773 100644 ---- a/common/common.cpp -+++ b/common/common.cpp -@@ -1584,6 +1584,9 @@ std::vector common_tokenize( - int n_tokens = text.length() + 2 * add_special; - std::vector result(n_tokens); - n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); -+ if (n_tokens == std::numeric_limits::min()) { -+ throw std::runtime_error("Tokenization failed: input text too large, tokenization result exceeds int32_t limit"); -+ } - if (n_tokens < 0) { - result.resize(-n_tokens); - int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); -diff --git a/include/llama.h b/include/llama.h -index 36945cd..50a1ca3 100644 ---- a/include/llama.h -+++ b/include/llama.h -@@ -929,6 +929,7 @@ extern "C" { - /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. - /// @return Returns the number of tokens on success, no more than n_tokens_max - /// @return Returns a negative number on failure - the number of tokens that would have been returned -+ /// @return Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit) - /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so. - /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated - /// as plaintext. Does not insert a leading space. -diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp -index 6974a33..497d780 100644 ---- a/src/llama-vocab.cpp -+++ b/src/llama-vocab.cpp -@@ -1744,6 +1744,10 @@ int32_t llama_tokenize_impl( - bool add_special, - bool parse_special) { - auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special); -+ if (res.size() >= static_cast(std::numeric_limits::max())) { -+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size()); -+ return std::numeric_limits::min(); -+ } - if (n_tokens_max < (int) res.size()) { - // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); - return -((int) res.size()); --- -2.43.0 - - diff --git a/backport-CVE-2025-53630.patch b/backport-CVE-2025-53630.patch deleted file mode 100644 index b293355131b8549b4c2c9f7c381d8835cff391ad..0000000000000000000000000000000000000000 --- a/backport-CVE-2025-53630.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 7d00e32369b13b1820d4acbf453232cef6de3171 Mon Sep 17 00:00:00 2001 -From: Miaoqian Lin -Date: Wed, 9 Jul 2025 20:33:53 +0800 -Subject: [PATCH] ggml : prevent integer overflow in gguf tensor size - calculation (#14595) - ---- - ggml/src/ggml.c | 9 ++++++++- - 1 file changed, 8 insertions(+), 1 deletion(-) - -diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c -index 058941c..8845215 100644 ---- a/ggml/src/ggml.c -+++ b/ggml/src/ggml.c -@@ -6854,7 +6854,14 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p - - const size_t size_cur = ggml_row_size(info->type, ne); - -- ctx->size += GGML_PAD(size_cur, ctx->alignment); -+ size_t padded_size = GGML_PAD(size_cur, ctx->alignment); -+ if (SIZE_MAX - ctx->size < padded_size) { -+ GGML_LOG_ERROR("%s: tensor size overflow, cannot accumulate size %zu + %zu\n", -+ __func__, ctx->size, padded_size); -+ gguf_free(ctx); -+ return NULL; -+ } -+ ctx->size += padded_size; - } - } - --- -2.43.0 - - diff --git a/llama.cpp.spec b/llama.cpp.spec index cf9ca8bb6bba36d92a4f430009fc2c405294a8b6..833f820c0625de912bfcb15de83a11c5305976b9 100644 --- a/llama.cpp.spec +++ b/llama.cpp.spec @@ -1,62 +1,69 @@ %define debug_package %{nil} -%global llama_commitid b4295 +%global llama_commitid b6602 -Name: llama.cpp -Version: 20241210 -Release: 4 -License: MIT -Summary: Port of English lagre model LLaMA implemented based on C/C++ +Name: llama.cpp +Version: 20251009 +Release: 2 +License: MIT +Summary: Port of English large model LLaMA implemented in C/C++ -URL: https://github.com/ggerganov/llama.cpp +URL: https://github.com/ggerganov/llama.cpp Source0: https://github.com/ggerganov/llama.cpp/archive/refs/tags/%{llama_commitid}.tar.gz -Patch001: backport-CVE-2025-49847.patch -Patch002: backport-CVE-2025-52566.patch -Patch003: backport-CVE-2025-53630.patch +Patch001:AArch64-enabled-instruction-set.patch +Patch002:Optimization-on-aarch64.patch -BuildRequires: gcc,gcc-c++,cmake +BuildRequires: gcc +BuildRequires: gcc-c++ +BuildRequires: cmake libcurl libcurl-devel numactl-devel %description -Port of English lagre model LLaMA implemented based on C/C++, -it can be used for model dialogue based on local laptops. +Port of English large model LLaMA implemented in C/C++, +can be used for local inference on laptops. %package devel -Summary: Port of Facebook's LLaMA model in C/C++ +Summary: Development headers and libraries for %{name} Requires: %{name}%{?_isa} = %{version}-%{release} %description devel -Port of English lagre model LLaMA implemented based on C/C++, -it can be used for model dialogue based on local laptops. +This package contains the header files, CMake config files and pkg-config +files needed to develop against %{name}. %prep -%autosetup -b 0 -n %{name}-%{llama_commitid} -p1 +%autosetup -n %{name}-%{llama_commitid} -p1 %build -%cmake -DCMAKE_INSTALL_PREFIX=%{_prefix} \ - -DCMAKE_INSTALL_LIBDIR=%{_libdir} \ - -DCMAKE_INSTALL_BINDIR=%{_bindir} \ - -DCMAKE_INSTALL_INCLUDEDIR=%{_includedir} -%cmake_build +mkdir -p build +cd build +cmake .. \ + -DCMAKE_INSTALL_PREFIX=%{_prefix} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_LIBDIR=%{_libdir} +%make_build %install -%cmake_install - +cd build +%make_install +# 清理空目录 +find %{buildroot} -type d -empty -delete %files %{_bindir}/* %{_libdir}/*.so %files devel -%dir %{_libdir}/cmake/llama -%doc README.md -%{_includedir}/ggml.h -%{_includedir}/ggml-*.h -%{_includedir}/llama.h -%{_includedir}/llama-*.h -%{_libdir}/cmake/llama/*.cmake -%{_exec_prefix}/lib/pkgconfig/llama.pc +%{_includedir}/*.h +%{_libdir}/cmake/ +%{_libdir}/pkgconfig/ +%{_libdir}/*.so %changelog +* Wed Oct 22 2025 bitianyuan - 20251009-2 +- Optimization on Aarch64 + +* Wed Oct 22 2025 GS_Stephen_Curry - 20251009-1 +- Upgrade to b6602 + * Mon Jul 21 2025 PshySimon - 20241210-4 - fix CVE-2025-53630