From 4d3d95eef0c7b5595e6dc394faf25b5672234e0c Mon Sep 17 00:00:00 2001 From: wangyuhang <524413304@qq.com> Date: Fri, 19 Dec 2025 10:31:01 +0800 Subject: [PATCH 1/2] extract the pressure_test function --- src/performance_optimizer/param_optimizer.py | 26 +++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/performance_optimizer/param_optimizer.py b/src/performance_optimizer/param_optimizer.py index e1ac2b15..c4d19aeb 100644 --- a/src/performance_optimizer/param_optimizer.py +++ b/src/performance_optimizer/param_optimizer.py @@ -77,6 +77,19 @@ class ParamOptimizer: return True return False + def pressure_test(self): + logging.info(f"[ParamOptimizer] waiting for pressure test finished ...") + pressure_test_result = wait_for_pressure_test(timeout=self.benchmark_timeout) + if pressure_test_result.status_code != 0: + raise RuntimeError( + f"[ParamOptimizer] failed to run pressure test, err msg is {pressure_test_result.err_msg}" + ) + baseline = float(pressure_test_result.output) + logging.info( + f"[ParamOptimizer] pressure test finished, baseline is {baseline}" + ) + return baseline + def benchmark(self): logging.info(f"🔄 start to verify benchmark performance of {self.service_name}...") result = self.app_interface.benchmark() @@ -164,18 +177,7 @@ class ParamOptimizer: def run(self): # 运行benchmark,摸底参数性能指标 if self.pressure_test_mode: - logging.info(f"[ParamOptimizer] waiting for pressure test finished ...") - pressure_test_result = wait_for_pressure_test(timeout=self.benchmark_timeout) - - if pressure_test_result.status_code != 0: - raise RuntimeError( - f"[ParamOptimizer] failed to run pressure test, err msg is {pressure_test_result.err_msg}" - ) - - baseline = float(pressure_test_result.output) - logging.info( - f"[ParamOptimizer] pressure test finished, baseline is {baseline}" - ) + baseline = self.pressure_test() else: baseline = self.benchmark() # 保存每轮调优的结果,反思调优目标是否达到 -- Gitee From 1862ea573373464a1bb23a7666cace92f906c11c Mon Sep 17 00:00:00 2001 From: wangyuhang <524413304@qq.com> Date: Sat, 20 Dec 2025 11:50:11 +0800 Subject: [PATCH 2/2] introduce a new feature: mem0 --- config/defaults.yaml | 147 ++++++++++++++--- config/defaults_zh.yaml | 148 +++++++++++++++--- src/check_tests/07-check_prompt_format.py | 54 +++++++ src/memory/base.py | 43 +++++ src/memory/h_mem.py | 64 ++++++++ src/performance_optimizer/param_optimizer.py | 43 +++-- .../param_recommender.py | 70 +++++---- src/utils/prompt_manager.py | 73 +++++---- 8 files changed, 524 insertions(+), 118 deletions(-) create mode 100644 src/check_tests/07-check_prompt_format.py create mode 100644 src/memory/base.py create mode 100644 src/memory/h_mem.py diff --git a/config/defaults.yaml b/config/defaults.yaml index 1ac46a9e..cfc7ac9a 100644 --- a/config/defaults.yaml +++ b/config/defaults.yaml @@ -15,14 +15,14 @@ common: &common recommender: value: | # CONTEXT # - [{self.service_name} Category] The goal of this performance optimization is: - Performance metric is {self.performance_metric.name}, which means: {self.performance_metric.value}, target is to improve by {self.slo_goal:.2%}. + [{service_name} Category] The goal of this performance optimization is: + Performance metric is {performance_metric.name}, which means: {performance_metric.value}, target is to improve by {slo_goal:.2%}. Current environment configuration: - {self.static_profile} + {static_profile} Historical tuning information (reference for improvements): {history_result} Current system load and performance analysis: - {self.performance_analysis_report} + {performance_analysis_report} Adjustable parameter set: {params_set_str} # OBJECTIVE # @@ -41,9 +41,9 @@ common: &common value: | # CONTEXT # The goal of this performance optimization is: - Performance metric is {self.performance_metric.name}, which means: {self.performance_metric.value}, target is to improve by {self.slo_goal:.2%} + Performance metric is {performance_metric.name}, which means: {performance_metric.value}, target is to improve by {slo_goal:.2%} Performance analysis report: - {self.performance_analysis_report} + {performance_analysis_report} Parameters you can analyze: {params_set_str} # OBJECTIVE # @@ -54,10 +54,10 @@ common: &common Your answer will be an important reference for other system operations experts, please provide your answer after careful consideration. Please output in English. recommender_positive: value: | - [{self.service_name} Category] You are a professional system operations expert. Current performance metrics do not meet expectations. Please select adjustable parameter values based on the following tuning ideas, current environment configuration, and adjustable parameters. + [{service_name} Category] You are a professional system operations expert. Current performance metrics do not meet expectations. Please select adjustable parameter values based on the following tuning ideas, current environment configuration, and adjustable parameters. Please keep descriptions concise, focus on the key adjustment directions that need output, no need to summarize viewpoints, and do not output anything that has no impact on performance. Current environment configuration: - {self.static_profile} + {static_profile} Historical tuning information, historical tuning modified the following parameters, you can refer to this historical information to reflect on potential improvement points: {history_result} Tuning ideas are: @@ -69,10 +69,10 @@ common: &common Do not provide any response other than JSON, avoid adding comments. Please output in English. recommender_negative: value: | - [{self.service_name} Category] You are a professional system operations expert. Current performance metrics do not meet expectations. Please select adjustable parameter values based on the following tuning ideas, current environment configuration, and adjustable parameters. + [{service_name} Category] You are a professional system operations expert. Current performance metrics do not meet expectations. Please select adjustable parameter values based on the following tuning ideas, current environment configuration, and adjustable parameters. Please keep descriptions concise, focus on the key adjustment directions that need output, no need to summarize viewpoints, and do not output anything that has no impact on performance. Current environment configuration: - {self.static_profile} + {static_profile} Historical tuning information, historical tuning modified the following parameters, you can refer to this historical information to reflect on potential improvement points: {history_result} Tuning ideas are: @@ -83,8 +83,63 @@ common: &common value is the recommended value for the adjustable parameter, please provide reasonable specific values based on the above environment configuration information, carefully confirm whether each value can be used, avoid application startup failures after setting. Do not provide any response other than JSON, avoid adding comments. Please output in English. slow: + mem: + value: | + You are a memory summarization system, responsible for recording and preserving the complete interaction history between humans and the AI tuning expert. You will receive the expert’s tuning ideas and actual execution feedback from the past N steps. Your task is to generate a comprehensive summary of the tuning process, covering the expert’s tuning concepts and corresponding feedback details, ensuring there is no ambiguity. **The expert’s tuning ideas do not need to be fully preserved; only the most important core concepts and key feedback should be summarized.** + ### Overall Structure: + - **Overview (global metadata):** + - **Task Objective: The tuning goal the expert is working to achieve. + - **Progress Status: Current completion percentage, along with a summary of completed milestones or steps. + - **Tuning Operation Sequence (numbered steps): Record only once per iteration. + 1. **Iteration i:** + - Tuning Concept: + The main idea of this tuning iteration, including bottleneck analysis, underutilized resources, etc. Must record highly specific optimization strategies for the given scenario. + - Core Parameters: + Core parameters related to the tuning concept. Record the parameters modified during tuning and their specific values. + - Operation Result: + The performance result of this iteration. + - Key Findings: + Important conclusions drawn from feedback, including parameter effectiveness and reflections on the tuning method. For discrete parameters, if their importance is confirmed, they should be fixed in the key findings; for continuous parameters, the direction of adjustment should be recorded. + ### Example Template: + ``` + ## Expert Tuning History Summary + **Task Objective**: Increase MySQL QPS by 10% in the current environment. + **Progress Status**: 70% completed — achieved a 7% performance improvement. + 1. **Iteration 1:** + - Tuning Concept: Memory usage is far from the upper limit; adjust memory-related parameters to improve utilization. + - Core Parameters: {{memory=50GB, xx=true}} + - Operation Result: Performance result of this iteration is 60, improvement: 20.00% + - Key Findings: Increasing memory settings was correct; further increasing memory-related parameters is beneficial. + 2. **Iteration 2:** + - Tuning Concept: Increase shared_buffers to reduce memory cache misses. + - Core Parameters: {{shared_buffers=20GB, maintenance_work_mem=2GB}} + - Operation Result: Performance result of this iteration is 65, improvement: 30.00% + - Key Findings: Increasing `shared_buffers` improved performance, but memory bottleneck has not yet been reached. + ... (subsequent iterations continue to be recorded by number) + ``` recommender: - value: "" + value: | + You are a system application optimization expert. Your task is to leverage your extensive experience in optimization to provide relevant recommendations. + ### Overall Target: + - The performance metric for the scenario is **{performance_metric.name}**, which signifies: {performance_metric.value}. The goal of tuning optimization is to achieve {slo_goal:.2%}. + - The system performance analysis report is as follows:{performance_analysis_report} + - The related parameters available for analysis and their descriptions are: {params_set_str} + """ + "{long_mem}" + "{short_mem}" + """ + ### Guidelines: + - "Clear": Ensure that the answers are clear, concise, and directly address the question. + - **Cautious, Lazy-Type Modification**: Only output parameter changes expected to yield good results; for continuous parameters, the adjustment range should be between 0~50%, and for discrete/enumerated parameters, avoid extreme adjustment spans; do not adjust parameters that are analyzed in the context as having "no impact" on performance. + - **Logical**: Parameter adjustments should align with the tuning strategy, with each adjustment having a corresponding logic, and rigorously adjust mutually exclusive parameters simultaneously. + - "Strict Formatting": Must meet type and unit requirements; the default unit for numerical values is "bytes". If a unit follows the numerical value, it should be represented as a string (e.g., "512MB"). Parameter output must be in **JSON format**. + ### Example Template: + ``` + {{ + "tuning_concept": "The main bottleneck in the current scenario is IO, but the system's memory usage is only about 30% of the current capacity. The current memory parameter is set at 20G, which is advisable to appropriately increase to enhance memory resource utilization and thereby accelerate performance ... (Additional tuning_concept)", + "PARAM": {{ + "memory": "30GB", + ... (Additional params) + }} + }} + ``` meta: {lang: en, version: 1} # ==================================================== nginx ===================================================== nginx: @@ -102,14 +157,14 @@ nginx: recommender: value: | # CONTEXT # - [{self.service_name} Category] The goal of this performance optimization is: - Performance metric is {self.performance_metric.name}, which means: {self.performance_metric.value}, target is to improve by {self.slo_goal:.2%}. + [{service_name} Category] The goal of this performance optimization is: + Performance metric is {performance_metric.name}, which means: {performance_metric.value}, target is to improve by {slo_goal:.2%}. Current environment configuration: - {self.static_profile} + {static_profile} Historical tuning information (reference for improvements): {history_result} Current system load and performance analysis: - {self.performance_analysis_report} + {performance_analysis_report} **Adjustable parameter set with value ranges and functional information**: {params_set_str} # REQUIREMENTS # @@ -137,9 +192,9 @@ nginx: value: | # CONTEXT # The goal of this performance optimization is: - Performance metric is {self.performance_metric.name}, which means: {self.performance_metric.value}, target is to improve by {self.slo_goal:.2%} + Performance metric is {performance_metric.name}, which means: {performance_metric.value}, target is to improve by {slo_goal:.2%} Performance analysis report: - {self.performance_analysis_report} + {performance_analysis_report} Parameters you can analyze: {params_set_str} # OBJECTIVE # @@ -157,7 +212,7 @@ nginx: Goal: Based on the following information, while maintaining the effective direction from the previous round, summarize parameter adjustment experience, further fine-tune parameters (moderately increase intensity within safe boundaries), only provide parameters that need changes and their recommended new values. Current environment configuration: - {self.static_profile} + {static_profile} Historical tuning information (including modified parameters and results): {history_result} @@ -188,7 +243,7 @@ nginx: Goal: Based on the following information, summarize baseline, best tuning results, worst tuning results, and previous round tuning results and parameter values from historical tuning experience, reversely fine-tune parameters that may have caused degradation in the previous round, and select more conservative and safe values; only provide parameters that need changes and their recommended new values. Current environment configuration: - {self.static_profile} + {static_profile} Historical tuning information (including modified parameters and results): {history_result} @@ -212,8 +267,60 @@ nginx: - Only output one JSON object, key is "adjustable parameter name", value is "recommended value". - Do not output any extra text, explanations, examples, code fences, or comments. slow: + mem: + value: | + - **Overview (Global Metadata):** + - **Task Objective**: The specific performance goal for {application} (e.g., increase QPS, reduce latency, improve concurrency handling). + - **Progress Status**: Current completion percentage, achieved improvements, and milestones reached. + - **Sequential Tuning Actions (Numbered Steps):** Each iteration only record once. + i. **Iteration i**: + - Tuning Concept: + The main concept behind the tuning action, including identified bottlenecks (e.g., connection limits, CPU saturation, keep-alive inefficiency), underutilized resources, or scenario-specific strategies (e.g., static file serving, reverse proxy buffering). + - Core Params: + The key {application} configuration parameters modified in this round. ONLY the parameters that were actually changed in this iteration. Include exact parameter names and values (e.g., for nginx:worker_processes, worker_connections, keepalive_timeout, sendfile, tcp_nopush). + - Operation Result: + Quantitative performance outcome: e.g., QPS, P99 latency, error rate, CPU/memory usage before and after. + - Key Findings: + Insights derived from the results. For discrete parameters (e.g., on/off), fix their optimal value if confirmed. For continuous ones (e.g., buffer sizes), indicate the adjustment direction (increase/decrease). Note any side effects (e.g., higher memory usage). + ### Example Template(Key Format Reference Only): + ``` + ## Summary of the expert's tuning history + **Task Objective**: Increase Nginx QPS by 20% under high concurrent load. + **Progress Status**: 70% completed - 7% improvement achieved. + 0. **Iteration 0**: + - Tuning Concept: Not yet executed - initial baseline established + - Core Params: No changes made + - Operation Result: Baseline metric recorded at xxx + - Key Finding: No tuning actions taken; initial performance benchmark established for future comparison + 1. **Iteration 1**: + - Tuning Concept: The memory usage rate is far from reaching its limit, and parameters related to memory utilization can be adjusted to increase it. + - Core Params: {{memory=50GB, xx=true}} + - Operation Result: QPS increased from 70507.0 to 129443.0 (+83.59% vs baseline, +83.59% vs prev). + - Key Finding: Increasing the memory setting was correct, make memory params larger is better. + ... (Additional numbered steps for subsequent actions) recommender: - value: "" + value: | + You are a system application optimization expert. Your task is to leverage your extensive experience in optimization to provide relevant recommendations. + ### Overall Target: + - The performance metric for the scenario is **{performance_metric.name}**, which signifies: {performance_metric.value}. The goal of tuning optimization is to achieve {slo_goal:.2%}. + - The system performance analysis report is as follows:{performance_analysis_report} + - The related parameters available for analysis and their descriptions are: {params_set_str} + """ + "{long_mem}" + "{short_mem}" + """ + ### Guidelines: + - "Clear": Ensure that the answers are clear, concise, and directly address the question. + - **Cautious, Lazy-Type Modification**: Only output parameter changes expected to yield good results; for continuous parameters, the adjustment range should be between 0~50%, and for discrete/enumerated parameters, avoid extreme adjustment spans; do not adjust parameters that are analyzed in the context as having "no impact" on performance. + - **Logical**: Parameter adjustments should align with the tuning strategy, with each adjustment having a corresponding logic, and rigorously adjust mutually exclusive parameters simultaneously. + - "Strict Formatting": Must meet type and unit requirements; the default unit for numerical values is "bytes". If a unit follows the numerical value, it should be represented as a string (e.g., "512MB"). Parameter output must be in **JSON format**. + ### Example Template: + ``` + {{ + "tuning_concept": "The main bottleneck in the current scenario is IO, but the system's memory usage is only about 30% of the current capacity. The current memory parameter is set at 20G, which is advisable to appropriately increase to enhance memory resource utilization and thereby accelerate performance ... (Additional tuning_concept)", + "PARAM": {{ + "memory": "30GB", + ... (Additional params) + }} + }} + ``` meta: {lang: en, version: 1} # ==================================================== mysql ===================================================== mysql: diff --git a/config/defaults_zh.yaml b/config/defaults_zh.yaml index 6c1baced..0a164ace 100644 --- a/config/defaults_zh.yaml +++ b/config/defaults_zh.yaml @@ -15,14 +15,14 @@ common: &common recommender: value: | # CONTEXT # - [{self.service_name}类] 本次性能优化的目标为: - 性能指标为 {self.performance_metric.name} ,该指标的含义为:{self.performance_metric.value} ,目标是提升 {self.slo_goal:.2%}。 + [{service_name}类] 本次性能优化的目标为: + 性能指标为 {performance_metric.name} ,该指标的含义为:{performance_metric.value} ,目标是提升 {slo_goal:.2%}。 当前环境的配置信息: - {self.static_profile} + {static_profile} 历史调优信息(可参考改进点): {history_result} 当前环境的系统负载与性能分析: - {self.performance_analysis_report} + {performance_analysis_report} 可调整的参数集合: {params_set_str} # OBJECTIVE # @@ -40,9 +40,9 @@ common: &common value: | # CONTEXT # 本次性能优化的目标为: - 性能指标为{self.performance_metric.name}, 该指标的含义为:{self.performance_metric.value},目标是提升{self.slo_goal:.2%} + 性能指标为{performance_metric.name}, 该指标的含义为:{performance_metric.value},目标是提升{slo_goal:.2%} 性能分析报告: - {self.performance_analysis_report} + {performance_analysis_report} 你可以分析的参数有: {params_set_str} # OBJECTIVE # @@ -53,10 +53,10 @@ common: &common 你的答案将会是其他系统运维专家的重要参考意见,请认真思考后给出你的答案。 recommender_positive: value: | - [{self.service_name}类] 你是一个专业的系统运维专家,当前性能指标未达到预期,请你基于以下调优思路、当前环境的配置信息、可调整参数,选出可调整参数值。 + [{service_name}类] 你是一个专业的系统运维专家,当前性能指标未达到预期,请你基于以下调优思路、当前环境的配置信息、可调整参数,选出可调整参数值。 请尽量精简描述,将终点需要调整的方向输出出来,不需要总结观点,对性能无影响的也不要输出。 当前环境的配置信息有: - {self.static_profile} + {static_profile} 以下是历史调优的信息,历史调优修改了如下参数,你可以参考如下历史调优信息来反思可以可以改进的点: {history_result} 调优思路是: @@ -68,10 +68,10 @@ common: &common 请勿给出除了json以外其他的回复,切勿增加注释。 recommender_negative: value: | - [{self.service_name}类] 你是一个专业的系统运维专家,当前性能指标未达到预期,请你基于以下调优思路、当前环境的配置信息、可调整参数,选出可调整参数值。 + [{service_name}类] 你是一个专业的系统运维专家,当前性能指标未达到预期,请你基于以下调优思路、当前环境的配置信息、可调整参数,选出可调整参数值。 请尽量精简描述,将终点需要调整的方向输出出来,不需要总结观点,对性能无影响的也不要输出。 当前环境的配置信息有: - {self.static_profile} + {static_profile} 以下是历史调优的信息,历史调优修改了如下参数,你可以参考如下历史调优信息来反思可以可以改进的点: {history_result} 调优思路是: @@ -82,8 +82,63 @@ common: &common value是可调参数的推荐取值,请根据上面的环境配置信息给出合理的具体取值,请仔细确认各个值是否可以被使用,避免设置后应用无法启动。 请勿给出除了json以外其他的回复,切勿增加注释。 slow: + mem: + value: | + 你是一个记忆摘要系统,负责记录并保存人类与 AI 调优专家之间的完整交互历史。你将获得专家在过去N个步骤中的调优思路和实际执行反馈。你的任务是生成一份全面的调优过程摘要,涵盖专家的调优理念和对应的反馈细节,确保没有任何歧义。 **专家的调优思路不需要完整保存;只需总结最重要的核心概念和关键反馈。** + ### 总体结构: + - **概览(全局元数据):** + - **任务目标:专家正在努力实现的调优目标。 + - **进展状态:当前完成百分比,以及已完成的具体里程碑或步骤摘要。 + - **调优操作序列(编号步骤): 每次迭代只记录一次。 + 1. **第i次迭代:** + - 调优理念: + 本次调优的主要思路,包括瓶颈分析、资源未充分利用等。需记录针对具体场景的高度特定的优化策略。 + - 核心参数: + 与调优理念相关的核心参数。记录调优过程中修改的核心参数及其具体数值。 + - 操作结果: + 本次迭代的性能结果。 + - 关键发现: + 从反馈中得出的重要结论,包括参数的有效性和对调优方法的反思。 对于离散类型的参数,如果其重要性被确认,应在关键发现中固定输出; 对于连续类型参数,应记录其应调整的方向。 + ### 示例模板: + ``` + ## 专家调优历史摘要 + **任务目标**: 在当前环境下将 MySQL 的 QPS 提高 10%。 + **进展状态**: 已完成 70% —— 已实现 7% 的性能提升。 + 1. **第 1 次迭代:** + - 调优理念:内存使用率远未达到上限,可调整与内存利用相关的参数以提升使用率。 + - 核心参数:{{memory=50GB, xx=true}} + - 操作结果:本次迭代性能结果为 60,提升幅度:20.00% + - 关键发现:增加内存设置是正确的,继续增大内存相关参数更有利。 + 2. **第 2 次迭代:** + - 调优理念:提升 shared_buffers 以减少内存缓存未命中。 + - 核心参数:{{shared_buffers=20GB, maintenance_work_mem=2GB}} + - 操作结果:本次迭代性能结果为 65,提升幅度:30.00% + - 关键发现:提升 `shared_buffers` 带来了性能提升,但尚未达到内存瓶颈。 + ... (后续迭代按编号继续记录) + ``` recommender: - value: "" + value: | + 你是一位系统应用优化专家。你的任务是利用你在优化方面的丰富经验,提供相关的建议。 + ### 总体目标 + - 本场景的性能指标是 {performance_metric.name},其含义为:{performance_metric.value}。调优优化的目标是实现 {slo_goal:.2%}。 + - 系统性能分析报告如下: {performance_analysis_report} + - 可供分析的相关参数及其描述为:{params_set_str} + """ + "{long_mem}" + "{short_mem}" + """ + ### 优化指南 + - **清晰**:确保答案清楚、简洁,并直接回应问题。 + - **谨慎、懒惰型修改**:仅输出预期能带来良好效果的参数调整;对于连续型参数,调整范围应在 0~50% 之间;对于离散/枚举型参数,避免极端跨度调整;不要调整在上下文分析中被认定为“对性能无影响”的参数。 + - **逻辑性**:参数调整应符合调优策略,每个调整都应有相应的逻辑,并严格同时调整互斥参数。 + - **严格格式化**:必须符合类型和单位要求;数值的默认单位为 “bytes”。如果数值后跟单位,应以字符串表示(例如 "512MB")。参数输出必须为**JSON 格式**。 + ### 示例模板 + ``` + {{ + "tuning_concept": "当前场景的主要瓶颈在 IO,但系统的内存使用率仅约为当前容量的 30%。当前内存参数设置为 20G,建议适当增加以提升内存资源利用率,从而加速性能 ... (更多调优概念)", + "PARAM": {{ + "memory": "30GB", + ... (更多参数) + }} + }} + ``` meta: {lang: zh, version: 1} # ==================================================== nginx ===================================================== nginx: @@ -101,14 +156,14 @@ nginx: recommender: value: | # CONTEXT # - [{self.service_name}类] 本次性能优化的目标为: - 性能指标为 {self.performance_metric.name} ,该指标的含义为:{self.performance_metric.value} ,目标是提升 {self.slo_goal:.2%}。 + [{service_name}类] 本次性能优化的目标为: + 性能指标为 {performance_metric.name} ,该指标的含义为:{performance_metric.value} ,目标是提升 {slo_goal:.2%}。 当前环境的配置信息: - {self.static_profile} + {static_profile} 历史调优信息(可参考改进点): {history_result} 当前环境的系统负载与性能分析: - {self.performance_analysis_report} + {performance_analysis_report} **可调整的参数集合以及其取值范围和功能信息**: {params_set_str} # 要求如下 # @@ -135,9 +190,9 @@ nginx: value: | # CONTEXT # 本次性能优化的目标为: - 性能指标为{self.performance_metric.name}, 该指标的含义为:{self.performance_metric.value},目标是提升{self.slo_goal:.2%} + 性能指标为{performance_metric.name}, 该指标的含义为:{performance_metric.value},目标是提升{slo_goal:.2%} 性能分析报告: - {self.performance_analysis_report} + {performance_analysis_report} 你可以分析的参数有: {params_set_str} # OBJECTIVE # @@ -155,7 +210,7 @@ nginx: 目标:基于以下信息,在保持上轮有效方向的前提下,总结参数调整经验,进一步微调参数(在安全边界内适度加大力度),仅给出需要变更的参数与推荐新值。 当前环境配置信息: - {self.static_profile} + {static_profile} 历史调优信息(包含已修改参数与结果): {history_result} @@ -185,7 +240,7 @@ nginx: 目标:基于以下信息,总结历史调优经验中的baseline、最佳调优结果、最差调优结果以及上一轮调优结果以及参数取值,反向微调上轮可能导致退化的参数,并选择更保守且安全的值;仅给出需要变更的参数与推荐新值。 当前环境配置信息: - {self.static_profile} + {static_profile} 历史调优信息(包含已修改参数与结果): {history_result} @@ -208,8 +263,61 @@ nginx: - 仅输出一个 JSON 对象,键为“可调参数名称”,值为“推荐取值”。 - 不要输出任何多余文字、说明、示例、代码围栏或注释。 slow: + mem: + value: | + 你是一个记忆摘要系统,负责记录并留存人类与AI调优专家围绕**{application}性能优化**展开的完整交互历史。 + 你会获取专家在过去N轮调优中提出的调优思路,以及每一轮调优的实际执行反馈。你的任务是生成一份调优过程的完整摘要,内容需涵盖专家的核心优化思路及其对应的反馈细节,确保信息无歧义。 + 无需完整存储专家的所有思路,请勿照搬复制全部信息,**仅提炼最重要的核心概念与关键反馈,同时必须保证数据严谨性,严禁伪造任何调优参数、性能指标及相关数据**。 + ### 整体结构 + - **概述(全局元数据)** + - **任务目标**:{application}的具体性能优化目标(例如:提升每秒查询数QPS、降低延迟、优化并发处理能力)。 + - **进度状态**:当前完成进度、已达成的性能提升效果及关键里程碑。 + - **调优动作序列(按轮次编号记录)** + 每一轮调优仅记录一次,格式如下: + i. **第 i 轮调优** + - **调优思路**:本轮调优的核心逻辑,包括识别出的性能瓶颈(例如:连接数限制、CPU资源饱和、长连接复用效率低)、未充分利用的系统资源,或面向特定场景的优化策略(例如:静态文件分发优化、反向代理缓冲配置)。 + - **核心参数**:本轮实际修改的{application}关键配置参数。仅保留当期迭代中变动的参数,需注明参数名称与精确取值(例如:针对Nginx的参数:worker_processes、worker_connections、keepalive_timeout、sendfile、tcp_nopush)。 + - **执行结果**:量化的性能表现数据,例如:每秒查询数(QPS)、P99延迟、错误率、CPU/内存使用率的调优前后对比值。 + - **关键结论**:从本轮结果中提炼的核心洞察。对于开关型离散参数(例如:开启/关闭),若已验证最优值则明确标注;对于缓冲区大小等连续型参数,需指明后续调整方向(增大/减小);同时记录调优带来的副作用(例如:内存占用升高)。 + ### 示例模板(仅作格式参考) + ``` + ## 专家调优历史摘要 + **任务目标:在高并发负载下,将 Nginx 的 QPS 提升 20%。 + **进度状态:已完成 70% 进度,实现 7% 的性能提升。 + 0. **第 0 轮调优**: + - 调优思路:未执行调优操作 —— 建立初始性能基线。 + - 核心参数:无参数变更。 + - 执行结果:记录基线性能指标为 xxx。 + - 关键结论:未执行任何调优动作,建立初始性能基准值,供后续调优对比参考。 + 1. **第 1 轮调优**: + - 调优思路:当前内存使用率远未达到阈值,可调整内存相关参数以提升利用率。 + - 核心参数:{{memory=50GB, xx=true}} + - 执行结果:QPS 从 70507.0 提升至 129443.0(相对基线提升 83.59%,相对上一轮提升 83.59%)。 + - 关键结论:提升内存参数配置的方向正确,内存相关参数可进一步增大以获取更佳性能。 + ... (后续调优轮次按上述格式补充) recommender: - value: "" + value: | + 你是一位系统应用优化专家。你的任务是利用你在优化方面的丰富经验,提供相关的建议。 + ### 总体目标 + - 本场景的性能指标是 {performance_metric.name},其含义为:{performance_metric.value}。调优优化的目标是实现 {slo_goal:.2%}。 + - 系统性能分析报告如下: {performance_analysis_report} + - 可供分析的相关参数及其描述为:{params_set_str} + """ + "{long_mem}" + "{short_mem}" + """ + ### 优化指南 + - **清晰**:确保答案清楚、简洁,并直接回应问题。 + - **谨慎、懒惰型修改**:仅输出预期能带来良好效果的参数调整;对于连续型参数,调整范围应在 0~50% 之间;对于离散/枚举型参数,避免极端跨度调整;不要调整在上下文分析中被认定为“对性能无影响”的参数。 + - **逻辑性**:参数调整应符合调优策略,每个调整都应有相应的逻辑,并严格同时调整互斥参数。 + - **严格格式化**:必须符合类型和单位要求;数值的默认单位为 “bytes”。如果数值后跟单位,应以字符串表示(例如 "512MB")。参数输出必须为**JSON 格式**。 + ### 示例模板 + ``` + {{ + "tuning_concept": "当前场景的主要瓶颈在 IO,但系统的内存使用率仅约为当前容量的 30%。当前内存参数设置为 20G,建议适当增加以提升内存资源利用率,从而加速性能 ... (更多调优概念)", + "PARAM": {{ + "memory": "30GB", + ... (更多参数) + }} + }} + ``` meta: {lang: zh, version: 1} # ==================================================== mysql ===================================================== mysql: diff --git a/src/check_tests/07-check_prompt_format.py b/src/check_tests/07-check_prompt_format.py new file mode 100644 index 00000000..4839879d --- /dev/null +++ b/src/check_tests/07-check_prompt_format.py @@ -0,0 +1,54 @@ +from src.utils.prompt_instance import prompt_manager + +common_allowed_set = { + "service_name": "TESTTEST-service_name-TESTTEST", + "performance_metric.name": "TESTTEST-performance_metric.name-TESTTEST", + "performance_metric.value": "TESTTEST-performance_metric.value-TESTTEST", + "slo_goal": "TESTTEST-slo_goal-TESTTEST", + "static_profile": "TESTTEST-static_profile-TESTTEST", + "performance_analysis_report": "TESTTEST-performance_analysis_report-TESTTEST", + "params_set_str": "TESTTEST-params_set_str-TESTTEST" +} + +def test_get_fast_prompt(service_name): + allowed_set = common_allowed_set + allowed_set["history_result"] = "TESTTEST-history_result-TESTTEST" + + prompt_manager.format_prompt(service_name, "fast", "recommender", allowed_set) + +def test_get_idea_prompt(service_name): + allowed_set = common_allowed_set + + prompt_manager.format_prompt(service_name, "normal", "idea", allowed_set) + +def test_get_normal_prompt(service_name): + allowed_set = common_allowed_set + allowed_set["optimization_idea"] = "TESTTEST-optimization_idea-TESTTEST" + allowed_set["history_result"] = "TESTTEST-history_result-TESTTEST" + + prompt_manager.format_prompt(service_name, "normal", "recommender_positive", allowed_set) + prompt_manager.format_prompt(service_name, "normal", "recommender_negative", allowed_set) + +def test_test_mem_mode(service_name): + allowed_set = common_allowed_set + allowed_set["application"] = "TESTTEST-application-TESTTEST" + + mem0_prompt = prompt_manager.format_prompt(service_name, "slow", "mem", allowed_set) + +def test_slow_prompt_mode(service_name): + allowed_set = common_allowed_set + allowed_set["long_mem"] = "TESTTEST-long_mem-TESTTEST" + allowed_set["short_mem"] = "TESTTEST-short_mem-TESTTEST" + + prompt_manager.format_prompt(service_name, "slow", "recommender", allowed_set) + +def main(): + for server_name in ["mysql", "nginx"]: + test_get_fast_prompt(server_name) + test_get_idea_prompt(server_name) + test_get_normal_prompt(server_name) + test_test_mem_mode(server_name) + test_slow_prompt_mode(server_name) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/memory/base.py b/src/memory/base.py new file mode 100644 index 00000000..4a34d4a4 --- /dev/null +++ b/src/memory/base.py @@ -0,0 +1,43 @@ +from abc import ABC, abstractmethod + +class MemoryBase(ABC): + @abstractmethod + def get(self, memory_id:int): + """ + Get the latest long-term and short-term memory. + + Args: + memory_id (str): ID of the memory to update. If -1, return the latest meomory. + + Returns: + dict: Retrieved memory. + """ + pass + + @abstractmethod + def update(self, data:str): + """ + Update a memory. + + Args: + data (str): New content to update the memory with. + + Returns: + dict: Success message indicating the memory was updated. + """ + pass + + @abstractmethod + def history(self): + """ + Get the history of changes for a memory by ID. + + Args: + None + + Returns: + list: List of changes for the memory. + """ + pass + + diff --git a/src/memory/h_mem.py b/src/memory/h_mem.py new file mode 100644 index 00000000..736674cb --- /dev/null +++ b/src/memory/h_mem.py @@ -0,0 +1,64 @@ +import logging +import hashlib +from datetime import datetime +import pytz +from copy import deepcopy +from src.memory.base import MemoryBase +from src.utils.llm import get_llm_response +from src.utils.prompt_instance import prompt_manager + + +class HierarchicalMemory(MemoryBase): + def __init__(self, service_name): + self.long_term_memory = "" + self.short_term_meomry = "" + self.history_record = [] + self.service_name = service_name + + def get(self) -> dict: + return self.history_record[-1] + + def update(self, data: str): + self.short_term_meomry = self._pack_short_mem(data) + self.long_term_memory = self._pack_long_mem(data) + self.history_record.append({ + "long": deepcopy(self.long_term_memory), + "short": deepcopy(self.short_term_meomry) + }) + return self.history_record[-1] + + def history(self): + return self.history_record + + def _get_metadata(self, data): + return { + "data": data, + "hash": hashlib.md5(data.encode()).hexdigest(), + "created_at": datetime.now(pytz.timezone("US/Pacific")).isoformat(), + "action": "UPDATE" + } + + def _pack_short_mem(self, data) -> dict | None: + return self._get_metadata(data) + + def _pack_long_mem(self, data, metadata=None) -> dict | None: + metadata = metadata or {} + long_mem = self.long_term_memory['data'] if self.long_term_memory else "" + allowed_set = { + "application": self.service_name + } + mem0_prompt = prompt_manager.format_prompt(self.service_name, "slow", "mem", allowed_set) + + parsed_message = mem0_prompt + "\n" + \ + long_mem + "\n" + \ + data + "\n" + \ + "Create memory summary of the above conversation." + + # because updating the mem requires accessing the LLM, + # so we only update the mem in slow mode to avoid unnecessary overhead. + if prompt_manager.get_mode(self.service_name) != "slow": + memory_content = "" + else: + memory_content = get_llm_response(prompt=parsed_message) + + return self._get_metadata(memory_content) diff --git a/src/performance_optimizer/param_optimizer.py b/src/performance_optimizer/param_optimizer.py index c4d19aeb..01f10a42 100644 --- a/src/performance_optimizer/param_optimizer.py +++ b/src/performance_optimizer/param_optimizer.py @@ -8,6 +8,7 @@ from src.performance_test.pressure_test import wait_for_pressure_test from src.utils.config.app_config import AppInterface from src.utils.shell_execute import SshClient from src.utils.snapshot import load_snapshot, save_snapshot +from src.memory.h_mem import HierarchicalMemory class ParamOptimizer: @@ -77,7 +78,7 @@ class ParamOptimizer: return True return False - def pressure_test(self): + def get_pressure_test_result(self): logging.info(f"[ParamOptimizer] waiting for pressure test finished ...") pressure_test_result = wait_for_pressure_test(timeout=self.benchmark_timeout) if pressure_test_result.status_code != 0: @@ -177,15 +178,10 @@ class ParamOptimizer: def run(self): # 运行benchmark,摸底参数性能指标 if self.pressure_test_mode: - baseline = self.pressure_test() + baseline = self.get_pressure_test_result() else: baseline = self.benchmark() - # 保存每轮调优的结果,反思调优目标是否达到 - historys = { - "best_result": {}, - "worst_result": {}, - "previous_result": {} - } + best_result = baseline worst_result = baseline curr_recommend_params = {} @@ -196,9 +192,30 @@ class ParamOptimizer: f"[{0}/{self.max_iterations}] performance baseline of {self.service_name} is: {baseline}" ) + # normal模式,通过history保存每轮调优的结果,反思调优目标是否达到 + historys = { + "best_result": {}, + "worst_result": {}, + "previous_result": {} + } + + # slow模式,通过mem机制来记忆调优过程,以期达到更优的调优结果 + mem = HierarchicalMemory(self.service_name) + mem.update( + f"**Task Objective**: Optimize {self.service_name} performance by tuning memory-related parameters to increase throughput and reduce latency.\n" + f"**Progress Status**:Baseline performance: {baseline}" + ) + for i in range(self.max_iterations): + current_mem = mem.get() + # 未达成目标的情况下,根据调优结果与历史最优的参数,执行参数调优推荐,给出参数名和参数值 - recommend_params = self.param_recommender.run(history_result=historys, is_positive=is_positive) + recommend_params = self.param_recommender.run( + long_mem = current_mem["long"], + short_mem = current_mem["short"], + history_result = historys, + is_positive = is_positive + ) # 设置参数生效 self.apply_params(recommend_params) @@ -221,6 +238,7 @@ class ParamOptimizer: script_path = '/tmp/euler-copilot-params.sh' self.save_restart_params_to_script(recommend_params, script_path, i + 1) self.recover_cluster() + if performance_result is None: historys["previous_result"] = {"previous_performance": "benchmark failed, because param is invalid.", "recommend_param": recommend_params} self.apply_params(self.current_params) @@ -254,6 +272,13 @@ class ParamOptimizer: ratio = self.calc_improve_rate(baseline, performance_result, symbol) + # mem更新 + performance_test_result = ( + f"Iteration: {i},This iteration performance result: {performance_result}, " + f"improvment: {ratio:.2%}, param change: {recommend_params}" + ) + mem.update(performance_test_result) + logging.info( f"[{i + 1}/{self.max_iterations}] performance baseline of {self.service_name} is {baseline}, best result: {best_result}, this round result: {performance_result if performance_result is not None else '-'}, performance improvement: {ratio:.2%}" ) diff --git a/src/performance_optimizer/param_recommender.py b/src/performance_optimizer/param_recommender.py index 17070a55..bd547509 100644 --- a/src/performance_optimizer/param_recommender.py +++ b/src/performance_optimizer/param_recommender.py @@ -85,41 +85,44 @@ class ParamRecommender: ] return filtered_history - def _process_chunk(self, history_result, cur_params_set, is_positive): + def _process_chunk(self, long_mem, short_mem, history_result, cur_params_set, is_positive): history_result = self._get_histort(history_result, cur_params_set) + + allowed_set = { + "service_name": self.service_name, + "performance_metric.name": self.performance_metric.name, + "performance_metric.value": self.performance_metric.value, + "slo_goal": self.slo_goal, + "static_profile": self.static_profile, + "performance_analysis_report": self.performance_analysis_report, + "params_set_str": ",".join(cur_params_set) + } - params_set_str = ",".join(cur_params_set) - allowed_set = set([ - "self.service_name", - "self.performance_metric.name", - "self.performance_metric.value", - "self.slo_goal", - "self.static_profile", - "history_result", - "self.performance_analysis_report", - "params_set_str", - ]) prompt_mode = prompt_manager.get_mode(self.service_name) if prompt_mode == "fast": - recommend_prompt_format = prompt_manager.get(self.service_name, prompt_mode, 'recommender')['value'] - recommend_prompt, extras = prompt_manager.render_by_parse(recommend_prompt_format, allowed_set) - if len(extras) != 0: - logging.warn(f"param not in custom offered param {extras}") - recommended_params = get_llm_response(recommend_prompt) - elif prompt_mode == "normal": - idea_prompt_format = prompt_manager.get(self.service_name, prompt_mode, 'idea')['value'] - idea_prompt, extras = prompt_manager.render_by_parse(idea_prompt_format, allowed_set) - optimization_idea = get_llm_response(idea_prompt) - allowed_set.add("optimization_idea") - recommend_prompt_format = prompt_manager.get(self.service_name, prompt_mode, - 'recommender_positive' if is_positive else 'recommender_negative')['value'] - recommend_prompt, extras = prompt_manager.render_by_parse(recommend_prompt_format, allowed_set) - recommended_params = get_llm_response(recommend_prompt) + allowed_set["history_result"] = history_result + + fast_prompt = prompt_manager.format_prompt(self.service_name, "fast", "recommender", allowed_set) + llm_opt_result = get_llm_response(fast_prompt) + recommended_params_set = json_repair(llm_opt_result) + elif prompt_mode == "slow": + allowed_set["long_mem"] = long_mem + allowed_set["short_mem"] = short_mem + + slow_prompt = prompt_manager.format_prompt(self.service_name, "slow", "recommender", allowed_set) + llm_opt_result = get_llm_response(slow_prompt) + recommended_params_set = json_repair(llm_opt_result)["PARAM"] else: - # todo for slow prompt - recommended_params = get_llm_response(recommend_prompt) + idea_prompt = prompt_manager.format_prompt(self.service_name, "normal", "idea", allowed_set) + optimization_idea = get_llm_response(idea_prompt) + + allowed_set["optimization_idea"] = optimization_idea + allowed_set["history_result"] = history_result + recommender_type = 'recommender_positive' if is_positive else 'recommender_negative' - recommended_params_set = json_repair(recommended_params) + normal_prompt = prompt_manager.format_prompt(self.service_name, "normal", recommender_type, allowed_set) + llm_opt_result = get_llm_response(normal_prompt) + recommended_params_set = json_repair(llm_opt_result) result = {} for param_name, param_value in recommended_params_set.items(): @@ -127,14 +130,19 @@ class ParamRecommender: result[param_name] = param_value return result - def run(self, history_result, is_positive=True): + def run(self, long_mem=None, short_mem=None, history_result=None, is_positive=True): resultset = {} for i in range(0, len(self.params_set), self.chunk_size): cur_params_set = self.params_set[i: i + self.chunk_size] # 提交任务给线程池,返回 future-like 对象(你线程池需要支持这个) thread_pool_manager.add_task( - self._process_chunk, history_result, cur_params_set, is_positive + self._process_chunk, + long_mem, + short_mem, + history_result, + cur_params_set, + is_positive ) thread_pool_manager.run_all_tasks() diff --git a/src/utils/prompt_manager.py b/src/utils/prompt_manager.py index bac9ad0e..1eaf0312 100644 --- a/src/utils/prompt_manager.py +++ b/src/utils/prompt_manager.py @@ -23,13 +23,13 @@ import tempfile import threading from typing import Any, Dict, List, Optional import string, inspect - import yaml from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field import uvicorn - +from src.utils.common import language +import logging # ----------------------------- # Pydantic 请求/响应模型 @@ -198,59 +198,56 @@ class StringRepository: self._save() @staticmethod - def render_by_parse(tpl: str, allowed: set, default: str = "未知"): + def render_by_parse(tpl: str, data: dict, default: str = None) -> str: """ - tpl: 模板字符串,含 {field[:fmt][!conv]} - allowed: 允许替换的字段全集(如 {'self.service_name','self.performance_metric.name',...}) - default: 不在 allowed 或取值失败时的占位 - 返回: (渲染后字符串, 额外占位符列表) + 通用字符串替换函数 + tpl: 模板字符串,含 {key} 占位符 + data: 字典,提供替换值 + default: 缺失字段时的默认值 + 返回: 渲染后的字符串 """ + default = default or ("未知" if language() == "zh" else "unknown") fmt = string.Formatter() - f = inspect.currentframe().f_back - # 记录上一个调用栈的所有变量信息 - scope = {**f.f_globals, **f.f_locals} - - def resolve(field: str): - parts = field.split('.') - cur = scope.get(parts[0], None) - for p in parts[1:]: - if cur is None: return None - cur = (cur.get(p) if isinstance(cur, dict) else getattr(cur, p, None)) - return cur - - out, extras = [], [] + out = [] + missing = [] + for literal, field, format_spec, conversion in fmt.parse(tpl): out.append(literal) if not field: continue - # 需要的变量不再提供的可用变量里面,直接改值为 “未知”,同时记录这个额外的 “需求变量” - if field not in allowed: - out.append(default) - extras.append(field) - continue - val = resolve(field) - if val is None: - out.append(default) - extras.append(field) - continue - # 格式化/转换 + # 从字典取值 + if field not in data or data[field] is None: + val = default + missing.append(field) + else: + val = data[field] + # 格式化 try: val = format(val, format_spec) if format_spec else str(val) except Exception: val = str(val) + # 转换符 if conversion == 'r': val = repr(val) elif conversion == 'a': val = ascii(val) out.append(str(val)) - # extras 去重保序 - seen, uniq = set(), [] - for e in extras: - if e not in seen: - seen.add(e) - uniq.append(e) - return ''.join(out), uniq + + return ''.join(out), missing + + def format_prompt(self, service_name, prompt_mode, prompt_type, allowed_set): + prompt_format = self.get( + service_name, + prompt_mode, + prompt_type + )['value'] + + opt_result, extras = self.render_by_parse(prompt_format, allowed_set) + # pay attention to this error message to avoid incorrect escaping when adding or modifying prompts, + if len(extras) != 0: + logging.warn(f"error parm in default.yaml: {service_name}->{prompt_mode}->{prompt_type}. param not in custom offered param {extras}") + return opt_result # ---------------------------------------------------------- # FastAPI(对前端/其他服务) -- Gitee