From 297d960752eb8162341cb3030647f1ca0eaf9429 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 25 Apr 2024 05:00:55 +0100 Subject: [PATCH 01/32] mm: simplify thp_vma_allowable_order ANBZ: #28369 commit e0ffb29bc54d86b9ab10ebafc66eb1b7229e0cd7 upstream. Combine the three boolean arguments into one flags argument for readability. Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Cc: Ryan Roberts Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 29 +++++++++++++++-------------- mm/huge_memory.c | 7 +++++-- mm/khugepaged.c | 16 +++++++--------- mm/memory.c | 14 ++++++++------ 5 files changed, 37 insertions(+), 33 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d6786bf83ed2..e75e2153357b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -879,8 +879,8 @@ static int show_smap(struct seq_file *m, void *v) #endif seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false, - true, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4bfec89eef03..0fa72495cd5d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,8 +92,12 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL \ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) -#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order))) +#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ +#define TVA_IN_PF (1 << 1) /* Page fault handler */ +#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ + +#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) static inline int lowest_order(unsigned long orders) { @@ -275,17 +279,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @smaps: whether answer will be used for smaps file - * @in_pf: whether answer will be used by page fault handler - * @enforce_sysfs: whether sysfs config should be taken into account + * @tva_flags: Which TVA flags to honour * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -298,12 +300,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { /* Optimization to check if required orders are enabled early. */ - if (enforce_sysfs && vma_is_anonymous(vma)) { + if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -317,8 +319,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, - enforce_sysfs, orders); + return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } struct thpsize { @@ -494,8 +495,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 965d13983538..89e06a874be2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,10 +89,13 @@ static bool anon_orders_configured __initdata; static bool file_orders_configured; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, bool smaps, - bool in_pf, bool enforce_sysfs, + unsigned long vm_flags, + unsigned long tva_flags, unsigned long orders) { + bool smaps = tva_flags & TVA_SMAPS; + bool in_pf = tva_flags & TVA_IN_PF; + bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 78e3126a09c1..62130c810307 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -471,7 +471,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, false, false, true, + if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } @@ -921,6 +921,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; + unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; @@ -931,8 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - cc->is_khugepaged, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1523,8 +1523,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2413,8 +2412,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, - true, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, + TVA_ENFORCE_SYSFS, PMD_ORDER)) { skip: progress++; continue; @@ -2769,8 +2768,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false, - PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 9643bdf7c56e..3daf8912cbd0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4473,8 +4473,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -5003,8 +5003,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true, - BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -6283,7 +6283,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6317,7 +6318,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, + TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- Gitee From a3157af1f373da107241a658eab54efa96de06b8 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 29 Jan 2024 13:45:51 +0800 Subject: [PATCH 02/32] mm/khugepaged: bypassing unnecessary scans with MMF_DISABLE_THP check ANBZ: #28369 commit 879c6000e191b61b97e17bce44c4564ee42eb612 upstream. khugepaged scans the entire address space in the background for each given mm, looking for opportunities to merge sequences of basic pages into huge pages. However, when an mm is inserted to the mm_slots list, and the MMF_DISABLE_THP flag is set later, this scanning process becomes unnecessary for that mm and can be skipped to avoid redundant operations, especially in scenarios with a large address space. On an Intel Core i5 CPU, the time taken by khugepaged to scan the address space of the process, which has been set with the MMF_DISABLE_THP flag after being added to the mm_slots list, is as follows (shorter is better): VMA Count | Old | New | Change --------------------------------------- 50 | 23us | 9us | -60.9% 100 | 32us | 9us | -71.9% 200 | 44us | 9us | -79.5% 400 | 75us | 9us | -88.0% 800 | 98us | 9us | -90.8% Once the count of VMAs for the process exceeds page_to_scan, khugepaged needs to wait for scan_sleep_millisecs ms before scanning the next process. IMO, unnecessary scans could actually be skipped with a very inexpensive mm->flags check in this case. This commit introduces a check before each scanning process to test the MMF_DISABLE_THP flag for the given mm; if the flag is set, the scanning process is bypassed, thereby improving the efficiency of khugepaged. This optimization is not a correctness issue but rather an enhancement to save expensive checks on each VMA when userspace cannot prctl itself before spawning into the new process. On some servers within our company, we deploy a daemon responsible for monitoring and updating local applications. Some applications prefer not to use THP, so the daemon calls prctl to disable THP before fork/exec. Conversely, for other applications, the daemon calls prctl to enable THP before fork/exec. Ideally, the daemon should invoke prctl after the fork, but its current implementation follows the described approach. In the Go standard library, there is no direct encapsulation of the fork system call; instead, fork and execve are combined into one through syscall.ForkExec. Link: https://lkml.kernel.org/r/20240129054551.57728-1-ioworker0@gmail.com Signed-off-by: Lance Yang Acked-by: David Hildenbrand Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 62130c810307..61929ef8a3f9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -434,6 +434,12 @@ static bool hugepage_pmd_enabled(void) return false; } +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +{ + return hpage_collapse_test_exit(mm) || + test_bit(MMF_DISABLE_THP, &mm->flags); +} + void __khugepaged_enter(struct mm_struct *mm) { struct khugepaged_mm_slot *mm_slot; @@ -1440,7 +1446,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (hpage_collapse_test_exit_or_disable(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -2400,7 +2406,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, goto breakouterloop_mmap_lock; progress++; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2408,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } @@ -2430,7 +2436,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2448,7 +2454,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit(mm)) + if (hpage_collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); @@ -2490,7 +2496,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (hpage_collapse_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit_or_disable(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find -- Gitee From 6305c28e9af26ee0e132c35897d7de1042cfedf4 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Tue, 27 Feb 2024 11:51:35 +0800 Subject: [PATCH 03/32] mm/khugepaged: keep mm in mm_slot without MMF_DISABLE_THP check ANBZ: #28369 commit 5dad604809c5acc546ec74057498db1623f1c408 upstream. Previously, we removed the mm from mm_slot and dropped mm_count if the MMF_THP_DISABLE flag was set. However, we didn't re-add the mm back after clearing the MMF_THP_DISABLE flag. Additionally, We add a check for the MMF_THP_DISABLE flag in hugepage_vma_revalidate(). Link: https://lkml.kernel.org/r/20240227035135.54593-1-ioworker0@gmail.com Fixes: 879c6000e191 ("mm/khugepaged: bypassing unnecessary scans with MMF_DISABLE_THP check") Signed-off-by: Lance Yang Suggested-by: Yang Shi Reviewed-by: Yang Shi Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Minchan Kim Cc: Muchun Song Cc: Peter Xu Cc: Zach O'Keefe Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 61929ef8a3f9..bdc7a664eb58 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -929,7 +929,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct vm_area_struct *vma; unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; - if (unlikely(hpage_collapse_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -1446,7 +1446,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit_or_disable(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -2496,7 +2496,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (hpage_collapse_test_exit_or_disable(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find -- Gitee From 1358dfff799afa9f374ca0dc1a098858b4b8efa0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:54 +0100 Subject: [PATCH 04/32] mm/huge_memory: convert "tva_flags" to "enum tva_type" ANBZ: #28369 commit 1f1c061089dcd274befa0c76fb9f6e253a8368c0 upstream. When determining which THP orders are eligible for a VMA mapping, we have previously specified tva_flags, however it turns out it is really not necessary to treat these as flags. Rather, we distinguish between distinct modes. The only case where we previously combined flags was with TVA_ENFORCE_SYSFS, but we can avoid this by observing that this is the default, except for MADV_COLLAPSE or an edge cases in collapse_pte_mapped_thp() and hugepage_vma_revalidate(), and adding a mode specifically for this case - TVA_FORCED_COLLAPSE. We have: * smaps handling for showing "THPeligible" * Pagefault handling * khugepaged handling * Forced collapse handling: primarily MADV_COLLAPSE, but also for an edge case in collapse_pte_mapped_thp() Disregarding the edge cases, we only want to ignore sysfs settings only when we are forcing a collapse through MADV_COLLAPSE, otherwise we want to enforce it, hence this patch does the following flag to enum conversions: * TVA_SMAPS | TVA_ENFORCE_SYSFS -> TVA_SMAPS * TVA_IN_PF | TVA_ENFORCE_SYSFS -> TVA_PAGEFAULT * TVA_ENFORCE_SYSFS -> TVA_KHUGEPAGED * 0 -> TVA_FORCED_COLLAPSE With this change, we immediately know if we are in the forced collapse case, which will be valuable next. Link: https://lkml.kernel.org/r/20250815135549.130506-3-usamaarif642@gmail.com Signed-off-by: David Hildenbrand Signed-off-by: Usama Arif Acked-by: Usama Arif Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton Signed-off-by: Yuanhe Shu --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 30 ++++++++++++++++++------------ mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 15 +++++++-------- mm/memory.c | 14 ++++++-------- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e75e2153357b..1232ab6e297e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -879,8 +879,8 @@ static int show_smap(struct seq_file *m, void *v) #endif seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, + THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0fa72495cd5d..c83f64c95e6b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -92,12 +92,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL \ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT) -#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ -#define TVA_IN_PF (1 << 1) /* Page fault handler */ -#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ +enum tva_type { + TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ + TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_KHUGEPAGED, /* Khugepaged collapse. */ + TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ +}; -#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define thp_vma_allowable_order(vma, vm_flags, type, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) static inline int lowest_order(unsigned long orders) { @@ -280,14 +283,14 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @tva_flags: Which TVA flags to honour + * @type: TVA type * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -301,11 +304,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - /* Optimization to check if required orders are enabled early. */ - if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { + /* + * Optimization to check if required orders are enabled early. Only + * forced collapse ignores sysfs configs. + */ + if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -319,7 +325,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); + return __thp_vma_allowable_orders(vma, vm_flags, type, orders); } struct thpsize { @@ -496,7 +502,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 89e06a874be2..07695d38ddd4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -90,12 +90,12 @@ static bool file_orders_configured; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - bool smaps = tva_flags & TVA_SMAPS; - bool in_pf = tva_flags & TVA_IN_PF; - bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + const bool smaps = type == TVA_SMAPS; + const bool in_pf = type == TVA_PAGEFAULT; + const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bdc7a664eb58..d33f6c5662c9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -477,8 +477,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -927,7 +926,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -938,7 +938,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1527,9 +1527,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2418,8 +2418,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; diff --git a/mm/memory.c b/mm/memory.c index 3daf8912cbd0..552a53ec8179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4473,8 +4473,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -5003,8 +5003,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -6283,8 +6283,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6318,8 +6317,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- Gitee From 06079e89ae31867d0020a6eff315c61b45f269d3 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:12 -0700 Subject: [PATCH 05/32] khugepaged: rename hpage_collapse_* to collapse_* ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-2-npache@redhat.com/ The hpage_collapse functions describe functions used by madvise_collapse and khugepaged. remove the unnecessary hpage prefix to shorten the function name. Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Liam R. Howlett Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 70 ++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d33f6c5662c9..32ff31fd63de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -405,7 +405,7 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline int hpage_collapse_test_exit(struct mm_struct *mm) +static inline int collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } @@ -434,9 +434,9 @@ static bool hugepage_pmd_enabled(void) return false; } -static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) +static inline int collapse_test_exit_or_disable(struct mm_struct *mm) { - return hpage_collapse_test_exit(mm) || + return collapse_test_exit(mm) || test_bit(MMF_DISABLE_THP, &mm->flags); } @@ -447,7 +447,7 @@ void __khugepaged_enter(struct mm_struct *mm) int wakeup; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); + VM_BUG_ON_MM(collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) return; @@ -505,7 +505,7 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (mm_slot) { /* * This is required to serialize against - * hpage_collapse_test_exit() (which is guaranteed to run + * collapse_test_exit() (which is guaranteed to run * under mmap sem read mode). Stop here (after we return all * pagetables will be destroyed) until khugepaged has finished * working on the pagetables under the mmap_lock. @@ -854,7 +854,7 @@ struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, }; -static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) +static bool collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -889,7 +889,7 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int hpage_collapse_find_target_node(struct collapse_control *cc) +static int collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; @@ -908,7 +908,7 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) return target_node; } #else -static int hpage_collapse_find_target_node(struct collapse_control *cc) +static int collapse_find_target_node(struct collapse_control *cc) { return 0; } @@ -929,7 +929,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -996,7 +996,7 @@ static int check_pmd_still_valid(struct mm_struct *mm, /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if hpage_collapse_scan_pmd believes it is worthwhile. + * Only done if khugepaged_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. @@ -1078,7 +1078,7 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); - int node = hpage_collapse_find_target_node(cc); + int node = collapse_find_target_node(cc); struct folio *folio; folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); @@ -1268,10 +1268,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, return result; } -static int hpage_collapse_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, bool *mmap_locked, - struct collapse_control *cc) +static int collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1373,7 +1373,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * hit record. */ node = folio_nid(folio); - if (hpage_collapse_scan_abort(node, cc)) { + if (collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } @@ -1446,7 +1446,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (hpage_collapse_test_exit(mm)) { + if (collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&slot->hash); list_del(&slot->mm_node); @@ -1732,7 +1732,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; - if (hpage_collapse_test_exit(mm)) + if (collapse_test_exit(mm)) continue; /* * When a vma is registered with uffd-wp, we cannot recycle @@ -2262,9 +2262,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, return result; } -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) +static int collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct folio *folio = NULL; struct address_space *mapping = file->f_mapping; @@ -2307,7 +2307,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } node = folio_nid(folio); - if (hpage_collapse_scan_abort(node, cc)) { + if (collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } @@ -2353,7 +2353,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, return result; } #else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, +static int collapse_scan_file(struct mm_struct *mm, unsigned long addr, struct file *file, pgoff_t start, struct collapse_control *cc) { @@ -2361,7 +2361,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, +static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) @@ -2406,7 +2406,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, goto breakouterloop_mmap_lock; progress++; - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; vma_iter_init(&vmi, mm, khugepaged_scan.address); @@ -2414,7 +2414,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, unsigned long hstart, hend; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { + if (unlikely(collapse_test_exit_or_disable(mm))) { progress++; break; } @@ -2435,7 +2435,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, bool mmap_locked = true; cond_resched(); - if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + if (unlikely(collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2448,12 +2448,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mmap_read_unlock(mm); mmap_locked = false; - *result = hpage_collapse_scan_file(mm, + *result = collapse_scan_file(mm, khugepaged_scan.address, file, pgoff, cc); fput(file); if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { mmap_read_lock(mm); - if (hpage_collapse_test_exit_or_disable(mm)) + if (collapse_test_exit_or_disable(mm)) goto breakouterloop; *result = collapse_pte_mapped_thp(mm, khugepaged_scan.address, false); @@ -2462,7 +2462,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mmap_read_unlock(mm); } } else { - *result = hpage_collapse_scan_pmd(mm, vma, + *result = collapse_scan_pmd(mm, vma, khugepaged_scan.address, &mmap_locked, cc); } @@ -2495,7 +2495,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (hpage_collapse_test_exit(mm) || !vma) { + if (collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2549,8 +2549,8 @@ static void khugepaged_do_scan(struct collapse_control *cc) pass_through_head++; if (khugepaged_has_work() && pass_through_head < 2) - progress += khugepaged_scan_mm_slot(pages - progress, - &result, cc); + progress += collapse_scan_mm_slot(pages - progress, + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); @@ -2812,11 +2812,11 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_read_unlock(mm); mmap_locked = false; - result = hpage_collapse_scan_file(mm, addr, file, pgoff, + result = collapse_scan_file(mm, addr, file, pgoff, cc); fput(file); } else { - result = hpage_collapse_scan_pmd(mm, vma, addr, + result = collapse_scan_pmd(mm, vma, addr, &mmap_locked, cc); } if (!mmap_locked) -- Gitee From 04f9642c4f29c1d81b658e1c9e25c57141fdcd90 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:13 -0700 Subject: [PATCH 06/32] introduce collapse_single_pmd to unify khugepaged and madvise_collapse ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-3-npache@redhat.com/ The khugepaged daemon and madvise_collapse have two different implementations that do almost the same thing. Create collapse_single_pmd to increase code reuse and create an entry point to these two users. Refactor madvise_collapse and collapse_scan_mm_slot to use the new collapse_single_pmd function. This introduces a minor behavioral change that is most likely an undiscovered bug. The current implementation of khugepaged tests collapse_test_exit_or_disable before calling collapse_pte_mapped_thp, but we weren't doing it in the madvise_collapse case. By unifying these two callers madvise_collapse now also performs this check. We also modify the return value to be SCAN_ANY_PROCESS which properly indicates that this process is no longer valid to operate on. We also guard the khugepaged_pages_collapsed variable to ensure its only incremented for khugepaged. Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 97 ++++++++++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 32ff31fd63de..f3b607b96db9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2361,6 +2361,53 @@ static int collapse_scan_file(struct mm_struct *mm, unsigned long addr, } #endif +/* + * Try to collapse a single PMD starting at a PMD aligned addr, and return + * the results. + */ +static int collapse_single_pmd(unsigned long addr, + struct vm_area_struct *vma, bool *mmap_locked, + struct collapse_control *cc) +{ + struct mm_struct *mm = vma->vm_mm; + int result; + struct file *file; + pgoff_t pgoff; + + if (vma_is_anonymous(vma)) { + result = collapse_scan_pmd(mm, vma, addr, mmap_locked, cc); + goto end; + } + + file = get_file(vma->vm_file); + pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + *mmap_locked = false; + result = collapse_scan_file(mm, addr, file, pgoff, cc); + fput(file); + if (result != SCAN_PTE_MAPPED_HUGEPAGE) + goto end; + + mmap_read_lock(mm); + *mmap_locked = true; + if (collapse_test_exit_or_disable(mm)) { + mmap_read_unlock(mm); + *mmap_locked = false; + return SCAN_ANY_PROCESS; + } + result = collapse_pte_mapped_thp(mm, addr, !cc->is_khugepaged); + if (result == SCAN_PMD_MAPPED) + result = SCAN_SUCCEED; + mmap_read_unlock(mm); + *mmap_locked = false; + +end: + if (cc->is_khugepaged && result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; + return result; +} + static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) @@ -2441,33 +2488,9 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result, VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { - struct file *file = get_file(vma->vm_file); - pgoff_t pgoff = linear_page_index(vma, - khugepaged_scan.address); - mmap_read_unlock(mm); - mmap_locked = false; - *result = collapse_scan_file(mm, - khugepaged_scan.address, file, pgoff, cc); - fput(file); - if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { - mmap_read_lock(mm); - if (collapse_test_exit_or_disable(mm)) - goto breakouterloop; - *result = collapse_pte_mapped_thp(mm, - khugepaged_scan.address, false); - if (*result == SCAN_PMD_MAPPED) - *result = SCAN_SUCCEED; - mmap_read_unlock(mm); - } - } else { - *result = collapse_scan_pmd(mm, vma, - khugepaged_scan.address, &mmap_locked, cc); - } - - if (*result == SCAN_SUCCEED) - ++khugepaged_pages_collapsed; + *result = collapse_single_pmd(khugepaged_scan.address, + vma, &mmap_locked, cc); /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; @@ -2806,37 +2829,19 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { - struct file *file = get_file(vma->vm_file); - pgoff_t pgoff = linear_page_index(vma, addr); - mmap_read_unlock(mm); - mmap_locked = false; - result = collapse_scan_file(mm, addr, file, pgoff, - cc); - fput(file); - } else { - result = collapse_scan_pmd(mm, vma, addr, - &mmap_locked, cc); - } + result = collapse_single_pmd(addr, vma, &mmap_locked, cc); if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ -handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; - case SCAN_PTE_MAPPED_HUGEPAGE: - BUG_ON(mmap_locked); - BUG_ON(*prev); - mmap_read_lock(mm); - result = collapse_pte_mapped_thp(mm, addr, true); - mmap_read_unlock(mm); - goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: + case SCAN_PTE_MAPPED_HUGEPAGE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_PAGE_RO: -- Gitee From 83e33b7ec4a586cbb3b1e748f88bbafb7756a241 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:14 -0700 Subject: [PATCH 07/32] khugepaged: generalize hugepage_vma_revalidate for mTHP support ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-4-npache@redhat.com/ For khugepaged to support different mTHP orders, we must generalize this to check if the PMD is not shared by another VMA and that the order is enabled. No functional change in this patch. Also correct a comment about the functionality of the revalidation. Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Co-developed-by: Dev Jain Signed-off-by: Dev Jain Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f3b607b96db9..b60d50c47cf1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -916,14 +916,13 @@ static int collapse_find_target_node(struct collapse_control *cc) /* * If mmap_lock temporarily dropped, revalidate vma - * before taking mmap_lock. + * after taking the mmap_lock again. * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - bool expect_anon, - struct vm_area_struct **vmap, - struct collapse_control *cc) + bool expect_anon, struct vm_area_struct **vmap, + struct collapse_control *cc, unsigned int order) { struct vm_area_struct *vma; enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : @@ -936,15 +935,16 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; + /* Always check the PMD order to ensure its not shared by another VMA */ if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) + if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order))) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then * remapped to file after khugepaged reaquired the mmap_lock. * - * thp_vma_allowable_order may return true for qualified file + * thp_vma_allowable_orders may return true for qualified file * vmas. */ if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) @@ -1137,7 +1137,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * in case the special case has happened. */ mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc, + HPAGE_PMD_ORDER); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; @@ -1168,7 +1169,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc, + HPAGE_PMD_ORDER); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -2818,7 +2820,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_read_lock(mm); mmap_locked = true; result = hugepage_vma_revalidate(mm, addr, false, &vma, - cc); + cc, HPAGE_PMD_ORDER); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; -- Gitee From 10fd781c46411c2d958e6b8a25053be2ca9ee336 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 1 Dec 2025 10:46:15 -0700 Subject: [PATCH 08/32] khugepaged: generalize alloc_charge_folio() ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-5-npache@redhat.com/ Pass order to alloc_charge_folio() and update mTHP statistics. Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Co-developed-by: Nico Pache Signed-off-by: Nico Pache Signed-off-by: Dev Jain Signed-off-by: Yuanhe Shu --- Documentation/admin-guide/mm/transhuge.rst | 8 ++++++++ include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 4 ++++ mm/khugepaged.c | 17 +++++++++++------ 4 files changed, 25 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 0f0b3ff0cb88..7b7fb3f2e208 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -621,6 +621,14 @@ anon_fault_fallback a huge page and instead falls back to using huge pages with lower orders or small pages. +collapse_alloc + is incremented every time a huge page is successfully allocated for a + khugepaged collapse. + +collapse_alloc_failed + is incremented every time a huge page allocation fails during a + khugepaged collapse. + anon_fault_fallback_charge is incremented if a page fault fails to charge a huge page and instead falls back to using huge pages with lower orders or diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c83f64c95e6b..86694639864b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -124,6 +124,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_COLLAPSE_ALLOC, + MTHP_STAT_COLLAPSE_ALLOC_FAILED, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 07695d38ddd4..f637cf659d83 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -654,6 +654,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(collapse_alloc, MTHP_STAT_COLLAPSE_ALLOC); +DEFINE_MTHP_STAT_ATTR(collapse_alloc_failed, MTHP_STAT_COLLAPSE_ALLOC_FAILED); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -711,6 +713,8 @@ static struct attribute *any_stats_attrs[] = { #endif &split_attr.attr, &split_failed_attr.attr, + &collapse_alloc_attr.attr, + &collapse_alloc_failed_attr.attr, NULL, }; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b60d50c47cf1..2ec4a24c5b2d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1074,21 +1074,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, } static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, - struct collapse_control *cc) + struct collapse_control *cc, unsigned int order) { gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE); int node = collapse_find_target_node(cc); struct folio *folio; - folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); + folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask); if (!folio) { *foliop = NULL; - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + if (order == HPAGE_PMD_ORDER) + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } - count_vm_event(THP_COLLAPSE_ALLOC); + if (order == HPAGE_PMD_ORDER) + count_vm_event(THP_COLLAPSE_ALLOC); + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC); + if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *foliop = NULL; @@ -1125,7 +1130,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ mmap_read_unlock(mm); - result = alloc_charge_folio(&folio, mm, cc); + result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER); if (result != SCAN_SUCCEED) goto out_nolock; @@ -1841,7 +1846,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - result = alloc_charge_folio(&new_folio, mm, cc); + result = alloc_charge_folio(&new_folio, mm, cc, HPAGE_PMD_ORDER); if (result != SCAN_SUCCEED) goto out; -- Gitee From 51dc119dcf57cabf2ebd3b857bc9aeaf56955390 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:16 -0700 Subject: [PATCH 09/32] khugepaged: introduce is_mthp_order helper ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-6-npache@redhat.com/ In order to add mTHP support, we will often be checking if a given order is a mTHP or PMD order. Lets create a simple helper function to keep the code clean and readable. Suggested-by: Lorenzo Stoakes Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2ec4a24c5b2d..96693390eb73 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -346,6 +346,11 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ +static bool is_mthp_order(unsigned int order) +{ + return order != HPAGE_PMD_ORDER; +} + int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { @@ -1084,13 +1089,13 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask); if (!folio) { *foliop = NULL; - if (order == HPAGE_PMD_ORDER) + if (!is_mthp_order(order)) count_vm_event(THP_COLLAPSE_ALLOC_FAILED); count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED); return SCAN_ALLOC_HUGE_PAGE_FAIL; } - if (order == HPAGE_PMD_ORDER) + if (!is_mthp_order(order)) count_vm_event(THP_COLLAPSE_ALLOC); count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC); -- Gitee From 50e6c3da8e8c8dbd4c63ff8d6022cd099b960769 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:17 -0700 Subject: [PATCH 10/32] khugepaged: generalize __collapse_huge_page_* for mTHP support ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-7-npache@redhat.com/ generalize the order of the __collapse_huge_page_* functions to support future mTHP collapse. mTHP collapse will not honor the khugepaged_max_ptes_shared or khugepaged_max_ptes_swap parameters, and will fail if it encounters a shared or swapped entry. No functional changes in this patch. Reviewed-by: Wei Yang Reviewed-by: Lance Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Co-developed-by: Dev Jain Signed-off-by: Dev Jain Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 76 ++++++++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 96693390eb73..8ec8eca4ee90 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -569,18 +569,18 @@ static bool is_refcount_suitable(struct folio *folio) } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, - pte_t *pte, - struct collapse_control *cc, - struct list_head *compound_pagelist) + unsigned long address, pte_t *pte, struct collapse_control *cc, + unsigned int order, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; + const unsigned long nr_pages = 1UL << order; + int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order); bool writable = false; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + for (_pte = pte; _pte < pte + nr_pages; _pte++, address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && @@ -588,7 +588,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { + none_or_zero <= max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -615,8 +615,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (page_mapcount(page) > 1) { ++shared; - if (cc->is_khugepaged && - shared > khugepaged_max_ptes_shared) { + /* + * TODO: Support shared pages without leading to further + * mTHP collapses. Currently bringing in new pages via + * shared may cause a future higher order collapse on a + * rescan of the same range. + */ + if (is_mthp_order(order) || (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared)) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; @@ -714,18 +720,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } static void __collapse_huge_page_copy_succeeded(pte_t *pte, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl, - struct list_head *compound_pagelist) + struct vm_area_struct *vma, unsigned long address, + spinlock_t *ptl, unsigned int order, + struct list_head *compound_pagelist) { struct folio *src_folio; struct page *src_page; struct page *tmp; pte_t *_pte; pte_t pteval; + const unsigned long nr_pages = 1UL << order; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + for (_pte = pte; _pte < pte + nr_pages; _pte++, address += PAGE_SIZE) { pteval = ptep_get(_pte); if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { @@ -769,13 +775,11 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, } static void __collapse_huge_page_copy_failed(pte_t *pte, - pmd_t *pmd, - pmd_t orig_pmd, - struct vm_area_struct *vma, - struct list_head *compound_pagelist) + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, + unsigned int order, struct list_head *compound_pagelist) { spinlock_t *pmd_ptl; - + const unsigned long nr_pages = 1UL << order; /* * Re-establish the PMD to point to the original page table * entry. Restoring PMD needs to be done prior to releasing @@ -789,7 +793,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, * Release both raw and compound pages isolated * in __collapse_huge_page_isolate. */ - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); + release_pte_pages(pte, pte + nr_pages, compound_pagelist); } /* @@ -809,16 +813,16 @@ static void __collapse_huge_page_copy_failed(pte_t *pte, */ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, - unsigned long address, spinlock_t *ptl, + unsigned long address, spinlock_t *ptl, unsigned int order, struct list_head *compound_pagelist) { unsigned int i; int result = SCAN_SUCCEED; - + const unsigned long nr_pages = 1UL << order; /* * Copying pages' contents is subject to memory poison at any iteration. */ - for (i = 0; i < HPAGE_PMD_NR; i++) { + for (i = 0; i < nr_pages; i++) { pte_t pteval = ptep_get(pte + i); struct page *page = folio_page(folio, i); unsigned long src_addr = address + i * PAGE_SIZE; @@ -837,10 +841,10 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, if (likely(result == SCAN_SUCCEED)) __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, - compound_pagelist); + order, compound_pagelist); else __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, - compound_pagelist); + order, compound_pagelist); return result; } @@ -1007,13 +1011,12 @@ static int check_pmd_still_valid(struct mm_struct *mm, * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ static int __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) + struct vm_area_struct *vma, unsigned long haddr, + pmd_t *pmd, int referenced, unsigned int order) { int swapped_in = 0; vm_fault_t ret = 0; - unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + unsigned long address, end = haddr + (PAGE_SIZE << order); int result; pte_t *pte = NULL; spinlock_t *ptl; @@ -1040,6 +1043,19 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, if (!is_swap_pte(vmf.orig_pte)) continue; + /* + * TODO: Support swapin without leading to further mTHP + * collapses. Currently bringing in new pages via swapin may + * cause a future higher order collapse on a rescan of the same + * range. + */ + if (is_mthp_order(order)) { + pte_unmap(pte); + mmap_read_unlock(mm); + result = SCAN_EXCEED_SWAP_PTE; + goto out; + } + vmf.pte = pte; vmf.ptl = ptl; ret = do_swap_page(&vmf); @@ -1167,7 +1183,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, - referenced); + referenced, HPAGE_PMD_ORDER); if (result != SCAN_SUCCEED) goto out_nolock; } @@ -1212,6 +1228,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, + HPAGE_PMD_ORDER, &compound_pagelist); spin_unlock(pte_ptl); } else { @@ -1242,6 +1259,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, + HPAGE_PMD_ORDER, &compound_pagelist); pte_unmap(pte); if (unlikely(result != SCAN_SUCCEED)) -- Gitee From 2f5ba4d6d476308e65811035e7847aa8afdb477c Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:18 -0700 Subject: [PATCH 11/32] khugepaged: introduce collapse_max_ptes_none helper function ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-8-npache@redhat.com/ The current mechanism for determining mTHP collapse scales the khugepaged_max_ptes_none value based on the target order. This introduces an undesirable feedback loop, or "creep", when max_ptes_none is set to a value greater than HPAGE_PMD_NR / 2. With this configuration, a successful collapse to order N will populate enough pages to satisfy the collapse condition on order N+1 on the next scan. This leads to unnecessary work and memory churn. To fix this issue introduce a helper function that will limit mTHP collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1. This effectively supports two modes: - max_ptes_none=0: never introduce new none-pages for mTHP collapse. - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest available mTHP order. This removes the possiblilty of "creep", while not modifying any uAPI expectations. A warning will be emitted if any non-supported max_ptes_none value is configured with mTHP enabled. The limits can be ignored by passing full_scan=true, this is useful for madvise_collapse (which ignores limits), or in the case of collapse_scan_pmd(), allows the full PMD to be scanned when mTHP collapse is available. Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8ec8eca4ee90..b3f129582fcb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -477,6 +477,44 @@ void __khugepaged_enter(struct mm_struct *mm) wake_up_interruptible(&khugepaged_wait); } +/** + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse + * @order: The folio order being collapsed to + * @full_scan: Whether this is a full scan (ignore limits) + * + * For madvise-triggered collapses (full_scan=true), all limits are bypassed + * and allow up to HPAGE_PMD_NR - 1 empty PTEs. + * + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured + * khugepaged_max_ptes_none value. + * + * For mTHP collapses, we currently only support khugepaged_max_pte_none values + * of 0 or (HPAGE_PMD_NR - 1). Any other value will emit a warning and no mTHP + * collapse will be attempted + * + * Return: Maximum number of empty PTEs allowed for the collapse operation + */ +static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan) +{ + /* ignore max_ptes_none limits */ + if (full_scan) + return HPAGE_PMD_NR - 1; + + if (!is_mthp_order(order)) + return khugepaged_max_ptes_none; + + /* Zero/non-present collapse disabled. */ + if (!khugepaged_max_ptes_none) + return 0; + + if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1) + return (1 << order) - 1; + + pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %d\n", + HPAGE_PMD_NR - 1); + return -EINVAL; +} + void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { @@ -577,9 +615,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; const unsigned long nr_pages = 1UL << order; - int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order); + int max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged); bool writable = false; + if (max_ptes_none == -EINVAL) + goto out; + for (_pte = pte; _pte < pte + nr_pages; _pte++, address += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); -- Gitee From a16a56188d190486d9522dc6f532f3effa61a830 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:19 -0700 Subject: [PATCH 12/32] khugepaged: generalize collapse_huge_page for mTHP collapse ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-9-npache@redhat.com/ Pass an order and offset to collapse_huge_page to support collapsing anon memory to arbitrary orders within a PMD. order indicates what mTHP size we are attempting to collapse to, and offset indicates were in the PMD to start the collapse attempt. For non-PMD collapse we must leave the anon VMA write locked until after we collapse the mTHP-- in the PMD case all the pages are isolated, but in the mTHP case this is not true, and we must keep the lock to prevent changes to the VMA from occurring. Reviewed-by: Baolin Wang Tested-by: Baolin Wang Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 113 ++++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 43 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b3f129582fcb..90cb02d656c8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1169,20 +1169,23 @@ static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, } static int collapse_huge_page(struct mm_struct *mm, unsigned long address, - int referenced, int unmapped, - struct collapse_control *cc) + int referenced, int unmapped, struct collapse_control *cc, + bool *mmap_locked, unsigned int order) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; - pte_t *pte; + pte_t *pte = NULL; pgtable_t pgtable; struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; + bool anon_vma_locked = false; + const unsigned long nr_pages = 1UL << order; + const unsigned long pmd_address = address & HPAGE_PMD_MASK; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_WARN_ON_ONCE(pmd_address & ~HPAGE_PMD_MASK); /* * Before allocating the hugepage, release the mmap_lock read lock. @@ -1190,9 +1193,12 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ - mmap_read_unlock(mm); + if (*mmap_locked) { + mmap_read_unlock(mm); + *mmap_locked = false; + } - result = alloc_charge_folio(&folio, mm, cc, HPAGE_PMD_ORDER); + result = alloc_charge_folio(&folio, mm, cc, order); if (result != SCAN_SUCCEED) goto out_nolock; @@ -1204,16 +1210,17 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * in case the special case has happened. */ mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, true, &vma, cc, - HPAGE_PMD_ORDER); + *mmap_locked = true; + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - result = find_pmd_or_thp_or_none(mm, address, &pmd); + result = find_pmd_or_thp_or_none(mm, pmd_address, &pmd); if (result != SCAN_SUCCEED || is_async_fork_mm(mm)) { mmap_read_unlock(mm); + *mmap_locked = false; goto out_nolock; } @@ -1224,32 +1231,35 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * that case. Continuing to collapse causes inconsistency. */ result = __collapse_huge_page_swapin(mm, vma, address, pmd, - referenced, HPAGE_PMD_ORDER); - if (result != SCAN_SUCCEED) + referenced, order); + if (result != SCAN_SUCCEED) { + *mmap_locked = false; goto out_nolock; + } } mmap_read_unlock(mm); + *mmap_locked = false; /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, true, &vma, cc, - HPAGE_PMD_ORDER); + result = hugepage_vma_revalidate(mm, pmd_address, true, &vma, cc, order); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - result = check_pmd_still_valid(mm, address, pmd); + result = check_pmd_still_valid(mm, pmd_address, pmd); if (result != SCAN_SUCCEED || is_async_fork_mm(mm)) goto out_up_write; vma_start_write(vma); anon_vma_lock_write(vma->anon_vma); + anon_vma_locked = true; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, - address + HPAGE_PMD_SIZE); + address + (PAGE_SIZE << order)); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ @@ -1261,7 +1271,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * Parallel fast GUP is fine since fast GUP will back off when * it detects PMD is changed. */ - _pmd = pmdp_collapse_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, pmd_address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); @@ -1269,16 +1279,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); if (pte) { result = __collapse_huge_page_isolate(vma, address, pte, cc, - HPAGE_PMD_ORDER, - &compound_pagelist); + order, &compound_pagelist); spin_unlock(pte_ptl); } else { result = SCAN_PMD_NULL; } if (unlikely(result != SCAN_SUCCEED)) { - if (pte) - pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* @@ -1288,21 +1295,21 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); - anon_vma_unlock_write(vma->anon_vma); goto out_up_write; } /* - * All pages are isolated and locked so anon_vma rmap - * can't run anymore. + * For PMD collapse all pages are isolated and locked so anon_vma + * rmap can't run anymore. For mTHP collapse we must hold the lock */ - anon_vma_unlock_write(vma->anon_vma); + if (!is_mthp_order(order)) { + anon_vma_unlock_write(vma->anon_vma); + anon_vma_locked = false; + } result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, vma, address, pte_ptl, - HPAGE_PMD_ORDER, - &compound_pagelist); - pte_unmap(pte); + order, &compound_pagelist); if (unlikely(result != SCAN_SUCCEED)) goto out_up_write; @@ -1312,27 +1319,48 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * write. */ __folio_mark_uptodate(folio); - pgtable = pmd_pgtable(_pmd); - - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - - spin_lock(pmd_ptl); - BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - deferred_split_folio(folio, false); + if (is_mthp_order(order)) { /* mTHP collapse */ + pte_t mthp_pte = mk_pte(folio_page(folio, 0), vma->vm_page_prot); + + mthp_pte = maybe_mkwrite(pte_mkdirty(mthp_pte), vma); + spin_lock(pmd_ptl); + WARN_ON_ONCE(!pmd_none(*pmd)); + folio_ref_add(folio, nr_pages - 1); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_ptes(vma->vm_mm, address, pte, mthp_pte, nr_pages); + update_mmu_cache_range(NULL, vma, address, pte, nr_pages); + + smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); + } else { /* PMD collapse */ + pgtable = pmd_pgtable(_pmd); + _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + + spin_lock(pmd_ptl); + BUG_ON(!pmd_none(*pmd)); + folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + pgtable_trans_huge_deposit(mm, pmd, pgtable); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache_pmd(vma, address, pmd); + deferred_split_folio(folio, false); + } spin_unlock(pmd_ptl); folio = NULL; result = SCAN_SUCCEED; out_up_write: + if (anon_vma_locked) + anon_vma_unlock_write(vma->anon_vma); + if (pte) + pte_unmap(pte); mmap_write_unlock(mm); + *mmap_locked = false; out_nolock: + WARN_ON_ONCE(*mmap_locked); if (folio) folio_put(folio); trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); @@ -1500,9 +1528,8 @@ static int collapse_scan_pmd(struct mm_struct *mm, pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { result = collapse_huge_page(mm, address, referenced, - unmapped, cc); - /* collapse_huge_page will return with the mmap_lock released */ - *mmap_locked = false; + unmapped, cc, mmap_locked, + HPAGE_PMD_ORDER); } out: trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, -- Gitee From 31f6a1265d251d90b8082c7ad1b52d7b829cc0f9 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:20 -0700 Subject: [PATCH 13/32] khugepaged: skip collapsing mTHP to smaller orders ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-10-npache@redhat.com/ khugepaged may try to collapse a mTHP to a smaller mTHP, resulting in some pages being unmapped. Skip these cases until we have a way to check if its ok to collapse to a smaller mTHP size (like in the case of a partially mapped folio). This patch is inspired by Dev Jain's work on khugepaged mTHP support [1]. [1] https://lore.kernel.org/lkml/20241216165105.56185-11-dev.jain@arm.com/ Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Co-developed-by: Dev Jain Signed-off-by: Dev Jain Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 90cb02d656c8..18e2839ed02e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -669,6 +669,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } } + /* + * TODO: In some cases of partially-mapped folios, we'd actually + * want to collapse. + */ + if (is_mthp_order(order) && folio_order(folio) >= order) { + result = SCAN_PTE_MAPPED_HUGEPAGE; + goto out; + } if (folio_test_large(folio)) { struct folio *f; -- Gitee From 8840b44d0a7ebc9007e2819b2a0403e8ba45c965 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:21 -0700 Subject: [PATCH 14/32] khugepaged: add per-order mTHP collapse failure statistics ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-11-npache@redhat.com/ Add three new mTHP statistics to track collapse failures for different orders when encountering swap PTEs, excessive none PTEs, and shared PTEs: - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap PTEs - collapse_exceed_none_pte: Counts when mTHP collapse fails due to exceeding the none PTE threshold for the given order - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared PTEs These statistics complement the existing THP_SCAN_EXCEED_* events by providing per-order granularity for mTHP collapse attempts. The stats are exposed via sysfs under `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each supported hugepage size. As we currently dont support collapsing mTHPs that contain a swap or shared entry, those statistics keep track of how often we are encountering failed mTHP collapses due to these restrictions. Reviewed-by: Baolin Wang Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- Documentation/admin-guide/mm/transhuge.rst | 24 ++++++++++++++++++++++ include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 6 ++++++ mm/khugepaged.c | 16 ++++++++++++--- 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 7b7fb3f2e208..ef822314a595 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -683,6 +683,30 @@ nr_anon_partially_mapped an anonymous THP as "partially mapped" and count it here, even though it is not actually partially mapped anymore. +collapse_exceed_none_pte + The number of collapse attempts that failed due to exceeding the + max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none + values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will + emit a warning and no mTHP collapse will be attempted. khugepaged will + try to collapse to the largest enabled (m)THP size, if it fails, it will + try the next lower enabled mTHP size. This counter records the number of + times a collapse attempt was skipped for exceeding the max_ptes_none + threshold, and khugepaged will move on to the next available mTHP size. + +collapse_exceed_swap_pte + The number of anonymous mTHP pte ranges which were unable to collapse due + to containing at least one swap PTE. Currently khugepaged does not + support collapsing mTHP regions that contain a swap PTE. This counter can + be used to monitor the number of khugepaged mTHP collapses that failed + due to the presence of a swap PTE. + +collapse_exceed_shared_pte + The number of anonymous mTHP pte ranges which were unable to collapse due + to containing at least one shared PTE. Currently khugepaged does not + support collapsing mTHP pte ranges that contain a shared PTE. This + counter can be used to monitor the number of khugepaged mTHP collapses + that failed due to the presence of a shared PTE. + file_alloc is incremented every time a file huge page is successfully allocated. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 86694639864b..5464672dadc2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -136,6 +136,9 @@ enum mthp_stat_item { MTHP_STAT_SPLIT_DEFERRED, MTHP_STAT_NR_ANON, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, + MTHP_STAT_COLLAPSE_EXCEED_SWAP, + MTHP_STAT_COLLAPSE_EXCEED_NONE, + MTHP_STAT_COLLAPSE_EXCEED_SHARED, MTHP_STAT_FILE_ALLOC, MTHP_STAT_FILE_FALLBACK, __MTHP_STAT_COUNT diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f637cf659d83..d504955dc78f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -668,6 +668,9 @@ DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED); DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED); DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON); DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED); +DEFINE_MTHP_STAT_ATTR(collapse_exceed_swap_pte, MTHP_STAT_COLLAPSE_EXCEED_SWAP); +DEFINE_MTHP_STAT_ATTR(collapse_exceed_none_pte, MTHP_STAT_COLLAPSE_EXCEED_NONE); +DEFINE_MTHP_STAT_ATTR(collapse_exceed_shared_pte, MTHP_STAT_COLLAPSE_EXCEED_SHARED); DEFINE_MTHP_STAT_ATTR(file_alloc, MTHP_STAT_FILE_ALLOC); DEFINE_MTHP_STAT_ATTR(file_fallback, MTHP_STAT_FILE_FALLBACK); @@ -682,6 +685,9 @@ static struct attribute *anon_stats_attrs[] = { &split_deferred_attr.attr, &nr_anon_attr.attr, &nr_anon_partially_mapped_attr.attr, + &collapse_exceed_swap_pte_attr.attr, + &collapse_exceed_none_pte_attr.attr, + &collapse_exceed_shared_pte_attr.attr, NULL, }; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 18e2839ed02e..198fa950f6f8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -633,7 +633,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, continue; } else { result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + if (!is_mthp_order(order)) + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_NONE); goto out; } } @@ -662,10 +664,17 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * shared may cause a future higher order collapse on a * rescan of the same range. */ - if (is_mthp_order(order) || (cc->is_khugepaged && - shared > khugepaged_max_ptes_shared)) { + if (is_mthp_order(order)) { + result = SCAN_EXCEED_SHARED_PTE; + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED); + goto out; + } + + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SHARED); goto out; } } @@ -1099,6 +1108,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * range. */ if (is_mthp_order(order)) { + count_mthp_stat(order, MTHP_STAT_COLLAPSE_EXCEED_SWAP); pte_unmap(pte); mmap_read_unlock(mm); result = SCAN_EXCEED_SWAP_PTE; -- Gitee From 5f57826310a69842b8f53fb5e8ba70c00a1d36af Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:22 -0700 Subject: [PATCH 15/32] khugepaged: improve tracepoints for mTHP orders ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-12-npache@redhat.com/ Add the order to the mm_collapse_huge_page<_swapin,_isolate> tracepoints to give better insight into what order is being operated at for. Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- include/trace/events/huge_memory.h | 34 +++++++++++++++++++----------- mm/khugepaged.c | 9 ++++---- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 9277524e84eb..3bc75b0097a5 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -92,34 +92,37 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, TRACE_EVENT(mm_collapse_huge_page, - TP_PROTO(struct mm_struct *mm, int isolated, int status), + TP_PROTO(struct mm_struct *mm, int isolated, int status, unsigned int order), - TP_ARGS(mm, isolated, status), + TP_ARGS(mm, isolated, status, order), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(int, isolated) __field(int, status) + __field(unsigned int, order) ), TP_fast_assign( __entry->mm = mm; __entry->isolated = isolated; __entry->status = status; + __entry->order = order; ), - TP_printk("mm=%p, isolated=%d, status=%s", + TP_printk("mm=%p, isolated=%d, status=%s order=%u", __entry->mm, __entry->isolated, - __print_symbolic(__entry->status, SCAN_STATUS)) + __print_symbolic(__entry->status, SCAN_STATUS), + __entry->order) ); TRACE_EVENT(mm_collapse_huge_page_isolate, TP_PROTO(struct page *page, int none_or_zero, - int referenced, bool writable, int status), + int referenced, bool writable, int status, unsigned int order), - TP_ARGS(page, none_or_zero, referenced, writable, status), + TP_ARGS(page, none_or_zero, referenced, writable, status, order), TP_STRUCT__entry( __field(unsigned long, pfn) @@ -127,6 +130,7 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, __field(int, referenced) __field(bool, writable) __field(int, status) + __field(unsigned int, order) ), TP_fast_assign( @@ -135,27 +139,31 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, __entry->referenced = referenced; __entry->writable = writable; __entry->status = status; + __entry->order = order; ), - TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s", + TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s order=%u", __entry->pfn, __entry->none_or_zero, __entry->referenced, __entry->writable, - __print_symbolic(__entry->status, SCAN_STATUS)) + __print_symbolic(__entry->status, SCAN_STATUS), + __entry->order) ); TRACE_EVENT(mm_collapse_huge_page_swapin, - TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret), + TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret, + unsigned int order), - TP_ARGS(mm, swapped_in, referenced, ret), + TP_ARGS(mm, swapped_in, referenced, ret, order), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(int, swapped_in) __field(int, referenced) __field(int, ret) + __field(unsigned int, order) ), TP_fast_assign( @@ -163,13 +171,15 @@ TRACE_EVENT(mm_collapse_huge_page_swapin, __entry->swapped_in = swapped_in; __entry->referenced = referenced; __entry->ret = ret; + __entry->order = order; ), - TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d", + TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d, order=%u", __entry->mm, __entry->swapped_in, __entry->referenced, - __entry->ret) + __entry->ret, + __entry->order) ); TRACE_EVENT(mm_khugepaged_scan_file, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 198fa950f6f8..71a6b7596b3c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -767,13 +767,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, - referenced, writable, result); + referenced, writable, result, order); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, - referenced, writable, result); + referenced, writable, result, order); return result; } @@ -1149,7 +1149,8 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, result = SCAN_SUCCEED; out: - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result, + order); return result; } @@ -1381,7 +1382,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, WARN_ON_ONCE(*mmap_locked); if (folio) folio_put(folio); - trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result, order); return result; } -- Gitee From 11f55d240b1a3c9ce991d21a16f2829877e649ad Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:23 -0700 Subject: [PATCH 16/32] khugepaged: introduce collapse_allowable_orders helper function ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-13-npache@redhat.com/ Add collapse_allowable_orders() to generalize THP order eligibility. The function determines which THP orders are permitted based on collapse context (khugepaged vs madv_collapse). This consolidates collapse configuration logic and provides a clean interface for future mTHP collapse support where the orders may be different. Reviewed-by: Baolin Wang Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 71a6b7596b3c..fa6c04057135 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -515,12 +515,22 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan) return -EINVAL; } +/* Check what orders are allowed based on the vma and collapse type */ +static unsigned long collapse_allowable_orders(struct vm_area_struct *vma, + vm_flags_t vm_flags, bool is_khugepaged) +{ + enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; + unsigned long orders = BIT(HPAGE_PMD_ORDER); + + return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); +} + void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) + if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)) __khugepaged_enter(vma->vm_mm); } } @@ -2582,7 +2592,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { + if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/true)) { skip: progress++; continue; @@ -2913,7 +2923,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/false)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); -- Gitee From 63e363ad8c4d35e1fc619b84ee3380d81b64541a Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:24 -0700 Subject: [PATCH 17/32] khugepaged: Introduce mTHP collapse support ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-14-npache@redhat.com/ Enable khugepaged to collapse to mTHP orders. This patch implements the main scanning logic using a bitmap to track occupied pages and a stack structure that allows us to find optimal collapse sizes. Previous to this patch, PMD collapse had 3 main phases, a light weight scanning phase (mmap_read_lock) that determines a potential PMD collapse, a alloc phase (mmap unlocked), then finally heavier collapse phase (mmap_write_lock). To enabled mTHP collapse we make the following changes: During PMD scan phase, track occupied pages in a bitmap. When mTHP orders are enabled, we remove the restriction of max_ptes_none during the scan phase to avoid missing potential mTHP collapse candidates. Once we have scanned the full PMD range and updated the bitmap to track occupied pages, we use the bitmap to find the optimal mTHP size. Implement collapse_scan_bitmap() to perform binary recursion on the bitmap and determine the best eligible order for the collapse. A stack structure is used instead of traditional recursion to manage the search. The algorithm recursively splits the bitmap into smaller chunks to find the highest order mTHPs that satisfy the collapse criteria. We start by attempting the PMD order, then moved on the consecutively lower orders (mTHP collapse). The stack maintains a pair of variables (offset, order), indicating the number of PTEs from the start of the PMD, and the order of the potential collapse candidate. The algorithm for consuming the bitmap works as such: 1) push (0, HPAGE_PMD_ORDER) onto the stack 2) pop the stack 3) check if the number of set bits in that (offset,order) pair statisfy the max_ptes_none threshold for that order 4) if yes, attempt collapse 5) if no (or collapse fails), push two new stack items representing the left and right halves of the current bitmap range, at the next lower order 6) repeat at step (2) until stack is empty. Below is a diagram representing the algorithm and stack items: offset mid_offset | | | | v v ____________________________________ | PTE Page Table | -------------------------------------- <-------><-------> order-1 order-1 We currently only support mTHP collapse for max_ptes_none values of 0 and HPAGE_PMD_NR - 1. resulting in the following behavior: - max_ptes_none=0: Never introduce new empty pages during collapse - max_ptes_none=HPAGE_PMD_NR-1: Always try collapse to the highest available mTHP order Any other max_ptes_none value will emit a warning and skip mTHP collapse attempts. There should be no behavior change for PMD collapse. Once we determine what mTHP sizes fits best in that PMD range a collapse is attempted. A minimum collapse order of 2 is used as this is the lowest order supported by anon memory as defined by THP_ORDERS_ALL_ANON. mTHP collapses reject regions containing swapped out or shared pages. This is because adding new entries can lead to new none pages, and these may lead to constant promotion into a higher order (m)THP. A similar issue can occur with "max_ptes_none > HPAGE_PMD_NR/2" due to a collapse introducing at least 2x the number of pages, and on a future scan will satisfy the promotion condition once again. This issue is prevented via the collapse_max_ptes_none() function which imposes the max_ptes_none restrictions above. Currently madv_collapse is not supported and will only attempt PMD collapse. We can also remove the check for is_khugepaged inside the PMD scan as the collapse_max_ptes_none() function handles this logic now. Reviewed-by: Baolin Wang Tested-by: Baolin Wang Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 184 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 176 insertions(+), 8 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fa6c04057135..c8c67a800d60 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -94,6 +94,32 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; +#define KHUGEPAGED_MIN_MTHP_ORDER 2 +/* + * The maximum number of mTHP ranges that can be stored on the stack. + * This is calculated based on the number of PTE entries in a PTE page table + * and the minimum mTHP order. + * + * ilog2(MAX_PTRS_PER_PTE) is log2 of the maximum number of PTE entries. + * This gives you the PMD_ORDER, and is needed in place of HPAGE_PMD_ORDER due + * to restrictions of some architectures (ie ppc64le). + * + * At most there will be 1 << (PMD_ORDER - KHUGEPAGED_MIN_MTHP_ORDER) mTHP ranges + */ +#define MTHP_STACK_SIZE (1UL << (ilog2(MAX_PTRS_PER_PTE) - KHUGEPAGED_MIN_MTHP_ORDER)) + +/* + * Defines a range of PTE entries in a PTE page table which are being + * considered for (m)THP collapse. + * + * @offset: the offset of the first PTE entry in a PMD range. + * @order: the order of the PTE entries being considered for collapse. + */ +struct mthp_range { + u16 offset; + u8 order; +}; + struct collapse_control { bool is_khugepaged; @@ -102,6 +128,11 @@ struct collapse_control { /* nodemask for allocation fallback */ nodemask_t alloc_nmask; + + /* bitmap used for mTHP collapse */ + DECLARE_BITMAP(mthp_bitmap, MAX_PTRS_PER_PTE); + DECLARE_BITMAP(mthp_bitmap_mask, MAX_PTRS_PER_PTE); + struct mthp_range mthp_bitmap_stack[MTHP_STACK_SIZE]; }; /** @@ -1396,6 +1427,121 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, return result; } +static void mthp_stack_push(struct collapse_control *cc, int *stack_size, + u16 offset, u8 order) +{ + const int size = *stack_size; + struct mthp_range *stack = &cc->mthp_bitmap_stack[size]; + + VM_WARN_ON_ONCE(size >= MTHP_STACK_SIZE); + stack->order = order; + stack->offset = offset; + (*stack_size)++; +} + +static struct mthp_range mthp_stack_pop(struct collapse_control *cc, int *stack_size) +{ + const int size = *stack_size; + + VM_WARN_ON_ONCE(size <= 0); + (*stack_size)--; + return cc->mthp_bitmap_stack[size - 1]; +} + +static unsigned int mthp_nr_occupied_pte_entries(struct collapse_control *cc, + u16 offset, unsigned long nr_pte_entries) +{ + bitmap_zero(cc->mthp_bitmap_mask, HPAGE_PMD_NR); + bitmap_set(cc->mthp_bitmap_mask, offset, nr_pte_entries); + return bitmap_weight_and(cc->mthp_bitmap, cc->mthp_bitmap_mask, HPAGE_PMD_NR); +} + +/* + * mthp_collapse() consumes the bitmap that is generated during + * collapse_scan_pmd() to determine what regions and mTHP orders fit best. + * + * Each bit in cc->mthp_bitmap represents a single occupied (!none/zero) page. + * A stack structure cc->mthp_bitmap_stack is used to check different regions + * of the bitmap for collapse eligibility. The stack maintains a pair of + * variables (offset, order), indicating the number of PTEs from the start of + * the PMD, and the order of the potential collapse candidate respectively. We + * start at the PMD order and check if it is eligible for collapse; if not, we + * add two entries to the stack at a lower order to represent the left and right + * halves of the PTE page table we are examining. + * + * offset mid_offset + * | | + * | | + * v v + * -------------------------------------- + * | cc->mthp_bitmap | + * -------------------------------------- + * <-------><-------> + * order-1 order-1 + * + * For each of these, we determine how many PTE entries are occupied in the + * range of PTE entries we propose to collapse, then we compare this to a + * threshold number of PTE entries which would need to be occupied for a + * collapse to be permitted at that order (accounting for max_ptes_none). + + * If a collapse is permitted, we attempt to collapse the PTE range into a + * mTHP. + */ +static int mthp_collapse(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, struct collapse_control *cc, + bool *mmap_locked, unsigned long enabled_orders) +{ + unsigned int max_ptes_none, nr_occupied_ptes; + struct mthp_range range; + unsigned long collapse_address; + int collapsed = 0, stack_size = 0; + unsigned long nr_pte_entries; + u16 offset; + u8 order; + + mthp_stack_push(cc, &stack_size, 0, HPAGE_PMD_ORDER); + + while (stack_size > 0) { + range = mthp_stack_pop(cc, &stack_size); + order = range.order; + offset = range.offset; + nr_pte_entries = 1UL << order; + + if (!test_bit(order, &enabled_orders)) + goto next_order; + + max_ptes_none = collapse_max_ptes_none(order, !cc->is_khugepaged); + + if (max_ptes_none == -EINVAL) + return collapsed; + + nr_occupied_ptes = mthp_nr_occupied_pte_entries(cc, offset, nr_pte_entries); + + if (nr_occupied_ptes >= nr_pte_entries - max_ptes_none) { + int ret; + + collapse_address = address + offset * PAGE_SIZE; + ret = collapse_huge_page(mm, collapse_address, referenced, + unmapped, cc, mmap_locked, + order); + if (ret == SCAN_SUCCEED) { + collapsed += nr_pte_entries; + continue; + } + } + +next_order: + if (order > KHUGEPAGED_MIN_MTHP_ORDER) { + const u8 next_order = order - 1; + const u16 mid_offset = offset + (nr_pte_entries / 2); + + mthp_stack_push(cc, &stack_size, mid_offset, next_order); + mthp_stack_push(cc, &stack_size, offset, next_order); + } + } + return collapsed; +} + static int collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, bool *mmap_locked, @@ -1403,11 +1549,15 @@ static int collapse_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; + int i; int result = SCAN_FAIL, referenced = 0; - int none_or_zero = 0, shared = 0; + int none_or_zero = 0, shared = 0, nr_collapsed = 0; struct page *page = NULL; + unsigned int max_ptes_none; struct folio *folio = NULL; unsigned long _address; + unsigned long enabled_orders; + bool full_scan = true; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; bool writable = false; @@ -1418,16 +1568,29 @@ static int collapse_scan_pmd(struct mm_struct *mm, if (result != SCAN_SUCCEED) goto out; + bitmap_zero(cc->mthp_bitmap, HPAGE_PMD_NR); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); + + enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged); + + /* + * If PMD is the only enabled order, enforce max_ptes_none, otherwise + * scan all pages to populate the bitmap for mTHP collapse. + */ + if (cc->is_khugepaged && enabled_orders == BIT(HPAGE_PMD_ORDER)) + full_scan = false; + max_ptes_none = collapse_max_ptes_none(HPAGE_PMD_ORDER, full_scan); + pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte) { result = SCAN_PMD_NULL; goto out; } - for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { + for (i = 0; i < HPAGE_PMD_NR; i++) { + _pte = pte + i; + _address = address + i * PAGE_SIZE; pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; @@ -1452,8 +1615,7 @@ static int collapse_scan_pmd(struct mm_struct *mm, if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { ++none_or_zero; if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { + none_or_zero <= max_ptes_none) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1494,6 +1656,9 @@ static int collapse_scan_pmd(struct mm_struct *mm, } folio = page_folio(page); + + /* Set bit for occupied pages */ + bitmap_set(cc->mthp_bitmap, i, 1); /* * Record which node the original page is from and save this * information to cc->node_load[]. @@ -1556,9 +1721,12 @@ static int collapse_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { - result = collapse_huge_page(mm, address, referenced, - unmapped, cc, mmap_locked, - HPAGE_PMD_ORDER); + nr_collapsed = mthp_collapse(mm, address, referenced, unmapped, + cc, mmap_locked, enabled_orders); + if (nr_collapsed > 0) + result = SCAN_SUCCEED; + else + result = SCAN_FAIL; } out: trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, -- Gitee From e61145eef413d78100399f90f92f7fdacb3d95fe Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:25 -0700 Subject: [PATCH 18/32] khugepaged: avoid unnecessary mTHP collapse attempts ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-15-npache@redhat.com/ There are cases where, if an attempted collapse fails, all subsequent orders are guaranteed to also fail. Avoid these collapse attempts by bailing out early. [backport note] Modify 'case SCAN_NO_PTE_TABLE:' to 'case SCAN_PMD_NULL:' and 'case SCAN_PMD_NONE:' as we have not backported 9e01407 ("mm/khugepaged: unify SCAN_PMD_NONE and SCAN_PMD_NULL into SCAN_NO_PTE_TABLE"). Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c8c67a800d60..8be72ac41996 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1524,9 +1524,43 @@ static int mthp_collapse(struct mm_struct *mm, unsigned long address, ret = collapse_huge_page(mm, collapse_address, referenced, unmapped, cc, mmap_locked, order); - if (ret == SCAN_SUCCEED) { + + switch (ret) { + /* Cases were we continue to next collapse candidate */ + case SCAN_SUCCEED: collapsed += nr_pte_entries; + fallthrough; + case SCAN_PTE_MAPPED_HUGEPAGE: continue; + /* Cases were lower orders might still succeed */ + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_EXCEED_NONE_PTE: + case SCAN_EXCEED_SWAP_PTE: + case SCAN_EXCEED_SHARED_PTE: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LRU: + case SCAN_PAGE_NULL: + case SCAN_DEL_PAGE_LRU: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_ALLOC_HUGE_PAGE_FAIL: + goto next_order; + /* Cases were no further collapse is possible */ + case SCAN_CGROUP_CHARGE_FAIL: + case SCAN_COPY_MC: + case SCAN_ADDRESS_RANGE: + case SCAN_PMD_NULL: + case SCAN_PMD_NONE: + case SCAN_ANY_PROCESS: + case SCAN_VMA_NULL: + case SCAN_VMA_CHECK: + case SCAN_SCAN_ABORT: + case SCAN_PAGE_ANON: + case SCAN_PMD_MAPPED: + case SCAN_FAIL: + default: + return collapsed; } } -- Gitee From a9121810b2a1e58265088ae038598776d962d686 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 1 Dec 2025 10:46:26 -0700 Subject: [PATCH 19/32] khugepaged: run khugepaged for all orders ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-16-npache@redhat.com/ If any order (m)THP is enabled we should allow running khugepaged to attempt scanning and collapsing mTHPs. In order for khugepaged to operate when only mTHP sizes are specified in sysfs, we must modify the predicate function that determines whether it ought to run to do so. This function is currently called hugepage_pmd_enabled(), this patch renames it to hugepage_enabled() and updates the logic to check to determine whether any valid orders may exist which would justify khugepaged running. We must also update collapse_allowable_orders() to check all orders if the vma is anonymous and the collapse is khugepaged. After this patch khugepaged mTHP collapse is fully enabled. Signed-off-by: Baolin Wang Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- mm/khugepaged.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8be72ac41996..5da1413950bb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -446,23 +446,23 @@ static inline int collapse_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -static bool hugepage_pmd_enabled(void) +static bool hugepage_enabled(void) { /* * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. - * Anon pmd-sized hugepages are determined by the pmd-size control. + * Anon hugepages are determined by its per-size mTHP control. * Shmem pmd-sized hugepages are also determined by its pmd-size control, * except when the global shmem_huge is set to SHMEM_HUGE_DENY. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && hugepage_global_enabled()) return true; - if (test_bit(PMD_ORDER, &huge_anon_orders_always)) + if (READ_ONCE(huge_anon_orders_always)) return true; - if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) + if (READ_ONCE(huge_anon_orders_madvise)) return true; - if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && + if (READ_ONCE(huge_anon_orders_inherit) && hugepage_global_enabled()) return true; if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) @@ -550,8 +550,14 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan) static unsigned long collapse_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, bool is_khugepaged) { + unsigned long orders; enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; - unsigned long orders = BIT(HPAGE_PMD_ORDER); + + /* If khugepaged is scanning an anonymous vma, allow mTHP collapse */ + if (is_khugepaged && vma_is_anonymous(vma)) + orders = THP_ORDERS_ALL_ANON; + else + orders = BIT(HPAGE_PMD_ORDER); return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } @@ -560,7 +566,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - hugepage_pmd_enabled()) { + hugepage_enabled()) { if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)) __khugepaged_enter(vma->vm_mm); } @@ -2872,7 +2878,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result, static int khugepaged_has_work(void) { - return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); + return !list_empty(&khugepaged_scan.mm_head) && hugepage_enabled(); } static int khugepaged_wait_event(void) @@ -2945,7 +2951,7 @@ static void khugepaged_wait_work(void) return; } - if (hugepage_pmd_enabled()) + if (hugepage_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } @@ -2995,7 +3001,7 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - if (!hugepage_pmd_enabled()) { + if (!hugepage_enabled()) { calculate_min_free_kbytes(); goto update_wmarks; } @@ -3045,7 +3051,7 @@ int start_stop_khugepaged(void) int err = 0; mutex_lock(&khugepaged_mutex); - if (hugepage_pmd_enabled()) { + if (hugepage_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -3071,7 +3077,7 @@ int start_stop_khugepaged(void) void khugepaged_min_free_kbytes_update(void) { mutex_lock(&khugepaged_mutex); - if (hugepage_pmd_enabled() && khugepaged_thread) + if (hugepage_enabled() && khugepaged_thread) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } -- Gitee From 1673880830c2ecdd39a88720c62b9e00a8e84663 Mon Sep 17 00:00:00 2001 From: Nico Pache Date: Mon, 1 Dec 2025 10:46:27 -0700 Subject: [PATCH 20/32] Documentation: mm: update the admin guide for mTHP collapse ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251201174627.23295-17-npache@redhat.com/ Now that we can collapse to mTHPs lets update the admin guide to reflect these changes and provide proper guidance on how to utilize it. Reviewed-by: Bagas Sanjaya Signed-off-by: Nico Pache Signed-off-by: Yuanhe Shu --- Documentation/admin-guide/mm/transhuge.rst | 48 +++++++++++++--------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index ef822314a595..1227685c52b1 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -63,7 +63,8 @@ often. THP can be enabled system wide or restricted to certain tasks or even memory ranges inside task's address space. Unless THP is completely disabled, there is ``khugepaged`` daemon that scans memory and -collapses sequences of basic pages into PMD-sized huge pages. +collapses sequences of basic pages into huge pages of either PMD size +or mTHP sizes, if the system is configured to do so The THP behaviour is controlled via :ref:`sysfs ` interface and using madvise(2) and prctl(2) system calls. @@ -212,20 +213,15 @@ this behaviour by writing 0 to shrink_underused, and enable it by writing echo 0 > /sys/kernel/mm/transparent_hugepage/shrink_underused echo 1 > /sys/kernel/mm/transparent_hugepage/shrink_underused -khugepaged will be automatically started when PMD-sized THP is enabled +khugepaged will be automatically started when any THP size is enabled (either of the per-size anon control or the top-level control are set to "always" or "madvise"), and it'll be automatically shutdown when -PMD-sized THP is disabled (when both the per-size anon control and the +all THP sizes are disabled (when both the per-size anon control and the top-level control are "never") Khugepaged controls ------------------- -.. note:: - khugepaged currently only searches for opportunities to collapse to - PMD-sized THP and no attempt is made to collapse to other THP - sizes. - khugepaged runs usually at low frequency so while one may not want to invoke defrag algorithms synchronously during the page faults, it should be worth invoking defrag at least in khugepaged. However it's @@ -253,11 +249,11 @@ allocation failure to throttle the next allocation attempt:: The khugepaged progress can be seen in the number of pages collapsed (note that this counter may not be an exact count of the number of pages collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping -being replaced by a PMD mapping, or (2) All 4K physical pages replaced by -one 2M hugepage. Each may happen independently, or together, depending on -the type of memory and the failures that occur. As such, this value should -be interpreted roughly as a sign of progress, and counters in /proc/vmstat -consulted for more accurate accounting):: +being replaced by a PMD mapping, or (2) physical pages replaced by one +hugepage of various sizes (PMD-sized or mTHP). Each may happen independently, +or together, depending on the type of memory and the failures that occur. +As such, this value should be interpreted roughly as a sign of progress, +and counters in /proc/vmstat consulted for more accurate accounting):: /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed @@ -265,16 +261,19 @@ for each pass:: /sys/kernel/mm/transparent_hugepage/khugepaged/full_scans -``max_ptes_none`` specifies how many extra small pages (that are -not already mapped) can be allocated when collapsing a group -of small pages into one large page:: +``max_ptes_none`` specifies how many empty (none/zero) pages are allowed +when collapsing a group of small pages into one large page:: /sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none -A higher value leads to use additional memory for programs. -A lower value leads to gain less thp performance. Value of -max_ptes_none can waste cpu time very little, you can -ignore it. +For PMD-sized THP collapse, this directly limits the number of empty pages +allowed in the 2MB region. For mTHP collapse, only 0 or (HPAGE_PMD_NR - 1) +are supported. Any other value will emit a warning and no mTHP collapse +will be attempted. + +A higher value allows more empty pages, potentially leading to more memory +usage but better THP performance. A lower value is more conservative and +may result in fewer THP collapses. ``max_ptes_swap`` specifies how many pages can be brought in from swap when collapsing a group of pages into a transparent huge page:: @@ -293,6 +292,15 @@ processes. Exceeding the number would block the collapse:: A higher value may increase memory footprint for some workloads. +.. note:: + For mTHP collapse, khugepaged does not support collapsing regions that + contain shared or swapped out pages, as this could lead to continuous + promotion to higher orders. The collapse will fail if any shared or + swapped PTEs are encountered during the scan. + + Currently, madvise_collapse only supports collapsing to PMD-sized THPs + and does not attempt mTHP collapses. + File-Backed Hugepages --------------------- -- Gitee From f933c831aa0b0d6f4a4231c27153d61fff740859 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:50 +0800 Subject: [PATCH 21/32] mm: thp: remove vm_flags parameter from khugepaged_enter_vma() ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-2-laoar.shao@gmail.com/ The khugepaged_enter_vma() function requires handling in two specific scenarios: 1. New VMA creation When a new VMA is created (for anon vma, it is deferred to pagefault), if vma->vm_mm is not present in khugepaged_mm_slot, it must be added. In this case, khugepaged_enter_vma() is called after vma->vm_flags have been set, allowing direct use of the VMA's flags. 2. VMA flag modification When vma->vm_flags are modified (particularly when VM_HUGEPAGE is set), the system must recheck whether to add vma->vm_mm to khugepaged_mm_slot. Currently, khugepaged_enter_vma() is called before the flag update, so the call must be relocated to occur after vma->vm_flags have been set. In the VMA merging path, khugepaged_enter_vma() is also called. For this case, since VMA merging only occurs when the vm_flags of both VMAs are identical (excluding special flags like VM_SOFTDIRTY), we can safely use target->vm_flags instead. (It is worth noting that khugepaged_enter_vma() can be removed from the VMA merging path because the VMA has already been added in the two aforementioned cases. We will address this cleanup in a separate patch.) After this change, we can further remove vm_flags parameter from thp_vma_allowable_order(). That will be handled in a followup patch. [backport note] Remove vm_flags in khugepaged_enter_vma() in vma_merge(), __mmap_region() and do_brk_flags() as they are the same as vma->vm_flags. Signed-off-by: Yafang Shao Cc: Yang Shi Cc: Usama Arif Signed-off-by: Yuanhe Shu --- include/linux/khugepaged.h | 10 ++++++---- mm/huge_memory.c | 2 +- mm/khugepaged.c | 27 ++++++++++++++------------- mm/madvise.c | 7 +++++++ mm/mmap.c | 12 ++++++------ 5 files changed, 34 insertions(+), 24 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 30baae91b225..50d94ade1739 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -13,8 +13,8 @@ extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); -extern void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags); +extern void khugepaged_enter_vma(struct vm_area_struct *vma); +extern void khugepaged_enter_mm(struct mm_struct *mm); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); #ifdef CONFIG_SHMEM @@ -46,8 +46,10 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) +static inline void khugepaged_enter_vma(struct vm_area_struct *vma) +{ +} +static inline void khugepaged_enter_mm(struct mm_struct *mm) { } static inline int collapse_pte_mapped_thp(struct mm_struct *mm, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d504955dc78f..b5bf5c95c4da 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1434,7 +1434,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter_vma(vma, vma->vm_flags); + khugepaged_enter_vma(vma); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5da1413950bb..bd2a77bfb228 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -398,12 +398,6 @@ int hugepage_madvise(struct vm_area_struct *vma, #endif *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; - /* - * If the vma become good for khugepaged to scan, - * register it here without waiting a page fault that - * may not happen any time soon. - */ - khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; @@ -562,14 +556,21 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma, return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } -void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) +void khugepaged_enter_mm(struct mm_struct *mm) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - hugepage_enabled()) { - if (collapse_allowable_orders(vma, vm_flags, /*is_khugepaged=*/true)) - __khugepaged_enter(vma->vm_mm); - } + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + return; + if (!hugepage_enabled()) + return; + + __khugepaged_enter(mm); +} + +void khugepaged_enter_vma(struct vm_area_struct *vma) +{ + if (!collapse_allowable_orders(vma, vma->vm_flags, true)) + return; + khugepaged_enter_mm(vma->vm_mm); } void __khugepaged_exit(struct mm_struct *mm) diff --git a/mm/madvise.c b/mm/madvise.c index 7d64cf8d037b..882a26f56b3b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1135,6 +1135,13 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, anon_name); anon_vma_name_put(anon_name); + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (!error && new_flags & VM_HUGEPAGE) + khugepaged_enter_mm(vma->vm_mm); out: /* * madvise() returns EAGAIN if kernel resources, such as diff --git a/mm/mmap.c b/mm/mmap.c index d9c64ab36b29..58e7ee7abfa5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1098,7 +1098,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(res, vm_flags); + khugepaged_enter_vma(res); return res; prealloc_fail: @@ -2122,7 +2122,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); + khugepaged_enter_vma(vma); mas_destroy(&mas); validate_mm(mm); return error; @@ -2218,7 +2218,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) } } anon_vma_unlock_write(vma->anon_vma); - khugepaged_enter_vma(vma, vma->vm_flags); + khugepaged_enter_vma(vma); mas_destroy(&mas); validate_mm(mm); return error; @@ -2831,7 +2831,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, /* Actually expand, if possible */ if (vma && !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { - khugepaged_enter_vma(vma, vm_flags); + khugepaged_enter_vma(vma); goto expanded; } @@ -2939,7 +2939,7 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr, * vma_merge() calls khugepaged_enter_vma() either, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, vma->vm_flags); + khugepaged_enter_vma(vma); file_expanded: file = vma->vm_file; @@ -3259,7 +3259,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, vma); vma_complete(&vp, vmi, mm); - khugepaged_enter_vma(vma, flags); + khugepaged_enter_vma(vma); goto out; } -- Gitee From fba98eeae41be8b62864bc7e07c6c8e44c690eac Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:51 +0800 Subject: [PATCH 22/32] mm: thp: remove vm_flags parameter from thp_vma_allowable_order() ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-3-laoar.shao@gmail.com/ Because all calls to thp_vma_allowable_order() pass vma->vm_flags as the vma_flags argument, we can remove the parameter and have the function access vma->vm_flags directly. Signed-off-by: Yafang Shao Acked-by: Usama Arif Signed-off-by: Yuanhe Shu --- fs/proc/task_mmu.c | 3 +-- include/linux/huge_mm.h | 17 ++++++++--------- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 18 +++++++++--------- mm/memory.c | 11 +++++------ 5 files changed, 25 insertions(+), 28 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1232ab6e297e..e6ac22f3bead 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -879,8 +879,7 @@ static int show_smap(struct seq_file *m, void *v) #endif seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, - THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, TVA_SMAPS, THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5464672dadc2..7b5c3240ae04 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -99,8 +99,8 @@ enum tva_type { TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ }; -#define thp_vma_allowable_order(vma, vm_flags, type, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) +#define thp_vma_allowable_order(vma, type, order) \ + (!!thp_vma_allowable_orders(vma, type, BIT(order))) static inline int lowest_order(unsigned long orders) { @@ -287,14 +287,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, enum tva_type type, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check - * @vm_flags: use these vm_flags instead of vma->vm_flags * @type: TVA type * @orders: bitfield of all orders to consider * @@ -308,10 +306,11 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, enum tva_type type, unsigned long orders) { + vm_flags_t vm_flags = vma->vm_flags; + /* * Optimization to check if required orders are enabled early. Only * forced collapse ignores sysfs configs. @@ -330,7 +329,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, type, orders); + return __thp_vma_allowable_orders(vma, type, orders); } struct thpsize { @@ -345,9 +344,10 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_flags; + /* * Explicitly disabled through madvise or prctl, or some * architectures may disable THP for some mappings, for @@ -506,7 +506,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, enum tva_type type, unsigned long orders) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b5bf5c95c4da..57987b303ce4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,7 +89,6 @@ static bool anon_orders_configured __initdata; static bool file_orders_configured; unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, enum tva_type type, unsigned long orders) { @@ -97,6 +96,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, const bool in_pf = type == TVA_PAGEFAULT; const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; unsigned long supported_orders; + vm_flags_t vm_flags = vma->vm_flags; /* Check the intersection of requested and supported orders. */ if (vma_is_anonymous(vma)) @@ -113,7 +113,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma)) return false; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index bd2a77bfb228..8af7362fd3c0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -542,10 +542,10 @@ static unsigned int collapse_max_ptes_none(unsigned int order, bool full_scan) /* Check what orders are allowed based on the vma and collapse type */ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma, - vm_flags_t vm_flags, bool is_khugepaged) + bool is_khugepaged) { unsigned long orders; - enum tva_type tva_flags = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; + enum tva_type tva_type = is_khugepaged ? TVA_KHUGEPAGED : TVA_FORCED_COLLAPSE; /* If khugepaged is scanning an anonymous vma, allow mTHP collapse */ if (is_khugepaged && vma_is_anonymous(vma)) @@ -553,7 +553,7 @@ static unsigned long collapse_allowable_orders(struct vm_area_struct *vma, else orders = BIT(HPAGE_PMD_ORDER); - return thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); + return thp_vma_allowable_orders(vma, tva_type, orders); } void khugepaged_enter_mm(struct mm_struct *mm) @@ -568,7 +568,7 @@ void khugepaged_enter_mm(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma) { - if (!collapse_allowable_orders(vma, vma->vm_flags, true)) + if (!collapse_allowable_orders(vma, TVA_KHUGEPAGED)) return; khugepaged_enter_mm(vma->vm_mm); } @@ -1053,7 +1053,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, /* Always check the PMD order to ensure its not shared by another VMA */ if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_orders(vma, vma->vm_flags, type, BIT(order))) + if (!thp_vma_allowable_orders(vma, type, BIT(order))) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1613,7 +1613,7 @@ static int collapse_scan_pmd(struct mm_struct *mm, memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - enabled_orders = collapse_allowable_orders(vma, vma->vm_flags, cc->is_khugepaged); + enabled_orders = collapse_allowable_orders(vma, cc->is_khugepaged); /* * If PMD is the only enabled order, enforce max_ptes_none, otherwise @@ -1865,7 +1865,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * and map it by a PMD, regardless of sysfs THP settings. As such, let's * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2801,7 +2801,7 @@ static unsigned int collapse_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/true)) { + if (!collapse_allowable_orders(vma, true)) { skip: progress++; continue; @@ -3132,7 +3132,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - if (!collapse_allowable_orders(vma, vma->vm_flags, /*is_khugepaged=*/false)) + if (!collapse_allowable_orders(vma, false)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 552a53ec8179..f79d224f883e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4473,7 +4473,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), @@ -5003,7 +5003,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); @@ -5267,7 +5267,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any * PMD mappings if THPs are disabled. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) @@ -6268,7 +6268,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -6283,7 +6282,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { + thp_vma_allowable_order(vma, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6317,7 +6316,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { + thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- Gitee From fec4b3467d98333a3e60952e0d3da7a5a040e83d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:52 +0800 Subject: [PATCH 23/32] mm: thp: add support for BPF based THP order selection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-4-laoar.shao@gmail.com/ The Motivation ============== This patch introduces a new BPF struct_ops called bpf_thp_ops for dynamic THP tuning. It includes a hook bpf_hook_thp_get_order(), allowing BPF programs to influence THP order selection based on factors such as: - Workload identity For example, workloads running in specific containers or cgroups. - Allocation context Whether the allocation occurs during a page fault, khugepaged, swap or other paths. - VMA's memory advice settings MADV_HUGEPAGE or MADV_NOHUGEPAGE - Memory pressure PSI system data or associated cgroup PSI metrics The BPF-THP Interface ===================== The kernel API of this new BPF hook is as follows, /** * thp_get_order: Get the suggested THP order from a BPF program for allocation * @vma: vm_area_struct associated with the THP allocation * @type: TVA type for current @vma * @orders: Bitmask of available THP orders for this allocation * * Return: The suggested THP order for allocation from the BPF program. * Returns a negative value to preserve the original available @orders, * which is useful in specific cases—for example, when only a particular * @type is handled and others are ignored. */ int thp_get_order(struct vm_area_struct *vma, enum tva_type type, unsigned long orders); This functionality is only active when system-wide THP is configured to madvise or always mode. It remains disabled in never mode. Additionally, if THP is explicitly disabled for a specific task via prctl(), this BPF functionality will also be unavailable for that task. The Design of Per Process BPF-THP ================================= As suggested by Alexei, we need to scoping the BPF-THP [0]. Scoping BPF-THP to cgroup is not acceptible ------------------------------------------- As explained by Gutierrez: [1] 1. It breaks the cgroup hierarchy when 2 siblings have different THP policies 2. Cgroup was designed for resource management not for grouping processes and une those processes 3. We set a precedent for other people adding new flags to cgroup and potentially polluting cgroups. We may end up with cgroups having tens of different flags, making sysadmin's job more complex Scoping BPF-THP to process -------------------------- To eliminate potential conflicts among competing BPF-THP instances, we enforce that each process is exclusively managed by a single BPF-THP. This approach has received agreement from David [2]. When registering a BPF-THP, we specify the PID of a target task. The BPF-THP is then installed in the task's `mm_struct` struct mm_struct { struct bpf_thp_ops __rcu *thp_thp; }; Inheritance Behavior: - Existing child processes are unaffected - Newly forked children inherit the BPF-THP from their parent - The BPF-THP persists across execve() calls A new linked list tracks all tasks managed by each BPF-THP instance: - Newly managed tasks are added to the list - Exiting tasks are automatically removed from the list - During BPF-THP unregistration (e.g., when the BPF link is removed), all managed tasks have their bpf_thp pointer set to NULL - BPF-THP instances can be dynamically updated, with all tracked tasks automatically migrating to the new version. This design simplifies BPF-THP management in production environments by providing clear lifecycle management and preventing conflicts between multiple BPF-THP instances. WARNING ======= This feature requires CONFIG_BPF_THP (EXPERIMENTAL) to be enabled. Note that this capability is currently unstable and may undergo significant changes—including potential removal—in future kernel versions. Link: https://lore.kernel.org/linux-mm/CAADnVQJtrJZOCWZKH498GBA8M0mYVztApk54mOEejs8Wr3nSiw@mail.gmail.com/ [0] Link: https://lore.kernel.org/linux-mm/1940d681-94a6-48fb-b889-cd8f0b91b330@huawei-partners.com/ [1] Link: https://lore.kernel.org/linux-mm/3577f7fd-429a-49c5-973b-38174a67be15@redhat.com/ [2] Signed-off-by: Yafang Shao Signed-off-by: Yuanhe Shu --- fs/exec.c | 1 + include/linux/huge_mm.h | 39 +++++ include/linux/mm_types.h | 17 +++ kernel/fork.c | 2 + mm/Kconfig | 22 +++ mm/Makefile | 1 + mm/huge_memory_bpf.c | 316 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 398 insertions(+) create mode 100644 mm/huge_memory_bpf.c diff --git a/fs/exec.c b/fs/exec.c index 55f2ad29d64e..2b1e5e8dd6d7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1025,6 +1025,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); + bpf_thp_retain_mm(mm, old_mm); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7b5c3240ae04..001f9743d937 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -174,6 +174,40 @@ static inline void count_mthp_stat(int order, enum mthp_stat_item item) } #endif +#ifdef CONFIG_BPF_THP + +unsigned long +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type, + unsigned long orders); +void bpf_thp_exit_mm(struct mm_struct *mm); +void bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm); +void bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm); + +#else /* CONFIG_BPF_THP */ + +static inline unsigned long +bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type, + unsigned long orders) +{ + return orders; +} + +static inline void bpf_thp_exit_mm(struct mm_struct *mm) +{ +} + +static inline void +bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm) +{ +} + +static inline void +bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm) +{ +} + +#endif /* CONFIG_BPF_THP */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) @@ -311,6 +345,11 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, { vm_flags_t vm_flags = vma->vm_flags; + /* The BPF-specified order overrides which order is selected. */ + orders &= bpf_hook_thp_get_orders(vma, type, orders); + if (!orders) + return 0; + /* * Optimization to check if required orders are enabled early. Only * forced collapse ignores sysfs configs. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b386a10d7ece..1ccf2d357eda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -780,6 +780,19 @@ struct mm_cid { }; #endif +#ifdef CONFIG_BPF_THP +struct bpf_thp_ops; +#endif + +#ifdef CONFIG_BPF_MM +struct bpf_mm_ops { +#ifdef CONFIG_BPF_THP + struct bpf_thp_ops __rcu *bpf_thp; + struct list_head bpf_thp_list; +#endif +}; +#endif + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1042,6 +1055,10 @@ struct mm_struct { #ifdef CONFIG_FUTEX unsigned int futex_nid; #endif + +#ifdef CONFIG_BPF_MM + struct bpf_mm_ops bpf_mm; +#endif } __randomize_layout; CK_KABI_RESERVE(1) diff --git a/kernel/fork.c b/kernel/fork.c index fd3fe8b8a87c..9c6e8ddcec55 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -797,6 +797,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); + bpf_thp_fork(mm, oldmm); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else if (mpnt) { @@ -1404,6 +1405,7 @@ static inline void __mmput(struct mm_struct *mm) exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ + bpf_thp_exit_mm(mm); exit_mmap(mm); mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); diff --git a/mm/Kconfig b/mm/Kconfig index 8e2cd7fd26df..4d19c28546d1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1322,6 +1322,28 @@ config DUPTEXT config IOMMU_MM_DATA bool +menuconfig BPF_MM + bool "BPF-based Memory Management (EXPERIMENTAL)" + depends on BPF_SYSCALL + + help + Enable BPF-based Memory Management Policy. This feature is currently + experimental. + + WARNING: This feature is unstable and may change in future kernel + +if BPF_MM +config BPF_THP + bool "BPF-based THP Policy (EXPERIMENTAL)" + depends on TRANSPARENT_HUGEPAGE && BPF_MM + + help + Enable dynamic THP policy adjustment using BPF programs. This feature + is currently experimental. + + WARNING: This feature is unstable and may change in future kernel +endif # BPF_MM + source "mm/damon/Kconfig" config ASYNC_FORK diff --git a/mm/Makefile b/mm/Makefile index 67af22e50f12..a5aacb2ffb0e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_NUMA) += memory-tiers.o obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o +obj-$(CONFIG_BPF_THP) += huge_memory_bpf.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o memcg_zombie_reaper.o ifdef CONFIG_SWAP diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c new file mode 100644 index 000000000000..f7eb25c0db3a --- /dev/null +++ b/mm/huge_memory_bpf.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BPF-based THP policy management + * + * Author: Yafang Shao + */ + +#include +#include +#include +#include + +/** + * @thp_order_fn_t: Get the suggested THP order from a BPF program for allocation + * @vma: vm_area_struct associated with the THP allocation + * @type: TVA type for current @vma + * @orders: Bitmask of available THP orders for this allocation + * + * Return: The suggested THP order for allocation from the BPF program. + * Returns a negative value to preserve the original available @orders, + * which is useful in specific cases—for example, when only a particular + * @type is handled and others are ignored. + */ +typedef int thp_order_fn_t(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders); + +struct bpf_thp_ops { + pid_t pid; /* The pid to attach */ + thp_order_fn_t *thp_get_order; + + /* private */ + /* The list of mm_struct objects managed by this BPF-THP instance. */ + struct list_head mm_list; +}; + +static DEFINE_SPINLOCK(thp_ops_lock); + +unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders) +{ + struct mm_struct *mm = vma->vm_mm; + struct bpf_thp_ops *bpf_thp; + int bpf_order; + + if (!mm) + return orders; + + rcu_read_lock(); + bpf_thp = rcu_dereference(mm->bpf_mm.bpf_thp); + if (!bpf_thp || !bpf_thp->thp_get_order) + goto out; + + bpf_order = bpf_thp->thp_get_order(vma, type, orders); + if (bpf_order < 0) + goto out; + orders &= BIT(bpf_order); + +out: + rcu_read_unlock(); + return orders; +} + +void bpf_thp_exit_mm(struct mm_struct *mm) +{ + if (!rcu_access_pointer(mm->bpf_mm.bpf_thp)) + return; + + spin_lock(&thp_ops_lock); + if (!rcu_access_pointer(mm->bpf_mm.bpf_thp)) { + spin_unlock(&thp_ops_lock); + return; + } + list_del(&mm->bpf_mm.bpf_thp_list); + RCU_INIT_POINTER(mm->bpf_mm.bpf_thp, NULL); + spin_unlock(&thp_ops_lock); + +} + +void bpf_thp_retain_mm(struct mm_struct *mm, struct mm_struct *old_mm) +{ + struct bpf_thp_ops *bpf_thp; + + if (!old_mm || !rcu_access_pointer(old_mm->bpf_mm.bpf_thp)) + return; + + spin_lock(&thp_ops_lock); + bpf_thp = rcu_dereference_protected(old_mm->bpf_mm.bpf_thp, + lockdep_is_held(&thp_ops_lock)); + if (!bpf_thp) { + spin_unlock(&thp_ops_lock); + return; + } + + /* The new mm_struct is under initialization. */ + RCU_INIT_POINTER(mm->bpf_mm.bpf_thp, bpf_thp); + + /* The old mm_struct is being destroyed. */ + RCU_INIT_POINTER(old_mm->bpf_mm.bpf_thp, NULL); + list_replace(&old_mm->bpf_mm.bpf_thp_list, &mm->bpf_mm.bpf_thp_list); + spin_unlock(&thp_ops_lock); +} + +void bpf_thp_fork(struct mm_struct *mm, struct mm_struct *old_mm) +{ + struct bpf_thp_ops *bpf_thp; + + if (!rcu_access_pointer(old_mm->bpf_mm.bpf_thp)) + return; + + spin_lock(&thp_ops_lock); + bpf_thp = rcu_dereference_protected(old_mm->bpf_mm.bpf_thp, + lockdep_is_held(&thp_ops_lock)); + if (!bpf_thp) { + spin_unlock(&thp_ops_lock); + return; + } + + /* The new mm_struct is under initialization. */ + RCU_INIT_POINTER(mm->bpf_mm.bpf_thp, bpf_thp); + + list_add_tail(&mm->bpf_mm.bpf_thp_list, &bpf_thp->mm_list); + spin_unlock(&thp_ops_lock); +} + +static bool bpf_thp_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static const struct bpf_func_proto * +bpf_thp_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id); +} + +static const struct bpf_verifier_ops thp_bpf_verifier_ops = { + .get_func_proto = bpf_thp_get_func_proto, + .is_valid_access = bpf_thp_ops_is_valid_access, +}; + +static int bpf_thp_init(struct btf *btf) +{ + return 0; +} + +static int bpf_thp_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + /* The call site operates under RCU protection. */ + if (prog->sleepable) + return -EINVAL; + return 0; +} + +static int bpf_thp_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct bpf_thp_ops *ubpf_thp; + struct bpf_thp_ops *kbpf_thp; + u32 moff; + + ubpf_thp = (const struct bpf_thp_ops *)udata; + kbpf_thp = (struct bpf_thp_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct bpf_thp_ops, pid): + /* bpf_struct_ops only handles func ptrs and zero-ed members. + * Return 1 to bypass the default handler. + */ + kbpf_thp->pid = ubpf_thp->pid; + return 1; + } + return 0; +} + +static int bpf_thp_reg(void *kdata) +{ + struct bpf_thp_ops *bpf_thp = kdata; + struct list_head *mm_list; + struct task_struct *p; + struct mm_struct *mm; + int err = 0; + pid_t pid; + + pid = bpf_thp->pid; + p = find_get_task_by_vpid(pid); + if (!p) + return -ESRCH; + + if (p->flags & PF_EXITING) { + put_task_struct(p); + return -ESRCH; + } + + mm = get_task_mm(p); + put_task_struct(p); + if (!mm) + return -EINVAL; + + /* To prevent conflicts, use this lock when multiple BPF-THP instances + * might register this task simultaneously. + */ + spin_lock(&thp_ops_lock); + /* Each process is exclusively managed by a single BPF-THP. */ + if (rcu_access_pointer(mm->bpf_mm.bpf_thp)) { + err = -EBUSY; + goto out; + } + rcu_assign_pointer(mm->bpf_mm.bpf_thp, bpf_thp); + + mm_list = &bpf_thp->mm_list; + INIT_LIST_HEAD(mm_list); + list_add_tail(&mm->bpf_mm.bpf_thp_list, mm_list); + +out: + spin_unlock(&thp_ops_lock); + mmput(mm); + return err; +} + +static void bpf_thp_unreg(void *kdata) +{ + struct bpf_thp_ops *bpf_thp = kdata; + struct bpf_mm_ops *bpf_mm; + struct list_head *pos, *n; + + spin_lock(&thp_ops_lock); + list_for_each_safe(pos, n, &bpf_thp->mm_list) { + bpf_mm = list_entry(pos, struct bpf_mm_ops, bpf_thp_list); + WARN_ON_ONCE(!bpf_mm); + rcu_replace_pointer(bpf_mm->bpf_thp, NULL, lockdep_is_held(&thp_ops_lock)); + list_del(pos); + } + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); +} + +static int bpf_thp_update(void *kdata, void *old_kdata) +{ + struct bpf_thp_ops *old_bpf_thp = old_kdata; + struct bpf_thp_ops *bpf_thp = kdata; + struct bpf_mm_ops *bpf_mm; + struct list_head *pos, *n; + + INIT_LIST_HEAD(&bpf_thp->mm_list); + + /* Could be optimized to a per-instance lock if this lock becomes a bottleneck. */ + spin_lock(&thp_ops_lock); + list_for_each_safe(pos, n, &old_bpf_thp->mm_list) { + bpf_mm = list_entry(pos, struct bpf_mm_ops, bpf_thp_list); + WARN_ON_ONCE(!bpf_mm); + rcu_replace_pointer(bpf_mm->bpf_thp, bpf_thp, lockdep_is_held(&thp_ops_lock)); + list_del(pos); + list_add_tail(&bpf_mm->bpf_thp_list, &bpf_thp->mm_list); + } + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); + return 0; +} + +static int bpf_thp_validate(void *kdata) +{ + struct bpf_thp_ops *ops = kdata; + + if (!ops->thp_get_order) { + pr_err("bpf_thp: required ops isn't implemented\n"); + return -EINVAL; + } + return 0; +} + +static int bpf_thp_get_order(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders) +{ + return -1; +} + +static struct bpf_thp_ops __bpf_thp_ops = { + .thp_get_order = (thp_order_fn_t *)bpf_thp_get_order, +}; + +static struct bpf_struct_ops bpf_bpf_thp_ops = { + .verifier_ops = &thp_bpf_verifier_ops, + .init = bpf_thp_init, + .check_member = bpf_thp_check_member, + .init_member = bpf_thp_init_member, + .reg = bpf_thp_reg, + .unreg = bpf_thp_unreg, + .update = bpf_thp_update, + .validate = bpf_thp_validate, + .cfi_stubs = &__bpf_thp_ops, + .owner = THIS_MODULE, + .name = "bpf_thp_ops", +}; + +static int __init bpf_thp_ops_init(void) +{ + int err; + + err = register_bpf_struct_ops(&bpf_bpf_thp_ops, bpf_thp_ops); + if (err) + pr_err("bpf_thp: Failed to register struct_ops (%d)\n", err); + return err; +} +late_initcall(bpf_thp_ops_init); -- Gitee From 9743f14775e7f6f317f2030f548d5aa932135ced Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:53 +0800 Subject: [PATCH 24/32] mm: thp: decouple THP allocation between swap and page fault paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-5-laoar.shao@gmail.com/ The new BPF capability enables finer-grained THP policy decisions by introducing separate handling for swap faults versus normal page faults. As highlighted by Barry: We’ve observed that swapping in large folios can lead to more swap thrashing for some workloads- e.g. kernel build. Consequently, some workloads might prefer swapping in smaller folios than those allocated by alloc_anon_folio(). While prtcl() could potentially be extended to leverage this new policy, doing so would require modifications to the uAPI. Signed-off-by: Yafang Shao Reviewed-by: Lorenzo Stoakes Acked-by: Usama Arif Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Yuanhe Shu --- include/linux/huge_mm.h | 3 ++- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 001f9743d937..fa7ab950c36b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,9 +94,10 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; enum tva_type { TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ - TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_PAGEFAULT, /* Serving a non-swap page fault. */ TVA_KHUGEPAGED, /* Khugepaged collapse. */ TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ + TVA_SWAP_PAGEFAULT, /* serving a swap page fault. */ }; #define thp_vma_allowable_order(vma, type, order) \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 57987b303ce4..67ae5acdb171 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -93,7 +93,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long orders) { const bool smaps = type == TVA_SMAPS; - const bool in_pf = type == TVA_PAGEFAULT; + const bool in_pf = (type == TVA_PAGEFAULT || type == TVA_SWAP_PAGEFAULT); const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; unsigned long supported_orders; vm_flags_t vm_flags = vma->vm_flags; diff --git a/mm/memory.c b/mm/memory.c index f79d224f883e..d340637d022b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4473,7 +4473,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, TVA_PAGEFAULT, + orders = thp_vma_allowable_orders(vma, TVA_SWAP_PAGEFAULT, BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), -- Gitee From fe93e462b4663d2a404b44bde28ef07a6f2e81c6 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:54 +0800 Subject: [PATCH 25/32] mm: thp: enable THP allocation exclusively through khugepaged ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-6-laoar.shao@gmail.com/ khugepaged_enter_vma() ultimately invokes any attached BPF function with the TVA_KHUGEPAGED flag set when determining whether or not to enable khugepaged THP for a freshly faulted in VMA. Currently, on fault, we invoke this in do_huge_pmd_anonymous_page(), as invoked by create_huge_pmd() and only when we have already checked to see if an allowable TVA_PAGEFAULT order is specified. Since we might want to disallow THP on fault-in but allow it via khugepaged, we move things around so we always attempt to enter khugepaged upon fault. This change is safe because: - khugepaged operates at the MM level rather than per-VMA. The THP allocation might fail during page faults due to transient conditions (e.g., memory pressure), it is safe to add this MM to khugepaged for subsequent defragmentation. - If __thp_vma_allowable_orders(TVA_PAGEFAULT) returns 0, then __thp_vma_allowable_orders(TVA_KHUGEPAGED) will also return 0. While we could also extend prctl() to utilize this new policy, such a change would require a uAPI modification to PR_SET_THP_DISABLE. Signed-off-by: Yafang Shao Acked-by: Lance Yang Cc: Usama Arif Signed-off-by: Yuanhe Shu --- mm/huge_memory.c | 1 - mm/memory.c | 13 ++++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 67ae5acdb171..d37fe882bc41 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1434,7 +1434,6 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter_vma(vma); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && diff --git a/mm/memory.c b/mm/memory.c index d340637d022b..f246a3e821fc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6315,11 +6315,14 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) { - ret = create_huge_pmd(&vmf); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (pmd_none(*vmf.pmd)) { + if (vma_is_anonymous(vma)) + khugepaged_enter_vma(vma); + if (thp_vma_allowable_order(vma, TVA_PAGEFAULT, PMD_ORDER)) { + ret = create_huge_pmd(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } else { vmf.orig_pmd = pmdp_get_lockless(vmf.pmd); -- Gitee From 83a5cdb06b7b4b84c33de560aba19c12a4adc1bb Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:55 +0800 Subject: [PATCH 26/32] mm: bpf-thp: add support for global mode ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-7-laoar.shao@gmail.com/ The per-process BPF-THP mode is unsuitable for managing shared resources such as shmem THP and file-backed THP. This aligns with known cgroup limitations for similar scenarios [0]. Introduce a global BPF-THP mode to address this gap. When registered: - All existing per-process instances are disabled - New per-process registrations are blocked - Existing per-process instances remain registered (no forced unregistration) The global mode takes precedence over per-process instances. Updates are type-isolated: global instances can only be updated by new global instances, and per-process instances by new per-process instances. Link: https://lore.kernel.org/linux-mm/YwNold0GMOappUxc@slm.duckdns.org/ [0] Signed-off-by: Yafang Shao Signed-off-by: Yuanhe Shu --- mm/huge_memory_bpf.c | 111 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c index f7eb25c0db3a..33d6133b11e6 100644 --- a/mm/huge_memory_bpf.c +++ b/mm/huge_memory_bpf.c @@ -35,6 +35,30 @@ struct bpf_thp_ops { }; static DEFINE_SPINLOCK(thp_ops_lock); +static struct bpf_thp_ops __rcu *bpf_thp_global; /* global mode */ + +static unsigned long +bpf_hook_thp_get_orders_global(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders) +{ + static struct bpf_thp_ops *bpf_thp; + int bpf_order; + + rcu_read_lock(); + bpf_thp = rcu_dereference(bpf_thp_global); + if (!bpf_thp || !bpf_thp->thp_get_order) + goto out; + + bpf_order = bpf_thp->thp_get_order(vma, type, orders); + if (bpf_order < 0) + goto out; + orders &= BIT(bpf_order); + +out: + rcu_read_unlock(); + return orders; +} unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma, enum tva_type type, @@ -47,6 +71,10 @@ unsigned long bpf_hook_thp_get_orders(struct vm_area_struct *vma, if (!mm) return orders; + /* Global BPF-THP takes precedence over per-process BPF-THP. */ + if (rcu_access_pointer(bpf_thp_global)) + return bpf_hook_thp_get_orders_global(vma, type, orders); + rcu_read_lock(); bpf_thp = rcu_dereference(mm->bpf_mm.bpf_thp); if (!bpf_thp || !bpf_thp->thp_get_order) @@ -181,6 +209,23 @@ static int bpf_thp_init_member(const struct btf_type *t, return 0; } +static int bpf_thp_reg_gloabl(void *kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *ops = kdata; + + /* Protect the global pointer bpf_thp_global from concurrent writes. */ + spin_lock(&thp_ops_lock); + /* Only one instance is allowed. */ + if (rcu_access_pointer(bpf_thp_global)) { + spin_unlock(&thp_ops_lock); + return -EBUSY; + } + + rcu_assign_pointer(bpf_thp_global, ops); + spin_unlock(&thp_ops_lock); + return 0; +} + static int bpf_thp_reg(void *kdata) { struct bpf_thp_ops *bpf_thp = kdata; @@ -191,6 +236,11 @@ static int bpf_thp_reg(void *kdata) pid_t pid; pid = bpf_thp->pid; + + /* Fallback to global mode if pid is not set. */ + if (!pid) + return bpf_thp_reg_gloabl(kdata, link); + p = find_get_task_by_vpid(pid); if (!p) return -ESRCH; @@ -209,8 +259,10 @@ static int bpf_thp_reg(void *kdata) * might register this task simultaneously. */ spin_lock(&thp_ops_lock); - /* Each process is exclusively managed by a single BPF-THP. */ - if (rcu_access_pointer(mm->bpf_mm.bpf_thp)) { + /* Each process is exclusively managed by a single BPF-THP. + * Global mode disables per-process instances. + */ + if (rcu_access_pointer(mm->bpf_mm.bpf_thp) || rcu_access_pointer(bpf_thp_global)) { err = -EBUSY; goto out; } @@ -226,12 +278,33 @@ static int bpf_thp_reg(void *kdata) return err; } +static void bpf_thp_unreg_global(void *kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *bpf_thp; + + spin_lock(&thp_ops_lock); + if (!rcu_access_pointer(bpf_thp_global)) { + spin_unlock(&thp_ops_lock); + return; + } + + bpf_thp = rcu_replace_pointer(bpf_thp_global, NULL, + lockdep_is_held(&thp_ops_lock)); + WARN_ON_ONCE(!bpf_thp); + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); +} + static void bpf_thp_unreg(void *kdata) { struct bpf_thp_ops *bpf_thp = kdata; struct bpf_mm_ops *bpf_mm; struct list_head *pos, *n; + if (!bpf_thp->pid) + return bpf_thp_unreg_global(kdata, link); + spin_lock(&thp_ops_lock); list_for_each_safe(pos, n, &bpf_thp->mm_list) { bpf_mm = list_entry(pos, struct bpf_mm_ops, bpf_thp_list); @@ -244,6 +317,31 @@ static void bpf_thp_unreg(void *kdata) synchronize_rcu(); } +static int bpf_thp_update_global(void *kdata, void *old_kdata, struct bpf_link *link) +{ + struct bpf_thp_ops *old_bpf_thp = old_kdata; + struct bpf_thp_ops *bpf_thp = kdata; + struct bpf_thp_ops *old_global; + + if (!old_bpf_thp || !bpf_thp) + return -EINVAL; + + spin_lock(&thp_ops_lock); + /* BPF-THP global instance has already been removed. */ + if (!rcu_access_pointer(bpf_thp_global)) { + spin_unlock(&thp_ops_lock); + return -ENOENT; + } + + old_global = rcu_replace_pointer(bpf_thp_global, bpf_thp, + lockdep_is_held(&thp_ops_lock)); + WARN_ON_ONCE(!old_global); + spin_unlock(&thp_ops_lock); + + synchronize_rcu(); + return 0; +} + static int bpf_thp_update(void *kdata, void *old_kdata) { struct bpf_thp_ops *old_bpf_thp = old_kdata; @@ -251,6 +349,15 @@ static int bpf_thp_update(void *kdata, void *old_kdata) struct bpf_mm_ops *bpf_mm; struct list_head *pos, *n; + /* Updates are confined to instances of the same scope: + * global to global, process-local to process-local. + */ + if (!!old_bpf_thp->pid != !!bpf_thp->pid) + return -EINVAL; + + if (!old_bpf_thp->pid) + return bpf_thp_update_global(kdata, old_kdata, link); + INIT_LIST_HEAD(&bpf_thp->mm_list); /* Could be optimized to a per-instance lock if this lock becomes a bottleneck. */ -- Gitee From fe704d5e0e2a4608f2b7c03567062f5221ea953a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:56 +0800 Subject: [PATCH 27/32] Documentation: add BPF THP ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-8-laoar.shao@gmail.com/ Add the documentation. Signed-off-by: Yafang Shao Signed-off-by: Yuanhe Shu --- Documentation/admin-guide/mm/transhuge.rst | 113 +++++++++++++++++++++ mm/Kconfig | 2 + 2 files changed, 115 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 1227685c52b1..73867bb4c12b 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -760,3 +760,116 @@ support enabled just fine as always. No difference can be noted in hugetlbfs other than there will be less overall fragmentation. All usual features belonging to hugetlbfs are preserved and unaffected. libhugetlbfs will also work fine as usual. + +BPF THP +======= + +:Author: Yafang Shao +:Date: October 2025 + +Overview +-------- + +When the system is configured with "always" or "madvise" THP mode, a BPF program +can be used to adjust THP allocation policies dynamically. This enables +fine-grained control over THP decisions based on various factors including +workload identity, allocation context, and system memory pressure. + +Program Interface +----------------- + +This feature implements a struct_ops BPF program with the following interface:: + + struct bpf_thp_ops { + pid_t pid; + thp_order_fn_t *thp_get_order; + }; + +Callback Functions +------------------ + +thp_get_order() +~~~~~~~~~~~~~~~ + +.. code-block:: c + + int thp_get_order(struct vm_area_struct *vma, + enum tva_type type, + unsigned long orders); + +Parameters +^^^^^^^^^^ + +``vma`` + ``vm_area_struct`` associated with the THP allocation. + +``type`` + TVA type for the current ``vma``. + +``orders`` + Bitmask of available THP orders for this allocation. + +Return value +^^^^^^^^^^^^ + +- The suggested THP order for allocation from the BPF program +- Must be a valid, available order from the provided ``orders`` bitmask + +Operation Modes +--------------- + +Per Process Mode +~~~~~~~~~~~~~~~~ + +When registering a BPF-THP with a specific PID, the program is installed in the +target task's ``mm_struct``:: + + struct mm_struct { + struct bpf_thp_ops __rcu *bpf_thp; + }; + +Inheritance Behavior +^^^^^^^^^^^^^^^^^^^^ + +- Existing child processes are unaffected +- Newly forked children inherit the BPF-THP from their parent +- The BPF-THP persists across execve() calls + +Management Rules +^^^^^^^^^^^^^^^^ + +- When a BPF-THP instance is unregistered, all managed tasks' ``bpf_thp`` + pointers are reset to ``NULL`` +- When a BPF-THP instance is updated, all managed tasks' ``bpf_thp`` pointers + are automatically updated to the new version +- Each process can be managed by only one BPF-THP instance at a time + +Global Mode +~~~~~~~~~~~ + +If no PID is specified during registration, the BPF-THP operates in global mode. +In this mode, all tasks in the system are managed by the global instance. + +Global Mode Precedence +^^^^^^^^^^^^^^^^^^^^^^ + +- The global instance takes precedence over all per-process instances +- All existing per-process instances are disabled when a global instance is + registered +- New per-process registrations are blocked while a global instance is active +- Existing per-process instances remain registered (no forced unregistration) + +Instance Management +^^^^^^^^^^^^^^^^^^^ + +- Updates are type-isolated: global instances can only be updated by new global + instances, and per-process instances by new per-process instances +- Only one global BPF-THP can be registered at a time +- Global instances can be updated dynamically without requiring task restarts + +Implementation Notes +-------------------- + +- This is currently an experimental feature +- ``CONFIG_BPF_THP`` must be enabled to use this functionality +- The feature depends on proper THP configuration ("always" or "madvise" mode) diff --git a/mm/Kconfig b/mm/Kconfig index 4d19c28546d1..e08b4c16593a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1341,6 +1341,8 @@ config BPF_THP Enable dynamic THP policy adjustment using BPF programs. This feature is currently experimental. + See Documentation/admin-guide/mm/transhuge.rst for more information. + WARNING: This feature is unstable and may change in future kernel endif # BPF_MM -- Gitee From 1151b70b14d13fa57b3f42ff105bfde896c25398 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:57 +0800 Subject: [PATCH 28/32] selftests/bpf: add a simple BPF based THP policy ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-9-laoar.shao@gmail.com/ This test case implements a basic THP policy that sets THPeligible to 0 for a specific task. I selected THPeligible for verification because its straightforward nature makes it ideal for validating the BPF THP policy functionality. Below configs must be enabled for this test: CONFIG_BPF_MM=y CONFIG_BPF_THP=y CONFIG_TRANSPARENT_HUGEPAGE=y Signed-off-by: Yafang Shao Signed-off-by: Yuanhe Shu --- mm/huge_memory_bpf.c | 12 +- tools/testing/selftests/bpf/config | 3 + .../selftests/bpf/prog_tests/thp_adjust.c | 245 ++++++++++++++++++ .../selftests/bpf/progs/test_thp_adjust.c | 24 ++ 4 files changed, 278 insertions(+), 6 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/thp_adjust.c create mode 100644 tools/testing/selftests/bpf/progs/test_thp_adjust.c diff --git a/mm/huge_memory_bpf.c b/mm/huge_memory_bpf.c index 33d6133b11e6..5a7d0cd63e1d 100644 --- a/mm/huge_memory_bpf.c +++ b/mm/huge_memory_bpf.c @@ -209,7 +209,7 @@ static int bpf_thp_init_member(const struct btf_type *t, return 0; } -static int bpf_thp_reg_gloabl(void *kdata, struct bpf_link *link) +static int bpf_thp_reg_gloabl(void *kdata) { struct bpf_thp_ops *ops = kdata; @@ -239,7 +239,7 @@ static int bpf_thp_reg(void *kdata) /* Fallback to global mode if pid is not set. */ if (!pid) - return bpf_thp_reg_gloabl(kdata, link); + return bpf_thp_reg_gloabl(kdata); p = find_get_task_by_vpid(pid); if (!p) @@ -278,7 +278,7 @@ static int bpf_thp_reg(void *kdata) return err; } -static void bpf_thp_unreg_global(void *kdata, struct bpf_link *link) +static void bpf_thp_unreg_global(void *kdata) { struct bpf_thp_ops *bpf_thp; @@ -303,7 +303,7 @@ static void bpf_thp_unreg(void *kdata) struct list_head *pos, *n; if (!bpf_thp->pid) - return bpf_thp_unreg_global(kdata, link); + return bpf_thp_unreg_global(kdata); spin_lock(&thp_ops_lock); list_for_each_safe(pos, n, &bpf_thp->mm_list) { @@ -317,7 +317,7 @@ static void bpf_thp_unreg(void *kdata) synchronize_rcu(); } -static int bpf_thp_update_global(void *kdata, void *old_kdata, struct bpf_link *link) +static int bpf_thp_update_global(void *kdata, void *old_kdata) { struct bpf_thp_ops *old_bpf_thp = old_kdata; struct bpf_thp_ops *bpf_thp = kdata; @@ -356,7 +356,7 @@ static int bpf_thp_update(void *kdata, void *old_kdata) return -EINVAL; if (!old_bpf_thp->pid) - return bpf_thp_update_global(kdata, old_kdata, link); + return bpf_thp_update_global(kdata, old_kdata); INIT_LIST_HEAD(&bpf_thp->mm_list); diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 5751614aef6a..bdcdbad8d43d 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -7,8 +7,10 @@ CONFIG_BPF_JIT=y CONFIG_BPF_KPROBE_OVERRIDE=y CONFIG_BPF_LIRC_MODE2=y CONFIG_BPF_LSM=y +CONFIG_BPF_MM=y CONFIG_BPF_STREAM_PARSER=y CONFIG_BPF_SYSCALL=y +CONFIG_BPF_THP=y # CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set CONFIG_CGROUP_BPF=y CONFIG_CRYPTO_HMAC=y @@ -83,6 +85,7 @@ CONFIG_RC_CORE=y CONFIG_SECURITY=y CONFIG_SECURITYFS=y CONFIG_TEST_BPF=m +CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_USERFAULTFD=y CONFIG_VXLAN=y CONFIG_XDP_SOCKETS=y diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c new file mode 100644 index 000000000000..2b23e2d08092 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "test_thp_adjust.skel.h" + +#define LEN (16 * 1024 * 1024) /* 16MB */ +#define THP_ENABLED_FILE "/sys/kernel/mm/transparent_hugepage/enabled" +#define PMD_SIZE_FILE "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" + +static struct test_thp_adjust *skel; +static char old_mode[32]; +static long pagesize; + +static int thp_mode_save(void) +{ + const char *start, *end; + char buf[128]; + int fd, err; + size_t len; + + fd = open(THP_ENABLED_FILE, O_RDONLY); + if (fd == -1) + return -1; + + err = read(fd, buf, sizeof(buf) - 1); + if (err == -1) + goto close; + + start = strchr(buf, '['); + end = start ? strchr(start, ']') : NULL; + if (!start || !end || end <= start) { + err = -1; + goto close; + } + + len = end - start - 1; + if (len >= sizeof(old_mode)) + len = sizeof(old_mode) - 1; + strncpy(old_mode, start + 1, len); + old_mode[len] = '\0'; + +close: + close(fd); + return err; +} + +static int thp_mode_set(const char *desired_mode) +{ + int fd, err; + + fd = open(THP_ENABLED_FILE, O_RDWR); + if (fd == -1) + return -1; + + err = write(fd, desired_mode, strlen(desired_mode)); + close(fd); + return err; +} + +static int thp_mode_reset(void) +{ + int fd, err; + + fd = open(THP_ENABLED_FILE, O_WRONLY); + if (fd == -1) + return -1; + + err = write(fd, old_mode, strlen(old_mode)); + close(fd); + return err; +} + +static char *thp_alloc(void) +{ + char *addr; + int err, i; + + addr = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (addr == MAP_FAILED) + return NULL; + + err = madvise(addr, LEN, MADV_HUGEPAGE); + if (err == -1) + goto unmap; + + /* Accessing a single byte within a page is sufficient to trigger a page fault. */ + for (i = 0; i < LEN; i += pagesize) + addr[i] = 1; + return addr; + +unmap: + munmap(addr, LEN); + return NULL; +} + +static void thp_free(char *ptr) +{ + munmap(ptr, LEN); +} + +static int get_pmd_order(void) +{ + ssize_t bytes_read, size; + int fd, order, ret = -1; + char buf[64], *endptr; + + fd = open(PMD_SIZE_FILE, O_RDONLY); + if (fd < 0) + return -1; + + bytes_read = read(fd, buf, sizeof(buf) - 1); + if (bytes_read <= 0) + goto close_fd; + + /* Remove potential newline character */ + if (buf[bytes_read - 1] == '\n') + buf[bytes_read - 1] = '\0'; + + size = strtoul(buf, &endptr, 10); + if (endptr == buf || *endptr != '\0') + goto close_fd; + if (size % pagesize != 0) + goto close_fd; + ret = size / pagesize; + if ((ret & (ret - 1)) == 0) { + order = 0; + while (ret > 1) { + ret >>= 1; + order++; + } + ret = order; + } + +close_fd: + close(fd); + return ret; +} + +static int get_thp_eligible(pid_t pid, unsigned long addr) +{ + int this_vma = 0, eligible = -1; + unsigned long start, end; + char smaps_path[64]; + FILE *smaps_file; + char line[4096]; + + snprintf(smaps_path, sizeof(smaps_path), "/proc/%d/smaps", pid); + smaps_file = fopen(smaps_path, "r"); + if (!smaps_file) + return -1; + + while (fgets(line, sizeof(line), smaps_file)) { + if (sscanf(line, "%lx-%lx", &start, &end) == 2) { + /* addr is monotonic */ + if (addr < start) + break; + this_vma = (addr >= start && addr < end) ? 1 : 0; + continue; + } + + if (!this_vma) + continue; + + if (strstr(line, "THPeligible:")) { + sscanf(line, "THPeligible: %d", &eligible); + break; + } + } + + fclose(smaps_file); + return eligible; +} + +static void subtest_thp_eligible(void) +{ + struct bpf_link *ops_link; + int elighble; + char *ptr; + + ops_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops); + if (!ASSERT_OK_PTR(ops_link, "attach struct_ops")) + return; + + ptr = thp_alloc(); + if (!ASSERT_OK_PTR(ptr, "THP alloc")) + goto detach; + + elighble = get_thp_eligible(getpid(), (unsigned long)ptr); + ASSERT_EQ(elighble, 0, "THPeligible"); + + thp_free(ptr); +detach: + bpf_link__destroy(ops_link); +} + +static int thp_adjust_setup(void) +{ + int err = -1, pmd_order; + + pagesize = sysconf(_SC_PAGESIZE); + pmd_order = get_pmd_order(); + if (!ASSERT_NEQ(pmd_order, -1, "get_pmd_order")) + return -1; + + if (!ASSERT_NEQ(thp_mode_save(), -1, "THP mode save")) + return -1; + if (!ASSERT_GE(thp_mode_set("madvise"), 0, "THP mode set")) + return -1; + + skel = test_thp_adjust__open(); + if (!ASSERT_OK_PTR(skel, "open")) + goto thp_reset; + + skel->bss->pmd_order = pmd_order; + skel->struct_ops.thp_eligible_ops->pid = getpid(); + + err = test_thp_adjust__load(skel); + if (!ASSERT_OK(err, "load")) + goto destroy; + return 0; + +destroy: + test_thp_adjust__destroy(skel); +thp_reset: + ASSERT_GE(thp_mode_reset(), 0, "THP mode reset"); + return err; +} + +static void thp_adjust_destroy(void) +{ + test_thp_adjust__destroy(skel); + ASSERT_GE(thp_mode_reset(), 0, "THP mode reset"); +} + +void test_thp_adjust(void) +{ + if (thp_adjust_setup() == -1) + return; + + if (test__start_subtest("thp_eligible")) + subtest_thp_eligible(); + + thp_adjust_destroy(); +} diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c new file mode 100644 index 000000000000..b180a7f9b923 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "vmlinux.h" +#include +#include + +char _license[] SEC("license") = "GPL"; + +int pmd_order; + +SEC("struct_ops/thp_get_order") +int BPF_PROG(thp_not_eligible, struct vm_area_struct *vma, enum tva_type type, + unsigned long orders) +{ + /* THPeligible in /proc/pid/smaps is 0 */ + if (type == TVA_SMAPS) + return 0; + return pmd_order; +} + +SEC(".struct_ops.link") +struct bpf_thp_ops thp_eligible_ops = { + .thp_get_order = (void *)thp_not_eligible, +}; -- Gitee From 487bc7a3bebb990117e614f44bbb1a6d0cc19b4a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:58 +0800 Subject: [PATCH 29/32] selftests/bpf: add test case to update THP policy ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-10-laoar.shao@gmail.com/ This test case exercises the BPF THP update mechanism by modifying an existing policy. The behavior confirms that: - EBUSY error occurs when attempting to install a BPF program on a process that already has an active BPF program - Updates to currently running programs are successfully processed - Local prog can't be updated by a global prog - Global prog can't be updated by a local prog - Global prog can be attached even if there's a local prog - Local prog can't be attached if there's a global prog Signed-off-by: Yafang Shao Signed-off-by: Yuanhe Shu --- .../selftests/bpf/prog_tests/thp_adjust.c | 79 +++++++++++++++++++ .../selftests/bpf/progs/test_thp_adjust.c | 29 +++++++ 2 files changed, 108 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c index 2b23e2d08092..0d570cee9006 100644 --- a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c +++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c @@ -194,6 +194,79 @@ static void subtest_thp_eligible(void) bpf_link__destroy(ops_link); } +static void subtest_thp_policy_update(void) +{ + struct bpf_link *old_link, *new_link; + int elighble, err, pid; + char *ptr; + + pid = getpid(); + ptr = thp_alloc(); + + old_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops); + if (!ASSERT_OK_PTR(old_link, "attach_old_link")) + goto free; + + elighble = get_thp_eligible(pid, (unsigned long)ptr); + ASSERT_EQ(elighble, 0, "THPeligible"); + + /* Attach multi BPF-THP to a single process is rejected. */ + new_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops2); + if (!ASSERT_NULL(new_link, "attach_new_link")) + goto destory_old; + ASSERT_EQ(errno, EBUSY, "attach_new_link"); + + elighble = get_thp_eligible(pid, (unsigned long)ptr); + ASSERT_EQ(elighble, 0, "THPeligible"); + + err = bpf_link__update_map(old_link, skel->maps.thp_eligible_ops2); + ASSERT_EQ(err, 0, "update_old_link"); + + elighble = get_thp_eligible(pid, (unsigned long)ptr); + ASSERT_EQ(elighble, 1, "THPeligible"); + + /* Per process prog can't be update by a global prog */ + err = bpf_link__update_map(old_link, skel->maps.swap_ops); + ASSERT_EQ(err, -EINVAL, "update_old_link"); + +destory_old: + bpf_link__destroy(old_link); +free: + thp_free(ptr); +} + +static void subtest_thp_global_policy(void) +{ + struct bpf_link *local_link, *global_link; + int err; + + local_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops); + if (!ASSERT_OK_PTR(local_link, "attach_local_link")) + return; + + /* global prog can be attached even if there is a local prog */ + global_link = bpf_map__attach_struct_ops(skel->maps.swap_ops); + if (!ASSERT_OK_PTR(global_link, "attach_global_link")) { + bpf_link__destroy(local_link); + return; + } + + bpf_link__destroy(local_link); + + /* local prog can't be attaached if there is a global prog */ + local_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops); + if (!ASSERT_NULL(local_link, "attach_new_link")) + goto destory_global; + ASSERT_EQ(errno, EBUSY, "attach_new_link"); + + /* global prog can't be updated by a local prog */ + err = bpf_link__update_map(global_link, skel->maps.thp_eligible_ops); + ASSERT_EQ(err, -EINVAL, "update_old_link"); + +destory_global: + bpf_link__destroy(global_link); +} + static int thp_adjust_setup(void) { int err = -1, pmd_order; @@ -214,6 +287,8 @@ static int thp_adjust_setup(void) skel->bss->pmd_order = pmd_order; skel->struct_ops.thp_eligible_ops->pid = getpid(); + skel->struct_ops.thp_eligible_ops2->pid = getpid(); + /* swap_ops is a global prog since its pid is not set. */ err = test_thp_adjust__load(skel); if (!ASSERT_OK(err, "load")) @@ -240,6 +315,10 @@ void test_thp_adjust(void) if (test__start_subtest("thp_eligible")) subtest_thp_eligible(); + if (test__start_subtest("policy_update")) + subtest_thp_policy_update(); + if (test__start_subtest("global_policy")) + subtest_thp_global_policy(); thp_adjust_destroy(); } diff --git a/tools/testing/selftests/bpf/progs/test_thp_adjust.c b/tools/testing/selftests/bpf/progs/test_thp_adjust.c index b180a7f9b923..44648326819a 100644 --- a/tools/testing/selftests/bpf/progs/test_thp_adjust.c +++ b/tools/testing/selftests/bpf/progs/test_thp_adjust.c @@ -22,3 +22,32 @@ SEC(".struct_ops.link") struct bpf_thp_ops thp_eligible_ops = { .thp_get_order = (void *)thp_not_eligible, }; + +SEC("struct_ops/thp_get_order") +int BPF_PROG(thp_eligible, struct vm_area_struct *vma, enum tva_type type, + unsigned long orders) +{ + /* THPeligible in /proc/pid/smaps is 1 */ + if (type == TVA_SMAPS) + return pmd_order; + return pmd_order; +} + +SEC(".struct_ops.link") +struct bpf_thp_ops thp_eligible_ops2 = { + .thp_get_order = (void *)thp_eligible, +}; + +SEC("struct_ops/thp_get_order") +int BPF_PROG(alloc_not_in_swap, struct vm_area_struct *vma, enum tva_type type, + unsigned long orders) +{ + if (type == TVA_SWAP_PAGEFAULT) + return 0; + return -1; +} + +SEC(".struct_ops.link") +struct bpf_thp_ops swap_ops = { + .thp_get_order = (void *)alloc_not_in_swap, +}; -- Gitee From 3a7ae24037cb64073a5c8f4096de1511759d23f2 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 26 Oct 2025 18:01:59 +0800 Subject: [PATCH 30/32] selftests/bpf: add test case for BPF-THP inheritance across fork ANBZ: #28369 cherry-picked from https://lore.kernel.org/all/20251026100159.6103-11-laoar.shao@gmail.com/ Verify that child processes correctly inherit BPF-THP policy from their parent during fork() operations. Signed-off-by: Yafang Shao Signed-off-by: Yuanhe Shu --- .../selftests/bpf/prog_tests/thp_adjust.c | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c index 0d570cee9006..f585e60882e8 100644 --- a/tools/testing/selftests/bpf/prog_tests/thp_adjust.c +++ b/tools/testing/selftests/bpf/prog_tests/thp_adjust.c @@ -267,6 +267,37 @@ static void subtest_thp_global_policy(void) bpf_link__destroy(global_link); } +static void subtest_thp_fork(void) +{ + int elighble, child, pid, status; + struct bpf_link *ops_link; + char *ptr; + + ops_link = bpf_map__attach_struct_ops(skel->maps.thp_eligible_ops); + if (!ASSERT_OK_PTR(ops_link, "attach struct_ops")) + return; + + child = fork(); + if (!ASSERT_GE(child, 0, "fork")) + goto destroy; + + if (child == 0) { + ptr = thp_alloc(); + elighble = get_thp_eligible(getpid(), (unsigned long)ptr); + ASSERT_EQ(elighble, 0, "THPeligible"); + thp_free(ptr); + + exit(EXIT_SUCCESS); + } + + pid = waitpid(child, &status, 0); + ASSERT_EQ(pid, child, "waitpid"); + +destroy: + bpf_link__destroy(ops_link); + +} + static int thp_adjust_setup(void) { int err = -1, pmd_order; @@ -319,6 +350,8 @@ void test_thp_adjust(void) subtest_thp_policy_update(); if (test__start_subtest("global_policy")) subtest_thp_global_policy(); + if (test__start_subtest("thp_fork")) + subtest_thp_fork(); thp_adjust_destroy(); } -- Gitee From 39e06b837d58c7485b30be8568e63f7eadb2b703 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 15 Mar 2024 11:48:54 -0700 Subject: [PATCH 31/32] bpf: Allow helper bpf_get_[ns_]current_pid_tgid() for all prog types ANBZ: #28369 commit eb166e522c77699fc19bfa705652327a1e51a117 upstream. Currently bpf_get_current_pid_tgid() is allowed in tracing, cgroup and sk_msg progs while bpf_get_ns_current_pid_tgid() is only allowed in tracing progs. We have an internal use case where for an application running in a container (with pid namespace), user wants to get the pid associated with the pid namespace in a cgroup bpf program. Currently, cgroup bpf progs already allow bpf_get_current_pid_tgid(). Let us allow bpf_get_ns_current_pid_tgid() as well. With auditing the code, bpf_get_current_pid_tgid() is also used by sk_msg prog. But there are no side effect to expose these two helpers to all prog types since they do not reveal any kernel specific data. The detailed discussion is in [1]. So with this patch, both bpf_get_current_pid_tgid() and bpf_get_ns_current_pid_tgid() are put in bpf_base_func_proto(), making them available to all program types. [1] https://lore.kernel.org/bpf/20240307232659.1115872-1-yonghong.song@linux.dev/ Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20240315184854.2975190-1-yonghong.song@linux.dev Signed-off-by: Yuanhe Shu --- kernel/bpf/cgroup.c | 2 -- kernel/bpf/helpers.c | 4 ++++ kernel/trace/bpf_trace.c | 4 ---- net/core/filter.c | 2 -- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 684fb450ad08..8abaa2a32596 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2605,8 +2605,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_comm: return &bpf_get_current_comm_proto; #ifdef CONFIG_CGROUP_NET_CLASSID diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 285f86babb1f..371f23bcdeaf 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1810,6 +1810,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_strtol_proto; case BPF_FUNC_strtoul: return &bpf_strtoul_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_get_ns_current_pid_tgid: + return &bpf_get_ns_current_pid_tgid_proto; default: break; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e67437c8de08..1849be428b5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1455,8 +1455,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_boot_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_get_current_task_btf: @@ -1512,8 +1510,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_send_signal_thread_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; - case BPF_FUNC_get_ns_current_pid_tgid: - return &bpf_get_ns_current_pid_tgid_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/net/core/filter.c b/net/core/filter.c index e786ff34ea19..2c6267da5c23 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8376,8 +8376,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_event_output_data_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: -- Gitee From 2ea659e610e74bdaf96cd5fbc1d08116a9e68f69 Mon Sep 17 00:00:00 2001 From: Yuanhe Shu Date: Tue, 23 Dec 2025 19:25:59 +0800 Subject: [PATCH 32/32] anolis: configs: Enable CONFIG_BPF_MM and CONFIG_BPF_THP ANBZ: #28369 Enable CONFIG_BPF_MM and CONFIG_BPF_THP to support thp order selection by bpf prog. Signed-off-by: Yuanhe Shu --- anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_MM | 1 + anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_THP | 1 + 2 files changed, 2 insertions(+) create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_MM create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_THP diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_MM b/anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_MM new file mode 100644 index 000000000000..ba92373f4c5f --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_MM @@ -0,0 +1 @@ +CONFIG_BPF_MM=y diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_THP b/anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_THP new file mode 100644 index 000000000000..47ed1bfe9c31 --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_BPF_THP @@ -0,0 +1 @@ +CONFIG_BPF_THP=y -- Gitee