From 57a196a58421a4b0c45949ae7309f21829aaa77f Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sun, 18 Sep 2022 19:13:48 -0700 Subject: [PATCH 001/313] hugetlb: simplify hugetlb handling in follow_page_mask During discussions of this series [1], it was suggested that hugetlb handling code in follow_page_mask could be simplified. At the beginning of follow_page_mask, there currently is a call to follow_huge_addr which 'may' handle hugetlb pages. ia64 is the only architecture which provides a follow_huge_addr routine that does not return error. Instead, at each level of the page table a check is made for a hugetlb entry. If a hugetlb entry is found, a call to a routine associated with that entry is made. Currently, there are two checks for hugetlb entries at each page table level. The first check is of the form: if (p?d_huge()) page = follow_huge_p?d(); the second check is of the form: if (is_hugepd()) page = follow_huge_pd(). We can replace these checks, as well as the special handling routines such as follow_huge_p?d() and follow_huge_pd() with a single routine to handle hugetlb vmas. A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the beginning of follow_page_mask. hugetlb_follow_page_mask will use the existing routine huge_pte_offset to walk page tables looking for hugetlb entries. huge_pte_offset can be overwritten by architectures, and already handles special cases such as hugepd entries. [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/ [mike.kravetz@oracle.com: remove vma (pmd sharing) per Peter] Link: https://lkml.kernel.org/r/20221028181108.119432-1-mike.kravetz@oracle.com [mike.kravetz@oracle.com: remove left over hugetlb_vma_unlock_read()] Link: https://lkml.kernel.org/r/20221030225825.40872-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220919021348.22151-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Aneesh Kumar K.V Cc: Christophe Leroy Cc: Michael Ellerman Cc: Muchun Song Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- arch/ia64/mm/hugetlbpage.c | 15 --- arch/powerpc/mm/hugetlbpage.c | 37 -------- include/linux/hugetlb.h | 50 ++-------- mm/gup.c | 80 +++------------- mm/hugetlb.c | 172 +++++++++++----------------------- 5 files changed, 76 insertions(+), 278 deletions(-) diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index f993cb36c062..380d2f3966c9 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file, return 0; } -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write) -{ - struct page *page; - pte_t *ptep; - - if (REGION_NUMBER(addr) != RGN_HPAGE) - return ERR_PTR(-EINVAL); - - ptep = huge_pte_offset(mm, addr, HPAGE_SIZE); - if (!ptep || pte_none(*ptep)) - return NULL; - page = pte_page(*ptep); - page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT); - return page; -} int pmd_huge(pmd_t pmd) { return 0; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 5852a86d990d..f1ba8d1e8c1a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift) -{ - pte_t *ptep; - spinlock_t *ptl; - struct page *page = NULL; - unsigned long mask; - int shift = hugepd_shift(hpd); - struct mm_struct *mm = vma->vm_mm; - -retry: - /* - * hugepage directory entries are protected by mm->page_table_lock - * Use this instead of huge_pte_lockptr - */ - ptl = &mm->page_table_lock; - spin_lock(ptl); - - ptep = hugepte_offset(hpd, address, pdshift); - if (pte_present(*ptep)) { - mask = (1UL << shift) - 1; - page = pte_page(*ptep); - page += ((address & mask) >> PAGE_SHIFT); - if (flags & FOLL_GET) - get_page(page); - } else { - if (is_hugetlb_entry_migration(*ptep)) { - spin_unlock(ptl); - __migration_entry_wait(mm, ptep, ptl); - goto retry; - } - } - spin_unlock(ptl); - return page; -} - bool __init arch_hugetlb_valid_size(unsigned long size) { int shift = __ffs(size); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8b4f93e84868..4a76c0fc6bbf 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -149,6 +149,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long len); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, unsigned long *, long, unsigned int, @@ -209,17 +211,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write); -struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, - int flags, int pdshift); -struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, - int flags); -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags); -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, - pgd_t *pgd, int flags); void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); @@ -272,6 +263,12 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ +} + static inline long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, @@ -282,12 +279,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm, return 0; } -static inline struct page *follow_huge_addr(struct mm_struct *mm, - unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, @@ -320,31 +311,6 @@ static inline void hugetlb_show_meminfo_node(int nid) { } -static inline struct page *follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, - int pdshift) -{ - return NULL; -} - -static inline struct page *follow_huge_pmd_pte(struct vm_area_struct *vma, - unsigned long address, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pud(struct mm_struct *mm, - unsigned long address, pud_t *pud, int flags) -{ - return NULL; -} - -static inline struct page *follow_huge_pgd(struct mm_struct *mm, - unsigned long address, pgd_t *pgd, int flags) -{ - return NULL; -} - static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { diff --git a/mm/gup.c b/mm/gup.c index fe195d47de74..6b16aecf5d2c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -537,18 +537,6 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); - - /* - * Considering PTE level hugetlb, like continuous-PTE hugetlb on - * ARM64 architecture. - */ - if (is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -680,20 +668,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pmd_val(pmdval)))) { - page = follow_huge_pd(vma, address, - __hugepd(pmd_val(pmdval)), flags, - PMD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } retry: if (!pmd_present(pmdval)) { /* @@ -783,20 +757,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pud(mm, address, pud, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pud_val(*pud)))) { - page = follow_huge_pd(vma, address, - __hugepd(pud_val(*pud)), flags, - PUD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); @@ -816,7 +776,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { p4d_t *p4d; - struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) @@ -825,14 +784,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); - if (is_hugepd(__hugepd(p4d_val(*p4d)))) { - page = follow_huge_pd(vma, address, - __hugepd(p4d_val(*p4d)), flags, - P4D_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } return follow_pud_mask(vma, address, p4d, flags, ctx); } @@ -870,10 +821,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, ctx->page_mask = 0; - /* make this handle hugepd */ - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); + /* + * Call hugetlb_follow_page_mask for hugetlb vmas as it will use + * special hugetlb page table walking code. This eliminates the + * need to check for hugetlb entries in the general walking code. + * + * hugetlb_follow_page_mask is only for follow_page() handling here. + * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. + */ + if (is_vm_hugetlb_page(vma)) { + page = hugetlb_follow_page_mask(vma, address, flags); + if (!page) + page = no_page_table(vma, flags); return page; } @@ -882,21 +841,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - if (pgd_huge(*pgd)) { - page = follow_huge_pgd(mm, address, pgd, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pgd_val(*pgd)))) { - page = follow_huge_pd(vma, address, - __hugepd(pgd_val(*pgd)), flags, - PGDIR_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } - return follow_p4d_mask(vma, address, pgd, flags, ctx); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 546df97c31e4..0af18c1e4b31 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6209,6 +6209,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; } +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & huge_page_mask(h); + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte, entry; + + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (!pte) + return NULL; + + ptl = huge_pte_lock(h, mm, pte); + entry = huge_ptep_get(pte); + if (pte_present(entry)) { + page = pte_page(entry) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* + * Note that page may be a sub-page, and with vmemmap + * optimizations the page struct may be read only. + * try_grab_page() will increase the ref count on the + * head page, so this will be OK. + * + * try_grab_page() should always succeed here, because we hold + * the ptl lock and have verified pte_present(). + */ + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(entry)) { + spin_unlock(ptl); + __migration_entry_wait_huge(pte, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -7201,122 +7257,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -struct page * __weak -follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - -struct page * __weak -follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, int pdshift) -{ - WARN(1, "hugepd follow called with no support for hugepage directory format\n"); - return NULL; -} - -struct page * __weak -follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) -{ - struct hstate *h = hstate_vma(vma); - struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; - spinlock_t *ptl; - pte_t *ptep, pte; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptep = huge_pte_offset(mm, address, huge_page_size(h)); - if (!ptep) - return NULL; - - ptl = huge_pte_lock(h, mm, ptep); - pte = huge_ptep_get(ptep); - if (pte_present(pte)) { - page = pte_page(pte) + - ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); - /* - * try_grab_page() should always succeed here, because: a) we - * hold the pmd (ptl) lock, and b) we've just checked that the - * huge pmd (head) page is present in the page tables. The ptl - * prevents the head page and tail pages from being rearranged - * in any way. So this page must be available at this point, - * unless the page refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait_huge(ptep, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); - if (!pud_huge(*pud)) - goto out; - pte = huge_ptep_get((pte_t *)pud); - if (pte_present(pte)) { - page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pud, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) -{ - if (flags & (FOLL_GET | FOLL_PIN)) - return NULL; - - return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); -} - int isolate_hugetlb(struct page *page, struct list_head *list) { int ret = 0; From 0538a82c39e94d49fa6985c6a0101ca819be11ee Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Oct 2022 15:31:13 -0400 Subject: [PATCH 002/313] mm: vmscan: make rotations a secondary factor in balancing anon vs file We noticed a 2% webserver throughput regression after upgrading from 5.6. This could be tracked down to a shift in the anon/file reclaim balance (confirmed with swappiness) that resulted in worse reclaim efficiency and thus more kswapd activity for the same outcome. The change that exposed the problem is aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU"). By qualifying swapins based on their refault distance, it lowered the cost of anon reclaim in this workload, in turn causing (much) more anon scanning than before. Scanning the anon list is more expensive due to the higher ratio of mmapped pages that may rotate during reclaim, and so the result was an increase in %sys time. Right now, rotations aren't considered a cost when balancing scan pressure between LRUs. We can end up with very few file refaults putting all the scan pressure on hot anon pages that are rotated en masse, don't get reclaimed, and never push back on the file LRU again. We still only reclaim file cache in that case, but we burn a lot CPU rotating anon pages. It's "fair" from an LRU age POV, but doesn't reflect the real cost it imposes on the system. Consider rotations as a secondary factor in balancing the LRUs. This doesn't attempt to make a precise comparison between IO cost and CPU cost, it just says: if reloads are about comparable between the lists, or rotations are overwhelmingly different, adjust for CPU work. This fixed the regression on our webservers. It has since been deployed to the entire Meta fleet and hasn't caused any problems. Link: https://lkml.kernel.org/r/20221013193113.726425-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++-- mm/swap.c | 22 +++++++++++++++++----- mm/vmscan.c | 4 +++- mm/workingset.c | 2 +- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index a18cf4b7c724..369d7799205d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -384,8 +384,9 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); -void lru_note_cost_folio(struct folio *); +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated); +void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); void lru_cache_add(struct page *); diff --git a/mm/swap.c b/mm/swap.c index 955930f41d20..2f12a2ee1d3a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -295,8 +295,20 @@ void folio_rotate_reclaimable(struct folio *folio) } } -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) { + unsigned long cost; + + /* + * Reflect the relative cost of incurring IO and spending CPU + * time on rotations. This doesn't attempt to make a precise + * comparison, it just says: if reloads are about comparable + * between the LRU lists, or rotations are overwhelmingly + * different between them, adjust scan balance for CPU work. + */ + cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; + do { unsigned long lrusize; @@ -310,9 +322,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) - lruvec->file_cost += nr_pages; + lruvec->file_cost += cost; else - lruvec->anon_cost += nr_pages; + lruvec->anon_cost += cost; /* * Decay previous events @@ -335,10 +347,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) } while ((lruvec = parent_lruvec(lruvec))); } -void lru_note_cost_folio(struct folio *folio) +void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), - folio_nr_pages(folio)); + folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) diff --git a/mm/vmscan.c b/mm/vmscan.c index 04d8b88e5216..ffe402e095d3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2499,7 +2499,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout); + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); mem_cgroup_uncharge_list(&folio_list); free_unref_page_list(&folio_list); @@ -2639,6 +2639,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); + if (nr_rotated) + lru_note_cost(lruvec, file, 0, nr_rotated); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, diff --git a/mm/workingset.c b/mm/workingset.c index ae7e984b23c6..d2d02978588c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -493,7 +493,7 @@ void workingset_refault(struct folio *folio, void *shadow) if (workingset) { folio_set_workingset(folio); /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_folio(folio); + lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: From de2baa880de3d5eecf2e46ebdee0aaeb565f5917 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 Oct 2022 10:39:18 -0400 Subject: [PATCH 003/313] selftests/vm: use memfd for uffd hugetlb tests Patch series "selftests/vm: Drop hugetlb mntpoint in run_vmtests.sh", v2. Clean the code up so we can use the same memfd for both hugetlb and shmem which is cleaner. This patch (of 4): We already used memfd for shmem test, move it forward with hugetlb too so that we don't need user to specify the hugetlb file path explicitly when running hugetlb shared tests. Link: https://lkml.kernel.org/r/20221014143921.93887-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221014143921.93887-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: Axel Rasmussen Cc: Mike Kravetz Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 62 ++++++++---------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 297f250c1d95..7f22844ed704 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -93,10 +93,8 @@ static volatile bool test_uffdio_zeropage_eexist = true; static bool test_uffdio_wp = true; /* Whether to test uffd minor faults */ static bool test_uffdio_minor = false; - static bool map_shared; -static int shm_fd; -static int huge_fd; +static int mem_fd; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; @@ -143,7 +141,7 @@ const char *examples = "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" "./userfaultfd hugetlb 256 50\n\n" "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n" + "./userfaultfd hugetlb_shared 256 50\n\n" "# 10MiB-~6GiB 999 bounces anonymous test, " "continue forever unless an error triggers\n" "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; @@ -260,35 +258,21 @@ static void hugetlb_release_pages(char *rel_area) static void hugetlb_allocate_area(void **alloc_area, bool is_src) { + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; void *area_alias = NULL; char **alloc_area_alias; - if (!map_shared) - *alloc_area = mmap(NULL, - nr_pages * page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | - (is_src ? 0 : MAP_NORESERVE), - -1, - 0); - else - *alloc_area = mmap(NULL, - nr_pages * page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED | - (is_src ? 0 : MAP_NORESERVE), - huge_fd, - is_src ? 0 : nr_pages * page_size); + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); if (map_shared) { - area_alias = mmap(NULL, - nr_pages * page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - huge_fd, - is_src ? 0 : nr_pages * page_size); + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } @@ -334,14 +318,14 @@ static void shmem_allocate_area(void **alloc_area, bool is_src) } *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - shm_fd, offset); + mem_fd, offset); if (*alloc_area == MAP_FAILED) err("mmap of memfd failed"); if (test_collapse && *alloc_area != p) err("mmap of memfd failed at %p", p); area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - shm_fd, offset); + mem_fd, offset); if (area_alias == MAP_FAILED) err("mmap of memfd alias failed"); if (test_collapse && area_alias != p_alias) @@ -1841,21 +1825,17 @@ int main(int argc, char **argv) } nr_pages = nr_pages_per_cpu * nr_cpus; - if (test_type == TEST_HUGETLB && map_shared) { - if (argc < 5) - usage(); - huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755); - if (huge_fd < 0) - err("Open of %s failed", argv[4]); - if (ftruncate(huge_fd, 0)) - err("ftruncate %s to size 0 failed", argv[4]); - } else if (test_type == TEST_SHMEM) { - shm_fd = memfd_create(argv[0], 0); - if (shm_fd < 0) + if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { + unsigned int memfd_flags = 0; + + if (test_type == TEST_HUGETLB) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create(argv[0], memfd_flags); + if (mem_fd < 0) err("memfd_create"); - if (ftruncate(shm_fd, nr_pages * page_size * 2)) + if (ftruncate(mem_fd, nr_pages * page_size * 2)) err("ftruncate"); - if (fallocate(shm_fd, + if (fallocate(mem_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, nr_pages * page_size * 2)) err("fallocate"); From 62f33fa228003a419a9484eed5133447a280387c Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 Oct 2022 10:39:19 -0400 Subject: [PATCH 004/313] selftests/vm: use memfd for hugetlb-madvise test For dropping the hugetlb mountpoint in run_vmtests.sh. Since no parameter is needed, drop USAGE too. Link: https://lkml.kernel.org/r/20221014143921.93887-3-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: Mike Kravetz Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugetlb-madvise.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index 3c9943131881..f96435b70986 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -12,6 +12,7 @@ * directory. */ +#define _GNU_SOURCE #include #include #include @@ -19,7 +20,6 @@ #define __USE_GNU #include -#define USAGE "USAGE: %s \n" #define MIN_FREE_PAGES 20 #define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ @@ -103,11 +103,6 @@ int main(int argc, char **argv) int fd; int ret; - if (argc != 2) { - printf(USAGE, argv[0]); - exit(1); - } - huge_page_size = default_huge_page_size(); if (!huge_page_size) { printf("Unable to determine huge page size, exiting!\n"); @@ -125,9 +120,9 @@ int main(int argc, char **argv) exit(1); } - fd = open(argv[1], O_CREAT | O_RDWR, 0755); + fd = memfd_create(argv[0], MFD_HUGETLB); if (fd < 0) { - perror("Open failed"); + perror("memfd_create() failed"); exit(1); } @@ -406,6 +401,5 @@ int main(int argc, char **argv) (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); close(fd); - unlink(argv[1]); return 0; } From 4705700d4fef3178e93230f53c2c528569744bb6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 Oct 2022 10:40:13 -0400 Subject: [PATCH 005/313] selftests/vm: use memfd for hugepage-mremap test For dropping the hugetlb mountpoint in run_vmtests.sh. Cleaned it up a little bit around the changed codes. Link: https://lkml.kernel.org/r/20221014144013.94027-1-peterx@redhat.com Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: Mike Kravetz Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugepage-mremap.c | 21 +++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index e63a0214f639..e53b5eaa8fce 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -22,6 +22,7 @@ #include /* Definition of SYS_* constants */ #include #include +#include #define DEFAULT_LENGTH_MB 10UL #define MB_TO_BYTES(x) (x * 1024 * 1024) @@ -108,26 +109,23 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { size_t length = 0; + int ret = 0, fd; - if (argc != 2 && argc != 3) { - printf("Usage: %s [length_in_MB] \n", argv[0]); + if (argc >= 2 && !strcmp(argv[1], "-h")) { + printf("Usage: %s [length_in_MB]\n", argv[0]); exit(1); } /* Read memory length as the first arg if valid, otherwise fallback to * the default length. */ - if (argc == 3) - length = argc > 2 ? (size_t)atoi(argv[1]) : 0UL; + if (argc >= 2) + length = (size_t)atoi(argv[1]); + else + length = DEFAULT_LENGTH_MB; - length = length > 0 ? length : DEFAULT_LENGTH_MB; length = MB_TO_BYTES(length); - - int ret = 0; - - /* last arg is the hugetlb file name */ - int fd = open(argv[argc-1], O_CREAT | O_RDWR, 0755); - + fd = memfd_create(argv[0], MFD_HUGETLB); if (fd < 0) { perror("Open failed"); exit(1); @@ -185,7 +183,6 @@ int main(int argc, char *argv[]) } close(fd); - unlink(argv[argc-1]); return ret; } From 0796c7b8be84415994fa37e9a022e5595d915dd7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 14 Oct 2022 10:40:15 -0400 Subject: [PATCH 006/313] selftests/vm: drop mnt point for hugetlb in run_vmtests.sh After converting all the three relevant testcases (uffd, madvise, mremap) to use memfd, no test will need the hugetlb mount point anymore. Drop the code. Link: https://lkml.kernel.org/r/20221014144015.94039-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Axel Rasmussen Cc: Mike Kravetz Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index e780e76c26b8..0dc9f545a32d 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -5,7 +5,6 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 -mnt=./huge exitcode=0 #get huge pagesize and freepages from /proc/meminfo @@ -84,9 +83,6 @@ run_test() { fi } -mkdir "$mnt" -mount -t hugetlbfs none "$mnt" - run_test ./hugepage-mmap shmmax=$(cat /proc/sys/kernel/shmmax) @@ -98,14 +94,9 @@ echo "$shmmax" > /proc/sys/kernel/shmmax echo "$shmall" > /proc/sys/kernel/shmall run_test ./map_hugetlb - -run_test ./hugepage-mremap "$mnt"/huge_mremap -rm -f "$mnt"/huge_mremap - +run_test ./hugepage-mremap run_test ./hugepage-vmemmap - -run_test ./hugetlb-madvise "$mnt"/madvise-test -rm -f "$mnt"/madvise-test +run_test ./hugetlb-madvise echo "NOTE: The above hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" @@ -126,14 +117,11 @@ for mod in "${uffd_mods[@]}"; do # Hugetlb tests require source and destination huge pages. Pass in half # the size ($half_ufd_size_MB), which is used for *each*. run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 - run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test - rm -f "$mnt"/uffd-test + run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 run_test ./userfaultfd shmem${mod} 20 16 done #cleanup -umount "$mnt" -rm -rf "$mnt" echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages run_test ./compaction_test From d03c376d9066532551dc56837c7c5490e4fcbbfe Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:03 -0500 Subject: [PATCH 007/313] mm/hugetlb: add folio support to hugetlb specific flag macros Patch series "begin converting hugetlb code to folios", v4. This patch series starts the conversion of the hugetlb code to operate on struct folios rather than struct pages. This removes the ambiguitiy of whether functions are operating on head pages, tail pages of compound pages, or base pages. This series passes the linux test project hugetlb test cases. Patch 1 adds hugeltb specific page macros that can operate on folios. Patch 2 adds the private field of the first tail page to struct page. For 32-bit, _private_1 alinging with page[1].private was confirmed by using pahole. Patch 3 introduces hugetlb subpool helper functions which operate on struct folios. These patches were tested using the hugepage-mmap.c selftest along with the migratepages command. Patch 4 converts hugetlb_delete_from_page_cache() to use folios. Patch 5 adds a folio_hstate() function to get hstate information from a folio and adds a user of folio_hstate(). Bpftrace was used to track time spent in the free_huge_pages function during the ltp test cases as it is a caller of the hugetlb subpool functions. From the histogram, the performance is similar before and after the patch series. Time spent in 'free_huge_page' 6.0.0-rc2.master.20220823 @nsecs: [256, 512) 14770 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ |@@@@@@@@@@@@@@@@@@@@@@@@@ | [512, 1K) 155 | | [1K, 2K) 169 | | [2K, 4K) 50 | | [4K, 8K) 14 | | [8K, 16K) 3 | | [16K, 32K) 3 | | 6.0.0-rc2.master.20220823 + patch series @nsecs: [256, 512) 13678 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |@@@@@@@@@@@@@@@@@@@@@@@@@ | [512, 1K) 142 | | [1K, 2K) 199 | | [2K, 4K) 44 | | [4K, 8K) 13 | | [8K, 16K) 4 | | [16K, 32K) 1 | | This patch (of 5): Allow the macros which test, set, and clear hugetlb specific page flags to take a hugetlb folio as an input. The macrros are generated as folio_{test, set, clear}_hugetlb_{restore_reserve, migratable, temporary, freed, vmemmap_optimized, raw_hwp_unreliable}. Link: https://lkml.kernel.org/r/20220922154207.1575343-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20220922154207.1575343-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4a76c0fc6bbf..3ff5d2dd3ca3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -589,26 +589,50 @@ enum hugetlb_page_flags { */ #ifdef CONFIG_HUGETLB_PAGE #define TESTHPAGEFLAG(uname, flname) \ +static __always_inline \ +bool folio_test_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + return test_bit(HPG_##flname, private); \ + } \ static inline int HPage##uname(struct page *page) \ { return test_bit(HPG_##flname, &(page->private)); } #define SETHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_set_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + set_bit(HPG_##flname, private); \ + } \ static inline void SetHPage##uname(struct page *page) \ { set_bit(HPG_##flname, &(page->private)); } #define CLEARHPAGEFLAG(uname, flname) \ +static __always_inline \ +void folio_clear_hugetlb_##flname(struct folio *folio) \ + { void *private = &folio->private; \ + clear_bit(HPG_##flname, private); \ + } \ static inline void ClearHPage##uname(struct page *page) \ { clear_bit(HPG_##flname, &(page->private)); } #else #define TESTHPAGEFLAG(uname, flname) \ +static inline bool \ +folio_test_hugetlb_##flname(struct folio *folio) \ + { return 0; } \ static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ +static inline void \ +folio_set_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ +static inline void \ +folio_clear_hugetlb_##flname(struct folio *folio) \ + { } \ static inline void ClearHPage##uname(struct page *page) \ { } #endif From d340625f4849ab5dbfebbc7d84709fbfcd39e52f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:04 -0500 Subject: [PATCH 008/313] mm: add private field of first tail to struct page and struct folio Allow struct folio to store hugetlb metadata that is contained in the private field of the first tail page. On 32-bit, _private_1 aligns with page[1].private. Link: https://lkml.kernel.org/r/20220922154207.1575343-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 500e536796ca..2d5b1575ffe0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -144,6 +144,7 @@ struct page { atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ + unsigned long _private_1; #endif }; struct { /* Second tail page of compound page */ @@ -264,6 +265,7 @@ struct page { * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). + * @_private_1: Do not use directly, call folio_get_private_1(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -311,6 +313,7 @@ struct folio { #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; #endif + unsigned long _private_1; }; #define FOLIO_MATCH(pg, fl) \ @@ -338,6 +341,7 @@ FOLIO_MATCH(compound_mapcount, _total_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); +FOLIO_MATCH(_private_1, _private_1); #endif #undef FOLIO_MATCH @@ -383,6 +387,16 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } +static inline void folio_set_private_1(struct folio *folio, unsigned long private) +{ + folio->_private_1 = private; +} + +static inline unsigned long folio_get_private_1(struct folio *folio) +{ + return folio->_private_1; +} + struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) From 149562f7509404c382c32c3fa8a6ba356135e5cf Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:05 -0500 Subject: [PATCH 009/313] mm/hugetlb: add hugetlb_folio_subpool() helpers Allow hugetlbfs_migrate_folio to check and read subpool information by passing in a folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: kernel test robot Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 8 ++++---- include/linux/hugetlb.h | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd54f67e47fd..c5137607e523 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1091,10 +1091,10 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(&src->page)) { - hugetlb_set_page_subpool(&dst->page, - hugetlb_page_subpool(&src->page)); - hugetlb_set_page_subpool(&src->page, NULL); + if (hugetlb_folio_subpool(src)) { + hugetlb_set_folio_subpool(dst, + hugetlb_folio_subpool(src)); + hugetlb_set_folio_subpool(src, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3ff5d2dd3ca3..496d02bdb997 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -718,18 +718,29 @@ extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return (void *)folio_get_private_1(folio); +} + /* * hugetlb page subpool pointer located in hpage[1].private */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { - return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL); + return hugetlb_folio_subpool(page_folio(hpage)); +} + +static inline void hugetlb_set_folio_subpool(struct folio *folio, + struct hugepage_subpool *subpool) +{ + folio_set_private_1(folio, (unsigned long)subpool); } static inline void hugetlb_set_page_subpool(struct page *hpage, struct hugepage_subpool *subpool) { - set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool); + hugetlb_set_folio_subpool(page_folio(hpage), subpool); } static inline struct hstate *hstate_file(struct file *f) From ece62684dcfb714b7d8452056b4a33d426b16457 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:06 -0500 Subject: [PATCH 010/313] hugetlbfs: convert hugetlb_delete_from_page_cache() to use folios Remove the last caller of delete_from_page_cache() by converting the code to its folio equivalent. Link: https://lkml.kernel.org/r/20220922154207.1575343-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 12 ++++++------ include/linux/pagemap.h | 1 - mm/folio-compat.c | 5 ----- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5137607e523..00495fc128c5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,11 +364,11 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void hugetlb_delete_from_page_cache(struct page *page) +static void hugetlb_delete_from_page_cache(struct folio *folio) { - ClearPageDirty(page); - ClearPageUptodate(page); - delete_from_page_cache(page); + folio_clear_dirty(folio); + folio_clear_uptodate(folio); + filemap_remove_folio(folio); } /* @@ -574,8 +574,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * map could fail. Correspondingly, the subpool and global * reserve usage count can need to be adjusted. */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); + VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); + hugetlb_delete_from_page_cache(folio); ret = true; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, index, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bbccb4044222..060ee98474ef 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); -void delete_from_page_cache(struct page *page); void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d7..8ae39c06da62 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -124,11 +124,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -void delete_from_page_cache(struct page *page) -{ - return filemap_remove_folio(page_folio(page)); -} - int try_to_release_page(struct page *page, gfp_t gfp) { return filemap_release_folio(page_folio(page), gfp); From e51da3a9b6c2f67879880259a25c51dbda01c462 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 22 Sep 2022 10:42:07 -0500 Subject: [PATCH 011/313] mm/hugetlb: add folio_hstate() Helper function to retrieve hstate information from a hugetlb folio. Link: https://lkml.kernel.org/r/20220922154207.1575343-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: kernel test robot Reviewed-by: Mike Kravetz Cc: Arnd Bergmann Cc: Colin Cross Cc: David Howells Cc: "Eric W . Biederman" Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 14 ++++++++++++-- mm/migrate.c | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 496d02bdb997..20a0d5a08395 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -824,10 +824,15 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, } #endif +static inline struct hstate *folio_hstate(struct folio *folio) +{ + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + return size_to_hstate(folio_size(folio)); +} + static inline struct hstate *page_hstate(struct page *page) { - VM_BUG_ON_PAGE(!PageHuge(page), page); - return size_to_hstate(page_size(page)); + return folio_hstate(page_folio(page)); } static inline unsigned hstate_index_to_shift(unsigned index) @@ -1036,6 +1041,11 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma) return NULL; } +static inline struct hstate *folio_hstate(struct folio *folio) +{ + return NULL; +} + static inline struct hstate *page_hstate(struct page *page) { return NULL; diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8a..556cb1c86e53 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1620,7 +1620,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) nid = folio_nid(folio); if (folio_test_hugetlb(folio)) { - struct hstate *h = page_hstate(&folio->page); + struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); From d8e454eb44473b2270e2675fb44a9d79dee36097 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 14 Oct 2022 09:39:31 +0800 Subject: [PATCH 012/313] mm/rmap: fix comment in anon_vma_clone() Commit 2555283eb40d ("mm/rmap: Fix anon_vma->degree ambiguity leading to double-reuse") use num_children and num_active_vmas to replace the origin degree to fix anon_vma UAF problem. Update the comment in anon_vma_clone to fit this change. Link: https://lkml.kernel.org/r/20221014013931.1565969-1-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Signed-off-by: Andrew Morton --- mm/rmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 2ec925e5fa6a..92ed6fe3d038 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -315,8 +315,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) enomem_failure: /* - * dst->anon_vma is dropped here otherwise its degree can be incorrectly - * decremented in unlink_anon_vmas(). + * dst->anon_vma is dropped here otherwise its num_active_vmas can + * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ From 3392ca121872dd8c33015c7703d4981c78819be3 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 17 Oct 2022 09:17:59 -0700 Subject: [PATCH 013/313] filemap: find_lock_entries() now updates start offset Patch series "Rework find_get_entries() and find_lock_entries()", v3. Originally the callers of find_get_entries() and find_lock_entries() were keeping track of the start index themselves as they traverse the search range. This resulted in hacky code such as in shmem_undo_range(): index = folio->index + folio_nr_pages(folio) - 1; where the - 1 is only present to stay in the right spot after incrementing index later. This sort of calculation was also being done on every folio despite not even using index later within that function. These patches change find_get_entries() and find_lock_entries() to calculate the new index instead of leaving it to the callers so we can avoid all these complications. This patch (of 2): Initially, find_lock_entries() was being passed in the start offset as a value. That left the calculation of the offset to the callers. This led to complexity in the callers trying to keep track of the index. Now find_lock_entries() takes in a pointer to the start offset and updates the value to be directly after the last entry found. If no entry is found, the offset is not changed. This gets rid of multiple hacky calculations that kept track of the start offset. Link: https://lkml.kernel.org/r/20221017161800.2003-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221017161800.2003-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 15 ++++++++++++--- mm/internal.h | 2 +- mm/shmem.c | 8 ++------ mm/truncate.c | 11 +++-------- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 08341616ae7a..3a73b7b8c2a4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2085,16 +2085,16 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * * Return: The number of entries which were found. */ -unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, +unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { - XA_STATE(xas, &mapping->i_pages, start); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { if (!xa_is_value(folio)) { - if (folio->index < start) + if (folio->index < *start) goto put; if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; @@ -2117,6 +2117,15 @@ put: } rcu_read_unlock(); + if (folio_batch_count(fbatch)) { + unsigned long nr = 1; + int idx = folio_batch_count(fbatch) - 1; + + folio = fbatch->folios[idx]; + if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + nr = folio_nr_pages(folio); + *start = indices[idx] + nr; + } return folio_batch_count(fbatch); } diff --git a/mm/internal.h b/mm/internal.h index 6b7ef495b56d..c504ac7267e0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -106,7 +106,7 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, nr_to_read); } -unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, +unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); diff --git a/mm/shmem.c b/mm/shmem.c index c1d8b8a1aa3b..6b560c3915af 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -922,21 +922,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_init(&fbatch); index = start; - while (index < end && find_lock_entries(mapping, index, end - 1, + while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; - index = indices[i]; - if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, folio); + indices[i], folio); continue; } - index += folio_nr_pages(folio) - 1; if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); @@ -945,7 +942,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); diff --git a/mm/truncate.c b/mm/truncate.c index c0be77e5c008..b6065a494c71 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -361,9 +361,8 @@ void truncate_inode_pages_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; - while (index < end && find_lock_entries(mapping, index, end - 1, + while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { - index = indices[folio_batch_count(&fbatch) - 1] + 1; truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); @@ -510,20 +509,17 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, int i; folio_batch_init(&fbatch); - while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ - index = indices[i]; if (xa_is_value(folio)) { count += invalidate_exceptional_entry(mapping, - index, - folio); + indices[i], folio); continue; } - index += folio_nr_pages(folio) - 1; ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); @@ -542,7 +538,6 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } return count; } From 9fb6beea79c6e7c959adf4fb7b94cf9a6028b941 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 17 Oct 2022 09:18:00 -0700 Subject: [PATCH 014/313] filemap: find_get_entries() now updates start offset Initially, find_get_entries() was being passed in the start offset as a value. That left the calculation of the offset to the callers. This led to complexity in the callers trying to keep track of the index. Now find_get_entries() takes in a pointer to the start offset and updates the value to be directly after the last entry found. If no entry is found, the offset is not changed. This gets rid of multiple hacky calculations that kept track of the start offset. Link: https://lkml.kernel.org/r/20221017161800.2003-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++++++-- mm/internal.h | 2 +- mm/shmem.c | 11 ++++------- mm/truncate.c | 19 +++++++------------ 4 files changed, 23 insertions(+), 22 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3a73b7b8c2a4..65eee6ec1066 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2048,10 +2048,10 @@ reset: * * Return: The number of entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, +unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { - XA_STATE(xas, &mapping->i_pages, start); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); @@ -2062,6 +2062,15 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, } rcu_read_unlock(); + if (folio_batch_count(fbatch)) { + unsigned long nr = 1; + int idx = folio_batch_count(fbatch) - 1; + + folio = fbatch->folios[idx]; + if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + nr = folio_nr_pages(folio); + *start = indices[idx] + nr; + } return folio_batch_count(fbatch); } diff --git a/mm/internal.h b/mm/internal.h index c504ac7267e0..68afdbe7106e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -108,7 +108,7 @@ static inline void force_page_cache_readahead(struct address_space *mapping, unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, +unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); diff --git a/mm/shmem.c b/mm/shmem.c index 6b560c3915af..9c897cf3fb99 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -973,7 +973,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &fbatch, + if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -985,13 +985,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; - index = indices[i]; if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, folio)) { + if (shmem_free_swap(mapping, indices[i], folio)) { /* Swap was replaced by page: retry */ - index--; + index = indices[i]; break; } nr_swaps_freed++; @@ -1004,19 +1003,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); - index--; + index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); truncate_inode_folio(mapping, folio); } - index = folio->index + folio_nr_pages(folio) - 1; folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); - index++; } spin_lock_irq(&info->lock); diff --git a/mm/truncate.c b/mm/truncate.c index b6065a494c71..c7bfd247a651 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -400,7 +400,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &fbatch, + if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -414,21 +414,18 @@ void truncate_inode_pages_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ - index = indices[i]; if (xa_is_value(folio)) continue; folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); - index = folio_index(folio) + folio_nr_pages(folio) - 1; } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); - index++; } } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -636,16 +633,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &fbatch, indices)) { + while (find_get_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ - index = indices[i]; if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, folio)) + indices[i], folio)) ret = -EBUSY; continue; } @@ -655,13 +651,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ - unmap_mapping_pages(mapping, index, - (1 + end - index), false); + unmap_mapping_pages(mapping, indices[i], + (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); if (folio->mapping != mapping) { folio_unlock(folio); continue; @@ -684,7 +680,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } /* * For DAX we invalidate page tables after invalidating page cache. We From 70ec04f3486103819807b061b50a99f6e1d2bf36 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 18 Oct 2022 16:51:54 +0200 Subject: [PATCH 015/313] zram: use try_cmpxchg in update_used_max Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in update_used_max. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, reorder code a bit to remove additional compare and conditional jump from the assembly code. Together, hese two changes save 15 bytes from the function when compiled for x86_64. No functional change intended. Link: https://lkml.kernel.org/r/20221018145154.3699-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 966aab902d19..87711ddf4b54 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -188,16 +188,13 @@ static void update_position(u32 *index, int *offset, struct bio_vec *bvec) static inline void update_used_max(struct zram *zram, const unsigned long pages) { - unsigned long old_max, cur_max; - - old_max = atomic_long_read(&zram->stats.max_used_pages); + unsigned long cur_max = atomic_long_read(&zram->stats.max_used_pages); do { - cur_max = old_max; - if (pages > cur_max) - old_max = atomic_long_cmpxchg( - &zram->stats.max_used_pages, cur_max, pages); - } while (old_max != cur_max); + if (cur_max >= pages) + return; + } while (!atomic_long_try_cmpxchg(&zram->stats.max_used_pages, + &cur_max, pages)); } static inline void zram_fill_page(void *ptr, unsigned long len, From 3e0ee843427a573e3e1187a5331e4b7fb00a76f3 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Fri, 7 Oct 2022 13:37:41 +0200 Subject: [PATCH 016/313] mm: fix typo in struct vm_operations_struct comments There is no eprotect(), so I assume this is about mprotect(). Link: https://lkml.kernel.org/r/2385684.8vm7BOzihM@mobilepool36.emlix.com Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc5565..f6d2d2d9e284 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -549,7 +549,7 @@ struct vm_operations_struct { /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not - * be modified. Returns 0 if eprotect() can proceed. + * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); From 6fe7d712d798e9312e3dff69ec3f5f62f4d03a04 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 7 Oct 2022 10:50:27 +0200 Subject: [PATCH 017/313] mm/shmem: remove unneeded assignments in shmem_get_folio_gfp() After the rework of shmem_get_folio_gfp() to use a folio, the local variable hindex is only needed to be set once before passing it to shmem_add_to_page_cache(). Remove the unneeded initialization and assignments of the variable hindex before the actual effective assignment and first use. No functional change. No change in object code. Link: https://lkml.kernel.org/r/20221007085027.6309-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9c897cf3fb99..57d878b6391d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1826,7 +1826,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct folio *folio; - pgoff_t hindex = index; + pgoff_t hindex; gfp_t huge_gfp; int error; int once = 0; @@ -1864,7 +1864,6 @@ repeat: } if (folio) { - hindex = folio->index; if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) From 97955f6941f0e7dea64dea22711382daf1db2f76 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Thu, 6 Oct 2022 23:03:45 -0400 Subject: [PATCH 018/313] mm/mincore.c: use vma_lookup() instead of find_vma() Using vma_lookup() verifies the start address is contained in the found vma. This results in easier to read the code. Link: https://lkml.kernel.org/r/20221007030345.5029-1-wangdeming@inspur.com Signed-off-by: Deming Wang Signed-off-by: Andrew Morton --- mm/mincore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index fa200c14185f..e7e046fe17d7 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -190,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v unsigned long end; int err; - vma = find_vma(current->mm, addr); - if (!vma || addr < vma->vm_start) + vma = vma_lookup(current->mm, addr); + if (!vma) return -ENOMEM; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); if (!can_do_mincore(vma)) { From 7848ed6284ec4791eba22026e28edb2062790a3d Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Fri, 30 Sep 2022 19:14:33 +0530 Subject: [PATCH 019/313] mm: memcontrol: use mem_cgroup_is_root() helper Replace the checks for memcg is root memcg, with mem_cgroup_is_root() helper. Link: https://lkml.kernel.org/r/20220930134433.338103-1-kamalesh.babulal@oracle.com Signed-off-by: Kamalesh Babulal Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Kamalesh Babulal Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tom Hromatka Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d8549ae1b30..f264a856ba86 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1219,7 +1219,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. */ - if (last != root_mem_cgroup) + if (!mem_cgroup_is_root(last)) __invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg); } @@ -1243,7 +1243,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); + BUG_ON(mem_cgroup_is_root(memcg)); for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1272,7 +1272,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) memcg = folio_memcg(folio); if (!memcg) - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); + VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); else VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); } @@ -2036,7 +2036,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, rcu_read_lock(); memcg = mem_cgroup_from_task(victim); - if (memcg == root_mem_cgroup) + if (mem_cgroup_is_root(memcg)) goto out; /* @@ -2995,7 +2995,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { struct obj_cgroup *objcg = NULL; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { objcg = rcu_dereference(memcg->objcg); if (objcg && obj_cgroup_tryget(objcg)) break; @@ -7163,7 +7163,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (memcg == root_mem_cgroup) + if (mem_cgroup_is_root(memcg)) goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; @@ -7298,7 +7298,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. */ - if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { VM_BUG_ON(1); break; } @@ -7462,7 +7462,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); @@ -7484,7 +7484,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || @@ -7648,7 +7648,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) return true; original_memcg = get_mem_cgroup_from_objcg(objcg); - for (memcg = original_memcg; memcg != root_mem_cgroup; + for (memcg = original_memcg; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long max = READ_ONCE(memcg->zswap_max); unsigned long pages; From a5454f95246aa1d3527ef5e128cd3a10bc8371de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 28 Sep 2022 12:45:35 +0200 Subject: [PATCH 020/313] tmpfs: ensure O_LARGEFILE with generic_file_open() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this check open() will open large files on tmpfs although O_LARGEFILE was not specified. This is inconsistent with other filesystems. Also it will later result in EOVERFLOW on stat() or EFBIG on write(). Link: https://lore.kernel.org/lkml/76bedae6-22ea-4abc-8c06-b424ceb39217@t-8ch.de/ Link: https://lkml.kernel.org/r/20220928104535.61186-1-linux@weissschuh.net Signed-off-by: Thomas Weißschuh Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index 57d878b6391d..0a7c4a748811 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3902,6 +3902,7 @@ EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, + .open = generic_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, From 7ce0ea19d50e4e97a8da69f616ffa8afbb532a93 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 27 Sep 2022 19:09:09 +0200 Subject: [PATCH 021/313] kasan: switch kunit tests to console tracepoints Switch KUnit-compatible KASAN tests from using per-task KUnit resources to console tracepoints. This allows for two things: 1. Migrating tests that trigger a KASAN report in the context of a task other than current to KUnit framework. This is implemented in the patches that follow. 2. Parsing and matching the contents of KASAN reports. This is not yet implemented. Link: https://lkml.kernel.org/r/9345acdd11e953b207b0ed4724ff780e63afeb36.1664298455.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 2 +- mm/kasan/kasan.h | 8 ---- mm/kasan/kasan_test.c | 85 +++++++++++++++++++++++++++++++------------ mm/kasan/report.c | 31 ---------------- 4 files changed, 63 insertions(+), 63 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index ca09b1cf8ee9..ba5b27962c34 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -181,7 +181,7 @@ config KASAN_VMALLOC config KASAN_KUNIT_TEST tristate "KUnit-compatible tests of KASAN bug detection capabilities" if !KUNIT_ALL_TESTS - depends on KASAN && KUNIT + depends on KASAN && KUNIT && TRACEPOINTS default KUNIT_ALL_TESTS help A KUnit-based KASAN test suite. Triggers different kinds of diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index abbcc1b0eec5..a84491bc4867 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -261,14 +261,6 @@ struct kasan_stack_ring { #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -/* Used in KUnit-compatible KASAN tests. */ -struct kunit_kasan_status { - bool report_found; - bool sync_fault; -}; -#endif - #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) static inline const void *kasan_shadow_to_mem(const void *shadow_addr) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 0d59098f0876..0ff20bfa3376 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -5,8 +5,12 @@ * Author: Andrey Ryabinin */ +#define pr_fmt(fmt) "kasan_test: " fmt + +#include #include #include +#include #include #include #include @@ -14,21 +18,28 @@ #include #include #include +#include #include #include +#include #include -#include #include -#include +#include #include -#include - #include "kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) +static bool multishot; + +/* Fields set based on lines observed in the console. */ +static struct { + bool report_found; + bool async_fault; +} test_status; + /* * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. @@ -36,35 +47,61 @@ void *kasan_ptr_result; int kasan_int_result; -static struct kunit_resource resource; -static struct kunit_kasan_status test_status; -static bool multishot; +/* Probe for console output: obtains test_status lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + if (strnstr(buf, "BUG: KASAN: ", len)) + WRITE_ONCE(test_status.report_found, true); + else if (strnstr(buf, "Asynchronous fault: ", len)) + WRITE_ONCE(test_status.async_fault, true); +} -/* - * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the - * first detected bug and panic the kernel if panic_on_warn is enabled. For - * hardware tag-based KASAN also allow tag checking to be reenabled for each - * test, see the comment for KUNIT_EXPECT_KASAN_FAIL(). - */ -static int kasan_test_init(struct kunit *test) +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kasan_suite_init(struct kunit_suite *suite) { if (!kasan_enabled()) { - kunit_err(test, "can't run KASAN tests with KASAN disabled"); + pr_err("Can't run KASAN tests with KASAN disabled"); return -1; } + /* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only + * report the first detected bug and panic the kernel if panic_on_warn + * is enabled. + */ multishot = kasan_save_enable_multi_shot(); - test_status.report_found = false; - test_status.sync_fault = false; - kunit_add_named_resource(test, NULL, NULL, &resource, - "kasan_status", &test_status); + + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); return 0; } +static void kasan_suite_exit(struct kunit_suite *suite) +{ + kasan_restore_multi_shot(multishot); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + static void kasan_test_exit(struct kunit *test) { - kasan_restore_multi_shot(multishot); - KUNIT_EXPECT_FALSE(test, test_status.report_found); + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); } /** @@ -106,11 +143,12 @@ static void kasan_test_exit(struct kunit *test) if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) { \ if (READ_ONCE(test_status.report_found) && \ - READ_ONCE(test_status.sync_fault)) \ + !READ_ONCE(test_status.async_fault)) \ kasan_enable_tagging(); \ migrate_enable(); \ } \ WRITE_ONCE(test_status.report_found, false); \ + WRITE_ONCE(test_status.async_fault, false); \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ @@ -1447,9 +1485,10 @@ static struct kunit_case kasan_kunit_test_cases[] = { static struct kunit_suite kasan_kunit_test_suite = { .name = "kasan", - .init = kasan_test_init, .test_cases = kasan_kunit_test_cases, .exit = kasan_test_exit, + .suite_init = kasan_suite_init, + .suite_exit = kasan_suite_exit, }; kunit_test_suite(kasan_kunit_test_suite); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index df3602062bfd..31355851a5ec 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -30,8 +30,6 @@ #include -#include - #include "kasan.h" #include "../slab.h" @@ -114,41 +112,12 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); #endif -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -static void update_kunit_status(bool sync) -{ - struct kunit *test; - struct kunit_resource *resource; - struct kunit_kasan_status *status; - - test = current->kunit_test; - if (!test) - return; - - resource = kunit_find_named_resource(test, "kasan_status"); - if (!resource) { - kunit_set_failure(test); - return; - } - - status = (struct kunit_kasan_status *)resource->data; - WRITE_ONCE(status->report_found, true); - WRITE_ONCE(status->sync_fault, sync); - - kunit_put_resource(resource); -} -#else -static void update_kunit_status(bool sync) { } -#endif - static DEFINE_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ disable_trace_on_warning(); - /* Update status of the currently running KASAN test. */ - update_kunit_status(sync); /* Do not allow LOCKDEP mangling KASAN reports. */ lockdep_off(); /* Make sure we don't end up in loop. */ From 8516e837cab0b2c740b90603b66039aa7dcecda4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 27 Sep 2022 19:09:10 +0200 Subject: [PATCH 022/313] kasan: migrate kasan_rcu_uaf test to kunit Migrate the kasan_rcu_uaf test to the KUnit framework. Changes to the implementation of the test: - Call rcu_barrier() after call_rcu() to make that the RCU callbacks get triggered before the test is over. - Cast pointer passed to rcu_dereference_protected as __rcu to get rid of the Sparse warning. - Check that KASAN prints a report via KUNIT_EXPECT_KASAN_FAIL. Initially, this test was intended to check that Generic KASAN prints auxiliary stack traces for RCU objects. Nevertheless, the test is enabled for all modes to make that KASAN reports bad accesses in RCU callbacks. The presence of auxiliary stack traces for the Generic mode needs to be inspected manually. Link: https://lkml.kernel.org/r/897ee08d6cd0ba7e8a4fbfd9d8502823a2f922e6.1664298455.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 37 ++++++++++++++++++++++++++++++++++++ mm/kasan/kasan_test_module.c | 30 ----------------------------- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 0ff20bfa3376..38bf6ed61cb8 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1141,6 +1141,42 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +static struct kasan_rcu_info { + int i; + struct rcu_head rcu; +} *global_rcu_ptr; + +static void rcu_uaf_reclaim(struct rcu_head *rp) +{ + struct kasan_rcu_info *fp = + container_of(rp, struct kasan_rcu_info, rcu); + + kfree(fp); + ((volatile struct kasan_rcu_info *)fp)->i; +} + +/* + * Check that Generic KASAN prints auxiliary stack traces for RCU callbacks. + * The report needs to be inspected manually. + * + * This test is still enabled for other KASAN modes to make sure that all modes + * report bad accesses in tested scenarios. + */ +static void rcu_uaf(struct kunit *test) +{ + struct kasan_rcu_info *ptr; + + ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + global_rcu_ptr = rcu_dereference_protected( + (struct kasan_rcu_info __rcu *)ptr, NULL); + + KUNIT_EXPECT_KASAN_FAIL(test, + call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); + rcu_barrier()); +} + static void vmalloc_helpers_tags(struct kunit *test) { void *ptr; @@ -1472,6 +1508,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(rcu_uaf), KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index e4ca82dc2c16..4688cbcd722d 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -62,35 +62,6 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static struct kasan_rcu_info { - int i; - struct rcu_head rcu; -} *global_rcu_ptr; - -static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) -{ - struct kasan_rcu_info *fp = container_of(rp, - struct kasan_rcu_info, rcu); - - kfree(fp); - ((volatile struct kasan_rcu_info *)fp)->i; -} - -static noinline void __init kasan_rcu_uaf(void) -{ - struct kasan_rcu_info *ptr; - - pr_info("use-after-free in kasan_rcu_reclaim\n"); - ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - global_rcu_ptr = rcu_dereference_protected(ptr, NULL); - call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim); -} - static noinline void __init kasan_workqueue_work(struct work_struct *work) { kfree(work); @@ -130,7 +101,6 @@ static int __init test_kasan_module_init(void) bool multishot = kasan_save_enable_multi_shot(); copy_user_test(); - kasan_rcu_uaf(); kasan_workqueue_uaf(); kasan_restore_multi_shot(multishot); From b2c5bd4c69ce28500ed2176d11002a4e9b30da36 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 27 Sep 2022 19:09:11 +0200 Subject: [PATCH 023/313] kasan: migrate workqueue_uaf test to kunit Migrate the workqueue_uaf test to the KUnit framework. Initially, this test was intended to check that Generic KASAN prints auxiliary stack traces for workqueues. Nevertheless, the test is enabled for all modes to make that KASAN reports bad accesses in the tested scenario. The presence of auxiliary stack traces for the Generic mode needs to be inspected manually. Link: https://lkml.kernel.org/r/1d81b6cc2a58985126283d1e0de8e663716dd930.1664298455.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 40 +++++++++++++++++++++++++++++------- mm/kasan/kasan_test_module.c | 30 --------------------------- 2 files changed, 33 insertions(+), 37 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 38bf6ed61cb8..e27591ef2777 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1141,6 +1141,14 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +/* + * The two tests below check that Generic KASAN prints auxiliary stack traces + * for RCU callbacks and workqueues. The reports need to be inspected manually. + * + * These tests are still enabled for other KASAN modes to make sure that all + * modes report bad accesses in tested scenarios. + */ + static struct kasan_rcu_info { int i; struct rcu_head rcu; @@ -1155,13 +1163,6 @@ static void rcu_uaf_reclaim(struct rcu_head *rp) ((volatile struct kasan_rcu_info *)fp)->i; } -/* - * Check that Generic KASAN prints auxiliary stack traces for RCU callbacks. - * The report needs to be inspected manually. - * - * This test is still enabled for other KASAN modes to make sure that all modes - * report bad accesses in tested scenarios. - */ static void rcu_uaf(struct kunit *test) { struct kasan_rcu_info *ptr; @@ -1177,6 +1178,30 @@ static void rcu_uaf(struct kunit *test) rcu_barrier()); } +static void workqueue_uaf_work(struct work_struct *work) +{ + kfree(work); +} + +static void workqueue_uaf(struct kunit *test) +{ + struct workqueue_struct *workqueue; + struct work_struct *work; + + workqueue = create_workqueue("kasan_workqueue_test"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue); + + work = kmalloc(sizeof(struct work_struct), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work); + + INIT_WORK(work, workqueue_uaf_work); + queue_work(workqueue, work); + destroy_workqueue(workqueue); + + KUNIT_EXPECT_KASAN_FAIL(test, + ((volatile struct work_struct *)work)->data); +} + static void vmalloc_helpers_tags(struct kunit *test) { void *ptr; @@ -1509,6 +1534,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), KUNIT_CASE(rcu_uaf), + KUNIT_CASE(workqueue_uaf), KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index 4688cbcd722d..7be7bed456ef 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -62,35 +62,6 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static noinline void __init kasan_workqueue_work(struct work_struct *work) -{ - kfree(work); -} - -static noinline void __init kasan_workqueue_uaf(void) -{ - struct workqueue_struct *workqueue; - struct work_struct *work; - - workqueue = create_workqueue("kasan_wq_test"); - if (!workqueue) { - pr_err("Allocation failed\n"); - return; - } - work = kmalloc(sizeof(struct work_struct), GFP_KERNEL); - if (!work) { - pr_err("Allocation failed\n"); - return; - } - - INIT_WORK(work, kasan_workqueue_work); - queue_work(workqueue, work); - destroy_workqueue(workqueue); - - pr_info("use-after-free on workqueue\n"); - ((volatile struct work_struct *)work)->data; -} - static int __init test_kasan_module_init(void) { /* @@ -101,7 +72,6 @@ static int __init test_kasan_module_init(void) bool multishot = kasan_save_enable_multi_shot(); copy_user_test(); - kasan_workqueue_uaf(); kasan_restore_multi_shot(multishot); return -EAGAIN; From 69c66add566395eaf4c08cb5975b45dec70dbe85 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:14 +0200 Subject: [PATCH 024/313] selftests/vm: anon_cow: test COW handling of anonymous memory Patch series "selftests/vm: test COW handling of anonymous memory". This is my current set of tests for testing COW handling of anonymous memory, especially when interacting with GUP. I developed these tests while working on PageAnonExclusive and managed to clean them up just now. On current upstream Linux, all tests pass except the hugetlb tests that rely on vmsplice -- these tests should pass as soon as vmsplice properly uses FOLL_PIN instead of FOLL_GET. I'm working on additional tests for COW handling in private mappings, focusing on long-term R/O pinning e.g., of the shared zeropage, pagecache pages and KSM pages. These tests, however, will go into a different file. So this is everything I have regarding tests for anonymous memory. This patch (of 7): Let's start adding tests for our COW handling of anonymous memory. We'll focus on basic tests that we can achieve without additional libraries or gup_test extensions. We'll add THP and hugetlb tests separately. [david@redhat.com: s/size_t/ssize_t/ on `cur', `total', `transferred';] Link: https://lkml.kernel.org/r/51302b9e-dc69-d709-3214-f23868028555@redhat.com Link: https://lkml.kernel.org/r/20220927110120.106906-1-david@redhat.com Link: https://lkml.kernel.org/r/20220927110120.106906-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 4 +- tools/testing/selftests/vm/anon_cow.c | 401 ++++++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 3 + tools/testing/selftests/vm/vm_util.c | 7 + tools/testing/selftests/vm/vm_util.h | 1 + 6 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/vm/anon_cow.c diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 7b9dc2426f18..8a536c731e3c 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +anon_cow hugepage-mmap hugepage-mremap hugepage-shm diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 163c2fde3cb3..ad07d7a84c3e 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -25,7 +25,8 @@ MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread -TEST_GEN_FILES = compaction_test +TEST_GEN_FILES = anon_cow +TEST_GEN_FILES += compaction_test TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests TEST_GEN_FILES += hugetlb-madvise @@ -95,6 +96,7 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk +$(OUTPUT)/anon_cow: vm_util.c $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c new file mode 100644 index 000000000000..4613294af758 --- /dev/null +++ b/tools/testing/selftests/vm/anon_cow.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * COW (Copy On Write) tests for anonymous memory. + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +static size_t pagesize; +static int pagemap_fd; + +struct comm_pipes { + int child_ready[2]; + int parent_ready[2]; +}; + +static int setup_comm_pipes(struct comm_pipes *comm_pipes) +{ + if (pipe(comm_pipes->child_ready) < 0) + return -errno; + if (pipe(comm_pipes->parent_ready) < 0) { + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + return -errno; + } + + return 0; +} + +static void close_comm_pipes(struct comm_pipes *comm_pipes) +{ + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + close(comm_pipes->parent_ready[0]); + close(comm_pipes->parent_ready[1]); +} + +static int child_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + char *old = malloc(size); + char buf; + + /* Backup the original content. */ + memcpy(old, mem, size); + + /* Wait until the parent modified the page. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values. */ + return memcmp(old, mem, size); +} + +static int child_vmsplice_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + char *old, *new; + int fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + /* Backup the original content. */ + memcpy(old, mem, size); + + if (pipe(fds) < 0) + return -errno; + + /* Trigger a read-only pin. */ + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred < 0) + return -errno; + if (transferred == 0) + return -EINVAL; + + /* Unmap it from our page tables. */ + if (munmap(mem, size) < 0) + return -errno; + + /* Wait until the parent modified it. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values via the pipe. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) + return -errno; + } + + return memcmp(old, new, transferred); +} + +typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); + +static void do_test_cow_in_parent(char *mem, size_t size, child_fn fn) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + exit(fn(mem, size, &comm_pipes)); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_cow_in_parent(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, child_memcmp_fn); +} + +static void test_vmsplice_in_child(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, child_vmsplice_memcmp_fn); +} + +static void do_test_vmsplice_in_parent(char *mem, size_t size, + bool before_fork) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + struct comm_pipes comm_pipes; + char *old, *new; + int ret, fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + memcpy(old, mem, size); + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free; + } + + if (pipe(fds) < 0) { + ksft_test_result_fail("pipe() failed\n"); + goto close_comm_pipes; + } + + if (before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + goto close_pipe; + } + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_pipe; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + /* Modify page content in the child. */ + memset(mem, 0xff, size); + exit(0); + } + + if (!before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + wait(&ret); + goto close_pipe; + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + if (munmap(mem, size) < 0) { + ksft_test_result_fail("munmap() failed\n"); + goto close_pipe; + } + write(comm_pipes.parent_ready[1], "0", 1); + + /* Wait until the child is done writing. */ + wait(&ret); + if (!WIFEXITED(ret)) { + ksft_test_result_fail("wait() failed\n"); + goto close_pipe; + } + + /* See if we still read the old values. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) { + ksft_test_result_fail("read() failed\n"); + goto close_pipe; + } + } + + ksft_test_result(!memcmp(old, new, transferred), + "No leak from child into parent\n"); +close_pipe: + close(fds[0]); + close(fds[1]); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free: + free(old); + free(new); +} + +static void test_vmsplice_before_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, true); +} + +static void test_vmsplice_after_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, false); +} + +typedef void (*test_fn)(char *mem, size_t size); + +static void do_run_with_base_page(test_fn fn, bool swapout) +{ + char *mem; + int ret; + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); + /* Ignore if not around on a kernel. */ + if (ret && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto munmap; + } + + /* Populate a base page. */ + memset(mem, 0, pagesize); + + if (swapout) { + madvise(mem, pagesize, MADV_PAGEOUT); + if (!pagemap_is_swapped(pagemap_fd, mem)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + } + + fn(mem, pagesize); +munmap: + munmap(mem, pagesize); +} + +static void run_with_base_page(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with base page\n", desc); + do_run_with_base_page(fn, false); +} + +static void run_with_base_page_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); + do_run_with_base_page(fn, true); +} + +struct test_case { + const char *desc; + test_fn fn; +}; + +static const struct test_case test_cases[] = { + /* + * Basic COW tests for fork() without any GUP. If we miss to break COW, + * either the child can observe modifications by the parent or the + * other way around. + */ + { + "Basic COW after fork()", + test_cow_in_parent, + }, + /* + * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If + * we miss to break COW, the child observes modifications by the parent. + * This is CVE-2020-29374 reported by Jann Horn. + */ + { + "vmsplice() + unmap in child", + test_vmsplice_in_child + }, + /* + * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after + * fork(); modify in the child. If we miss to break COW, the parent + * observes modifications by the child. + */ + { + "vmsplice() before fork(), unmap in parent after fork()", + test_vmsplice_before_fork, + }, + /* + * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the + * child. If we miss to break COW, the parent observes modifications by + * the child. + */ + { + "vmsplice() + unmap in parent after fork()", + test_vmsplice_after_fork, + }, +}; + +static void run_test_case(struct test_case const *test_case) +{ + run_with_base_page(test_case->fn, test_case->desc); + run_with_base_page_swap(test_case->fn, test_case->desc); +} + +static void run_test_cases(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_test_case(&test_cases[i]); +} + +int main(int argc, char **argv) +{ + int nr_test_cases = ARRAY_SIZE(test_cases); + int err; + + pagesize = getpagesize(); + + ksft_print_header(); + ksft_set_plan(nr_test_cases * 2); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + run_test_cases(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 0dc9f545a32d..1fa783732296 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -185,4 +185,7 @@ fi run_test ./soft-dirty +# COW tests for anonymous memory +run_test ./anon_cow + exit $exitcode diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index f11f8adda521..37dd230673ee 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -28,6 +28,13 @@ bool pagemap_is_softdirty(int fd, char *start) return entry & 0x0080000000000000ull; } +bool pagemap_is_swapped(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + return entry & 0x4000000000000000ull; +} + void clear_softdirty(void) { int ret; diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 5c35de454e08..833df81b2694 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -4,6 +4,7 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); +bool pagemap_is_swapped(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); From a905e82ae44b22a25ea73415a3ec1228775eb9a9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:15 +0200 Subject: [PATCH 025/313] selftests/vm: factor out pagemap_is_populated() into vm_util We'll reuse it in the anon_cow test next. Link: https://lkml.kernel.org/r/20220927110120.106906-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/madv_populate.c | 8 -------- tools/testing/selftests/vm/vm_util.c | 8 ++++++++ tools/testing/selftests/vm/vm_util.h | 1 + 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c index 715a42e8e2cd..60547245e479 100644 --- a/tools/testing/selftests/vm/madv_populate.c +++ b/tools/testing/selftests/vm/madv_populate.c @@ -27,14 +27,6 @@ static size_t pagesize; -static bool pagemap_is_populated(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* Present or swapped. */ - return entry & 0xc000000000000000ull; -} - static void sense_support(void) { char *addr; diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 37dd230673ee..5bbf7641a0f0 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -35,6 +35,14 @@ bool pagemap_is_swapped(int fd, char *start) return entry & 0x4000000000000000ull; } +bool pagemap_is_populated(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* Present or swapped. */ + return entry & 0xc000000000000000ull; +} + void clear_softdirty(void) { int ret; diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 833df81b2694..80d5a6ad413b 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -5,6 +5,7 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); bool pagemap_is_swapped(int fd, char *start); +bool pagemap_is_populated(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); From f4b5fd6946e244cdedc3bbb9a1f24c8133b2077a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:16 +0200 Subject: [PATCH 026/313] selftests/vm: anon_cow: THP tests Let's add various THP variants that we'll run with our existing test cases. Link: https://lkml.kernel.org/r/20220927110120.106906-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/anon_cow.c | 259 +++++++++++++++++++++++++- 1 file changed, 258 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c index 4613294af758..c1681c9d255f 100644 --- a/tools/testing/selftests/vm/anon_cow.c +++ b/tools/testing/selftests/vm/anon_cow.c @@ -24,6 +24,43 @@ static size_t pagesize; static int pagemap_fd; +static size_t thpsize; + +static void detect_thpsize(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", + O_RDONLY); + size_t size = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + size = strtoul(buf, NULL, 10); + if (size < pagesize) + size = 0; + if (size > 0) { + thpsize = size; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); + } + } + + close(fd); +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} struct comm_pipes { int child_ready[2]; @@ -319,6 +356,206 @@ static void run_with_base_page_swap(test_fn fn, const char *desc) do_run_with_base_page(fn, true); } +enum thp_run { + THP_RUN_PMD, + THP_RUN_PMD_SWAPOUT, + THP_RUN_PTE, + THP_RUN_PTE_SWAPOUT, + THP_RUN_SINGLE_PTE, + THP_RUN_SINGLE_PTE_SWAPOUT, + THP_RUN_PARTIAL_MREMAP, + THP_RUN_PARTIAL_SHARED, +}; + +static void do_run_with_thp(test_fn fn, enum thp_run thp_run) +{ + char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; + size_t size, mmap_size, mremap_size; + int ret; + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Try to populate a THP. Touch the first sub-page and test if we get + * another sub-page populated automatically. + */ + mem[0] = 0; + if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + memset(mem, 0, thpsize); + + size = thpsize; + switch (thp_run) { + case THP_RUN_PMD: + case THP_RUN_PMD_SWAPOUT: + break; + case THP_RUN_PTE: + case THP_RUN_PTE_SWAPOUT: + /* + * Trigger PTE-mapping the THP by temporarily mapping a single + * subpage R/O. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + break; + case THP_RUN_SINGLE_PTE: + case THP_RUN_SINGLE_PTE_SWAPOUT: + /* + * Discard all but a single subpage of that PTE-mapped THP. What + * remains is a single PTE mapping a single subpage. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); + if (ret) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto munmap; + } + size = pagesize; + break; + case THP_RUN_PARTIAL_MREMAP: + /* + * Remap half of the THP. We need some new memory location + * for that. + */ + mremap_size = thpsize / 2; + mremap_mem = mmap(NULL, mremap_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + tmp = mremap(mem + mremap_size, mremap_size, mremap_size, + MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); + if (tmp != mremap_mem) { + ksft_test_result_fail("mremap() failed\n"); + goto munmap; + } + size = mremap_size; + break; + case THP_RUN_PARTIAL_SHARED: + /* + * Share the first page of the THP with a child and quit the + * child. This will result in some parts of the THP never + * have been shared. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto munmap; + } + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto munmap; + } else if (!ret) { + exit(0); + } + wait(&ret); + /* Allow for sharing all pages again. */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + goto munmap; + } + break; + default: + assert(false); + } + + switch (thp_run) { + case THP_RUN_PMD_SWAPOUT: + case THP_RUN_PTE_SWAPOUT: + case THP_RUN_SINGLE_PTE_SWAPOUT: + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + break; + default: + break; + } + + fn(mem, size); +munmap: + munmap(mmap_mem, mmap_size); + if (mremap_mem != MAP_FAILED) + munmap(mremap_mem, mremap_size); +} + +static void run_with_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD); +} + +static void run_with_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); +} + +static void run_with_pte_mapped_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE); +} + +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); +} + +static void run_with_single_pte_of_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE); +} + +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); +} + +static void run_with_partial_mremap_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); +} + +static void run_with_partial_shared_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); +} + struct test_case { const char *desc; test_fn fn; @@ -367,6 +604,16 @@ static void run_test_case(struct test_case const *test_case) { run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); + if (thpsize) { + run_with_thp(test_case->fn, test_case->desc); + run_with_thp_swap(test_case->fn, test_case->desc); + run_with_pte_mapped_thp(test_case->fn, test_case->desc); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); + run_with_single_pte_of_thp(test_case->fn, test_case->desc); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); + run_with_partial_mremap_thp(test_case->fn, test_case->desc); + run_with_partial_shared_thp(test_case->fn, test_case->desc); + } } static void run_test_cases(void) @@ -377,15 +624,25 @@ static void run_test_cases(void) run_test_case(&test_cases[i]); } +static int tests_per_test_case(void) +{ + int tests = 2; + + if (thpsize) + tests += 8; + return tests; +} + int main(int argc, char **argv) { int nr_test_cases = ARRAY_SIZE(test_cases); int err; pagesize = getpagesize(); + detect_thpsize(); ksft_print_header(); - ksft_set_plan(nr_test_cases * 2); + ksft_set_plan(nr_test_cases * tests_per_test_case()); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) From 7dad331be7816103eba8c12caeb88fbd3599c0b9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:17 +0200 Subject: [PATCH 027/313] selftests/vm: anon_cow: hugetlb tests Let's run all existing test cases with all hugetlb sizes we're able to detect. Note that some tests cases still fail. This will, for example, be fixed once vmsplice properly uses FOLL_PIN instead of FOLL_GET for pinning. With 2 MiB and 1 GiB hugetlb on x86_64, the expected failures are: # [RUN] vmsplice() + unmap in child ... with hugetlb (2048 kB) not ok 23 No leak from parent into child # [RUN] vmsplice() + unmap in child ... with hugetlb (1048576 kB) not ok 24 No leak from parent into child # [RUN] vmsplice() before fork(), unmap in parent after fork() ... with hugetlb (2048 kB) not ok 35 No leak from child into parent # [RUN] vmsplice() before fork(), unmap in parent after fork() ... with hugetlb (1048576 kB) not ok 36 No leak from child into parent # [RUN] vmsplice() + unmap in parent after fork() ... with hugetlb (2048 kB) not ok 47 No leak from child into parent # [RUN] vmsplice() + unmap in parent after fork() ... with hugetlb (1048576 kB) not ok 48 No leak from child into parent Link: https://lkml.kernel.org/r/20220927110120.106906-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/anon_cow.c | 70 ++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c index c1681c9d255f..78dfdd983380 100644 --- a/tools/testing/selftests/vm/anon_cow.c +++ b/tools/testing/selftests/vm/anon_cow.c @@ -25,6 +25,8 @@ static size_t pagesize; static int pagemap_fd; static size_t thpsize; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; static void detect_thpsize(void) { @@ -54,6 +56,31 @@ static void detect_thpsize(void) close(fd); } +static void detect_hugetlbsizes(void) +{ + DIR *dir = opendir("/sys/kernel/mm/hugepages/"); + + if (!dir) + return; + + while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { + struct dirent *entry = readdir(dir); + size_t kb; + + if (!entry) + break; + if (entry->d_type != DT_DIR) + continue; + if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) + continue; + hugetlbsizes[nr_hugetlbsizes] = kb * 1024; + nr_hugetlbsizes++; + ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", + kb); + } + closedir(dir); +} + static bool range_is_swapped(void *addr, size_t size) { for (; size; addr += pagesize, size -= pagesize) @@ -556,6 +583,41 @@ static void run_with_partial_shared_thp(test_fn fn, const char *desc) do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); } +static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) +{ + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + char *mem, *dummy; + + ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; + + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + /* Populate an huge page. */ + memset(mem, 0, hugetlbsize); + + /* + * We need a total of two hugetlb pages to handle COW/unsharing + * properly, otherwise we might get zapped by a SIGBUS. + */ + dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (dummy == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto munmap; + } + munmap(dummy, hugetlbsize); + + fn(mem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); +} + struct test_case { const char *desc; test_fn fn; @@ -602,6 +664,8 @@ static const struct test_case test_cases[] = { static void run_test_case(struct test_case const *test_case) { + int i; + run_with_base_page(test_case->fn, test_case->desc); run_with_base_page_swap(test_case->fn, test_case->desc); if (thpsize) { @@ -614,6 +678,9 @@ static void run_test_case(struct test_case const *test_case) run_with_partial_mremap_thp(test_case->fn, test_case->desc); run_with_partial_shared_thp(test_case->fn, test_case->desc); } + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); } static void run_test_cases(void) @@ -626,7 +693,7 @@ static void run_test_cases(void) static int tests_per_test_case(void) { - int tests = 2; + int tests = 2 + nr_hugetlbsizes; if (thpsize) tests += 8; @@ -640,6 +707,7 @@ int main(int argc, char **argv) pagesize = getpagesize(); detect_thpsize(); + detect_hugetlbsizes(); ksft_print_header(); ksft_set_plan(nr_test_cases * tests_per_test_case()); From e487ebbd12986facc7f77129d3ca80de84841170 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:18 +0200 Subject: [PATCH 028/313] selftests/vm: anon_cow: add liburing test cases io_uring provides a simple mechanism to test long-term, R/W GUP pins -- via fixed buffers -- and can be used to verify that GUP pins stay in sync with the pages in the page table even if a page would temporarily get mapped R/O or concurrent fork() could accidentially end up sharing pinned pages with the child. Note that this essentially re-introduces local_config support that was removed recently in commit 6f83d6c74ea5 ("Kselftests: remove support of libhugetlbfs from kselftests"). [david@redhat.com: s/size_t/ssize_t/ on `cur', `total'.] Link: https://lkml.kernel.org/r/445fe1ae-9e22-0d1d-4d09-272231d2f84a@redhat.com Link: https://lkml.kernel.org/r/20220927110120.106906-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 21 ++- tools/testing/selftests/vm/anon_cow.c | 190 +++++++++++++++++++++ tools/testing/selftests/vm/check_config.sh | 31 ++++ 3 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/vm/check_config.sh diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index ad07d7a84c3e..00920cb8b499 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -1,7 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # Makefile for vm selftests -LOCAL_HDRS += $(top_srcdir)/mm/gup_test.h +LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h + +include local_config.mk uname_M := $(shell uname -m 2>/dev/null || echo not) MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') @@ -152,8 +154,25 @@ warn_32bit_failure: endif endif +# ANON_COW_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/anon_cow: LDLIBS += $(ANON_COW_EXTRA_LIBS) + $(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap $(OUTPUT)/ksm_tests: LDLIBS += -lnuma $(OUTPUT)/migration: LDLIBS += -lnuma + +local_config.mk local_config.h: check_config.sh + /bin/sh ./check_config.sh $(CC) + +EXTRA_CLEAN += local_config.mk local_config.h + +ifeq ($(ANON_COW_EXTRA_LIBS),) +all: warn_missing_liburing + +warn_missing_liburing: + @echo ; \ + echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ + echo +endif diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c index 78dfdd983380..01417a604eda 100644 --- a/tools/testing/selftests/vm/anon_cow.c +++ b/tools/testing/selftests/vm/anon_cow.c @@ -19,6 +19,11 @@ #include #include +#include "local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBURING +#include +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + #include "../kselftest.h" #include "vm_util.h" @@ -334,6 +339,170 @@ static void test_vmsplice_after_fork(char *mem, size_t size) do_test_vmsplice_in_parent(mem, size, false); } +#ifdef LOCAL_CONFIG_HAVE_LIBURING +static void do_test_iouring(char *mem, size_t size, bool use_fork) +{ + struct comm_pipes comm_pipes; + struct io_uring_cqe *cqe; + struct io_uring_sqe *sqe; + struct io_uring ring; + ssize_t cur, total; + struct iovec iov; + char *buf, *tmp; + int ret, fd; + FILE *file; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + goto close_comm_pipes; + } + fd = fileno(file); + assert(fd); + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + goto close_file; + } + + /* Skip on errors, as we might just lack kernel support. */ + ret = io_uring_queue_init(1, &ring, 0); + if (ret < 0) { + ksft_test_result_skip("io_uring_queue_init() failed\n"); + goto free_tmp; + } + + /* + * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN + * | FOLL_LONGTERM the range. + * + * Skip on errors, as we might just lack kernel support or might not + * have sufficient MEMLOCK permissions. + */ + iov.iov_base = mem; + iov.iov_len = size; + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) { + ksft_test_result_skip("io_uring_register_buffers() failed\n"); + goto queue_exit; + } + + if (use_fork) { + /* + * fork() and keep the child alive until we're done. Note that + * we expect the pinned page to not get shared with the child. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto unregister_buffers; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + } else { + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto unregister_buffers; + } + } + + /* + * Modify the page and write page content as observed by the fixed + * buffer pin to the file so we can verify it. + */ + memset(mem, 0xff, size); + sqe = io_uring_get_sqe(&ring); + if (!sqe) { + ksft_test_result_fail("io_uring_get_sqe() failed\n"); + goto quit_child; + } + io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); + + ret = io_uring_submit(&ring); + if (ret < 0) { + ksft_test_result_fail("io_uring_submit() failed\n"); + goto quit_child; + } + + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + ksft_test_result_fail("io_uring_wait_cqe() failed\n"); + goto quit_child; + } + + if (cqe->res != size) { + ksft_test_result_fail("write_fixed failed\n"); + goto quit_child; + } + io_uring_cqe_seen(&ring, cqe); + + /* Read back the file content to the temporary buffer. */ + total = 0; + while (total < size) { + cur = pread(fd, tmp + total, size - total, total); + if (cur < 0) { + ksft_test_result_fail("pread() failed\n"); + goto quit_child; + } + total += cur; + } + + /* Finally, check if we read what we expected. */ + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/W pin is reliable\n"); + +quit_child: + if (use_fork) { + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + } +unregister_buffers: + io_uring_unregister_buffers(&ring); +queue_exit: + io_uring_queue_exit(&ring); +free_tmp: + free(tmp); +close_file: + fclose(file); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_iouring_ro(char *mem, size_t size) +{ + do_test_iouring(mem, size, false); +} + +static void test_iouring_fork(char *mem, size_t size) +{ + do_test_iouring(mem, size, true); +} + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + typedef void (*test_fn)(char *mem, size_t size); static void do_run_with_base_page(test_fn fn, bool swapout) @@ -660,6 +829,27 @@ static const struct test_case test_cases[] = { "vmsplice() + unmap in parent after fork()", test_vmsplice_after_fork, }, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + /* + * Take a R/W longterm pin and then map the page R/O into the page + * table to trigger a write fault on next access. When modifying the + * page, the page content must be visible via the pin. + */ + { + "R/O-mapping a page registered as iouring fixed buffer", + test_iouring_ro, + }, + /* + * Take a R/W longterm pin and then fork() a child. When modifying the + * page, the page content must be visible via the pin. We expect the + * pinned page to not get shared with the child. + */ + { + "fork() with an iouring fixed buffer", + test_iouring_fork, + }, + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ }; static void run_test_case(struct test_case const *test_case) diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh new file mode 100644 index 000000000000..9a44c6520925 --- /dev/null +++ b/tools/testing/selftests/vm/check_config.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Probe for libraries and create header files to record the results. Both C +# header files and Makefile include fragments are created. + +OUTPUT_H_FILE=local_config.h +OUTPUT_MKFILE=local_config.mk + +tmpname=$(mktemp) +tmpfile_c=${tmpname}.c +tmpfile_o=${tmpname}.o + +# liburing +echo "#include " > $tmpfile_c +echo "#include " >> $tmpfile_c +echo "int func(void) { return 0; }" >> $tmpfile_c + +CC=${1:?"Usage: $0 # example compiler: gcc"} +$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 + +if [ -f $tmpfile_o ]; then + echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE + echo "ANON_COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE +else + echo "// No liburing support found" > $OUTPUT_H_FILE + echo "# No liburing support found, so:" > $OUTPUT_MKFILE + echo "ANON_COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE +fi + +rm ${tmpname}.* From c77369b437f983a862bb6741814670d4ad38478c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:19 +0200 Subject: [PATCH 029/313] mm/gup_test: start/stop/read functionality for PIN LONGTERM test We want an easy way to take a R/O or R/W longterm pin on a range and be able to observe the content of the pinned pages, so we can properly test how longterm puns interact with our COW logic. [david@redhat.com: silence a warning on 32-bit] Link: https://lkml.kernel.org/r/74adbb51-6e33-f636-8a9c-2ad87bd9007e@redhat.com [yang.lee@linux.alibaba.com: ./mm/gup_test.c:281:2-3: Unneeded semicolon] Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2455 Link: https://lkml.kernel.org/r/20221020024035.113619-1-yang.lee@linux.alibaba.com Link: https://lkml.kernel.org/r/20220927110120.106906-7-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Yang Li Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/gup_test.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/gup_test.h | 12 +++++ 2 files changed, 153 insertions(+) diff --git a/mm/gup_test.c b/mm/gup_test.c index 12b0a91767d3..0d76d9b4bb5a 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -203,6 +203,135 @@ free_pages: return ret; } +static DEFINE_MUTEX(pin_longterm_test_mutex); +static struct page **pin_longterm_test_pages; +static unsigned long pin_longterm_test_nr_pages; + +static inline void pin_longterm_test_stop(void) +{ + if (pin_longterm_test_pages) { + if (pin_longterm_test_nr_pages) + unpin_user_pages(pin_longterm_test_pages, + pin_longterm_test_nr_pages); + kfree(pin_longterm_test_pages); + pin_longterm_test_pages = NULL; + pin_longterm_test_nr_pages = 0; + } +} + +static inline int pin_longterm_test_start(unsigned long arg) +{ + long nr_pages, cur_pages, addr, remaining_pages; + int gup_flags = FOLL_LONGTERM; + struct pin_longterm_test args; + struct page **pages; + int ret = 0; + bool fast; + + if (pin_longterm_test_pages) + return -EINVAL; + + if (copy_from_user(&args, (void __user *)arg, sizeof(args))) + return -EFAULT; + + if (args.flags & + ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST)) + return -EINVAL; + if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE)) + return -EINVAL; + if (args.size > LONG_MAX) + return -EINVAL; + nr_pages = args.size / PAGE_SIZE; + if (!nr_pages) + return -EINVAL; + + pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE) + gup_flags |= FOLL_WRITE; + fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST); + + if (!fast && mmap_read_lock_killable(current->mm)) { + kfree(pages); + return -EINTR; + } + + pin_longterm_test_pages = pages; + pin_longterm_test_nr_pages = 0; + + while (nr_pages - pin_longterm_test_nr_pages) { + remaining_pages = nr_pages - pin_longterm_test_nr_pages; + addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE; + + if (fast) + cur_pages = pin_user_pages_fast(addr, remaining_pages, + gup_flags, pages); + else + cur_pages = pin_user_pages(addr, remaining_pages, + gup_flags, pages, NULL); + if (cur_pages < 0) { + pin_longterm_test_stop(); + ret = cur_pages; + break; + } + pin_longterm_test_nr_pages += cur_pages; + pages += cur_pages; + } + + if (!fast) + mmap_read_unlock(current->mm); + return ret; +} + +static inline int pin_longterm_test_read(unsigned long arg) +{ + __u64 user_addr; + unsigned long i; + + if (!pin_longterm_test_pages) + return -EINVAL; + + if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr))) + return -EFAULT; + + for (i = 0; i < pin_longterm_test_nr_pages; i++) { + void *addr = page_to_virt(pin_longterm_test_pages[i]); + + if (copy_to_user((void __user *)(unsigned long)user_addr, addr, + PAGE_SIZE)) + return -EFAULT; + user_addr += PAGE_SIZE; + } + return 0; +} + +static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + int ret = -EINVAL; + + if (mutex_lock_killable(&pin_longterm_test_mutex)) + return -EINTR; + + switch (cmd) { + case PIN_LONGTERM_TEST_START: + ret = pin_longterm_test_start(arg); + break; + case PIN_LONGTERM_TEST_STOP: + pin_longterm_test_stop(); + ret = 0; + break; + case PIN_LONGTERM_TEST_READ: + ret = pin_longterm_test_read(arg); + break; + } + + mutex_unlock(&pin_longterm_test_mutex); + return ret; +} + static long gup_test_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -217,6 +346,10 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd, case PIN_BASIC_TEST: case DUMP_USER_PAGES_TEST: break; + case PIN_LONGTERM_TEST_START: + case PIN_LONGTERM_TEST_STOP: + case PIN_LONGTERM_TEST_READ: + return pin_longterm_test_ioctl(filep, cmd, arg); default: return -EINVAL; } @@ -234,9 +367,17 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd, return 0; } +static int gup_test_release(struct inode *inode, struct file *file) +{ + pin_longterm_test_stop(); + + return 0; +} + static const struct file_operations gup_test_fops = { .open = nonseekable_open, .unlocked_ioctl = gup_test_ioctl, + .release = gup_test_release, }; static int __init gup_test_init(void) diff --git a/mm/gup_test.h b/mm/gup_test.h index 887ac1d5f5bc..5b37b54e8bea 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -10,6 +10,9 @@ #define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test) #define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test) #define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test) +#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test) +#define PIN_LONGTERM_TEST_STOP _IO('g', 8) +#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64) #define GUP_TEST_MAX_PAGES_TO_DUMP 8 @@ -30,4 +33,13 @@ struct gup_test { __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP]; }; +#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1 +#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2 + +struct pin_longterm_test { + __u64 addr; + __u64 size; + __u32 flags; +}; + #endif /* __GUP_TEST_H */ From 6f1405efc61b6e686bdf6e2e09b41d2d3a5c14e2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 27 Sep 2022 13:01:20 +0200 Subject: [PATCH 030/313] selftests/vm: anon_cow: add R/O longterm tests via gup_test Let's trigger a R/O longterm pin on three cases of R/O mapped anonymous pages: * exclusive (never shared) * shared (child still alive) * previously shared (child no longer alive) ... and make sure that the pin is reliable: whatever we write via the page tables has to be observable via the pin. Link: https://lkml.kernel.org/r/20220927110120.106906-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Christoph von Recklinghausen Cc: Don Dutile Cc: Jason Gunthorpe Cc: John Hubbard Cc: Mike Rapoport Cc: Nadav Amit Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/anon_cow.c | 210 ++++++++++++++++++++++++++ 1 file changed, 210 insertions(+) diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c index 01417a604eda..705bd0b3db11 100644 --- a/tools/testing/selftests/vm/anon_cow.c +++ b/tools/testing/selftests/vm/anon_cow.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include "local_config.h" @@ -24,6 +25,7 @@ #include #endif /* LOCAL_CONFIG_HAVE_LIBURING */ +#include "../../../../mm/gup_test.h" #include "../kselftest.h" #include "vm_util.h" @@ -32,6 +34,7 @@ static int pagemap_fd; static size_t thpsize; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; +static int gup_fd; static void detect_thpsize(void) { @@ -503,6 +506,170 @@ static void test_iouring_fork(char *mem, size_t size) #endif /* LOCAL_CONFIG_HAVE_LIBURING */ +enum ro_pin_test { + RO_PIN_TEST_SHARED, + RO_PIN_TEST_PREVIOUSLY_SHARED, + RO_PIN_TEST_RO_EXCLUSIVE, +}; + +static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, + bool fast) +{ + struct pin_longterm_test args; + struct comm_pipes comm_pipes; + char *tmp, buf; + __u64 tmp_val; + int ret; + + if (gup_fd < 0) { + ksft_test_result_skip("gup_test not available\n"); + return; + } + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + return; + } + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free_tmp; + } + + switch (test) { + case RO_PIN_TEST_SHARED: + case RO_PIN_TEST_PREVIOUSLY_SHARED: + /* + * Share the pages with our child. As the pages are not pinned, + * this should just work. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + /* Wait until our child is ready. */ + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { + /* + * Tell the child to quit now and wait until it quit. + * The pages should now be mapped R/O into our page + * tables, but they are no longer shared. + */ + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + } + break; + case RO_PIN_TEST_RO_EXCLUSIVE: + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Take a R/O pin. This should trigger unsharing. */ + args.addr = (__u64)mem; + args.size = size; + args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); + if (ret) { + if (errno == EINVAL) + ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); + else + ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); + goto wait; + } + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* + * Read back the content via the pin to the temporary buffer and + * test if we observed the modification. + */ + tmp_val = (__u64)tmp; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); + if (ret) + ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); + else + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/O pin is reliable\n"); + + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); + if (ret) + ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); +wait: + switch (test) { + case RO_PIN_TEST_SHARED: + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + break; + default: + break; + } +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free_tmp: + free(tmp); +} + +static void test_ro_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); +} + +static void test_ro_fast_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); +} + +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); +} + +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); +} + +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); +} + +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); +} + typedef void (*test_fn)(char *mem, size_t size); static void do_run_with_base_page(test_fn fn, bool swapout) @@ -850,6 +1017,48 @@ static const struct test_case test_cases[] = { }, #endif /* LOCAL_CONFIG_HAVE_LIBURING */ + /* + * Take a R/O longterm pin on a R/O-mapped shared anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped shared page", + test_ro_pin_on_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped shared page", + test_ro_fast_pin_on_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that + * was previously shared. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped previously-shared page", + test_ro_pin_on_ro_previously_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped previously-shared page", + test_ro_fast_pin_on_ro_previously_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped exclusive page", + test_ro_pin_on_ro_exclusive, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped exclusive page", + test_ro_fast_pin_on_ro_exclusive, + }, }; static void run_test_case(struct test_case const *test_case) @@ -902,6 +1111,7 @@ int main(int argc, char **argv) ksft_print_header(); ksft_set_plan(nr_test_cases * tests_per_test_case()); + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("opening pagemap failed\n"); From f3ad032c2d06be970d78384885c63917974a4af4 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 27 Sep 2022 14:38:26 +0800 Subject: [PATCH 031/313] mm: rmap: rename page_not_mapped() to folio_not_mapped() Since commit 2f031c6f042c ("mm/rmap: Convert rmap_walk() to take a folio"), page_not_mapped() takes folio as parameter, rename it to be consistent. Link: https://lkml.kernel.org/r/20220927063826.159590-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/rmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 92ed6fe3d038..9bba65b30e4d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1801,7 +1801,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_not_mapped(struct folio *folio) +static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } @@ -1822,7 +1822,7 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; @@ -2150,7 +2150,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; @@ -2297,7 +2297,7 @@ static bool folio_make_device_exclusive(struct folio *folio, }; struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; From f9e60beceee5c85dc9d5e71c1090cfed97ab0897 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:40 +0800 Subject: [PATCH 032/313] cgroup/cpuset: use hotplug_memory_notifier() directly Patch series "mm: Use hotplug_memory_notifier() instead of register_hotmemory_notifier()", v4. Commit f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") introduced register_hotmemory_notifier() to avoid a compile problem with gcc-4.4.4: When CONFIG_MEMORY_HOTPLUG=n, we don't want the memory-hotplug notifier handlers to be included in the .o files, for space reasons. The existing hotplug_memory_notifier() tries to handle this but testing with gcc-4.4.4 shows that it doesn't work - the hotplug functions are still present in the .o files. Since commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") has already updated the minimum gcc version to 5.1. The previous problem mentioned in f02c69680088 does not exist. So we can now revert to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). In the last patch, we move all hotplug memory notifier priority to same file for easy sorting. This patch (of 8): Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-1-liushixin2@huawei.com Link: https://lkml.kernel.org/r/20220923033347.3935160-2-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- kernel/cgroup/cpuset.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b474289c15b8..0c6db6a4f427 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3630,11 +3630,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block cpuset_track_online_nodes_nb = { - .notifier_call = cpuset_track_online_nodes, - .priority = 10, /* ??! */ -}; - /** * cpuset_init_smp - initialize cpus_allowed * @@ -3652,7 +3647,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); From 5d89c224328bce791d051bf60aa92d90bae93c01 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:41 +0800 Subject: [PATCH 033/313] fs/proc/kcore.c: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-3-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index dff921f7ca33..7692a360972d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -638,10 +637,6 @@ static int __meminit kcore_callback(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block kcore_callback_nb __meminitdata = { - .notifier_call = kcore_callback, - .priority = 0, -}; static struct kcore_list kcore_vmalloc; @@ -694,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - register_hotmemory_notifier(&kcore_callback_nb); + hotplug_memory_notifier(kcore_callback, 0); return 0; } From 946d5f9c9dcdbaedcd664fad08cea7910139d10f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:42 +0800 Subject: [PATCH 034/313] mm/slub.c: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-4-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- mm/slub.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..f37e6a51e233 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4771,11 +4771,6 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -static struct notifier_block slab_memory_callback_nb = { - .notifier_call = slab_memory_callback, - .priority = SLAB_CALLBACK_PRI, -}; - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -4841,7 +4836,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); - register_hotmemory_notifier(&slab_memory_callback_nb); + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; From cddb8d09ff1e477de8236a061a5017b21bab3c14 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:43 +0800 Subject: [PATCH 035/313] mm/mmap: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-5-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 2def55555e05..3f47fd57d165 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3749,13 +3749,9 @@ static int reserve_mem_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static struct notifier_block reserve_mem_nb = { - .notifier_call = reserve_mem_notifier, -}; - static int __meminit init_reserve_notifier(void) { - if (register_hotmemory_notifier(&reserve_mem_nb)) + if (hotplug_memory_notifier(reserve_mem_notifier, 0)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; From d46722ef1c090541d56f706f3a90f3f2e84cdf0c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:44 +0800 Subject: [PATCH 036/313] mm/mm_init.c: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-6-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- mm/mm_init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 0d7b2bd2454a..44aadc162d1f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -178,16 +178,10 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block compute_batch_nb __meminitdata = { - .notifier_call = mm_compute_batch_notifier, - .priority = IPC_CALLBACK_PRI, /* use lowest priority */ -}; - static int __init mm_compute_batch_init(void) { mm_compute_batch(sysctl_overcommit_memory); - register_hotmemory_notifier(&compute_batch_nb); - + hotplug_memory_notifier(mm_compute_batch_notifier, IPC_CALLBACK_PRI); return 0; } From 82f8661a7982efea3e4437777e9f914781fac640 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:45 +0800 Subject: [PATCH 037/313] ACPI: HMAT: use hotplug_memory_notifier() directly Commit 76ae847497bc52 ("Documentation: raise minimum supported version of GCC to 5.1") updated the minimum gcc version to 5.1. So the problem mentioned in f02c69680088 ("include/linux/memory.h: implement register_hotmemory_notifier()") no longer exist. So we can now switch to use hotplug_memory_notifier() directly rather than register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-7-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- drivers/acpi/numa/hmat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 23f49a2f4d14..0ecefb604734 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -767,11 +767,6 @@ static int hmat_callback(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block hmat_callback_nb = { - .notifier_call = hmat_callback, - .priority = 2, -}; - static __init void hmat_free_structures(void) { struct memory_target *target, *tnext; @@ -854,7 +849,7 @@ static __init int hmat_init(void) hmat_register_targets(); /* Keep the table and structures if the notifier may use them */ - if (!register_hotmemory_notifier(&hmat_callback_nb)) + if (!hotplug_memory_notifier(hmat_callback, 2)) return 0; out_put: hmat_free_structures(); From eafd296e0cc0cc03b4ae01c2b3b07273514d757c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:46 +0800 Subject: [PATCH 038/313] memory: remove unused register_hotmemory_notifier() Remove unused register_hotmemory_notifier(). Link: https://lkml.kernel.org/r/20220923033347.3935160-8-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- include/linux/memory.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index aa619464a1df..98d2a2ebcc10 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -19,7 +19,6 @@ #include #include #include -#include #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) @@ -136,9 +135,6 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } -/* These aren't inline functions due to a GCC bug. */ -#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; }) -#define unregister_hotmemory_notifier(nb) ({ (void)(nb); }) #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -166,8 +162,6 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, { .notifier_call = fn, .priority = pri };\ register_memory_notifier(&fn##_mem_nb); \ }) -#define register_hotmemory_notifier(nb) register_memory_notifier(nb) -#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb) #ifdef CONFIG_NUMA void memory_block_add_nid(struct memory_block *mem, int nid, From 1eeaa4fd39b0b1b3e986f8eab6978e69b01e3c5e Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 11:33:47 +0800 Subject: [PATCH 039/313] memory: move hotplug memory notifier priority to same file for easy sorting The priority of hotplug memory callback is defined in a different file. And there are some callers using numbers directly. Collect them together into include/linux/memory.h for easy reading. This allows us to sort their priorities more intuitively without additional comments. Link: https://lkml.kernel.org/r/20220923033347.3935160-9-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Christoph Lameter Cc: David Hildenbrand Cc: Kefeng Wang Cc: Waiman Long Cc: zefan li Signed-off-by: Andrew Morton --- drivers/acpi/numa/hmat.c | 2 +- fs/proc/kcore.c | 2 +- include/linux/memory-tiers.h | 1 - include/linux/memory.h | 9 +++++++-- kernel/cgroup/cpuset.c | 2 +- mm/kasan/shadow.c | 2 +- mm/ksm.c | 2 +- mm/memory-tiers.c | 2 +- mm/mm_init.c | 2 +- mm/mmap.c | 2 +- mm/page_ext.c | 2 +- 11 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 0ecefb604734..139e3b41653e 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -849,7 +849,7 @@ static __init int hmat_init(void) hmat_register_targets(); /* Keep the table and structures if the notifier may use them */ - if (!hotplug_memory_notifier(hmat_callback, 2)) + if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI)) return 0; out_put: hmat_free_structures(); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 7692a360972d..98f3289556e4 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -689,7 +689,7 @@ static int __init proc_kcore_init(void) add_modules_range(); /* Store direct-map area from physical memory map */ kcore_update_ram(); - hotplug_memory_notifier(kcore_callback, 0); + hotplug_memory_notifier(kcore_callback, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 965009aa01d7..fc9647b1b4f9 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -18,7 +18,6 @@ * the same memory tier. */ #define MEMTIER_ADISTANCE_DRAM ((4 * MEMTIER_CHUNK_SIZE) + (MEMTIER_CHUNK_SIZE >> 1)) -#define MEMTIER_HOTPLUG_PRIO 100 struct memory_tier; struct memory_dev_type { diff --git a/include/linux/memory.h b/include/linux/memory.h index 98d2a2ebcc10..463662ef7614 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -112,8 +112,13 @@ struct mem_section; * Priorities for the hotplug memory callback routines (stored in decreasing * order in the callback chain) */ -#define SLAB_CALLBACK_PRI 1 -#define IPC_CALLBACK_PRI 10 +#define DEFAULT_CALLBACK_PRI 0 +#define SLAB_CALLBACK_PRI 1 +#define HMAT_CALLBACK_PRI 2 +#define MM_COMPUTE_BATCH_PRI 10 +#define CPUSET_CALLBACK_PRI 10 +#define MEMTIER_HOTPLUG_PRI 100 +#define KSM_CALLBACK_PRI 100 #ifndef CONFIG_MEMORY_HOTPLUG static inline void memory_dev_init(void) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0c6db6a4f427..3ea2e836e93e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3647,7 +3647,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 0e3648b603a6..2fba1f51f042 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - hotplug_memory_notifier(kasan_mem_notifier, 0); + hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/mm/ksm.c b/mm/ksm.c index c19fcca9bc03..7ba97f86d831 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3211,7 +3211,7 @@ static int __init ksm_init(void) #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ - hotplug_memory_notifier(ksm_memory_callback, 100); + hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index fa8c9d07f9ce..939e200c283b 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -664,7 +664,7 @@ static int __init memory_tier_init(void) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); - hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); + hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); diff --git a/mm/mm_init.c b/mm/mm_init.c index 44aadc162d1f..c1883362e71d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -181,7 +181,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, static int __init mm_compute_batch_init(void) { mm_compute_batch(sysctl_overcommit_memory); - hotplug_memory_notifier(mm_compute_batch_notifier, IPC_CALLBACK_PRI); + hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index 3f47fd57d165..c697771d406b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3751,7 +3751,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, static int __meminit init_reserve_notifier(void) { - if (hotplug_memory_notifier(reserve_mem_notifier, 0)) + if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; diff --git a/mm/page_ext.c b/mm/page_ext.c index affe80243b6d..b2ff5c9129f4 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -513,7 +513,7 @@ void __init page_ext_init(void) cond_resched(); } } - hotplug_memory_notifier(page_ext_callback, 0); + hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; From 3c0c9bc9c9596d5cd69529da822526f88673365b Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:47 +0200 Subject: [PATCH 040/313] mm: vmalloc: add alloc_vmap_area trace event Patch series "Add basic trace events for vmap/vmalloc (v2)", v2. This small series add some basic trace events for the vmap/vmalloc code. Since currently we lack any, sometimes it is hard to start debuging vmap code if an issue is reported or occured. For example https://lore.kernel.org/linux-mm/Y0p8BZIiDXLQbde%2F@pc636/T/ The final patch adds two reviewers for vmalloc code. This patch (of 7): It is for debug purposes and for validation of passed parameters. Link: https://lkml.kernel.org/r/20221018181053.434508-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20221018181053.434508-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 56 ++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 include/trace/events/vmalloc.h diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h new file mode 100644 index 000000000000..39fbd77c91e7 --- /dev/null +++ b/include/trace/events/vmalloc.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vmalloc + +#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VMALLOC_H + +#include + +/** + * alloc_vmap_area - called when a new vmap allocation occurs + * @addr: an allocated address + * @size: a requested size + * @align: a requested alignment + * @vstart: a requested start range + * @vend: a requested end range + * @failed: an allocation failed or not + * + * This event is used for a debug purpose, it can give an extra + * information for a developer about how often it occurs and which + * parameters are passed for further validation. + */ +TRACE_EVENT(alloc_vmap_area, + + TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int failed), + + TP_ARGS(addr, size, align, vstart, vend, failed), + + TP_STRUCT__entry( + __field(unsigned long, addr) + __field(unsigned long, size) + __field(unsigned long, align) + __field(unsigned long, vstart) + __field(unsigned long, vend) + __field(int, failed) + ), + + TP_fast_assign( + __entry->addr = addr; + __entry->size = size; + __entry->align = align; + __entry->vstart = vstart; + __entry->vend = vend; + __entry->failed = failed; + ), + + TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", + __entry->addr, __entry->size, __entry->align, + __entry->vstart, __entry->vend, __entry->failed) +); + +#endif /* _TRACE_VMALLOC_H */ + +/* This part must be outside protection */ +#include From b3a5a7b099162e1b11db459f8128d4374f7d1c05 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:48 +0200 Subject: [PATCH 041/313] mm: vmalloc: add purge_vmap_area_lazy trace event It is for debug purposes to track number of freed vmap areas including a range it occurs on. Link: https://lkml.kernel.org/r/20221018181053.434508-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h index 39fbd77c91e7..afeb8003a0f2 100644 --- a/include/trace/events/vmalloc.h +++ b/include/trace/events/vmalloc.h @@ -50,6 +50,39 @@ TRACE_EVENT(alloc_vmap_area, __entry->vstart, __entry->vend, __entry->failed) ); +/** + * purge_vmap_area_lazy - called when vmap areas were lazily freed + * @start: purging start address + * @end: purging end address + * @npurged: numbed of purged vmap areas + * + * This event is used for a debug purpose. It gives some + * indication about start:end range and how many objects + * are released. + */ +TRACE_EVENT(purge_vmap_area_lazy, + + TP_PROTO(unsigned long start, unsigned long end, + unsigned int npurged), + + TP_ARGS(start, end, npurged), + + TP_STRUCT__entry( + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, npurged) + ), + + TP_fast_assign( + __entry->start = start; + __entry->end = end; + __entry->npurged = npurged; + ), + + TP_printk("start=0x%lx end=0x%lx num_purged=%u", + __entry->start, __entry->end, __entry->npurged) +); + #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ From fabc27f7649e070c4f6c742e436a51ff68c4a280 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:49 +0200 Subject: [PATCH 042/313] mm: vmalloc: add free_vmap_area_noflush trace event This event is used in order to validate/debug a start address of freed VA, number of currently outstanding and maximum allowed areas. Link: https://lkml.kernel.org/r/20221018181053.434508-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- include/trace/events/vmalloc.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/include/trace/events/vmalloc.h b/include/trace/events/vmalloc.h index afeb8003a0f2..ad4e02191f35 100644 --- a/include/trace/events/vmalloc.h +++ b/include/trace/events/vmalloc.h @@ -83,6 +83,40 @@ TRACE_EVENT(purge_vmap_area_lazy, __entry->start, __entry->end, __entry->npurged) ); +/** + * free_vmap_area_noflush - called when a vmap area is freed + * @va_start: a start address of VA + * @nr_lazy: number of current lazy pages + * @nr_lazy_max: number of maximum lazy pages + * + * This event is used for a debug purpose. It gives some + * indication about a VA that is released, number of current + * outstanding areas and a maximum allowed threshold before + * dropping all of them. + */ +TRACE_EVENT(free_vmap_area_noflush, + + TP_PROTO(unsigned long va_start, unsigned long nr_lazy, + unsigned long nr_lazy_max), + + TP_ARGS(va_start, nr_lazy, nr_lazy_max), + + TP_STRUCT__entry( + __field(unsigned long, va_start) + __field(unsigned long, nr_lazy) + __field(unsigned long, nr_lazy_max) + ), + + TP_fast_assign( + __entry->va_start = va_start; + __entry->nr_lazy = nr_lazy; + __entry->nr_lazy_max = nr_lazy_max; + ), + + TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", + __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) +); + #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ From cf243da6ab3987b65b95357194926a31415095b8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:50 +0200 Subject: [PATCH 043/313] mm: vmalloc: use trace_alloc_vmap_area event This is for debug purpose and is called when an allocation attempt occurs. This event gives some information about: - start address of allocated area; - size that is requested; - alignment that is required; - vstart/vend restriction; - if an allocation fails. Link: https://lkml.kernel.org/r/20221018181053.434508-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ccaa461998f3..849563d334fb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -43,6 +43,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "internal.h" #include "pgalloc-track.h" @@ -1620,6 +1623,8 @@ retry: size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); + trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. From 6030fd5fd1f7baaac3661a5301cc7838d4e3b7f6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:51 +0200 Subject: [PATCH 044/313] mm: vmalloc: use trace_purge_vmap_area_lazy event This is for debug purposes and is called when all outstanding areas are removed back to the vmap space. It gives some extra information about: - a start:end range where set of vmap ares were freed; - a number of purged areas which were backed off. [urezki@gmail.com: simplify return boolean expression] Link: https://lkml.kernel.org/r/20221020125247.5053-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20221018181053.434508-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 849563d334fb..1b1205ade1cf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1730,6 +1730,7 @@ static void purge_fragmented_blocks_allcpus(void); static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; + unsigned int num_purged_areas = 0; struct list_head local_purge_list; struct vmap_area *va, *n_va; @@ -1741,7 +1742,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) spin_unlock(&purge_vmap_area_lock); if (unlikely(list_empty(&local_purge_list))) - return false; + goto out; start = min(start, list_first_entry(&local_purge_list, @@ -1776,12 +1777,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); + num_purged_areas++; if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); - return true; + +out: + trace_purge_vmap_area_lazy(start, end, num_purged_areas); + return num_purged_areas > 0; } /* From 8c4196fe810a6717a8f9e528083911703f6a5a51 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:52 +0200 Subject: [PATCH 045/313] mm: vmalloc: use trace_free_vmap_area_noflush event It is for debug purposes and is called when a vmap area gets freed. This event gives some indication about: - a start address of released area; - a current number of outstanding pages; - a maximum number of allowed outstanding pages. Link: https://lkml.kernel.org/r/20221018181053.434508-7-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1b1205ade1cf..ca71de7c9d77 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1821,6 +1821,8 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { + unsigned long nr_lazy_max = lazy_max_pages(); + unsigned long va_start = va->va_start; unsigned long nr_lazy; spin_lock(&vmap_area_lock); @@ -1838,8 +1840,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) &purge_vmap_area_root, &purge_vmap_area_list); spin_unlock(&purge_vmap_area_lock); + trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); + /* After this point, we may free va at any time */ - if (unlikely(nr_lazy > lazy_max_pages())) + if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } From 65f199b2b40d82e48c79af2f4b5e9fafb290b231 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 18 Oct 2022 20:10:53 +0200 Subject: [PATCH 046/313] vmalloc: add reviewers for vmalloc code Add myself and Christoph Hellwig as reviewers for vmalloc. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20221018181053.434508-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (Google) Acked-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- MAINTAINERS | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 046ff06ff97f..631bd2e8c5c4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13294,10 +13294,20 @@ F: include/linux/memory_hotplug.h F: include/linux/mm.h F: include/linux/mmzone.h F: include/linux/pagewalk.h -F: include/linux/vmalloc.h F: mm/ F: tools/testing/selftests/vm/ +VMALLOC +M: Andrew Morton +R: Uladzislau Rezki +R: Christoph Hellwig +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/linux/vmalloc.h +F: mm/vmalloc.c + MEMORY HOT(UN)PLUG M: David Hildenbrand M: Oscar Salvador From e025ab842ec35225b1a8e163d1f311beb9e38ce9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 18 Oct 2022 15:40:14 +0800 Subject: [PATCH 047/313] mm: remove kern_addr_valid() completely Most architectures (except arm64/x86/sparc) simply return 1 for kern_addr_valid(), which is only used in read_kcore(), and it calls copy_from_kernel_nofault() which could check whether the address is a valid kernel address. So as there is no need for kern_addr_valid(), let's remove it. Link: https://lkml.kernel.org/r/20221018074014.185687-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Geert Uytterhoeven [m68k] Acked-by: Heiko Carstens [s390] Acked-by: Christoph Hellwig Acked-by: Helge Deller [parisc] Acked-by: Michael Ellerman [powerpc] Acked-by: Guo Ren [csky] Acked-by: Catalin Marinas [arm64] Cc: Alexander Gordeev Cc: Andy Lutomirski Cc: Anton Ivanov Cc: Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Greg Ungerer Cc: H. Peter Anvin Cc: Huacai Chen Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Johannes Berg Cc: Jonas Bonn Cc: Matt Turner Cc: Max Filippov Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Xuerui Wang Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/include/asm/pgtable.h | 2 - arch/arc/include/asm/pgtable-bits-arcv2.h | 2 - arch/arm/include/asm/pgtable-nommu.h | 2 - arch/arm/include/asm/pgtable.h | 4 -- arch/arm64/include/asm/pgtable.h | 2 - arch/arm64/mm/mmu.c | 47 ----------------------- arch/arm64/mm/pageattr.c | 3 +- arch/csky/include/asm/pgtable.h | 3 -- arch/hexagon/include/asm/page.h | 7 ---- arch/ia64/include/asm/pgtable.h | 16 -------- arch/loongarch/include/asm/pgtable.h | 2 - arch/m68k/include/asm/pgtable_mm.h | 2 - arch/m68k/include/asm/pgtable_no.h | 1 - arch/microblaze/include/asm/pgtable.h | 3 -- arch/mips/include/asm/pgtable.h | 2 - arch/nios2/include/asm/pgtable.h | 2 - arch/openrisc/include/asm/pgtable.h | 2 - arch/parisc/include/asm/pgtable.h | 15 -------- arch/powerpc/include/asm/pgtable.h | 7 ---- arch/riscv/include/asm/pgtable.h | 2 - arch/s390/include/asm/pgtable.h | 2 - arch/sh/include/asm/pgtable.h | 2 - arch/sparc/include/asm/pgtable_32.h | 6 --- arch/sparc/mm/init_32.c | 3 +- arch/sparc/mm/init_64.c | 1 - arch/um/include/asm/pgtable.h | 2 - arch/x86/include/asm/pgtable_32.h | 9 ----- arch/x86/include/asm/pgtable_64.h | 1 - arch/x86/mm/init_64.c | 41 -------------------- arch/xtensa/include/asm/pgtable.h | 2 - fs/proc/kcore.c | 26 +++++-------- 31 files changed, 11 insertions(+), 210 deletions(-) diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 3ea9661c09ff..9e45f6735d5d 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -313,8 +313,6 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - #define pte_ERROR(e) \ printk("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index b23be557403e..515e82db519f 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -120,8 +120,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - #ifdef CONFIG_TRANSPARENT_HUGEPAGE #include #endif diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index d16aba48fa0a..25d8c7bb07e0 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h @@ -21,8 +21,6 @@ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) -/* FIXME */ /* * PMD_SHIFT determines the size of the area a second-level page table can map * PGDIR_SHIFT determines what a third-level page table entry can map diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 78a532068fec..00954ab1a039 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -298,10 +298,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -/* FIXME: this is not correct */ -#define kern_addr_valid(addr) (1) - /* * We provide our own arch_get_unmapped_area to cope with VIPT caches. */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 71a1af42f0e8..4873c1d6e7d0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1021,8 +1021,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, */ #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS) -extern int kern_addr_valid(unsigned long addr); - #ifdef CONFIG_ARM64_MTE #define __HAVE_ARCH_PREPARE_TO_SWAP diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9a7c38965154..556154d821bf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -814,53 +814,6 @@ void __init paging_init(void) create_idmap(); } -/* - * Check whether a kernel address is valid (derived from arch/x86/). - */ -int kern_addr_valid(unsigned long addr) -{ - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp, pud; - pmd_t *pmdp, pmd; - pte_t *ptep, pte; - - addr = arch_kasan_reset_tag(addr); - if ((((long)addr) >> VA_BITS) != -1UL) - return 0; - - pgdp = pgd_offset_k(addr); - if (pgd_none(READ_ONCE(*pgdp))) - return 0; - - p4dp = p4d_offset(pgdp, addr); - if (p4d_none(READ_ONCE(*p4dp))) - return 0; - - pudp = pud_offset(p4dp, addr); - pud = READ_ONCE(*pudp); - if (pud_none(pud)) - return 0; - - if (pud_sect(pud)) - return pfn_valid(pud_pfn(pud)); - - pmdp = pmd_offset(pudp, addr); - pmd = READ_ONCE(*pmdp); - if (pmd_none(pmd)) - return 0; - - if (pmd_sect(pmd)) - return pfn_valid(pmd_pfn(pmd)); - - ptep = pte_offset_kernel(pmdp, addr); - pte = READ_ONCE(*ptep); - if (pte_none(pte)) - return 0; - - return pfn_valid(pte_pfn(pte)); -} - #ifdef CONFIG_MEMORY_HOTPLUG static void free_hotplug_page_range(struct page *page, size_t size, struct vmem_altmap *altmap) diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index d107c3d434e2..0a741a910a6a 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -201,8 +201,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) /* * This function is used to determine if a linear map page has been marked as - * not-valid. Walk the page table and check the PTE_VALID bit. This is based - * on kern_addr_valid(), which almost does what we need. + * not-valid. Walk the page table and check the PTE_VALID bit. * * Because this is only called on the kernel linear map, p?d_sect() implies * p?d_present(). When debug_pagealloc is enabled, sections mappings are diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c3d9b92cbe61..77bc6caff2d2 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -249,9 +249,6 @@ extern void paging_init(void); void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte); -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) (1) - #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) diff --git a/arch/hexagon/include/asm/page.h b/arch/hexagon/include/asm/page.h index 7cbf719c578e..d7d4f9fca327 100644 --- a/arch/hexagon/include/asm/page.h +++ b/arch/hexagon/include/asm/page.h @@ -131,13 +131,6 @@ static inline void clear_page(void *page) #define page_to_virt(page) __va(page_to_phys(page)) -/* - * For port to Hexagon Virtual Machine, MAYBE we check for attempts - * to reference reserved HVM space, but in any case, the VM will be - * protected. - */ -#define kern_addr_valid(addr) (1) - #include #include /* XXX Todo: implement assembly-optimized version of getorder. */ diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 6925e28ae61d..01517a5e6778 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -181,22 +181,6 @@ ia64_phys_addr_valid (unsigned long addr) return (addr & (local_cpu_data->unimpl_pa_mask)) == 0; } -/* - * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel - * memory. For the return value to be meaningful, ADDR must be >= - * PAGE_OFFSET. This operation can be relatively expensive (e.g., - * require a hash-, or multi-level tree-lookup or something of that - * sort) but it guarantees to return TRUE only if accessing the page - * at that address does not cause an error. Note that there may be - * addresses for which kern_addr_valid() returns FALSE even though an - * access would not cause an error (e.g., this is typically true for - * memory mapped I/O regions. - * - * XXX Need to implement this for IA-64. - */ -#define kern_addr_valid(addr) (1) - - /* * Now come the defines and routines to manage and access the three-level * page table. diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 946704bee599..fc70b7041b76 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -421,8 +421,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, __update_tlb(vma, address, (pte_t *)pmdp); } -#define kern_addr_valid(addr) (1) - static inline unsigned long pmd_pfn(pmd_t pmd) { return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT; diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index 9b4e2fe2ac82..b93c41fe2067 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -145,8 +145,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #endif /* !__ASSEMBLY__ */ -#define kern_addr_valid(addr) (1) - /* MMU-specific headers */ #ifdef CONFIG_SUN3 diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index bce5ca56c388..fed58da3a6b6 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h @@ -20,7 +20,6 @@ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) -#define kern_addr_valid(addr) (1) #define pmd_offset(a, b) ((void *)0) #define PAGE_NONE __pgprot(0) diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index ba348e997dbb..42f5988e998b 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -416,9 +416,6 @@ extern unsigned long iopa(unsigned long addr); #define IOMAP_NOCACHE_NONSER 2 #define IOMAP_NO_COPYBACK 3 -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) (1) - void do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code); diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 6caec386ad2f..364a06033105 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -550,8 +550,6 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, __update_tlb(vma, address, pte); } -#define kern_addr_valid(addr) (1) - /* * Allow physical addresses to be fixed up to help 36-bit peripherals. */ diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index b3d45e815295..ab793bc517f5 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -249,8 +249,6 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -#define kern_addr_valid(addr) (1) - extern void __init paging_init(void); extern void __init mmu_init(void); diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index dcae8aea132f..6477c17b3062 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -395,8 +395,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - typedef pte_t *pte_addr_t; #endif /* __ASSEMBLY__ */ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index ecd028854469..bd09a44cfb2d 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -23,21 +23,6 @@ #include #include -/* - * kern_addr_valid(ADDR) tests if ADDR is pointing to valid kernel - * memory. For the return value to be meaningful, ADDR must be >= - * PAGE_OFFSET. This operation can be relatively expensive (e.g., - * require a hash-, or multi-level tree-lookup or something of that - * sort) but it guarantees to return TRUE only if accessing the page - * at that address does not cause an error. Note that there may be - * addresses for which kern_addr_valid() returns FALSE even though an - * access would not cause an error (e.g., this is typically true for - * memory mapped I/O regions. - * - * XXX Need to implement this for parisc. - */ -#define kern_addr_valid(addr) (1) - /* This is for the serialization of PxTLB broadcasts. At least on the N class * systems, only one PxTLB inter processor broadcast can be active at any one * time on the Merced bus. */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 283f40d05a4d..9972626ddaf6 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -81,13 +81,6 @@ void poking_init(void); extern unsigned long ioremap_bot; extern const pgprot_t protection_map[16]; -/* - * kern_addr_valid is intended to indicate whether an address is a valid - * kernel address. Most 32-bit archs define it as always true (like this) - * but most 64-bit archs actually perform a test. What should we do here? - */ -#define kern_addr_valid(addr) (1) - #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7ec936910a96..c7993bdf749f 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -801,8 +801,6 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #endif /* !CONFIG_MMU */ -#define kern_addr_valid(addr) (1) /* FIXME */ - extern char _start[]; extern void *_dtb_early_va; extern uintptr_t _dtb_early_pa; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index f1cb9391190d..e1db07211818 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1773,8 +1773,6 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 6fb9ec54cf9b..3ce30becf6df 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -92,8 +92,6 @@ static inline unsigned long phys_addr_mask(void) typedef pte_t *pte_addr_t; -#define kern_addr_valid(addr) (1) - #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) struct vm_area_struct; diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 8ff549004fac..5acc05b572e6 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -368,12 +368,6 @@ __get_iospace (unsigned long addr) } } -extern unsigned long *sparc_valid_addr_bitmap; - -/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ -#define kern_addr_valid(addr) \ - (test_bit(__pa((unsigned long)(addr))>>20, sparc_valid_addr_bitmap)) - /* * For sparc32&64, the pfn in io_remap_pfn_range() carries in * its high 4 bits. These macros/functions put it there or get it from there. diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index d88e774c8eb4..9c0ea457bdf0 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -37,8 +37,7 @@ #include "mm_32.h" -unsigned long *sparc_valid_addr_bitmap; -EXPORT_SYMBOL(sparc_valid_addr_bitmap); +static unsigned long *sparc_valid_addr_bitmap; unsigned long phys_base; EXPORT_SYMBOL(phys_base); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index d6faee23c77d..04f9db0c3111 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1667,7 +1667,6 @@ bool kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -EXPORT_SYMBOL(kern_addr_valid); static unsigned long __ref kernel_map_hugepud(unsigned long vstart, unsigned long vend, diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 66bc3f99d9be..4e3052f2671a 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -298,8 +298,6 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -#define kern_addr_valid(addr) (1) - /* Clear a kernel PTE and flush it from the TLB */ #define kpte_clear_flush(ptep, vaddr) \ do { \ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 7c9c968a42ef..7d4ad8907297 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -47,15 +47,6 @@ do { \ #endif /* !__ASSEMBLY__ */ -/* - * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM - */ -#ifdef CONFIG_FLATMEM -#define kern_addr_valid(addr) (1) -#else -#define kern_addr_valid(kaddr) (0) -#endif - /* * This is used to calculate the .brk reservation for initial pagetables. * Enough space is reserved to allocate pagetables sufficient to cover all diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e479491da8d5..7929327abe00 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -240,7 +240,6 @@ static inline void native_pgd_clear(pgd_t *pgd) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) #define __swp_entry_to_pmd(x) ((pmd_t) { .pmd = (x).val }) -extern int kern_addr_valid(unsigned long addr); extern void cleanup_highmap(void); #define HAVE_ARCH_UNMAPPED_AREA diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3f040c6e5d13..e8db4edd7cc9 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1416,47 +1416,6 @@ void mark_rodata_ro(void) debug_checkwx(); } -int kern_addr_valid(unsigned long addr) -{ - unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (above != 0 && above != -1UL) - return 0; - - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) - return 0; - - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) - return 0; - - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) - return 0; - - if (pud_large(*pud)) - return pfn_valid(pud_pfn(*pud)); - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - return 0; - - if (pmd_large(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); - - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) - return 0; - - return pfn_valid(pte_pfn(*pte)); -} - /* * Block size is the minimum amount of memory which can be hotplugged or * hotremoved. It must be power of two and must be equal or larger than diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 54f577c13afa..5b5484d707b2 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -386,8 +386,6 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #else -#define kern_addr_valid(addr) (1) - extern void update_mmu_cache(struct vm_area_struct * vma, unsigned long address, pte_t *ptep); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 98f3289556e4..71157ee35c1a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -540,25 +540,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) fallthrough; case KCORE_VMEMMAP: case KCORE_TEXT: - if (kern_addr_valid(start)) { - /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. - */ - if (copy_from_kernel_nofault(buf, (void *)start, - tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + /* + * Using bounce buffer to bypass the + * hardened user copy kernel text checks. + */ + if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; } } else { - if (clear_user(buffer, tsz)) { + if (copy_to_user(buffer, buf, tsz)) { ret = -EFAULT; goto out; } From 9ee2c086271639d82ad8f6e96b91fa7991800c0a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Oct 2022 19:33:29 +0100 Subject: [PATCH 048/313] mm/huge_memory: convert split_huge_pages_in_file() to use a folio Patch series "Remove FGP_HEAD flag". We have just two users left of the FGP_HEAD flag and both of them are better off; sometimes startlingly so as a result of conversion to use folios. This patch (of 4): Removes a number of calls to compound_head() and a call to pagecache_get_page(). Link: https://lkml.kernel.org/r/20221019183332.2802139-1-willy@infradead.org Link: https://lkml.kernel.org/r/20221019183332.2802139-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 561a42567477..bc5dda3d2ad7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3066,28 +3066,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, mapping = candidate->f_mapping; for (index = off_start; index < off_end; index += nr_pages) { - struct page *fpage = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, + FGP_ENTRY, 0); nr_pages = 1; - if (xa_is_value(fpage) || !fpage) + if (xa_is_value(folio) || !folio) continue; - if (!is_transparent_hugepage(fpage)) + if (!folio_test_large(folio)) goto next; total++; - nr_pages = thp_nr_pages(fpage); + nr_pages = folio_nr_pages(folio); - if (!trylock_page(fpage)) + if (!folio_trylock(folio)) goto next; - if (!split_huge_page(fpage)) + if (!split_folio(folio)) split++; - unlock_page(fpage); + folio_unlock(folio); next: - put_page(fpage); + folio_put(folio); cond_resched(); } From dd8095b15a6034165bc48da1eb6d0acc73c1558a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Oct 2022 19:33:30 +0100 Subject: [PATCH 049/313] mm/swap: convert find_get_incore_page to use folios Eliminates a use of FGP_HEAD and saves 35 bytes of text. Link: https://lkml.kernel.org/r/20221019183332.2802139-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 438d0676c5be..44e3530520e8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -386,17 +386,14 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0); - if (!page) - return page; - if (!xa_is_value(page)) - return find_subpage(page, index); + if (!xa_is_value(folio)) + goto out; if (!shmem_mapping(mapping)) return NULL; - swp = radix_to_swp_entry(page); + swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) return NULL; @@ -404,9 +401,13 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) si = get_swap_device(swp); if (!si) return NULL; - page = find_get_page(swap_address_space(swp), swp_offset(swp)); + index = swp_offset(swp); + folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); - return page; +out: + if (!folio) + return NULL; + return folio_file_page(folio, index); } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, From 524984ff66ee4b63264dffe568c3547a20b4136c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Oct 2022 19:33:31 +0100 Subject: [PATCH 050/313] mm: convert find_get_incore_page() to filemap_get_incore_folio() Return the containing folio instead of the precise page. One of the callers wants the folio and the other can do the folio->page conversion itself. Nets 442 bytes of text size reduction, 478 bytes removed and 36 bytes added. Link: https://lkml.kernel.org/r/20221019183332.2802139-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 12 +++++++++--- mm/mincore.c | 10 +++++----- mm/swap.h | 8 +++++--- mm/swap_state.c | 15 +++++++-------- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f264a856ba86..fd707dcd6d04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5648,15 +5648,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { + unsigned long index; + struct folio *folio; + if (!vma->vm_file) /* anonymous vma */ return NULL; if (!(mc.flags & MOVE_FILE)) return NULL; - /* page is moved even if it's not RSS of this task(page-faulted). */ + /* folio is moved even if it's not RSS of this task(page-faulted). */ /* shmem/tmpfs may report page out on swap: account for that too. */ - return find_get_incore_page(vma->vm_file->f_mapping, - linear_page_index(vma, addr)); + index = linear_page_index(vma, addr); + folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); + if (!folio) + return NULL; + return folio_file_page(folio, index); } /** diff --git a/mm/mincore.c b/mm/mincore.c index e7e046fe17d7..a085a2aeabd8 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -52,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; - struct page *page; + struct folio *folio; /* * When tmpfs swaps out a page from a file, any process mapping that @@ -60,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_incore_page(mapping, index); - if (page) { - present = PageUptodate(page); - put_page(page); + folio = filemap_get_incore_folio(mapping, index); + if (folio) { + present = folio_test_uptodate(folio); + folio_put(folio); } return present; diff --git a/mm/swap.h b/mm/swap.h index cc08c459c619..f78065c8ef52 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,7 +41,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, @@ -105,9 +106,10 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, } static inline -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { - return find_get_page(mapping, index); + return filemap_get_folio(mapping, index); } static inline bool add_to_swap(struct folio *folio) diff --git a/mm/swap_state.c b/mm/swap_state.c index 44e3530520e8..40fe6f23e105 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -373,16 +373,17 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, } /** - * find_get_incore_page - Find and get a page from the page or swap caches. + * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * - * This differs from find_get_page() in that it will also look for the - * page in the swap cache. + * This differs from filemap_get_folio() in that it will also look for the + * folio in the swap cache. * - * Return: The found page or %NULL. + * Return: The found folio or %NULL. */ -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; @@ -405,9 +406,7 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); out: - if (!folio) - return NULL; - return folio_file_page(folio, index); + return folio; } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, From c5255b421fd04ba6a405809b7216a3b6ebd5493a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 19 Oct 2022 19:33:32 +0100 Subject: [PATCH 051/313] mm: remove FGP_HEAD This is no longer used; all callers have been converted to use folios instead. Somehow this manages to save 11 bytes of text. Link: https://lkml.kernel.org/r/20221019183332.2802139-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ++--- mm/folio-compat.c | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 060ee98474ef..b33ab86d5dca 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -504,9 +504,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 -#define FGP_HEAD 0x00000080 -#define FGP_ENTRY 0x00000100 -#define FGP_STABLE 0x00000200 +#define FGP_ENTRY 0x00000080 +#define FGP_STABLE 0x00000100 struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 8ae39c06da62..bac2a366aada 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -108,7 +108,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); - if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + if (!folio || xa_is_value(folio)) return &folio->page; return folio_file_page(folio, index); } From 93d38b72e4c1a518f46fd6dfcb5ad7c5003fa372 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Oct 2022 11:49:41 +0800 Subject: [PATCH 052/313] nios2: remove unused INIT_MMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: cleanup with VM_ACCESS_FLAGS". This patch (of 5): It seems that INIT_MMAP is gone in 2.4.10, not sure, anyways, it is useless now, kill it. Link: https://lkml.kernel.org/r/20221019034945.93081-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20221019034945.93081-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Dinh Nguyen Cc: Dave Hansen Cc: Jarkko Sakkinen Cc: Kefeng Wang Cc: Alex Deucher Cc: "Christian König" Cc: Daniel Vetter Cc: David Airlie Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton --- arch/nios2/include/asm/processor.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index 8916d93d5c2d..eb44130364a9 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -50,9 +50,6 @@ struct thread_struct { unsigned long kpsr; }; -#define INIT_MMAP \ - { &init_mm, (0), (0), __pgprot(0x0), VM_READ | VM_WRITE | VM_EXEC } - # define INIT_THREAD { \ .kregs = NULL, \ .ksp = 0, \ From 4f20566f5c0f42c451f6a43be9bfa6f0b3d142df Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Oct 2022 11:49:42 +0800 Subject: [PATCH 053/313] x86/sgx: use VM_ACCESS_FLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify VM_READ|VM_WRITE|VM_EXEC with VM_ACCESS_FLAGS. Link: https://lkml.kernel.org/r/20221019034945.93081-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Jarkko Sakkinen Cc: Dave Hansen Cc: Alex Deucher Cc: "Christian König" Cc: Daniel Vetter Cc: David Airlie Cc: Dinh Nguyen Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton --- arch/x86/kernel/cpu/sgx/encl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 1ec20807de1e..6225c525372d 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -268,7 +268,7 @@ static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl, unsigned long addr, unsigned long vm_flags) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *entry; entry = xa_load(&encl->page_array, PFN_DOWN(addr)); @@ -502,7 +502,7 @@ static void sgx_vma_open(struct vm_area_struct *vma) int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start, unsigned long end, unsigned long vm_flags) { - unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC); + unsigned long vm_prot_bits = vm_flags & VM_ACCESS_FLAGS; struct sgx_encl_page *page; unsigned long count = 0; int ret = 0; From e39ee675f42e993bbf1c04b1ad7526db820ccdce Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Oct 2022 11:49:43 +0800 Subject: [PATCH 054/313] mm: mprotect: use VM_ACCESS_FLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify VM_READ|VM_WRITE|VM_EXEC with VM_ACCESS_FLAGS. Link: https://lkml.kernel.org/r/20221019034945.93081-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Alex Deucher Cc: "Christian König" Cc: Daniel Vetter Cc: Dave Hansen Cc: David Airlie Cc: Dinh Nguyen Cc: Jarkko Sakkinen Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton --- mm/mprotect.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 668bfaa6ed2a..99762403cc8f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -756,8 +756,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, * If a permission is not passed to mprotect(), it must be * cleared from the VMA. */ - mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | - VM_FLAGS_CLEAR; + mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR; new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); From d7e679b6f9d9f7337f0fdc5011f2ecc9b16f821b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Oct 2022 11:49:44 +0800 Subject: [PATCH 055/313] mm: debug_vm_pgtable: use VM_ACCESS_FLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Directly use VM_ACCESS_FLAGS instead VMFLAGS. Link: https://lkml.kernel.org/r/20221019034945.93081-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Alex Deucher Cc: "Christian König" Cc: Daniel Vetter Cc: Dave Hansen Cc: David Airlie Cc: Dinh Nguyen Cc: Jarkko Sakkinen Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index dc7df1254f0a..2b61fde8c38c 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -38,11 +38,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - */ - -#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) - -/* + * * On s390 platform, the lower 4 bits are used to identify given page table * entry type. But these bits might affect the ability to clear entries with * pxx_clear() because of how dynamic page table folding works on s390. So @@ -1125,7 +1121,7 @@ static int __init init_args(struct pgtable_debug_args *args) */ memset(args, 0, sizeof(*args)); args->vaddr = get_random_vaddr(); - args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS); args->page_prot_none = vm_get_page_prot(VM_NONE); args->is_contiguous_page = false; args->pud_pfn = ULONG_MAX; From cc03817c0e8417419ede18a8e0749c5b9699b135 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Oct 2022 11:49:45 +0800 Subject: [PATCH 056/313] amdgpu: use VM_ACCESS_FLAGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify VM_READ|VM_WRITE|VM_EXEC with VM_ACCESS_FLAGS. Link: https://lkml.kernel.org/r/20221019034945.93081-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Alex Deucher Cc: "Christian König" Cc: "Pan, Xinhui" Cc: David Airlie Cc: Daniel Vetter Cc: Dave Hansen Cc: Dinh Nguyen Cc: Jarkko Sakkinen Signed-off-by: Andrew Morton --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 8ef31d687ef3..4728be161828 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -255,7 +255,7 @@ static int amdgpu_gem_object_mmap(struct drm_gem_object *obj, struct vm_area_str * becoming writable and makes is_cow_mapping(vm_flags) false. */ if (is_cow_mapping(vma->vm_flags) && - !(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))) + !(vma->vm_flags & VM_ACCESS_FLAGS)) vma->vm_flags &= ~VM_MAYWRITE; return drm_gem_ttm_mmap(obj, vma); From 4781593d5dbae50500d1c7975be03b590ae2b92a Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 20 Oct 2022 15:38:32 -0400 Subject: [PATCH 057/313] mm/hugetlb: unify clearing of RestoreReserve for private pages A trivial cleanup to move clearing of RestoreReserve into adding anon rmap of private hugetlb mappings. It matches with the shared mappings where we only clear the bit when adding into page cache, rather than spreading it around the code paths. Link: https://lkml.kernel.org/r/20221020193832.776173-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 ++++---------- mm/rmap.c | 2 +- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0af18c1e4b31..d11e92117d4a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4775,7 +4775,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr hugepage_add_new_anon_rmap(new_page, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); } @@ -5438,8 +5437,6 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearHPageRestoreReserve(new_page); - /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -5734,10 +5731,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; - if (anon_rmap) { - ClearHPageRestoreReserve(page); + if (anon_rmap) hugepage_add_new_anon_rmap(page, vma, haddr); - } else + else page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -6120,12 +6116,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) { + if (page_in_pagecache) page_dup_file_rmap(page, true); - } else { - ClearHPageRestoreReserve(page); + else hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); - } /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY diff --git a/mm/rmap.c b/mm/rmap.c index 9bba65b30e4d..3b2d18bbdc44 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2571,7 +2571,7 @@ void hugepage_add_new_anon_rmap(struct page *page, BUG_ON(address < vma->vm_start || address >= vma->vm_end); atomic_set(compound_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); - + ClearHPageRestoreReserve(page); __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ From 6e2be1f2ebcea42ed6044432f72f32434e60b34d Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:52 +0200 Subject: [PATCH 058/313] compiler-gcc: be consistent with underscores use for `no_sanitize` Patch series "compiler-gcc: be consistent with underscores use for `no_sanitize`". This patch (of 5): Other macros that define shorthands for attributes in e.g. `compiler_attributes.h` and elsewhere use underscores. Link: https://lkml.kernel.org/r/20221021115956.9947-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index f55a37efdb97..b9530d3515ac 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -83,25 +83,25 @@ #endif #if __has_attribute(__no_sanitize_address__) -#define __no_sanitize_address __attribute__((no_sanitize_address)) +#define __no_sanitize_address __attribute__((__no_sanitize_address__)) #else #define __no_sanitize_address #endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) -#define __no_sanitize_thread __attribute__((no_sanitize_thread)) +#define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread #endif #if __has_attribute(__no_sanitize_undefined__) -#define __no_sanitize_undefined __attribute__((no_sanitize_undefined)) +#define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) #else #define __no_sanitize_undefined #endif #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) -#define __no_sanitize_coverage __attribute__((no_sanitize_coverage)) +#define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else #define __no_sanitize_coverage #endif From ae37a9a2c2d0960d643d782b426ea1aa9c05727a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:53 +0200 Subject: [PATCH 059/313] compiler-gcc: remove attribute support check for `__no_sanitize_address__` The attribute was added in GCC 4.8, while the minimum GCC version supported by the kernel is GCC 5.1. Therefore, remove the check. Link: https://godbolt.org/z/84v56vcn8 Link: https://lkml.kernel.org/r/20221021115956.9947-2-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Marco Elver Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index b9530d3515ac..bfce7f4d0978 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -82,11 +82,7 @@ #define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) #endif -#if __has_attribute(__no_sanitize_address__) #define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#else -#define __no_sanitize_address -#endif #if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) From 095ac0763ac507dd4e1a71ad9784f49f51498483 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:54 +0200 Subject: [PATCH 060/313] compiler-gcc: remove attribute support check for `__no_sanitize_thread__` The attribute was added in GCC 5.1, which matches the minimum GCC version supported by the kernel. Therefore, remove the check. Link: https://godbolt.org/z/vbxKejxbx Link: https://lkml.kernel.org/r/20221021115956.9947-3-ojeda@kernel.org Signed-off-by: Miguel Ojeda Acked-by: Marco Elver Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index bfce7f4d0978..ba207deb77ca 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -84,7 +84,7 @@ #define __no_sanitize_address __attribute__((__no_sanitize_address__)) -#if defined(__SANITIZE_THREAD__) && __has_attribute(__no_sanitize_thread__) +#if defined(__SANITIZE_THREAD__) #define __no_sanitize_thread __attribute__((__no_sanitize_thread__)) #else #define __no_sanitize_thread From 689540cbda7f69594ae5e13fef4c8239519d8b66 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:55 +0200 Subject: [PATCH 061/313] compiler-gcc: remove attribute support check for `__no_sanitize_undefined__` The attribute was added in GCC 4.9, while the minimum GCC version supported by the kernel is GCC 5.1. Therefore, remove the check. Link: https://godbolt.org/z/GrMeo6fYr Link: https://lkml.kernel.org/r/20221021115956.9947-4-ojeda@kernel.org Signed-off-by: Miguel Ojeda Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Marco Elver Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index ba207deb77ca..7f2c2bb73815 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -90,11 +90,7 @@ #define __no_sanitize_thread #endif -#if __has_attribute(__no_sanitize_undefined__) #define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) -#else -#define __no_sanitize_undefined -#endif #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) #define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) From f39556bc2530c83a22bc11b73c7a46df9a340685 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 21 Oct 2022 13:59:56 +0200 Subject: [PATCH 062/313] compiler-gcc: document minimum version for `__no_sanitize_coverage__` The attribute was added in GCC 12.1. This will simplify future cleanups, and is closer to what we do in `compiler_attributes.h`. Link: https://godbolt.org/z/MGbT76j6G Link: https://lkml.kernel.org/r/20221021115956.9947-5-ojeda@kernel.org Signed-off-by: Miguel Ojeda Acked-by: Marco Elver Reviewed-by: Nathan Chancellor Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dan Li Cc: Kees Cook Cc: Kumar Kartikeya Dwivedi Cc: Nick Desaulniers Cc: Uros Bizjak Signed-off-by: Andrew Morton --- include/linux/compiler-gcc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7f2c2bb73815..7af9e34ec261 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -92,6 +92,9 @@ #define __no_sanitize_undefined __attribute__((__no_sanitize_undefined__)) +/* + * Only supported since gcc >= 12 + */ #if defined(CONFIG_KCOV) && __has_attribute(__no_sanitize_coverage__) #define __no_sanitize_coverage __attribute__((__no_sanitize_coverage__)) #else From b5f1fc98c62b6b75e9f7499e7519dc67684affd3 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 21 Oct 2022 16:46:09 +0800 Subject: [PATCH 063/313] mm: memory-failure: make put_ref_page() more useful Pass pfn/flags to put_ref_page(), then check MF_COUNT_INCREASED and drop refcount to make the code look cleaner. Link: https://lkml.kernel.org/r/20221021084611.53765-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 145bb561ddb3..8c6a19b9790f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1910,17 +1910,25 @@ static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) } #endif /* CONFIG_HUGETLB_PAGE */ +/* Drop the extra refcount in case we come from madvise() */ +static void put_ref_page(unsigned long pfn, int flags) +{ + struct page *page; + + if (!(flags & MF_COUNT_INCREASED)) + return; + + page = pfn_to_page(pfn); + if (page) + put_page(page); +} + static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { - struct page *page = pfn_to_page(pfn); int rc = -ENXIO; - if (flags & MF_COUNT_INCREASED) - /* - * Drop the extra refcount in case we come from madvise(). - */ - put_page(page); + put_ref_page(pfn, flags); /* device metadata space is not recoverable */ if (!pgmap_pfn_valid(pgmap, pfn)) @@ -2513,12 +2521,6 @@ static int soft_offline_in_use_page(struct page *page) return ret; } -static void put_ref_page(struct page *page) -{ - if (page) - put_page(page); -} - /** * soft_offline_page - Soft offline a page. * @pfn: pfn to soft-offline @@ -2547,19 +2549,17 @@ int soft_offline_page(unsigned long pfn, int flags) { int ret; bool try_again = true; - struct page *page, *ref_page = NULL; + struct page *page; WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); if (!pfn_valid(pfn)) return -ENXIO; - if (flags & MF_COUNT_INCREASED) - ref_page = pfn_to_page(pfn); /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ page = pfn_to_online_page(pfn); if (!page) { - put_ref_page(ref_page); + put_ref_page(pfn, flags); return -EIO; } @@ -2567,7 +2567,7 @@ int soft_offline_page(unsigned long pfn, int flags) if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); - put_ref_page(ref_page); + put_ref_page(pfn, flags); mutex_unlock(&mf_mutex); return 0; } From 183a7c5d15d3c56f49955662d3edd0092141df78 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 21 Oct 2022 16:46:10 +0800 Subject: [PATCH 064/313] mm: memory-failure: avoid pfn_valid() twice in soft_offline_page() Simplify WARN_ON_ONCE(flags & MF_COUNT_INCREASED) under !pfn_valid(). Link: https://lkml.kernel.org/r/20221021084611.53765-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8c6a19b9790f..b5e0dba02192 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2551,10 +2551,10 @@ int soft_offline_page(unsigned long pfn, int flags) bool try_again = true; struct page *page; - WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); - - if (!pfn_valid(pfn)) + if (!pfn_valid(pfn)) { + WARN_ON_ONCE(flags & MF_COUNT_INCREASED); return -ENXIO; + } /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ page = pfn_to_online_page(pfn); From b66d00dfebe79ebd0d5a0ec4ee4e26583432c381 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 21 Oct 2022 16:46:11 +0800 Subject: [PATCH 065/313] mm: memory-failure: make action_result() return int Check mf_result in action_result(), only return 0 when MF_RECOVERED, or return -EBUSY, which will simplify code a bit. [wangkefeng.wang@huawei.com: v2] Link: https://lkml.kernel.org/r/20221024035138.99119-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20221021084611.53765-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b5e0dba02192..13594556146c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1179,14 +1179,16 @@ static struct page_state error_states[] = { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum mf_action_page_type type, - enum mf_result result) +static int action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { trace_memory_failure_event(pfn, type, result); num_poisoned_pages_inc(); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); + + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } static int page_action(struct page_state *ps, struct page *p, @@ -1197,14 +1199,12 @@ static int page_action(struct page_state *ps, struct page *p, /* page p should be unlocked after returning from ps->action(). */ result = ps->action(ps, p); - action_result(pfn, ps->type, result); - /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; + return action_result(pfn, ps->type, result); } static inline bool PageHWPoisonTakenOff(struct page *page) @@ -1853,8 +1853,7 @@ retry: flags |= MF_NO_RETRY; goto retry; } - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return res; + return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); } head = compound_head(p); @@ -1880,22 +1879,17 @@ retry: } else { res = MF_FAILED; } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; + return action_result(pfn, MF_MSG_FREE_HUGE, res); } page_flags = head->flags; if (!hwpoison_user_mappings(p, pfn, flags, head)) { - action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); - res = -EBUSY; - goto out; + unlock_page(head); + return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } return identify_page_state(pfn, p, page_flags); -out: - unlock_page(head); - return res; } #else @@ -2060,16 +2054,13 @@ try_again: } res = MF_FAILED; } - action_result(pfn, MF_MSG_BUDDY, res); - res = res == MF_RECOVERED ? 0 : -EBUSY; + res = action_result(pfn, MF_MSG_BUDDY, res); } else { - action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); } goto unlock_mutex; } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); goto unlock_mutex; } } @@ -2090,8 +2081,7 @@ try_again: */ SetPageHasHWPoisoned(hpage); if (try_to_split_thp_page(p) < 0) { - action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); @@ -2124,8 +2114,7 @@ try_again: retry = false; goto try_again; } - action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); goto unlock_page; } @@ -2165,8 +2154,7 @@ try_again: * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ if (!hwpoison_user_mappings(p, pfn, flags, p)) { - action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); goto unlock_page; } @@ -2174,8 +2162,7 @@ try_again: * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } From 26215b7ee923b9251f7bb12c4e5f09dc465d35f2 Mon Sep 17 00:00:00 2001 From: Hawkins Jiawei Date: Fri, 21 Oct 2022 07:16:08 +0800 Subject: [PATCH 066/313] hugetlbfs: fix null-ptr-deref in hugetlbfs_parse_param() Syzkaller reports a null-ptr-deref bug as follows: ====================================================== KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:hugetlbfs_parse_param+0x1dd/0x8e0 fs/hugetlbfs/inode.c:1380 [...] Call Trace: vfs_parse_fs_param fs/fs_context.c:148 [inline] vfs_parse_fs_param+0x1f9/0x3c0 fs/fs_context.c:129 vfs_parse_fs_string+0xdb/0x170 fs/fs_context.c:191 generic_parse_monolithic+0x16f/0x1f0 fs/fs_context.c:231 do_new_mount fs/namespace.c:3036 [inline] path_mount+0x12de/0x1e20 fs/namespace.c:3370 do_mount fs/namespace.c:3383 [inline] __do_sys_mount fs/namespace.c:3591 [inline] __se_sys_mount fs/namespace.c:3568 [inline] __x64_sys_mount+0x27f/0x300 fs/namespace.c:3568 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] ====================================================== According to commit "vfs: parse: deal with zero length string value", kernel will set the param->string to null pointer in vfs_parse_fs_string() if fs string has zero length. Yet the problem is that, hugetlbfs_parse_param() will dereference the param->string, without checking whether it is a null pointer. To be more specific, if hugetlbfs_parse_param() parses an illegal mount parameter, such as "size=,", kernel will constructs struct fs_parameter with null pointer in vfs_parse_fs_string(), then passes this struct fs_parameter to hugetlbfs_parse_param(), which triggers the above null-ptr-deref bug. This patch solves it by adding sanity check on param->string in hugetlbfs_parse_param(). Link: https://lkml.kernel.org/r/20221020231609.4810-1-yin31149@gmail.com Reported-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com Tested-by: syzbot+a3e6acd85ded5c16a709@syzkaller.appspotmail.com Link: https://lore.kernel.org/all/0000000000005ad00405eb7148c6@google.com/ Signed-off-by: Hawkins Jiawei Reviewed-by: Mike Kravetz Cc: Hawkins Jiawei Cc: Muchun Song Cc: Ian Kent Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 00495fc128c5..09e644f80a4a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1378,7 +1378,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->max_size_opt = memparse(param->string, &rest); ctx->max_val_type = SIZE_STD; @@ -1388,7 +1388,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_nr_inodes: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->nr_inodes = memparse(param->string, &rest); return 0; @@ -1404,7 +1404,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par case Opt_min_size: /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(param->string[0])) + if (!param->string || !isdigit(param->string[0])) goto bad_val; ctx->min_size_opt = memparse(param->string, &rest); ctx->min_val_type = SIZE_STD; From b12fdbf15f92b6cf5fecdd8a1855afe8809e5c58 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 24 Oct 2022 15:33:36 -0400 Subject: [PATCH 067/313] Revert "mm/uffd: fix warning without PTE_MARKER_UFFD_WP compiled in" With " mm/uffd: Fix vma check on userfault for wp" to fix the registration, we'll be safe to remove the macro hacks now. Link: https://lkml.kernel.org/r/20221024193336.1233616-3-peterx@redhat.com Signed-off-by: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ---- mm/memory.c | 2 -- mm/mprotect.c | 2 -- 3 files changed, 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d11e92117d4a..fc8908d715d6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5114,7 +5114,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to @@ -5126,7 +5125,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); else -#endif huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; @@ -5155,13 +5153,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); -#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); -#endif hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); diff --git a/mm/memory.c b/mm/memory.c index f88c351aecd4..81cc75e71888 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1393,12 +1393,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); -#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, diff --git a/mm/mprotect.c b/mm/mprotect.c index 99762403cc8f..8d770855b591 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -267,7 +267,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); -#ifdef CONFIG_PTE_MARKER_UFFD_WP if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { /* * For file-backed mem, we need to be able to @@ -279,7 +278,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, make_pte_marker(PTE_MARKER_UFFD_WP)); pages++; } -#endif } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); From fd4a7ac32918d3d7a2d17dc06c5520f45e36eb52 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 24 Oct 2022 16:34:22 +0800 Subject: [PATCH 068/313] mm: migrate: try again if THP split is failed due to page refcnt When creating a virtual machine, we will use memfd_create() to get a file descriptor which can be used to create share memory mappings using the mmap function, meanwhile the mmap() will set the MAP_POPULATE flag to allocate physical pages for the virtual machine. When allocating physical pages for the guest, the host can fallback to allocate some CMA pages for the guest when over half of the zone's free memory is in the CMA area. In guest os, when the application wants to do some data transaction with DMA, our QEMU will call VFIO_IOMMU_MAP_DMA ioctl to do longterm-pin and create IOMMU mappings for the DMA pages. However, when calling VFIO_IOMMU_MAP_DMA ioctl to pin the physical pages, we found it will be failed to longterm-pin sometimes. After some invetigation, we found the pages used to do DMA mapping can contain some CMA pages, and these CMA pages will cause a possible failure of the longterm-pin, due to failed to migrate the CMA pages. The reason of migration failure may be temporary reference count or memory allocation failure. So that will cause the VFIO_IOMMU_MAP_DMA ioctl returns error, which makes the application failed to start. I observed one migration failure case (which is not easy to reproduce) is that, the 'thp_migration_fail' count is 1 and the 'thp_split_page_failed' count is also 1. That means when migrating a THP which is in CMA area, but can not allocate a new THP due to memory fragmentation, so it will split the THP. However THP split is also failed, probably the reason is temporary reference count of this THP. And the temporary reference count can be caused by dropping page caches (I observed the drop caches operation in the system), but we can not drop the shmem page caches due to they are already dirty at that time. Especially for THP split failure, which is caused by temporary reference count, we can try again to mitigate the failure of migration in this case according to previous discussion [1]. [1] https://lore.kernel.org/all/470dc638-a300-f261-94b4-e27250e42f96@redhat.com/ Link: https://lkml.kernel.org/r/6784730480a1df82e8f4cba1ed088e4ac767994b.1666599848.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Cc: Alistair Popple Cc: David Hildenbrand Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- mm/migrate.c | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bc5dda3d2ad7..a00e9c335e69 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2712,7 +2712,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { - ret = -EBUSY; + ret = -EAGAIN; goto out_unlock; } @@ -2762,7 +2762,7 @@ fail: xas_unlock(&xas); local_irq_enable(); remap_page(folio, folio_nr_pages(folio)); - ret = -EBUSY; + ret = -EAGAIN; } out_unlock: diff --git a/mm/migrate.c b/mm/migrate.c index 556cb1c86e53..f8c85b42e2bc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1506,9 +1506,22 @@ thp_subpage_migration: if (is_thp) { nr_thp_failed++; /* THP NUMA faulting doesn't split THP to retry. */ - if (!nosplit && !try_split_thp(page, &thp_split_pages)) { - nr_thp_split++; - break; + if (!nosplit) { + int ret = try_split_thp(page, &thp_split_pages); + + if (!ret) { + nr_thp_split++; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* + * Try again to split THP to mitigate + * the failure of longterm pinning. + */ + thp_retry++; + nr_retry_pages += nr_subpages; + break; + } } } else if (!no_subpage_counting) { nr_failed++; From e591ef7d96d6ea249916f351dc26a636e565c635 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:09 +0900 Subject: [PATCH 069/313] mm,hwpoison,hugetlb,memory_hotplug: hotremove memory section with hwpoisoned hugepage Patch series "mm, hwpoison: improve handling workload related to hugetlb and memory_hotplug", v7. This patchset tries to solve the issue among memory_hotplug, hugetlb and hwpoison. In this patchset, memory hotplug handles hwpoison pages like below: - hwpoison pages should not prevent memory hotremove, - memory block with hwpoison pages should not be onlined. This patch (of 4): HWPoisoned page is not supposed to be accessed once marked, but currently such accesses can happen during memory hotremove because do_migrate_range() can be called before dissolve_free_huge_pages() is called. Clear HPageMigratable for hwpoisoned hugepages to prevent them from being migrated. This should be done in hugetlb_lock to avoid race against isolate_hugetlb(). get_hwpoison_huge_page() needs to have a flag to show it's called from unpoison to take refcount of hwpoisoned hugepages, so add it. [naoya.horiguchi@linux.dev: remove TestClearHPageMigratable and reduce to test and clear separately] Link: https://lkml.kernel.org/r/20221025053559.GA2104800@ik1-406-35019.vs.sakura.ne.jp Link: https://lkml.kernel.org/r/20221024062012.1520887-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20221024062012.1520887-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: Miaohe Lin Reviewed-by: Oscar Salvador Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++++---- include/linux/mm.h | 6 ++++-- mm/hugetlb.c | 9 +++++---- mm/memory-failure.c | 21 +++++++++++++++++---- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 20a0d5a08395..65ea34022aa2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -183,8 +183,9 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); int isolate_hugetlb(struct page *page, struct list_head *list); -int get_hwpoison_huge_page(struct page *page, bool *hugetlb); -int get_huge_page_for_hwpoison(unsigned long pfn, int flags); +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); void putback_active_hugepage(struct page *page); void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); @@ -391,12 +392,13 @@ static inline int isolate_hugetlb(struct page *page, struct list_head *list) return -EBUSY; } -static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { return 0; } -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index f6d2d2d9e284..e2ac6fff03a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3277,9 +3277,11 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags); +extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared); #else -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fc8908d715d6..fdb36afea2b2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7265,7 +7265,7 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { int ret = 0; @@ -7275,7 +7275,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page)) ret = 0; - else if (HPageMigratable(page)) + else if (HPageMigratable(page) || unpoison) ret = get_page_unless_zero(page); else ret = -EBUSY; @@ -7284,12 +7284,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags); + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 13594556146c..4fff0b36c61d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1244,7 +1244,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, false); if (hugetlb) return ret; @@ -1334,7 +1334,7 @@ static int __get_unpoison_page(struct page *page) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, true); if (hugetlb) return ret; @@ -1785,7 +1785,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage) * -EBUSY - the hugepage is busy (try to retry) * -EHWPOISON - the hugepage is already hwpoisoned */ -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); struct page *head = compound_head(page); @@ -1815,6 +1816,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) goto out; } + /* + * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * from being migrated by memory hotremove. + */ + if (count_increased && HPageMigratable(head)) { + ClearHPageMigratable(head); + *migratable_cleared = true; + } + return ret; out: if (count_increased) @@ -1834,10 +1844,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb struct page *p = pfn_to_page(pfn); struct page *head; unsigned long page_flags; + bool migratable_cleared = false; *hugetlb = 1; retry: - res = get_huge_page_for_hwpoison(pfn, flags); + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); if (res == 2) { /* fallback to normal page handling */ *hugetlb = 0; return 0; @@ -1861,6 +1872,8 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); + if (migratable_cleared) + SetHPageMigratable(head); unlock_page(head); if (res == 1) put_page(head); From d027122d8363e58cd8bc2fa6a16917f7f69b85bb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:10 +0900 Subject: [PATCH 070/313] mm/hwpoison: move definitions of num_poisoned_pages_* to memory-failure.c These interfaces will be used by drivers/base/memory.c by later patch, so as a preparatory work move them to more common header file visible to the file. Link: https://lkml.kernel.org/r/20221024062012.1520887-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/parisc/kernel/pdt.c | 3 +-- include/linux/mm.h | 5 +++++ include/linux/swapops.h | 24 ++---------------------- mm/memory-failure.c | 10 ++++++++++ 4 files changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index e391b175f5ec..fdc880e2575a 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -18,8 +18,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index e2ac6fff03a8..c667a6c4b657 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,12 +3279,17 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); +extern void num_poisoned_pages_inc(void); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } + +static inline void num_poisoned_pages_inc(void) +{ +} #endif #ifndef arch_memory_failure diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 86b95ccb81bb..3ba9bf56899d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -581,8 +581,6 @@ static inline int is_pmd_migration_entry(pmd_t pmd) #ifdef CONFIG_MEMORY_FAILURE -extern atomic_long_t num_poisoned_pages __read_mostly; - /* * Support for hardware poisoned pages */ @@ -597,17 +595,7 @@ static inline int is_hwpoison_entry(swp_entry_t entry) return swp_type(entry) == SWP_HWPOISON; } -static inline void num_poisoned_pages_inc(void) -{ - atomic_long_inc(&num_poisoned_pages); -} - -static inline void num_poisoned_pages_sub(long i) -{ - atomic_long_sub(i, &num_poisoned_pages); -} - -#else /* CONFIG_MEMORY_FAILURE */ +#else static inline swp_entry_t make_hwpoison_entry(struct page *page) { @@ -618,15 +606,7 @@ static inline int is_hwpoison_entry(swp_entry_t swp) { return 0; } - -static inline void num_poisoned_pages_inc(void) -{ -} - -static inline void num_poisoned_pages_sub(long i) -{ -} -#endif /* CONFIG_MEMORY_FAILURE */ +#endif static inline int non_swap_entry(swp_entry_t entry) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4fff0b36c61d..44d7bf6ff214 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,6 +74,16 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +inline void num_poisoned_pages_inc(void) +{ + atomic_long_inc(&num_poisoned_pages); +} + +static inline void num_poisoned_pages_sub(long i) +{ + atomic_long_sub(i, &num_poisoned_pages); +} + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, From a46c9304b4bbf1b164154976cbb7e648980c7b5b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:11 +0900 Subject: [PATCH 071/313] mm/hwpoison: pass pfn to num_poisoned_pages_*() No functional change. Link: https://lkml.kernel.org/r/20221024062012.1520887-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/parisc/kernel/pdt.c | 2 +- include/linux/mm.h | 4 ++-- mm/memory-failure.c | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/parisc/kernel/pdt.c b/arch/parisc/kernel/pdt.c index fdc880e2575a..80943a00e245 100644 --- a/arch/parisc/kernel/pdt.c +++ b/arch/parisc/kernel/pdt.c @@ -231,7 +231,7 @@ void __init pdc_pdt_init(void) /* mark memory page bad */ memblock_reserve(pdt_entry[i] & PAGE_MASK, PAGE_SIZE); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(addr >> PAGE_SHIFT); } } diff --git a/include/linux/mm.h b/include/linux/mm.h index c667a6c4b657..78ae2ee09a24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,7 +3279,7 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -extern void num_poisoned_pages_inc(void); +extern void num_poisoned_pages_inc(unsigned long pfn); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) @@ -3287,7 +3287,7 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void num_poisoned_pages_inc(void) +static inline void num_poisoned_pages_inc(unsigned long pfn) { } #endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44d7bf6ff214..757a46e172de 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,12 +74,12 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -inline void num_poisoned_pages_inc(void) +inline void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); } -static inline void num_poisoned_pages_sub(long i) +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); } @@ -125,7 +125,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (release) put_page(page); page_ref_inc(page); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); return true; } @@ -1194,7 +1194,7 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(pfn); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); @@ -1741,7 +1741,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) llist_add(&raw_hwp->node, head); /* the first error event will be counted in action_result(). */ if (ret) - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -2414,7 +2414,7 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_sub(count); + num_poisoned_pages_sub(pfn, count); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2630,5 +2630,5 @@ void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } } if (total) - num_poisoned_pages_sub(total); + num_poisoned_pages_sub(0, total); } From 5033091de814ab4b5623faed2755f3064e19e2d2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:12 +0900 Subject: [PATCH 072/313] mm/hwpoison: introduce per-memory_block hwpoison counter Currently PageHWPoison flag does not behave well when experiencing memory hotremove/hotplug. Any data field in struct page is unreliable when the associated memory is offlined, and the current mechanism can't tell whether a memory block is onlined because a new memory devices is installed or because previous failed offline operations are undone. Especially if there's a hwpoisoned memory, it's unclear what the best option is. So introduce a new mechanism to make struct memory_block remember that a memory block has hwpoisoned memory inside it. And make any online event fail if the onlining memory block contains hwpoison. struct memory_block is freed and reallocated over ACPI-based hotremove/hotplug, but not over sysfs-based hotremove/hotplug. So the new counter can distinguish these cases. Link: https://lkml.kernel.org/r/20221024062012.1520887-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- drivers/base/memory.c | 38 ++++++++++++++++++++++++++++++++++++++ include/linux/memory.h | 3 +++ include/linux/mm.h | 20 +++++++++++++++++++- mm/internal.h | 8 -------- mm/memory-failure.c | 36 +++++++++++------------------------- mm/sparse.c | 2 -- 6 files changed, 71 insertions(+), 36 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 9aa0da991cfb..fe98fb8d94e5 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -175,6 +175,15 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +static unsigned long memblk_nr_poison(struct memory_block *mem); +#else +static inline unsigned long memblk_nr_poison(struct memory_block *mem) +{ + return 0; +} +#endif + static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -183,6 +192,9 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; + if (memblk_nr_poison(mem)) + return -EHWPOISON; + zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, start_pfn, nr_pages); @@ -864,6 +876,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; + num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); unregister_memory_block_under_nodes(mem); remove_memory_block(mem); } @@ -1164,3 +1177,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, } return ret; } + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +void memblk_nr_poison_inc(unsigned long pfn) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_inc(&mem->nr_hwpoison); +} + +void memblk_nr_poison_sub(unsigned long pfn, long i) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_sub(i, &mem->nr_hwpoison); +} + +static unsigned long memblk_nr_poison(struct memory_block *mem) +{ + return atomic_long_read(&mem->nr_hwpoison); +} +#endif diff --git a/include/linux/memory.h b/include/linux/memory.h index 463662ef7614..31343566c221 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -84,6 +84,9 @@ struct memory_block { unsigned long nr_vmemmap_pages; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) + atomic_long_t nr_hwpoison; +#endif }; int arch_get_memory_phys_device(unsigned long start_pfn); diff --git a/include/linux/mm.h b/include/linux/mm.h index 78ae2ee09a24..429ff89bfe06 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3279,7 +3279,8 @@ extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -extern void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_inc(unsigned long pfn); +void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) @@ -3290,6 +3291,23 @@ static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, static inline void num_poisoned_pages_inc(unsigned long pfn) { } + +static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ +} +#endif + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +extern void memblk_nr_poison_inc(unsigned long pfn); +extern void memblk_nr_poison_sub(unsigned long pfn, long i); +#else +static inline void memblk_nr_poison_inc(unsigned long pfn) +{ +} + +static inline void memblk_nr_poison_sub(unsigned long pfn, long i) +{ +} #endif #ifndef arch_memory_failure diff --git a/mm/internal.h b/mm/internal.h index 68afdbe7106e..bcf75a8b032d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -708,14 +708,6 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -#ifdef CONFIG_MEMORY_FAILURE -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif - extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757a46e172de..9b82402ec242 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -77,11 +77,14 @@ static bool hw_memory_failure __read_mostly = false; inline void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); + memblk_nr_poison_inc(pfn); } -static inline void num_poisoned_pages_sub(unsigned long pfn, long i) +inline void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); + if (pfn != -1UL) + memblk_nr_poison_sub(pfn, i); } /* @@ -1706,6 +1709,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) if (move_flag) SetPageHWPoison(p->page); + else + num_poisoned_pages_sub(page_to_pfn(p->page), 1); kfree(p); count++; } @@ -2332,6 +2337,7 @@ int unpoison_memory(unsigned long pfn) int ret = -EBUSY; int freeit = 0; unsigned long count = 1; + bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2380,6 +2386,7 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2395,6 +2402,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } else { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2414,7 +2422,8 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_sub(pfn, count); + if (!huge) + num_poisoned_pages_sub(pfn, 1); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2609,26 +2618,3 @@ retry: return ret; } - -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i, total = 0; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - total++; - ClearPageHWPoison(&memmap[i]); - } - } - if (total) - num_poisoned_pages_sub(0, total); -} diff --git a/mm/sparse.c b/mm/sparse.c index e5a8a3a0edd7..2779b419ef2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, - nr_pages - map_offset); section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ From ea0ffd0c08d0fef1f6e93eb07badbeeabf6b43d6 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 24 Oct 2022 00:25:33 +0800 Subject: [PATCH 073/313] swap: add a limit for readahead page-cluster value Currenty there is no upper limit for /proc/sys/vm/page-cluster, and it's a bit shift value, so it could result in overflow of the 32-bit integer. Add a reasonable upper limit for it, read-in at most 2**31 pages, which is a large enough value for readahead. Link: https://lkml.kernel.org/r/20221023162533.81561-1-ryncsn@gmail.com Signed-off-by: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + kernel/sysctl.c | 1 + mm/swap.c | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 429ff89bfe06..255931ebf2dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -74,6 +74,7 @@ static inline void totalram_pages_add(long count) extern void * high_memory; extern int page_cluster; +extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 188c305aeb8b..71a4350ac601 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2125,6 +2125,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&page_cluster_max, }, { .procname = "dirtytime_expire_seconds", diff --git a/mm/swap.c b/mm/swap.c index 2f12a2ee1d3a..b9a6817e07ff 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -43,8 +43,9 @@ #define CREATE_TRACE_POINTS #include -/* How many pages do we try to swap or page in/out together? */ +/* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; +const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { From 2ea3498980f5e6f3001f2984b0b92736bf1b78cb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:32 +0000 Subject: [PATCH 074/313] mm/damon/core: split out DAMOS-charged region skip logic into a new function Patch series "mm/damon: cleanup and refactoring code", v2. This patchset cleans up and refactors a range of DAMON code including the core, DAMON sysfs interface, and DAMON modules, for better readability and convenient future feature implementations. In detail, this patchset splits unnecessarily long and complex functions in core into smaller functions (patches 1-4). Then, it cleans up the DAMON sysfs interface by using more type-safe code (patch 5) and removing unnecessary function parameters (patch 6). Further, it refactor the code by distributing the code into multiple files (patches 7-10). Last two patches (patches 11 and 12) deduplicates and remove unnecessary header inclusion in DAMON modules (reclaim and lru_sort). This patch (of 12): The DAMOS action applying function, 'damon_do_apply_schemes()', is quite long and not so simple. Split out the already quota-charged region skip code, which is not a small amount of simple code, into a new function with some comments for better readability. Link: https://lkml.kernel.org/r/20221026225943.100429-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221026225943.100429-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 96 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 31 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 36d098d06c55..06b50ede9cc6 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -694,6 +694,67 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; } +/* + * damos_skip_charged_region() - Check if the given region or starting part of + * it is already charged for the DAMOS quota. + * @t: The target of the region. + * @rp: The pointer to the region. + * @s: The scheme to be applied. + * + * If a quota of a scheme has exceeded in a quota charge window, the scheme's + * action would applied to only a part of the target access pattern fulfilling + * regions. To avoid applying the scheme action to only already applied + * regions, DAMON skips applying the scheme action to the regions that charged + * in the previous charge window. + * + * This function checks if a given region should be skipped or not for the + * reason. If only the starting part of the region has previously charged, + * this function splits the region into two so that the second one covers the + * area that not charged in the previous charge widnow and saves the second + * region in *rp and returns false, so that the caller can apply DAMON action + * to the second one. + * + * Return: true if the region should be entirely skipped, false otherwise. + */ +static bool damos_skip_charged_region(struct damon_target *t, + struct damon_region **rp, struct damos *s) +{ + struct damon_region *r = *rp; + struct damos_quota *quota = &s->quota; + unsigned long sz_to_skip; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + return true; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return true; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + return true; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz_to_skip) { + if (damon_sz_region(r) <= DAMON_MIN_REGION) + return true; + sz_to_skip = DAMON_MIN_REGION; + } + damon_split_region_at(t, r, sz_to_skip); + r = damon_next_region(r); + *rp = r; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + return false; +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -702,7 +763,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = damon_sz_region(r); + unsigned long sz; struct timespec64 begin, end; unsigned long sz_applied = 0; @@ -713,41 +774,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - /* Skip previously charged regions */ - if (quota->charge_target_from) { - if (t != quota->charge_target_from) - continue; - if (r == damon_last_region(t)) { - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - continue; - } - if (quota->charge_addr_from && - r->ar.end <= quota->charge_addr_from) - continue; - - if (quota->charge_addr_from && r->ar.start < - quota->charge_addr_from) { - sz = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); - if (!sz) { - if (damon_sz_region(r) <= - DAMON_MIN_REGION) - continue; - sz = DAMON_MIN_REGION; - } - damon_split_region_at(t, r, sz); - r = damon_next_region(r); - sz = damon_sz_region(r); - } - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - } + if (damos_skip_charged_region(t, &r, s)) + continue; if (!damos_valid_target(c, t, r, s)) continue; /* Apply the scheme */ + sz = damon_sz_region(r); if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { From e63a30c51f8400915db401c05d3c4db6743857e8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:33 +0000 Subject: [PATCH 075/313] mm/damon/core: split damos application logic into a new function The DAMOS action applying function, 'damon_do_apply_schemes()', is still long and not easy to read. Split out the code for applying a single action to a single region into a new function for better readability. Link: https://lkml.kernel.org/r/20221026225943.100429-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 73 ++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 06b50ede9cc6..c1a912bc46ae 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -755,6 +755,44 @@ static bool damos_skip_charged_region(struct damon_target *t, return false; } +static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + unsigned long sz = damon_sz_region(r); + struct timespec64 begin, end; + unsigned long sz_applied = 0; + + if (c->ops.apply_scheme) { + if (quota->esz && quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(t, r, sz); + } + ktime_get_coarse_ts64(&begin); + sz_applied = c->ops.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -763,9 +801,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz; - struct timespec64 begin, end; - unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -780,37 +815,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (!damos_valid_target(c, t, r, s)) continue; - /* Apply the scheme */ - sz = damon_sz_region(r); - if (c->ops.apply_scheme) { - if (quota->esz && - quota->charged_sz + sz > quota->esz) { - sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); - if (!sz) - goto update_stat; - damon_split_region_at(t, r, sz); - } - ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); - ktime_get_coarse_ts64(&end); - quota->total_charged_ns += timespec64_to_ns(&end) - - timespec64_to_ns(&begin); - quota->charged_sz += sz; - if (quota->esz && quota->charged_sz >= quota->esz) { - quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; - } - } - if (s->action != DAMOS_STAT) - r->age = 0; - -update_stat: - s->stat.nr_tried++; - s->stat.sz_tried += sz; - if (sz_applied) - s->stat.nr_applied++; - s->stat.sz_applied += sz_applied; + damos_apply_scheme(c, t, r, s); } } From d1cbbf621fc25950938be74a228ef518d05d93a1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:34 +0000 Subject: [PATCH 076/313] mm/damon/core: split out scheme stat update logic into a new function The function for applying a given DAMON scheme action to a given DAMON region, 'damos_apply_scheme()' is not quite short. Make it better to read by splitting out the stat update logic into a new function. Link: https://lkml.kernel.org/r/20221026225943.100429-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c1a912bc46ae..3a810c6e26bc 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -755,6 +755,16 @@ static bool damos_skip_charged_region(struct damon_target *t, return false; } +static void damos_update_stat(struct damos *s, + unsigned long sz_tried, unsigned long sz_applied) +{ + s->stat.nr_tried++; + s->stat.sz_tried += sz_tried; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -786,11 +796,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, r->age = 0; update_stat: - s->stat.nr_tried++; - s->stat.sz_tried += sz; - if (sz_applied) - s->stat.nr_applied++; - s->stat.sz_applied += sz_applied; + damos_update_stat(s, sz, sz_applied); } static void damon_do_apply_schemes(struct damon_ctx *c, From 898810e5ca54691f4e173f5ffc92bbce0335bc69 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:35 +0000 Subject: [PATCH 077/313] mm/damon/core: split out scheme quota adjustment logic into a new function DAMOS quota adjustment logic in 'kdamond_apply_schemes()', has some amount of code, and the logic is not so straightforward. Split it out to a new function for better readability. Link: https://lkml.kernel.org/r/20221026225943.100429-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 91 ++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 43 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3a810c6e26bc..80d5937fe337 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -848,6 +848,53 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->esz = esz; } +static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + struct damon_target *t; + struct damon_region *r; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; + + if (!quota->ms && !quota->sz) + return; + + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies(quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } + + if (!c->ops.get_scheme_score) + return; + + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->ops.get_scheme_score(c, t, r, s); + quota->histogram[score] += damon_sz_region(r); + if (score > max_score) + max_score = score; + } + } + + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; +} + static void kdamond_apply_schemes(struct damon_ctx *c) { struct damon_target *t; @@ -855,52 +902,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) struct damos *s; damon_for_each_scheme(s, c) { - struct damos_quota *quota = &s->quota; - unsigned long cumulated_sz; - unsigned int score, max_score = 0; - if (!s->wmarks.activated) continue; - if (!quota->ms && !quota->sz) - continue; - - /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + - msecs_to_jiffies( - quota->reset_interval))) { - if (quota->esz && quota->charged_sz >= quota->esz) - s->stat.qt_exceeds++; - quota->total_charged_sz += quota->charged_sz; - quota->charged_from = jiffies; - quota->charged_sz = 0; - damos_set_effective_quota(quota); - } - - if (!c->ops.get_scheme_score) - continue; - - /* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - if (!__damos_valid_target(r, s)) - continue; - score = c->ops.get_scheme_score( - c, t, r, s); - quota->histogram[score] += damon_sz_region(r); - if (score > max_score) - max_score = score; - } - } - - /* Set the min score limit */ - for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; - if (cumulated_sz >= quota->esz || !score) - break; - } - quota->min_score = score; + damos_adjust_quota(c, s); } damon_for_each_target(t, c) { From 789a230613c8dd14bdd41653de0c22783726276f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:36 +0000 Subject: [PATCH 078/313] mm/damon/sysfs: use damon_addr_range for region's start and end values DAMON has a struct for each address range but DAMON sysfs interface is using the low type (unsigned long) for storing the start and end addresses of regions. Use the dedicated struct for better type safety. Link: https://lkml.kernel.org/r/20221026225943.100429-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 07e5f1bdf025..a5ef503d8444 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1062,13 +1062,11 @@ static struct kobj_type damon_sysfs_schemes_ktype = { struct damon_sysfs_region { struct kobject kobj; - unsigned long start; - unsigned long end; + struct damon_addr_range ar; }; static struct damon_sysfs_region *damon_sysfs_region_alloc( - unsigned long start, - unsigned long end) + struct damon_addr_range ar) { struct damon_sysfs_region *region = kmalloc(sizeof(*region), GFP_KERNEL); @@ -1076,8 +1074,7 @@ static struct damon_sysfs_region *damon_sysfs_region_alloc( if (!region) return NULL; region->kobj = (struct kobject){}; - region->start = start; - region->end = end; + region->ar = ar; return region; } @@ -1087,7 +1084,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->start); + return sysfs_emit(buf, "%lu\n", region->ar.start); } static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1095,7 +1092,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->start); + int err = kstrtoul(buf, 0, ®ion->ar.start); return err ? err : count; } @@ -1106,7 +1103,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->end); + return sysfs_emit(buf, "%lu\n", region->ar.end); } static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1114,7 +1111,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->end); + int err = kstrtoul(buf, 0, ®ion->ar.end); return err ? err : count; } @@ -1187,7 +1184,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, regions->regions_arr = regions_arr; for (i = 0; i < nr_regions; i++) { - region = damon_sysfs_region_alloc(0, 0); + region = damon_sysfs_region_alloc((struct damon_addr_range){}); if (!region) { damon_sysfs_regions_rm_dirs(regions); return -ENOMEM; @@ -2147,11 +2144,11 @@ static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_region *sys_region = sysfs_regions->regions_arr[i]; - if (sys_region->start > sys_region->end) + if (sys_region->ar.start > sys_region->ar.end) goto out; - ranges[i].start = sys_region->start; - ranges[i].end = sys_region->end; + ranges[i].start = sys_region->ar.start; + ranges[i].end = sys_region->ar.end; if (i == 0) continue; if (ranges[i - 1].end > ranges[i].start) From 1f71981408ef5696ad8544f282d336d4fc60a807 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:37 +0000 Subject: [PATCH 079/313] mm/damon/sysfs: remove parameters of damon_sysfs_region_alloc() 'damon_sysfs_region_alloc()' is always called with zero-filled 'struct damon_addr_range', because the start and end addresses should set by users. Remove unnecessary parameters of the function and simplify the body by using 'kzalloc()'. Link: https://lkml.kernel.org/r/20221026225943.100429-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a5ef503d8444..f3d7b34ea0ab 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1065,17 +1065,9 @@ struct damon_sysfs_region { struct damon_addr_range ar; }; -static struct damon_sysfs_region *damon_sysfs_region_alloc( - struct damon_addr_range ar) +static struct damon_sysfs_region *damon_sysfs_region_alloc(void) { - struct damon_sysfs_region *region = kmalloc(sizeof(*region), - GFP_KERNEL); - - if (!region) - return NULL; - region->kobj = (struct kobject){}; - region->ar = ar; - return region; + return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL); } static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1184,7 +1176,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, regions->regions_arr = regions_arr; for (i = 0; i < nr_regions; i++) { - region = damon_sysfs_region_alloc((struct damon_addr_range){}); + region = damon_sysfs_region_alloc(); if (!region) { damon_sysfs_regions_rm_dirs(regions); return -ENOMEM; From 39240595917ec0c4f71d7b9dd7909790715968b5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:38 +0000 Subject: [PATCH 080/313] mm/damon/sysfs: move sysfs_lock to common module DAMON sysfs interface is implemented in a single file, sysfs.c, which has about 2,800 lines of code. As the interface is hierarchical and some of the code can be reused by different hierarchies, it would make more sense to split out the implementation into common parts and different parts in multiple files. As the beginning of the work, create files for common code and move the global mutex for directories modifications protection into the new file. Link: https://lkml.kernel.org/r/20221026225943.100429-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/Makefile | 2 +- mm/damon/sysfs-common.c | 11 +++++++++++ mm/damon/sysfs-common.h | 11 +++++++++++ mm/damon/sysfs.c | 4 +--- 4 files changed, 24 insertions(+), 4 deletions(-) create mode 100644 mm/damon/sysfs-common.c create mode 100644 mm/damon/sysfs-common.h diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 3e6b8ad73858..f8d535a6253b 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o -obj-$(CONFIG_DAMON_SYSFS) += sysfs.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c new file mode 100644 index 000000000000..9dc743868d5b --- /dev/null +++ b/mm/damon/sysfs-common.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park + */ + +#include "sysfs-common.h" + +DEFINE_MUTEX(damon_sysfs_lock); + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h new file mode 100644 index 000000000000..745a918b94f5 --- /dev/null +++ b/mm/damon/sysfs-common.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park + */ + +#include +#include + +extern struct mutex damon_sysfs_lock; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f3d7b34ea0ab..a847b9159718 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -5,13 +5,11 @@ * Copyright (c) 2022 SeongJae Park */ -#include -#include #include #include #include -static DEFINE_MUTEX(damon_sysfs_lock); +#include "sysfs-common.h" /* * unsigned long range directory From d332fe11debe69fee3de4c2d84fa0b6649678ad2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:39 +0000 Subject: [PATCH 081/313] mm/damon/sysfs: move unsigned long range directory to common module The implementation of unsigned long type range directories can be reused by multiple DAMON sysfs directories including those for DAMON-based Operation Schemes and the range of number of monitoring regions. Move the code into the files for DAMON sysfs common logics. Link: https://lkml.kernel.org/r/20221026225943.100429-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.c | 96 ++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs-common.h | 13 ++++++ mm/damon/sysfs.c | 100 ---------------------------------------- 3 files changed, 109 insertions(+), 100 deletions(-) diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c index 9dc743868d5b..52bebf242f74 100644 --- a/mm/damon/sysfs-common.c +++ b/mm/damon/sysfs-common.c @@ -5,7 +5,103 @@ * Author: SeongJae Park */ +#include + #include "sysfs-common.h" DEFINE_MUTEX(damon_sysfs_lock); +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return err; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return err; + + range->max = max; + return count; +} + +void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 745a918b94f5..56e6a99e353b 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -9,3 +9,16 @@ #include extern struct mutex damon_sysfs_lock; + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max); +void damon_sysfs_ul_range_release(struct kobject *kobj); + +extern struct kobj_type damon_sysfs_ul_range_ktype; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a847b9159718..6774a669962e 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -11,106 +11,6 @@ #include "sysfs-common.h" -/* - * unsigned long range directory - */ - -struct damon_sysfs_ul_range { - struct kobject kobj; - unsigned long min; - unsigned long max; -}; - -static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( - unsigned long min, - unsigned long max) -{ - struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), - GFP_KERNEL); - - if (!range) - return NULL; - range->kobj = (struct kobject){}; - range->min = min; - range->max = max; - - return range; -} - -static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->min); -} - -static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long min; - int err; - - err = kstrtoul(buf, 0, &min); - if (err) - return err; - - range->min = min; - return count; -} - -static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->max); -} - -static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long max; - int err; - - err = kstrtoul(buf, 0, &max); - if (err) - return err; - - range->max = max; - return count; -} - -static void damon_sysfs_ul_range_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); -} - -static struct kobj_attribute damon_sysfs_ul_range_min_attr = - __ATTR_RW_MODE(min, 0600); - -static struct kobj_attribute damon_sysfs_ul_range_max_attr = - __ATTR_RW_MODE(max, 0600); - -static struct attribute *damon_sysfs_ul_range_attrs[] = { - &damon_sysfs_ul_range_min_attr.attr, - &damon_sysfs_ul_range_max_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_ul_range); - -static struct kobj_type damon_sysfs_ul_range_ktype = { - .release = damon_sysfs_ul_range_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_ul_range_groups, -}; - /* * schemes/stats directory */ From 4acd715ff57fd05a481c64d074db68f2cf5711aa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:40 +0000 Subject: [PATCH 082/313] mm/damon/sysfs: split out kdamond-independent schemes stats update logic into a new function 'damon_sysfs_schemes_update_stats()' is coupled with both damon_sysfs_kdamond and damon_sysfs_schemes. It's a wide range of types dependency. It makes splitting the logics a little bit distracting. Split the function so that each function is coupled with smaller range of types. Link: https://lkml.kernel.org/r/20221026225943.100429-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6774a669962e..836df19a7d86 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2246,25 +2246,13 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) mutex_unlock(&ctx->kdamond_lock); } -/* - * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. - * @kdamond: The kobject wrapper that associated to the kdamond thread. - * - * This function reads the schemes stats of specific kdamond and update the - * related values for sysfs files. This function should be called from DAMON - * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON - * contexts-internal data and DAMON sysfs variables. - */ -static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +static void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) { - struct damon_ctx *ctx = kdamond->damon_ctx; - struct damon_sysfs_schemes *sysfs_schemes; struct damos *scheme; int schemes_idx = 0; - if (!ctx) - return -EINVAL; - sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_stats *sysfs_stats; @@ -2279,6 +2267,25 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) sysfs_stats->sz_applied = scheme->stat.sz_applied; sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } +} + +/* + * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. + * @kdamond: The kobject wrapper that associated to the kdamond thread. + * + * This function reads the schemes stats of specific kdamond and update the + * related values for sysfs files. This function should be called from DAMON + * callbacks while holding ``damon_syfs_lock``, to safely access the DAMON + * contexts-internal data and DAMON sysfs variables. + */ +static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + damon_sysfs_schemes_update_stats( + kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } From c8e7b4d0ba348a8ef14956a80c780f152f433764 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:41 +0000 Subject: [PATCH 083/313] mm/damon/sysfs: split out schemes directory implementation to separate file DAMON sysfs interface for 'schemes' directory is implemented using about one thousand lines of code. It has no strong dependency with other parts of its file, so split it out to another file for better code management. Link: https://lkml.kernel.org/r/20221026225943.100429-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/Makefile | 2 +- mm/damon/sysfs-common.h | 22 + mm/damon/sysfs-schemes.c | 1068 ++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs.c | 1064 ------------------------------------- 4 files changed, 1091 insertions(+), 1065 deletions(-) create mode 100644 mm/damon/sysfs-schemes.c diff --git a/mm/damon/Makefile b/mm/damon/Makefile index f8d535a6253b..1e86f5253d7f 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o -obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 56e6a99e353b..4626b2784404 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -22,3 +22,25 @@ struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( void damon_sysfs_ul_range_release(struct kobject *kobj); extern struct kobj_type damon_sysfs_ul_range_ktype; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); + +extern struct kobj_type damon_sysfs_schemes_ktype; + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes); + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c new file mode 100644 index 000000000000..9509d5c1e7fc --- /dev/null +++ b/mm/damon/sysfs-schemes.c @@ -0,0 +1,1068 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park + */ + +#include + +#include "sysfs-common.h" + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + return err ? err : count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + return err ? err : count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + return err ? err : count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + return err ? err : count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + return err ? err : count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + return err ? err : count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + return err ? err : count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_stats *stats; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "lru_prio", + "lru_deprio", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + return 0; + +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->stats->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); +} + +static void damon_sysfs_update_scheme(struct damos *scheme, + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + scheme->pattern.min_sz_region = access_pattern->sz->min; + scheme->pattern.max_sz_region = access_pattern->sz->max; + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; + scheme->pattern.min_age_region = access_pattern->age->min; + scheme->pattern.max_age_region = access_pattern->age->max; + + scheme->action = sysfs_scheme->action; + + scheme->quota.ms = sysfs_quotas->ms; + scheme->quota.sz = sysfs_quotas->sz; + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; + scheme->quota.weight_sz = sysfs_weights->sz; + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; + scheme->quota.weight_age = sysfs_weights->age; + + scheme->wmarks.metric = sysfs_wmarks->metric; + scheme->wmarks.interval = sysfs_wmarks->interval_us; + scheme->wmarks.high = sysfs_wmarks->high; + scheme->wmarks.mid = sysfs_wmarks->mid; + scheme->wmarks.low = sysfs_wmarks->low; +} + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + struct damos *scheme, *next; + int i = 0; + + damon_for_each_scheme_safe(scheme, next, ctx) { + if (i < sysfs_schemes->nr) + damon_sysfs_update_scheme(scheme, + sysfs_schemes->schemes_arr[i]); + else + damon_destroy_scheme(scheme); + i++; + } + + for (; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 836df19a7d86..284daf274b3e 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -11,949 +11,6 @@ #include "sysfs-common.h" -/* - * schemes/stats directory - */ - -struct damon_sysfs_stats { - struct kobject kobj; - unsigned long nr_tried; - unsigned long sz_tried; - unsigned long nr_applied; - unsigned long sz_applied; - unsigned long qt_exceeds; -}; - -static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); -} - -static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_tried); -} - -static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_tried); -} - -static ssize_t nr_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_applied); -} - -static ssize_t sz_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_applied); -} - -static ssize_t qt_exceeds_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); -} - -static void damon_sysfs_stats_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); -} - -static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = - __ATTR_RO_MODE(nr_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = - __ATTR_RO_MODE(sz_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = - __ATTR_RO_MODE(nr_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = - __ATTR_RO_MODE(sz_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = - __ATTR_RO_MODE(qt_exceeds, 0400); - -static struct attribute *damon_sysfs_stats_attrs[] = { - &damon_sysfs_stats_nr_tried_attr.attr, - &damon_sysfs_stats_sz_tried_attr.attr, - &damon_sysfs_stats_nr_applied_attr.attr, - &damon_sysfs_stats_sz_applied_attr.attr, - &damon_sysfs_stats_qt_exceeds_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_stats); - -static struct kobj_type damon_sysfs_stats_ktype = { - .release = damon_sysfs_stats_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_stats_groups, -}; - -/* - * watermarks directory - */ - -struct damon_sysfs_watermarks { - struct kobject kobj; - enum damos_wmark_metric metric; - unsigned long interval_us; - unsigned long high; - unsigned long mid; - unsigned long low; -}; - -static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( - enum damos_wmark_metric metric, unsigned long interval_us, - unsigned long high, unsigned long mid, unsigned long low) -{ - struct damon_sysfs_watermarks *watermarks = kmalloc( - sizeof(*watermarks), GFP_KERNEL); - - if (!watermarks) - return NULL; - watermarks->kobj = (struct kobject){}; - watermarks->metric = metric; - watermarks->interval_us = interval_us; - watermarks->high = high; - watermarks->mid = mid; - watermarks->low = low; - return watermarks; -} - -/* Should match with enum damos_wmark_metric */ -static const char * const damon_sysfs_wmark_metric_strs[] = { - "none", - "free_mem_rate", -}; - -static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_wmark_metric_strs[watermarks->metric]); -} - -static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - enum damos_wmark_metric metric; - - for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { - if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { - watermarks->metric = metric; - return count; - } - } - return -EINVAL; -} - -static ssize_t interval_us_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->interval_us); -} - -static ssize_t interval_us_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->interval_us); - - return err ? err : count; -} - -static ssize_t high_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->high); -} - -static ssize_t high_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->high); - - return err ? err : count; -} - -static ssize_t mid_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->mid); -} - -static ssize_t mid_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->mid); - - return err ? err : count; -} - -static ssize_t low_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->low); -} - -static ssize_t low_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->low); - - return err ? err : count; -} - -static void damon_sysfs_watermarks_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); -} - -static struct kobj_attribute damon_sysfs_watermarks_metric_attr = - __ATTR_RW_MODE(metric, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = - __ATTR_RW_MODE(interval_us, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_high_attr = - __ATTR_RW_MODE(high, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_mid_attr = - __ATTR_RW_MODE(mid, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_low_attr = - __ATTR_RW_MODE(low, 0600); - -static struct attribute *damon_sysfs_watermarks_attrs[] = { - &damon_sysfs_watermarks_metric_attr.attr, - &damon_sysfs_watermarks_interval_us_attr.attr, - &damon_sysfs_watermarks_high_attr.attr, - &damon_sysfs_watermarks_mid_attr.attr, - &damon_sysfs_watermarks_low_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_watermarks); - -static struct kobj_type damon_sysfs_watermarks_ktype = { - .release = damon_sysfs_watermarks_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_watermarks_groups, -}; - -/* - * scheme/weights directory - */ - -struct damon_sysfs_weights { - struct kobject kobj; - unsigned int sz; - unsigned int nr_accesses; - unsigned int age; -}; - -static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, - unsigned int nr_accesses, unsigned int age) -{ - struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), - GFP_KERNEL); - - if (!weights) - return NULL; - weights->kobj = (struct kobject){}; - weights->sz = sz; - weights->nr_accesses = nr_accesses; - weights->age = age; - return weights; -} - -static ssize_t sz_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->sz); -} - -static ssize_t sz_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->sz); - - return err ? err : count; -} - -static ssize_t nr_accesses_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->nr_accesses); -} - -static ssize_t nr_accesses_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->nr_accesses); - - return err ? err : count; -} - -static ssize_t age_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->age); -} - -static ssize_t age_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->age); - - return err ? err : count; -} - -static void damon_sysfs_weights_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); -} - -static struct kobj_attribute damon_sysfs_weights_sz_attr = - __ATTR_RW_MODE(sz_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = - __ATTR_RW_MODE(nr_accesses_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_age_attr = - __ATTR_RW_MODE(age_permil, 0600); - -static struct attribute *damon_sysfs_weights_attrs[] = { - &damon_sysfs_weights_sz_attr.attr, - &damon_sysfs_weights_nr_accesses_attr.attr, - &damon_sysfs_weights_age_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_weights); - -static struct kobj_type damon_sysfs_weights_ktype = { - .release = damon_sysfs_weights_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_weights_groups, -}; - -/* - * quotas directory - */ - -struct damon_sysfs_quotas { - struct kobject kobj; - struct damon_sysfs_weights *weights; - unsigned long ms; - unsigned long sz; - unsigned long reset_interval_ms; -}; - -static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); -} - -static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) -{ - struct damon_sysfs_weights *weights; - int err; - - weights = damon_sysfs_weights_alloc(0, 0, 0); - if (!weights) - return -ENOMEM; - - err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, - "as->kobj, "weights"); - if (err) - kobject_put(&weights->kobj); - else - quotas->weights = weights; - return err; -} - -static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) -{ - kobject_put("as->weights->kobj); -} - -static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->ms); -} - -static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->ms); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->sz); -} - -static ssize_t bytes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->sz); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t reset_interval_ms_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); -} - -static ssize_t reset_interval_ms_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->reset_interval_ms); - - if (err) - return -EINVAL; - return count; -} - -static void damon_sysfs_quotas_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); -} - -static struct kobj_attribute damon_sysfs_quotas_ms_attr = - __ATTR_RW_MODE(ms, 0600); - -static struct kobj_attribute damon_sysfs_quotas_sz_attr = - __ATTR_RW_MODE(bytes, 0600); - -static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = - __ATTR_RW_MODE(reset_interval_ms, 0600); - -static struct attribute *damon_sysfs_quotas_attrs[] = { - &damon_sysfs_quotas_ms_attr.attr, - &damon_sysfs_quotas_sz_attr.attr, - &damon_sysfs_quotas_reset_interval_ms_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_quotas); - -static struct kobj_type damon_sysfs_quotas_ktype = { - .release = damon_sysfs_quotas_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_quotas_groups, -}; - -/* - * access_pattern directory - */ - -struct damon_sysfs_access_pattern { - struct kobject kobj; - struct damon_sysfs_ul_range *sz; - struct damon_sysfs_ul_range *nr_accesses; - struct damon_sysfs_ul_range *age; -}; - -static -struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) -{ - struct damon_sysfs_access_pattern *access_pattern = - kmalloc(sizeof(*access_pattern), GFP_KERNEL); - - if (!access_pattern) - return NULL; - access_pattern->kobj = (struct kobject){}; - return access_pattern; -} - -static int damon_sysfs_access_pattern_add_range_dir( - struct damon_sysfs_access_pattern *access_pattern, - struct damon_sysfs_ul_range **range_dir_ptr, - char *name) -{ - struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); - int err; - - if (!range) - return -ENOMEM; - err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, - &access_pattern->kobj, name); - if (err) - kobject_put(&range->kobj); - else - *range_dir_ptr = range; - return err; -} - -static int damon_sysfs_access_pattern_add_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - int err; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->sz, "sz"); - if (err) - goto put_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->nr_accesses, "nr_accesses"); - if (err) - goto put_nr_accesses_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->age, "age"); - if (err) - goto put_age_nr_accesses_sz_out; - return 0; - -put_age_nr_accesses_sz_out: - kobject_put(&access_pattern->age->kobj); - access_pattern->age = NULL; -put_nr_accesses_sz_out: - kobject_put(&access_pattern->nr_accesses->kobj); - access_pattern->nr_accesses = NULL; -put_sz_out: - kobject_put(&access_pattern->sz->kobj); - access_pattern->sz = NULL; - return err; -} - -static void damon_sysfs_access_pattern_rm_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - kobject_put(&access_pattern->sz->kobj); - kobject_put(&access_pattern->nr_accesses->kobj); - kobject_put(&access_pattern->age->kobj); -} - -static void damon_sysfs_access_pattern_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); -} - -static struct attribute *damon_sysfs_access_pattern_attrs[] = { - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); - -static struct kobj_type damon_sysfs_access_pattern_ktype = { - .release = damon_sysfs_access_pattern_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_access_pattern_groups, -}; - -/* - * scheme directory - */ - -struct damon_sysfs_scheme { - struct kobject kobj; - enum damos_action action; - struct damon_sysfs_access_pattern *access_pattern; - struct damon_sysfs_quotas *quotas; - struct damon_sysfs_watermarks *watermarks; - struct damon_sysfs_stats *stats; -}; - -/* This should match with enum damos_action */ -static const char * const damon_sysfs_damos_action_strs[] = { - "willneed", - "cold", - "pageout", - "hugepage", - "nohugepage", - "lru_prio", - "lru_deprio", - "stat", -}; - -static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( - enum damos_action action) -{ - struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), - GFP_KERNEL); - - if (!scheme) - return NULL; - scheme->kobj = (struct kobject){}; - scheme->action = action; - return scheme; -} - -static int damon_sysfs_scheme_set_access_pattern( - struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_access_pattern *access_pattern; - int err; - - access_pattern = damon_sysfs_access_pattern_alloc(); - if (!access_pattern) - return -ENOMEM; - err = kobject_init_and_add(&access_pattern->kobj, - &damon_sysfs_access_pattern_ktype, &scheme->kobj, - "access_pattern"); - if (err) - goto out; - err = damon_sysfs_access_pattern_add_dirs(access_pattern); - if (err) - goto out; - scheme->access_pattern = access_pattern; - return 0; - -out: - kobject_put(&access_pattern->kobj); - return err; -} - -static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); - int err; - - if (!quotas) - return -ENOMEM; - err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, - &scheme->kobj, "quotas"); - if (err) - goto out; - err = damon_sysfs_quotas_add_dirs(quotas); - if (err) - goto out; - scheme->quotas = quotas; - return 0; - -out: - kobject_put("as->kobj); - return err; -} - -static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_watermarks *watermarks = - damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); - int err; - - if (!watermarks) - return -ENOMEM; - err = kobject_init_and_add(&watermarks->kobj, - &damon_sysfs_watermarks_ktype, &scheme->kobj, - "watermarks"); - if (err) - kobject_put(&watermarks->kobj); - else - scheme->watermarks = watermarks; - return err; -} - -static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); - int err; - - if (!stats) - return -ENOMEM; - err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, - &scheme->kobj, "stats"); - if (err) - kobject_put(&stats->kobj); - else - scheme->stats = stats; - return err; -} - -static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) -{ - int err; - - err = damon_sysfs_scheme_set_access_pattern(scheme); - if (err) - return err; - err = damon_sysfs_scheme_set_quotas(scheme); - if (err) - goto put_access_pattern_out; - err = damon_sysfs_scheme_set_watermarks(scheme); - if (err) - goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_stats(scheme); - if (err) - goto put_watermarks_quotas_access_pattern_out; - return 0; - -put_watermarks_quotas_access_pattern_out: - kobject_put(&scheme->watermarks->kobj); - scheme->watermarks = NULL; -put_quotas_access_pattern_out: - kobject_put(&scheme->quotas->kobj); - scheme->quotas = NULL; -put_access_pattern_out: - kobject_put(&scheme->access_pattern->kobj); - scheme->access_pattern = NULL; - return err; -} - -static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) -{ - damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); - kobject_put(&scheme->access_pattern->kobj); - damon_sysfs_quotas_rm_dirs(scheme->quotas); - kobject_put(&scheme->quotas->kobj); - kobject_put(&scheme->watermarks->kobj); - kobject_put(&scheme->stats->kobj); -} - -static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_damos_action_strs[scheme->action]); -} - -static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - enum damos_action action; - - for (action = 0; action < NR_DAMOS_ACTIONS; action++) { - if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { - scheme->action = action; - return count; - } - } - return -EINVAL; -} - -static void damon_sysfs_scheme_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); -} - -static struct kobj_attribute damon_sysfs_scheme_action_attr = - __ATTR_RW_MODE(action, 0600); - -static struct attribute *damon_sysfs_scheme_attrs[] = { - &damon_sysfs_scheme_action_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_scheme); - -static struct kobj_type damon_sysfs_scheme_ktype = { - .release = damon_sysfs_scheme_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_scheme_groups, -}; - -/* - * schemes directory - */ - -struct damon_sysfs_schemes { - struct kobject kobj; - struct damon_sysfs_scheme **schemes_arr; - int nr; -}; - -static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); -} - -static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) -{ - struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; - int i; - - for (i = 0; i < schemes->nr; i++) { - damon_sysfs_scheme_rm_dirs(schemes_arr[i]); - kobject_put(&schemes_arr[i]->kobj); - } - schemes->nr = 0; - kfree(schemes_arr); - schemes->schemes_arr = NULL; -} - -static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, - int nr_schemes) -{ - struct damon_sysfs_scheme **schemes_arr, *scheme; - int err, i; - - damon_sysfs_schemes_rm_dirs(schemes); - if (!nr_schemes) - return 0; - - schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), - GFP_KERNEL | __GFP_NOWARN); - if (!schemes_arr) - return -ENOMEM; - schemes->schemes_arr = schemes_arr; - - for (i = 0; i < nr_schemes; i++) { - scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); - if (!scheme) { - damon_sysfs_schemes_rm_dirs(schemes); - return -ENOMEM; - } - - err = kobject_init_and_add(&scheme->kobj, - &damon_sysfs_scheme_ktype, &schemes->kobj, - "%d", i); - if (err) - goto out; - err = damon_sysfs_scheme_add_dirs(scheme); - if (err) - goto out; - - schemes_arr[i] = scheme; - schemes->nr++; - } - return 0; - -out: - damon_sysfs_schemes_rm_dirs(schemes); - kobject_put(&scheme->kobj); - return err; -} - -static ssize_t nr_schemes_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); - - return sysfs_emit(buf, "%d\n", schemes->nr); -} - -static ssize_t nr_schemes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_schemes *schemes; - int nr, err = kstrtoint(buf, 0, &nr); - - if (err) - return err; - if (nr < 0) - return -EINVAL; - - schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); - - if (!mutex_trylock(&damon_sysfs_lock)) - return -EBUSY; - err = damon_sysfs_schemes_add_dirs(schemes, nr); - mutex_unlock(&damon_sysfs_lock); - if (err) - return err; - return count; -} - -static void damon_sysfs_schemes_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); -} - -static struct kobj_attribute damon_sysfs_schemes_nr_attr = - __ATTR_RW_MODE(nr_schemes, 0600); - -static struct attribute *damon_sysfs_schemes_attrs[] = { - &damon_sysfs_schemes_nr_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_schemes); - -static struct kobj_type damon_sysfs_schemes_ktype = { - .release = damon_sysfs_schemes_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_schemes_groups, -}; - /* * init region directory */ @@ -2133,104 +1190,6 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return 0; } -static struct damos *damon_sysfs_mk_scheme( - struct damon_sysfs_scheme *sysfs_scheme) -{ - struct damon_sysfs_access_pattern *access_pattern = - sysfs_scheme->access_pattern; - struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; - struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; - struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - - struct damos_access_pattern pattern = { - .min_sz_region = access_pattern->sz->min, - .max_sz_region = access_pattern->sz->max, - .min_nr_accesses = access_pattern->nr_accesses->min, - .max_nr_accesses = access_pattern->nr_accesses->max, - .min_age_region = access_pattern->age->min, - .max_age_region = access_pattern->age->max, - }; - struct damos_quota quota = { - .ms = sysfs_quotas->ms, - .sz = sysfs_quotas->sz, - .reset_interval = sysfs_quotas->reset_interval_ms, - .weight_sz = sysfs_weights->sz, - .weight_nr_accesses = sysfs_weights->nr_accesses, - .weight_age = sysfs_weights->age, - }; - struct damos_watermarks wmarks = { - .metric = sysfs_wmarks->metric, - .interval = sysfs_wmarks->interval_us, - .high = sysfs_wmarks->high, - .mid = sysfs_wmarks->mid, - .low = sysfs_wmarks->low, - }; - - return damon_new_scheme(&pattern, sysfs_scheme->action, "a, - &wmarks); -} - -static void damon_sysfs_update_scheme(struct damos *scheme, - struct damon_sysfs_scheme *sysfs_scheme) -{ - struct damon_sysfs_access_pattern *access_pattern = - sysfs_scheme->access_pattern; - struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; - struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; - struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - - scheme->pattern.min_sz_region = access_pattern->sz->min; - scheme->pattern.max_sz_region = access_pattern->sz->max; - scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; - scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; - scheme->pattern.min_age_region = access_pattern->age->min; - scheme->pattern.max_age_region = access_pattern->age->max; - - scheme->action = sysfs_scheme->action; - - scheme->quota.ms = sysfs_quotas->ms; - scheme->quota.sz = sysfs_quotas->sz; - scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; - scheme->quota.weight_sz = sysfs_weights->sz; - scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; - scheme->quota.weight_age = sysfs_weights->age; - - scheme->wmarks.metric = sysfs_wmarks->metric; - scheme->wmarks.interval = sysfs_wmarks->interval_us; - scheme->wmarks.high = sysfs_wmarks->high; - scheme->wmarks.mid = sysfs_wmarks->mid; - scheme->wmarks.low = sysfs_wmarks->low; -} - -static int damon_sysfs_set_schemes(struct damon_ctx *ctx, - struct damon_sysfs_schemes *sysfs_schemes) -{ - struct damos *scheme, *next; - int i = 0; - - damon_for_each_scheme_safe(scheme, next, ctx) { - if (i < sysfs_schemes->nr) - damon_sysfs_update_scheme(scheme, - sysfs_schemes->schemes_arr[i]); - else - damon_destroy_scheme(scheme); - i++; - } - - for (; i < sysfs_schemes->nr; i++) { - struct damos *scheme, *next; - - scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); - if (!scheme) { - damon_for_each_scheme_safe(scheme, next, ctx) - damon_destroy_scheme(scheme); - return -ENOMEM; - } - damon_add_scheme(ctx, scheme); - } - return 0; -} - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; @@ -2246,29 +1205,6 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) mutex_unlock(&ctx->kdamond_lock); } -static void damon_sysfs_schemes_update_stats( - struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) -{ - struct damos *scheme; - int schemes_idx = 0; - - damon_for_each_scheme(scheme, ctx) { - struct damon_sysfs_stats *sysfs_stats; - - /* user could have removed the scheme sysfs dir */ - if (schemes_idx >= sysfs_schemes->nr) - break; - - sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; - sysfs_stats->nr_tried = scheme->stat.nr_tried; - sysfs_stats->sz_tried = scheme->stat.sz_tried; - sysfs_stats->nr_applied = scheme->stat.nr_applied; - sysfs_stats->sz_applied = scheme->stat.sz_applied; - sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; - } -} - /* * damon_sysfs_upd_schemes_stats() - Update schemes stats sysfs files. * @kdamond: The kobject wrapper that associated to the kdamond thread. From 7ae2c17f53d5054d1fe5c1a103ad46068034617d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:42 +0000 Subject: [PATCH 084/313] mm/damon/modules: deduplicate init steps for DAMON context setup DAMON_RECLAIM and DAMON_LRU_SORT has duplicated code for DAMON context and target initializations. Deduplicate the part by implementing a function for the initialization in 'modules-common.c' and using it. Link: https://lkml.kernel.org/r/20221026225943.100429-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/Makefile | 4 ++-- mm/damon/lru_sort.c | 17 +++------------- mm/damon/modules-common.c | 42 +++++++++++++++++++++++++++++++++++++++ mm/damon/modules-common.h | 3 +++ mm/damon/reclaim.c | 17 +++------------- 5 files changed, 53 insertions(+), 30 deletions(-) create mode 100644 mm/damon/modules-common.c diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 1e86f5253d7f..f7add3f4aa79 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o -obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o -obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o +obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index efbc2bda8b9c..a1896c5acfe9 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -314,25 +314,14 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) static int __init damon_lru_sort_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - schedule_delayed_work(&damon_lru_sort_timer, 0); damon_lru_sort_initialized = true; diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c new file mode 100644 index 000000000000..b2381a8466ec --- /dev/null +++ b/mm/damon/modules-common.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#include "modules-common.h" + +/* + * Allocate, set, and return a DAMON context for the physical address space. + * @ctxp: Pointer to save the point to the newly created context + * @targetp: Pointer to save the point to the newly created target + */ +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp) +{ + struct damon_ctx *ctx; + struct damon_target *target; + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + *ctxp = ctx; + *targetp = target; + return 0; +} diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 5a4921851d32..f49cdb417005 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -44,3 +44,6 @@ 0400); \ module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ 0400); + +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 162c9b1ca00f..3173f373435c 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -256,25 +256,14 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) static int __init damon_reclaim_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - schedule_delayed_work(&damon_reclaim_timer, 0); damon_reclaim_initialized = true; From b0d3dbd1b98660ec2154fccbd21c13916c967c05 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 26 Oct 2022 22:59:43 +0000 Subject: [PATCH 085/313] mm/damon/{reclaim,lru_sort}: remove unnecessarily included headers Some headers that 'reclaim.c' and 'lru_sort.c' are including are unnecessary now owing to previous cleanups and refactorings. Remove those. Link: https://lkml.kernel.org/r/20221026225943.100429-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 -- mm/damon/reclaim.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index a1896c5acfe9..5c60163e556c 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -8,9 +8,7 @@ #define pr_fmt(fmt) "damon-lru-sort: " fmt #include -#include #include -#include #include #include "modules-common.h" diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3173f373435c..e14eb30c01f4 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -8,9 +8,7 @@ #define pr_fmt(fmt) "damon-reclaim: " fmt #include -#include #include -#include #include #include "modules-common.h" From 04e98764befa371836a78b2b489e8b931a3a9e9a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Oct 2022 17:36:47 +0000 Subject: [PATCH 086/313] mm/damon/reclaim: enable and disable synchronously Patch series "mm/damon/reclaim,lru_sort: enable/disable synchronously". Writing a value to DAMON_RECLAIM and DAMON_LRU_SORT's 'enabled' parameters turns on or off DAMON in an ansychronous way. This means the parameter cannot be used to read the current status of them. 'kdamond_pid' parameter should be used instead for the purpose. The documentation is easy to be read as it works in a synchronous way, so it is a little bit confusing. It also makes the user space tooling dirty. There's no real reason to have the asynchronous behavior, though. Simply make the parameter works synchronously, rather than updating the document. The first and second patches changes the behavior of the 'enabled' parameter for DAMON_RECLAIM and adds a selftest for the changed behavior, respectively. Following two patches make the same changes for DAMON_LRU_SORT. This patch (of 4): Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off DAMON in an ansychronous way. This means the parameter cannot be used to read the current status of DAMON_RECLAIM. 'kdamond_pid' parameter should be used instead for the purpose. The documentation is easy to be read as it works in a synchronous way, so it is a little bit confusing. It also makes the user space tooling dirty. There's no real reason to have the asynchronous behavior, though. Simply make the parameter works synchronously, rather than updating the document. Link: https://lkml.kernel.org/r/20221025173650.90624-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221025173650.90624-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 53 ++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e14eb30c01f4..e57604bec06d 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -9,7 +9,6 @@ #include #include -#include #include "modules-common.h" @@ -181,38 +180,31 @@ static int damon_reclaim_turn(bool on) return 0; } -static struct delayed_work damon_reclaim_timer; -static void damon_reclaim_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_reclaim_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); - -static bool damon_reclaim_initialized; - static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = strtobool(val, &enable); + if (err) + return err; - /* system_wq might not initialized yet */ - if (!damon_reclaim_initialized) - return rc; + if (is_enabled == enable) + return 0; - schedule_delayed_work(&damon_reclaim_timer, 0); - return 0; + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; + + err = damon_reclaim_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -262,10 +254,11 @@ static int __init damon_reclaim_init(void) ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - schedule_delayed_work(&damon_reclaim_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_reclaim_turn(true); - damon_reclaim_initialized = true; - return 0; + return err; } module_init(damon_reclaim_init); From 4cc0ee7787d7dc595752a8de2e073efa68f7c965 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Oct 2022 17:36:48 +0000 Subject: [PATCH 087/313] selftests/damon: add tests for DAMON_RECLAIM's enabled parameter Add simple test cases for DAMON_RECLAIM's 'enabled' parameter. Those tests are focusing on the synchronous behavior of DAMON_RECLAIM enabling and disabling. Link: https://lkml.kernel.org/r/20221025173650.90624-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + tools/testing/selftests/damon/reclaim.sh | 42 ++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tools/testing/selftests/damon/reclaim.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index a1fa2eff8192..dbbf18cb3e6b 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -8,5 +8,6 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += sysfs.sh +TEST_PROGS += reclaim.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/reclaim.sh b/tools/testing/selftests/damon/reclaim.sh new file mode 100644 index 000000000000..78dbc2334cbe --- /dev/null +++ b/tools/testing/selftests/damon/reclaim.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ $EUID -ne 0 ] +then + echo "Run as root" + exit $ksft_skip +fi + +damon_reclaim_enabled="/sys/module/damon_reclaim/parameters/enabled" +if [ ! -f "$damon_reclaim_enabled" ] +then + echo "No 'enabled' file. Maybe DAMON_RECLAIM not built" + exit $ksft_skip +fi + +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "Another kdamond is running" + exit $ksft_skip +fi + +echo Y > "$damon_reclaim_enabled" + +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 1 ] +then + echo "kdamond is not turned on" + exit 1 +fi + +echo N > "$damon_reclaim_enabled" +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "kdamond is not turned off" + exit 1 +fi From 7a034fbba3361e94956431d17660d7c5674d13c3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Oct 2022 17:36:49 +0000 Subject: [PATCH 088/313] mm/damon/lru_sort: enable and disable synchronously Writing a value to DAMON_RECLAIM's 'enabled' parameter turns on or off DAMON in an ansychronous way. This means the parameter cannot be used to read the current status of DAMON_RECLAIM. 'kdamond_pid' parameter should be used instead for the purpose. The documentation is easy to be read as it works in a synchronous way, so it is a little bit confusing. It also makes the user space tooling dirty. There's no real reason to have the asynchronous behavior, though. Simply make the parameter works synchronously, rather than updating the document. Link: https://lkml.kernel.org/r/20221025173650.90624-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 51 +++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 5c60163e556c..2a532e3983df 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -9,7 +9,6 @@ #include #include -#include #include "modules-common.h" @@ -235,38 +234,31 @@ static int damon_lru_sort_turn(bool on) return 0; } -static struct delayed_work damon_lru_sort_timer; -static void damon_lru_sort_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_lru_sort_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); - -static bool damon_lru_sort_initialized; - static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = strtobool(val, &enable); + if (err) + return err; - if (!damon_lru_sort_initialized) - return rc; + if (is_enabled == enable) + return 0; - schedule_delayed_work(&damon_lru_sort_timer, 0); + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; - return 0; + err = damon_lru_sort_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -320,10 +312,11 @@ static int __init damon_lru_sort_init(void) ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; - schedule_delayed_work(&damon_lru_sort_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_lru_sort_turn(true); - damon_lru_sort_initialized = true; - return 0; + return err; } module_init(damon_lru_sort_init); From 9cd6ffa60256e931503d347006049b8bef508203 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 25 Oct 2022 17:36:50 +0000 Subject: [PATCH 089/313] selftests/damon: add tests for DAMON_LRU_SORT's enabled parameter Add simple test cases for DAMON_LRU_SORT's 'enabled' parameter. Those tests are focusing on the synchronous behavior of DAMON_RECLAIM enabling and disabling. Link: https://lkml.kernel.org/r/20221025173650.90624-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- tools/testing/selftests/damon/lru_sort.sh | 41 +++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/damon/lru_sort.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index dbbf18cb3e6b..af490acc5348 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -8,6 +8,6 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += sysfs.sh -TEST_PROGS += reclaim.sh +TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/lru_sort.sh b/tools/testing/selftests/damon/lru_sort.sh new file mode 100644 index 000000000000..61b80197c896 --- /dev/null +++ b/tools/testing/selftests/damon/lru_sort.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ $EUID -ne 0 ] +then + echo "Run as root" + exit $ksft_skip +fi + +damon_lru_sort_enabled="/sys/module/damon_lru_sort/parameters/enabled" +if [ ! -f "$damon_lru_sort_enabled" ] +then + echo "No 'enabled' file. Maybe DAMON_LRU_SORT not built" + exit $ksft_skip +fi + +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "Another kdamond is running" + exit $ksft_skip +fi + +echo Y > "$damon_lru_sort_enabled" +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 1 ] +then + echo "kdamond is not turned on" + exit 1 +fi + +echo N > "$damon_lru_sort_enabled" +nr_kdamonds=$(pgrep kdamond | wc -l) +if [ "$nr_kdamonds" -ne 0 ] +then + echo "kdamond is not turned off" + exit 1 +fi From f1a7941243c102a44e8847e3b94ff4ff3ec56f25 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 24 Oct 2022 05:28:41 +0000 Subject: [PATCH 090/313] mm: convert mm's rss stats into percpu_counter Currently mm_struct maintains rss_stats which are updated on page fault and the unmapping codepaths. For page fault codepath the updates are cached per thread with the batch of TASK_RSS_EVENTS_THRESH which is 64. The reason for caching is performance for multithreaded applications otherwise the rss_stats updates may become hotspot for such applications. However this optimization comes with the cost of error margin in the rss stats. The rss_stats for applications with large number of threads can be very skewed. At worst the error margin is (nr_threads * 64) and we have a lot of applications with 100s of threads, so the error margin can be very high. Internally we had to reduce TASK_RSS_EVENTS_THRESH to 32. Recently we started seeing the unbounded errors for rss_stats for specific applications which use TCP rx0cp. It seems like vm_insert_pages() codepath does not sync rss_stats at all. This patch converts the rss_stats into percpu_counter to convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However this conversion enable us to get the accurate stats for situations where accuracy is more important than the cpu cost. This patch does not make such tradeoffs - we can just use percpu_counter_add_local() for the updates and percpu_counter_sum() (or percpu_counter_sync() + percpu_counter_read) for the readers. At the moment the readers are either procfs interface, oom_killer and memory reclaim which I think are not performance critical and should be ok with slow read. However I think we can make that change in a separate patch. Link: https://lkml.kernel.org/r/20221024052841.3291983-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 ++++-------- include/linux/mm_types.h | 7 +--- include/linux/mm_types_task.h | 13 ------ include/linux/percpu_counter.h | 1 - include/linux/sched.h | 3 -- include/trace/events/kmem.h | 8 ++-- kernel/fork.c | 16 +++++++- mm/memory.c | 73 +++++----------------------------- 8 files changed, 40 insertions(+), 107 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f919befc8fac..0cb4e196d60b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2052,40 +2052,30 @@ static inline bool get_user_page_fast_only(unsigned long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { - long val = atomic_long_read(&mm->rss_stat.count[member]); - -#ifdef SPLIT_RSS_COUNTING - /* - * counter is updated in asynchronous manner and may go to minus. - * But it's never be expected number for users. - */ - if (val < 0) - val = 0; -#endif - return (unsigned long)val; + return percpu_counter_read_positive(&mm->rss_stat[member]); } -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + percpu_counter_add(&mm->rss_stat[member], value); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + percpu_counter_inc(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + percpu_counter_dec(&mm->rss_stat[member]); - mm_trace_rss_stat(mm, member, count); + mm_trace_rss_stat(mm, member); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2d5b1575ffe0..e86861ff5bbd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -626,11 +627,7 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - /* - * Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - struct mm_rss_stat rss_stat; + struct percpu_counter rss_stat[NR_MM_COUNTERS]; struct linux_binfmt *binfmt; diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 0bb4b6da9993..5414b5c6a103 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -36,19 +36,6 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) -#define SPLIT_RSS_COUNTING -/* per-thread cached information, */ -struct task_rss_stat { - int events; /* for synchronization threshold */ - int count[NR_MM_COUNTERS]; -}; -#endif /* USE_SPLIT_PTE_PTLOCKS */ - -struct mm_rss_stat { - atomic_long_t count[NR_MM_COUNTERS]; -}; - struct page_frag { struct page *page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8ed5fba6d156..bde6c4c1f405 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -13,7 +13,6 @@ #include #include #include -#include /* percpu_counter batch for local add or sub */ #define PERCPU_COUNTER_LOCAL_BATCH INT_MAX diff --git a/include/linux/sched.h b/include/linux/sched.h index ffb6eb55cd13..079d299fa465 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -870,9 +870,6 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; -#ifdef SPLIT_RSS_COUNTING - struct task_rss_stat rss_stat; -#endif int exit_state; int exit_code; int exit_signal; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 243073cfc29d..58688768ef0f 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -346,10 +346,9 @@ TRACE_MM_PAGES TRACE_EVENT(rss_stat, TP_PROTO(struct mm_struct *mm, - int member, - long count), + int member), - TP_ARGS(mm, member, count), + TP_ARGS(mm, member), TP_STRUCT__entry( __field(unsigned int, mm_id) @@ -362,7 +361,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id = mm_ptr_to_hash(mm); __entry->curr = !!(current->mm == mm); __entry->member = member; - __entry->size = (count << PAGE_SHIFT); + __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member]) + << PAGE_SHIFT); ), TP_printk("mm_id=%u curr=%d type=%s size=%ldB", diff --git a/kernel/fork.c b/kernel/fork.c index 08969f5aa38d..0fef202434c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -753,7 +753,7 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); + long x = percpu_counter_sum(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", @@ -779,6 +779,8 @@ static void check_mm(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { + int i; + BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); WARN_ON_ONCE(mm == current->active_mm); @@ -788,6 +790,9 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + + for (i = 0; i < NR_MM_COUNTERS; i++) + percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1107,6 +1112,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { + int i; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1148,10 +1155,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + for (i = 0; i < NR_MM_COUNTERS; i++) + if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) + goto fail_pcpu; + mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; +fail_pcpu: + while (i > 0) + percpu_counter_destroy(&mm->rss_stat[--i]); fail_nocontext: mm_free_pgd(mm); fail_nopgd: diff --git a/mm/memory.c b/mm/memory.c index 7826143ec9cd..e0555ddd71b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,58 +162,11 @@ static int __init init_zero_pfn(void) } early_initcall(init_zero_pfn); -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member) { - trace_rss_stat(mm, member, count); + trace_rss_stat(mm, member); } -#if defined(SPLIT_RSS_COUNTING) - -void sync_mm_rss(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - if (current->rss_stat.count[i]) { - add_mm_counter(mm, i, current->rss_stat.count[i]); - current->rss_stat.count[i] = 0; - } - } - current->rss_stat.events = 0; -} - -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) -{ - struct task_struct *task = current; - - if (likely(task->mm == mm)) - task->rss_stat.count[member] += val; - else - add_mm_counter(mm, member, val); -} -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) - -/* sync counter once per 64 page faults */ -#define TASK_RSS_EVENTS_THRESH (64) -static void check_sync_rss_stat(struct task_struct *task) -{ - if (unlikely(task != current)) - return; - if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - sync_mm_rss(task->mm); -} -#else /* SPLIT_RSS_COUNTING */ - -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) - -static void check_sync_rss_stat(struct task_struct *task) -{ -} - -#endif /* SPLIT_RSS_COUNTING */ - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1857,7 +1810,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -3153,12 +3106,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter(mm, mm_counter_file(old_page)); + inc_mm_counter(mm, MM_ANONPAGES); } } else { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3965,8 +3917,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); /* @@ -4146,7 +4098,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -4336,11 +4288,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); @@ -5192,9 +5144,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); - if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) From f689054aace2ff13af2e9a44a74fbba650ca31ba Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 9 Nov 2022 01:20:11 +0000 Subject: [PATCH 091/313] percpu_counter: add percpu_counter_sum_all interface The percpu_counter is used for scenarios where performance is more important than the accuracy. For percpu_counter users, who want more accurate information in their slowpath, percpu_counter_sum is provided which traverses all the online CPUs to accumulate the data. The reason it only needs to traverse online CPUs is because percpu_counter does implement CPU offline callback which syncs the local data of the offlined CPU. However there is a small race window between the online CPUs traversal of percpu_counter_sum and the CPU offline callback. The offline callback has to traverse all the percpu_counters on the system to flush the CPU local data which can be a lot. During that time, the CPU which is going offline has already been published as offline to all the readers. So, as the offline callback is running, percpu_counter_sum can be called for one counter which has some state on the CPU going offline. Since percpu_counter_sum only traverses online CPUs, it will skip that specific CPU and the offline callback might not have flushed the state for that specific percpu_counter on that offlined CPU. Normally this is not an issue because percpu_counter users can deal with some inaccuracy for small time window. However a new user i.e. mm_struct on the cleanup path wants to check the exact state of the percpu_counter through check_mm(). For such users, this patch introduces percpu_counter_sum_all() which traverses all possible CPUs and it is used in fork.c:check_mm() to avoid the potential race. This issue is exposed by the later patch "mm: convert mm's rss stats into percpu_counter". Link: https://lkml.kernel.org/r/20221109012011.881058-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 6 ++++++ kernel/fork.c | 5 +++++ lib/percpu_counter.c | 29 +++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index bde6c4c1f405..a3aae8d57a42 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -45,6 +45,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); +s64 percpu_counter_sum_all(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); @@ -193,6 +194,11 @@ static inline s64 percpu_counter_sum(struct percpu_counter *fbc) return percpu_counter_read(fbc); } +static inline s64 percpu_counter_sum_all(struct percpu_counter *fbc) +{ + return percpu_counter_read(fbc); +} + static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return true; diff --git a/kernel/fork.c b/kernel/fork.c index 0fef202434c3..1be4c4ab7f3e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -755,6 +755,11 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); + if (likely(!x)) + continue; + + /* Making sure this is not due to race with CPU offlining. */ + x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index ed610b75dc32..42f729c8e56c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -117,11 +117,8 @@ void percpu_counter_sync(struct percpu_counter *fbc) } EXPORT_SYMBOL(percpu_counter_sync); -/* - * Add up all the per-cpu counts, return the result. This is a more accurate - * but much slower version of percpu_counter_read_positive() - */ -s64 __percpu_counter_sum(struct percpu_counter *fbc) +static s64 __percpu_counter_sum_mask(struct percpu_counter *fbc, + const struct cpumask *cpu_mask) { s64 ret; int cpu; @@ -129,15 +126,35 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, cpu_mask) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } + +/* + * Add up all the per-cpu counts, return the result. This is a more accurate + * but much slower version of percpu_counter_read_positive() + */ +s64 __percpu_counter_sum(struct percpu_counter *fbc) +{ + return __percpu_counter_sum_mask(fbc, cpu_online_mask); +} EXPORT_SYMBOL(__percpu_counter_sum); +/* + * This is slower version of percpu_counter_sum as it traverses all possible + * cpus. Use this only in the cases where accurate data is needed in the + * presense of CPUs getting offlined. + */ +s64 percpu_counter_sum_all(struct percpu_counter *fbc) +{ + return __percpu_counter_sum_mask(fbc, cpu_possible_mask); +} +EXPORT_SYMBOL(percpu_counter_sum_all); + int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { From a873dfe1032a132bf89f9e19a6ac44f5a0b78754 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Oct 2022 13:01:19 -0700 Subject: [PATCH 092/313] mm, hwpoison: try to recover from copy-on write faults Patch series "Copy-on-write poison recovery", v3. Part 1 deals with the process that triggered the copy on write fault with a store to a shared read-only page. That process is send a SIGBUS with the usual machine check decoration to specify the virtual address of the lost page, together with the scope. Part 2 sets up to asynchronously take the page with the uncorrected error offline to prevent additional machine check faults. H/t to Miaohe Lin and Shuai Xue for pointing me to the existing function to queue a call to memory_failure(). On x86 there is some duplicate reporting (because the error is also signalled by the memory controller as well as by the core that triggered the machine check). Console logs look like this: This patch (of 2): If the kernel is copying a page as the result of a copy-on-write fault and runs into an uncorrectable error, Linux will crash because it does not have recovery code for this case where poison is consumed by the kernel. It is easy to set up a test case. Just inject an error into a private page, fork(2), and have the child process write to the page. I wrapped that neatly into a test at: git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git just enable ACPI error injection and run: # ./einj_mem-uc -f copy-on-write Add a new copy_user_highpage_mc() function that uses copy_mc_to_kernel() on architectures where that is available (currently x86 and powerpc). When an error is detected during the page copy, return VM_FAULT_HWPOISON to caller of wp_page_copy(). This propagates up the call stack. Both x86 and powerpc have code in their fault handler to deal with this code by sending a SIGBUS to the application. Note that this patch avoids a system crash and signals the process that triggered the copy-on-write action. It does not take any action for the memory error that is still in the shared page. To handle that a call to memory_failure() is needed. But this cannot be done from wp_page_copy() because it holds mmap_lock(). Perhaps the architecture fault handlers can deal with this loose end in a subsequent patch? On Intel/x86 this loose end will often be handled automatically because the memory controller provides an additional notification of the h/w poison in memory, the handler for this will call memory_failure(). This isn't a 100% solution. If there are multiple errors, not all may be logged in this way. [tony.luck@intel.com: add call to kmsan_unpoison_memory(), per Miaohe Lin] Link: https://lkml.kernel.org/r/20221031201029.102123-2-tony.luck@intel.com Link: https://lkml.kernel.org/r/20221021200120.175753-1-tony.luck@intel.com Link: https://lkml.kernel.org/r/20221021200120.175753-2-tony.luck@intel.com Signed-off-by: Tony Luck Reviewed-by: Dan Williams Reviewed-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Alexander Potapenko Tested-by: Shuai Xue Cc: Christophe Leroy Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- include/linux/highmem.h | 26 ++++++++++++++++++++++++++ mm/memory.c | 30 ++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index e9912da5441b..44242268f53b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -319,6 +319,32 @@ static inline void copy_user_highpage(struct page *to, struct page *from, #endif +#ifdef copy_mc_to_kernel +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + unsigned long ret; + char *vfrom, *vto; + + vfrom = kmap_local_page(from); + vto = kmap_local_page(to); + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); + if (!ret) + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); + kunmap_local(vto); + kunmap_local(vfrom); + + return ret; +} +#else +static inline int copy_mc_user_highpage(struct page *to, struct page *from, + unsigned long vaddr, struct vm_area_struct *vma) +{ + copy_user_highpage(to, from, vaddr, vma); + return 0; +} +#endif + #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) diff --git a/mm/memory.c b/mm/memory.c index e0555ddd71b5..13b1fe661d86 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2798,10 +2798,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf) return same; } -static inline bool __wp_page_copy_user(struct page *dst, struct page *src, - struct vm_fault *vmf) +/* + * Return: + * 0: copied succeeded + * -EHWPOISON: copy failed due to hwpoison in source page + * -EAGAIN: copied failed (some other reason) + */ +static inline int __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) { - bool ret; + int ret; void *kaddr; void __user *uaddr; bool locked = false; @@ -2810,8 +2816,9 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - copy_user_highpage(dst, src, addr, vma); - return true; + if (copy_mc_user_highpage(dst, src, addr, vma)) + return -EHWPOISON; + return 0; } /* @@ -2838,7 +2845,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, * and update local tlb only */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2863,7 +2870,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2882,7 +2889,7 @@ warn: } } - ret = true; + ret = 0; pte_unlock: if (locked) @@ -3054,6 +3061,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; + int ret; delayacct_wpcopy_start(); @@ -3071,19 +3079,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; - if (!__wp_page_copy_user(new_page, old_page, vmf)) { + ret = __wp_page_copy_user(new_page, old_page, vmf); + if (ret) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. + * The -EHWPOISON case will not be retried. */ put_page(new_page); if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return 0; + return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } From d302c2398ba269e788a4f37ae57c07a7fcabaa42 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 21 Oct 2022 13:01:20 -0700 Subject: [PATCH 093/313] mm, hwpoison: when copy-on-write hits poison, take page offline Cannot call memory_failure() directly from the fault handler because mmap_lock (and others) are held. It is important, but not urgent, to mark the source page as h/w poisoned and unmap it from other tasks. Use memory_failure_queue() to request a call to memory_failure() for the page with the error. Also provide a stub version for CONFIG_MEMORY_FAILURE=n Link: https://lkml.kernel.org/r/20221021200120.175753-3-tony.luck@intel.com Signed-off-by: Tony Luck Reviewed-by: Miaohe Lin Cc: Christophe Leroy Cc: Dan Williams Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Naoya Horiguchi Cc: Nicholas Piggin Cc: Shuai Xue Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++++- mm/memory.c | 4 +++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cb4e196d60b..3950ef45b9a9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3280,7 +3280,6 @@ enum mf_flags { int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); -extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern int sysctl_memory_failure_early_kill; @@ -3289,11 +3288,16 @@ extern void shake_page(struct page *p); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE +extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else +static inline void memory_failure_queue(unsigned long pfn, int flags) +{ +} + static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { diff --git a/mm/memory.c b/mm/memory.c index 13b1fe661d86..659620b6770f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2816,8 +2816,10 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - if (copy_mc_user_highpage(dst, src, addr, vma)) + if (copy_mc_user_highpage(dst, src, addr, vma)) { + memory_failure_queue(page_to_pfn(src), 0); return -EHWPOISON; + } return 0; } From 1cc53a047b0b9389c2d8f4a69499c6135572f23e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 27 Oct 2022 11:36:41 +0800 Subject: [PATCH 094/313] mm: hugetlb_vmemmap: remove redundant list_del() The ->lru field will be assigned to a new value in __free_page(). So it is unnecessary to delete it from the @list. Just remove it to simplify the code. Link: https://lkml.kernel.org/r/20221027033641.66709-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4962dd1ba4a6..7898c2c75e35 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -232,10 +232,8 @@ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); - } } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, From bd4149290c3edc09454a8a7e7ef3a5544cb9eed6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 24 Oct 2022 17:46:18 +0000 Subject: [PATCH 095/313] Docs/admin-guide/mm/damon/usage: describe the rules of sysfs region directories Patch series "Docs/admin-buide/mm/damon/usage: minor fixes". DAMON usage document contains an unclear description and a wrong usage example. This patchset fixes the two minor problems. This patch (of 2): Target region directories of DAMON sysfs interface should contain no overlap and sorted by the address, but not clearly documented. Actually, a user had an issue[1] due to the poor documentation. Add clear description of it on the usage document. [1] https://lore.kernel.org/damon/CAEZ6=UNUcH2BvJj++OrT=XQLdkidU79wmCO=tantSOB36pPNTg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20221024174619.15600-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221024174619.15600-2-sj@kernel.org Signed-off-by: SeongJae Park Reported-by: Vinicius Petrucci Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index b47b0cbbd491..89d9a4f75a29 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -235,6 +235,9 @@ In each region directory, you will find two files (``start`` and ``end``). You can set and get the start and end addresses of the initial monitoring target region by writing to and reading from the files, respectively. +Each region should not overlap with others. ``end`` of directory ``N`` should +be equal or smaller than ``start`` of directory ``N+1``. + contexts//schemes/ --------------------- From 1b0166387586cae69d7da783f0a4521864534aad Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 24 Oct 2022 17:46:19 +0000 Subject: [PATCH 096/313] Docs/admin-guide/mm/damon/usage: fix wrong usage example of init_regions file DAMON debugfs interface assumes the users will write all inputs at once. However, redirecting a string of multiple lines sometimes end up writing line by line. Therefore, the example usage of 'init_regions' file, which writes input as a string of multiple lines can fail. Fix it to use a single line string instead. Also update the description of the usage to not assume users will write inputs in multiple lines. Link: https://lkml.kernel.org/r/20221024174619.15600-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Vinicius Petrucci Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 89d9a4f75a29..c17e02e1e426 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -468,8 +468,9 @@ regions in case of physical memory monitoring. Therefore, users should set the monitoring target regions by themselves. In such cases, users can explicitly set the initial monitoring target regions -as they want, by writing proper values to the ``init_regions`` file. Each line -of the input should represent one region in below form.:: +as they want, by writing proper values to the ``init_regions`` file. The input +should be a sequence of three integers separated by white spaces that represent +one region in below form.:: @@ -484,9 +485,9 @@ ranges, ``20-40`` and ``50-100`` as that of pid 4242, which is the second one # cd /damon # cat target_ids 42 4242 - # echo "0 1 100 - 0 100 200 - 1 20 40 + # echo "0 1 100 \ + 0 100 200 \ + 1 20 40 \ 1 50 100" > init_regions Note that this sets the initial monitoring target regions only. In case of From 57e9cc50f4dd926d6c38751799d25cad89fb2bd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 26 Oct 2022 14:01:33 -0400 Subject: [PATCH 097/313] mm: vmscan: split khugepaged stats from direct reclaim stats Direct reclaim stats are useful for identifying a potential source for application latency, as well as spotting issues with kswapd. However, khugepaged currently distorts the picture: as a kernel thread it doesn't impose allocation latencies on userspace, and it explicitly opts out of kswapd reclaim. Its activity showing up in the direct reclaim stats is misleading. Counting it as kswapd reclaim could also cause confusion when trying to understand actual kswapd behavior. Break out khugepaged from the direct reclaim counters into new pgsteal_khugepaged, pgdemote_khugepaged, pgscan_khugepaged counters. Test with a huge executable (CONFIG_READ_ONLY_THP_FOR_FS): pgsteal_kswapd 1342185 pgsteal_direct 0 pgsteal_khugepaged 3623 pgscan_kswapd 1345025 pgscan_direct 0 pgscan_khugepaged 3623 Link: https://lkml.kernel.org/r/20221026180133.377671-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Eric Bergen Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 6 +++++ include/linux/khugepaged.h | 6 +++++ include/linux/vm_event_item.h | 3 +++ mm/khugepaged.c | 5 ++++ mm/memcontrol.c | 8 +++++-- mm/vmscan.c | 32 ++++++++++++++++++------- mm/vmstat.c | 3 +++ 7 files changed, 53 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index dc254a3cb956..74cec76be9f2 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1488,12 +1488,18 @@ PAGE_SIZE multiple when read back. pgscan_direct (npn) Amount of scanned pages directly (in an inactive LRU list) + pgscan_khugepaged (npn) + Amount of scanned pages by khugepaged (in an inactive LRU list) + pgsteal_kswapd (npn) Amount of reclaimed pages by kswapd pgsteal_direct (npn) Amount of reclaimed pages directly + pgsteal_khugepaged (npn) + Amount of reclaimed pages by khugepaged + pgfault (npn) Total number of page faults incurred diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 70162d707caf..f68865e19b0b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); +extern bool current_is_khugepaged(void); #ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); @@ -57,6 +58,11 @@ static inline int collapse_pte_mapped_thp(struct mm_struct *mm, static inline void khugepaged_min_free_kbytes_update(void) { } + +static inline bool current_is_khugepaged(void) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3518dba1e02f..7f5d1caf5890 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -40,10 +40,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGREUSE, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, + PGDEMOTE_KHUGEPAGED, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSCAN_DIRECT_THROTTLE, PGSCAN_ANON, PGSCAN_FILE, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3703a56571c1..9c111273bbf9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2577,6 +2577,11 @@ void khugepaged_min_free_kbytes_update(void) mutex_unlock(&khugepaged_mutex); } +bool current_is_khugepaged(void) +{ + return kthread_func(current) == khugepaged; +} + static int madvise_collapse_errno(enum scan_result r) { /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c95e2ed6e7fd..23750cec0036 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -661,8 +661,10 @@ static const unsigned int memcg_vm_event_stat[] = { PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGFAULT, PGMAJFAULT, PGREFILL, @@ -1574,10 +1576,12 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) /* Accumulated memory events */ seq_buf_printf(&s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); + memcg_events(memcg, PGSCAN_DIRECT) + + memcg_events(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(&s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); + memcg_events(memcg, PGSTEAL_DIRECT) + + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { if (memcg_vm_event_stat[i] == PGPGIN || diff --git a/mm/vmscan.c b/mm/vmscan.c index 55a5b5d66d68..d7c71be6417d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -1047,6 +1048,24 @@ void drop_slab(void) drop_slab_node(nid); } +static int reclaimer_offset(void) +{ + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); + + if (current_is_kswapd()) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; +} + static inline int is_page_cache_freeable(struct folio *folio) { /* @@ -1599,10 +1618,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - if (current_is_kswapd()) - __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); - else - __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); return nr_succeeded; } @@ -2475,7 +2491,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); @@ -2492,7 +2508,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); @@ -4871,7 +4887,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, break; } - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); @@ -5049,7 +5065,7 @@ retry: if (walk && walk->batched) reset_batch_size(lruvec, walk); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); diff --git a/mm/vmstat.c b/mm/vmstat.c index b2371d745e00..1ea6a5ce1c41 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1271,10 +1271,13 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgsteal_khugepaged", "pgdemote_kswapd", "pgdemote_direct", + "pgdemote_khugepaged", "pgscan_kswapd", "pgscan_direct", + "pgscan_khugepaged", "pgscan_direct_throttle", "pgscan_anon", "pgscan_file", From 6e7ba8b5e2380f941dda8a1025d70c5ce5b38982 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Fri, 28 Oct 2022 14:45:34 +0000 Subject: [PATCH 098/313] maple_tree: mte_set_full() and mte_clear_full() clang-analyzer clean up mte_set_full() and mte_clear_full() were incorrectly setting a pointer to a value without returning a result. Fix this by returning the modified pointer to be use as necessary. Also add a third function to return if the bit is set or not. Link: https://lore.kernel.org/lkml/20221026120029.12555-1-lukas.bulwahn@gmail.com/ Link: https://lkml.kernel.org/r/20221028144520.2776767-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Lukas Bulwahn Suggested-by: Dan Carpenter Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index df352f6ccc24..3fe1491d2bf9 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -323,14 +323,19 @@ static inline void *mte_safe_root(const struct maple_enode *node) return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } -static inline void mte_set_full(const struct maple_enode *node) +static inline void *mte_set_full(const struct maple_enode *node) { - node = (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); + return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } -static inline void mte_clear_full(const struct maple_enode *node) +static inline void *mte_clear_full(const struct maple_enode *node) { - node = (void *)((unsigned long)node | MAPLE_ENODE_NULL); + return (void *)((unsigned long)node | MAPLE_ENODE_NULL); +} + +static inline bool mte_has_null(const struct maple_enode *node) +{ + return (unsigned long)node & MAPLE_ENODE_NULL; } static inline bool ma_is_root(struct maple_node *node) From b2b23ba03cb9059d11b270cc280dcdfa6dbbdf53 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 28 Oct 2022 08:53:01 -0700 Subject: [PATCH 099/313] mempool: do not use ksize() for poisoning Nothing appears to be using ksize() within the kmalloc-backed mempools except the mempool poisoning logic. Use the actual pool size instead of the ksize() to avoid needing any special handling of the memory as needed by KASAN, UBSAN_BOUNDS, nor FORTIFY_SOURCE. [vbabka@suse.cz: for slab mempools pool_data is not object size] Link: https://lkml.kernel.org/r/13c4bd6e-09d3-efce-43a5-5a99be8bc96b@suse.cz Link: https://lkml.kernel.org/r/20221028154823.you.615-kees@kernel.org Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka Suggested-by: Vlastimil Babka Link: https://lore.kernel.org/lkml/f4fc52c4-7c18-1d76-0c7a-4058ea2486b9@suse.cz/ Acked-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Cc: David Rientjes Cc: Marco Elver Cc: Vincenzo Frascino Reported-by: Anders Roxell Link: https://lore.kernel.org/all/20221031105514.GB69385@mutt/ Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/mempool.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 96488b13a1ef..734bcf5afbb7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -57,8 +57,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size) static void check_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { - __check_element(pool, element, ksize(element)); + if (pool->free == mempool_kfree) { + __check_element(pool, element, (size_t)pool->pool_data); + } else if (pool->free == mempool_free_slab) { + __check_element(pool, element, kmem_cache_size(pool->pool_data)); } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; @@ -80,8 +82,10 @@ static void __poison_element(void *element, size_t size) static void poison_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { - __poison_element(element, ksize(element)); + if (pool->alloc == mempool_kmalloc) { + __poison_element(element, (size_t)pool->pool_data); + } else if (pool->alloc == mempool_alloc_slab) { + __poison_element(element, kmem_cache_size(pool->pool_data)); } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; @@ -111,8 +115,10 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) static void kasan_unpoison_element(mempool_t *pool, void *element) { - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_unpoison_range(element, __ksize(element)); + if (pool->alloc == mempool_kmalloc) + kasan_unpoison_range(element, (size_t)pool->pool_data); + else if (pool->alloc == mempool_alloc_slab) + kasan_unpoison_range(element, kmem_cache_size(pool->pool_data)); else if (pool->alloc == mempool_alloc_pages) kasan_unpoison_pages(element, (unsigned long)pool->pool_data, false); From a098c977722ca27d3b4bfeb966767af3cce45f85 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:51 -0700 Subject: [PATCH 100/313] mm/hugetlb_cgroup: convert __set_hugetlb_cgroup() to folios Patch series "convert hugetlb_cgroup helper functions to folios", v2. This patch series continues the conversion of hugetlb code from being managed in pages to folios by converting many of the hugetlb_cgroup helper functions to use folios. This allows the core hugetlb functions to pass in a folio to these helper functions. This patch (of 9); Change __set_hugetlb_cgroup() to use folios so it is explicit that the function operates on a head page. Link: https://lkml.kernel.org/r/20221101223059.460937-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221101223059.460937-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 14 +++++++------- mm/hugetlb_cgroup.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 630cd255d0cf..7576e9ed8afe 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -90,31 +90,31 @@ hugetlb_cgroup_from_page_rsvd(struct page *page) return __hugetlb_cgroup_from_page(page, true); } -static inline void __set_hugetlb_cgroup(struct page *page, +static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD, + set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD), (unsigned long)h_cg); else - set_page_private(page + SUBPAGE_INDEX_CGROUP, + set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP), (unsigned long)h_cg); } static inline void set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, false); + __set_hugetlb_cgroup(page_folio(page), h_cg, false); } static inline void set_hugetlb_cgroup_rsvd(struct page *page, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page, h_cg, true); + __set_hugetlb_cgroup(page_folio(page), h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f61d132df52b..b2316bcbf634 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -314,7 +314,7 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled() || !h_cg) return; - __set_hugetlb_cgroup(page, h_cg, rsvd); + __set_hugetlb_cgroup(page_folio(page), h_cg, rsvd); if (!rsvd) { unsigned long usage = h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; @@ -356,7 +356,7 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, h_cg = __hugetlb_cgroup_from_page(page, rsvd); if (unlikely(!h_cg)) return; - __set_hugetlb_cgroup(page, NULL, rsvd); + __set_hugetlb_cgroup(page_folio(page), NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), From f074732d599e19a2a5b12e54743ad5eaccbe6550 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:52 -0700 Subject: [PATCH 101/313] mm/hugetlb_cgroup: convert hugetlb_cgroup_from_page() to folios Introduce folios in __remove_hugetlb_page() by converting hugetlb_cgroup_from_page() to use folios. Also gets rid of unsed hugetlb_cgroup_from_page_resv() function. Link: https://lkml.kernel.org/r/20221101223059.460937-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 39 +++++++++++++++++----------------- mm/hugetlb.c | 5 +++-- mm/hugetlb_cgroup.c | 13 +++++++----- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7576e9ed8afe..feb2edafc8b6 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -67,27 +67,34 @@ struct hugetlb_cgroup { }; static inline struct hugetlb_cgroup * -__hugetlb_cgroup_from_page(struct page *page, bool rsvd) +__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - VM_BUG_ON_PAGE(!PageHuge(page), page); + struct page *tail; - if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); + if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - if (rsvd) - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD); - else - return (void *)page_private(page + SUBPAGE_INDEX_CGROUP); + + if (rsvd) { + tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD); + return (void *)page_private(tail); + } + + else { + tail = folio_page(folio, SUBPAGE_INDEX_CGROUP); + return (void *)page_private(tail); + } } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, false); + return __hugetlb_cgroup_from_folio(folio, false); } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { - return __hugetlb_cgroup_from_page(page, true); + return __hugetlb_cgroup_from_folio(folio, true); } static inline void __set_hugetlb_cgroup(struct folio *folio, @@ -181,19 +188,13 @@ static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, { } -static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) { return NULL; } static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_resv(struct page *page) -{ - return NULL; -} - -static inline struct hugetlb_cgroup * -hugetlb_cgroup_from_page_rsvd(struct page *page) +hugetlb_cgroup_from_folio_rsvd(struct folio *folio) { return NULL; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be09678d0582..f86a61a73112 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1446,9 +1446,10 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, bool demote) { int nid = page_to_nid(page); + struct folio *folio = page_folio(page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b2316bcbf634..8b95c1560f9c 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -191,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_page(page); + page_hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely @@ -349,14 +350,15 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page, bool rsvd) { struct hugetlb_cgroup *h_cg; + struct folio *folio = page_folio(page); if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); - h_cg = __hugetlb_cgroup_from_page(page, rsvd); + h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; - __set_hugetlb_cgroup(page_folio(page), NULL, rsvd); + __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), @@ -888,13 +890,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = page_hstate(oldhpage); + struct folio *old_folio = page_folio(oldhpage); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); - h_cg = hugetlb_cgroup_from_page(oldhpage); - h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); + h_cg = hugetlb_cgroup_from_folio(old_folio); + h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); set_hugetlb_cgroup(oldhpage, NULL); set_hugetlb_cgroup_rsvd(oldhpage, NULL); From de656ed376c4cb47c5713fba52f8bbfbea44f387 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:53 -0700 Subject: [PATCH 102/313] mm/hugetlb_cgroup: convert set_hugetlb_cgroup*() to folios Allows __prep_new_huge_page() to operate on a folio by converting set_hugetlb_cgroup*() to take in a folio. Link: https://lkml.kernel.org/r/20221101223059.460937-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 12 ++++++------ mm/hugetlb.c | 33 +++++++++++++++++++-------------- mm/hugetlb_cgroup.c | 11 ++++++----- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index feb2edafc8b6..a7e3540f7f38 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -112,16 +112,16 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, (unsigned long)h_cg); } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page_folio(page), h_cg, false); + __set_hugetlb_cgroup(folio, h_cg, false); } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { - __set_hugetlb_cgroup(page_folio(page), h_cg, true); + __set_hugetlb_cgroup(folio, h_cg, true); } static inline bool hugetlb_cgroup_disabled(void) @@ -199,12 +199,12 @@ hugetlb_cgroup_from_folio_rsvd(struct folio *folio) return NULL; } -static inline void set_hugetlb_cgroup(struct page *page, +static inline void set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg) { } -static inline void set_hugetlb_cgroup_rsvd(struct page *page, +static inline void set_hugetlb_cgroup_rsvd(struct folio *folio, struct hugetlb_cgroup *h_cg) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f86a61a73112..01ea43b22724 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1774,19 +1774,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct hstate *h, struct page *page) +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - hugetlb_vmemmap_optimize(h, page); - INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - hugetlb_set_page_subpool(page, NULL); - set_hugetlb_cgroup(page, NULL); - set_hugetlb_cgroup_rsvd(page, NULL); + hugetlb_vmemmap_optimize(h, &folio->page); + INIT_LIST_HEAD(&folio->lru); + folio->_folio_dtor = HUGETLB_PAGE_DTOR; + hugetlb_set_folio_subpool(folio, NULL); + set_hugetlb_cgroup(folio, NULL); + set_hugetlb_cgroup_rsvd(folio, NULL); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { - __prep_new_huge_page(h, page); + struct folio *folio = page_folio(page); + + __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); @@ -2748,8 +2750,10 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - int nid = page_to_nid(old_page); + struct folio *old_folio = page_folio(old_page); + int nid = folio_nid(old_folio); struct page *new_page; + struct folio *new_folio; int ret = 0; /* @@ -2762,16 +2766,17 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; - __prep_new_huge_page(h, new_page); + new_folio = page_folio(new_page); + __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); - if (!PageHuge(old_page)) { + if (!folio_test_hugetlb(old_folio)) { /* * Freed from under us. Drop new_page too. */ goto free_new; - } else if (page_count(old_page)) { + } else if (folio_ref_count(old_folio)) { /* * Someone has grabbed the page, try to isolate it here. * Fail with -EBUSY if not possible. @@ -2780,7 +2785,7 @@ retry: ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; - } else if (!HPageFreed(old_page)) { + } else if (!folio_test_hugetlb_freed(old_folio)) { /* * Page's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if @@ -2818,7 +2823,7 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); /* Page has a zero ref count, but needs a ref to be freed */ - set_page_refcounted(new_page); + folio_ref_unfreeze(new_folio, 1); update_and_free_page(h, new_page, false); return ret; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 8b95c1560f9c..87a1125aa42d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -212,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); - set_hugetlb_cgroup(page, parent); + set_hugetlb_cgroup(folio, parent); out: return; } @@ -891,6 +891,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = page_hstate(oldhpage); struct folio *old_folio = page_folio(oldhpage); + struct folio *new_folio = page_folio(newhpage); if (hugetlb_cgroup_disabled()) return; @@ -898,12 +899,12 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_folio(old_folio); h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); - set_hugetlb_cgroup(oldhpage, NULL); - set_hugetlb_cgroup_rsvd(oldhpage, NULL); + set_hugetlb_cgroup(old_folio, NULL); + set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ - set_hugetlb_cgroup(newhpage, h_cg); - set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); + set_hugetlb_cgroup(new_folio, h_cg); + set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&newhpage->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; From 29f394304f624b06fafb3cc9c3da8779f71f4bee Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:54 -0700 Subject: [PATCH 103/313] mm/hugetlb_cgroup: convert hugetlb_cgroup_migrate to folios Cleans up intermediate page to folio conversion code in hugetlb_cgroup_migrate() by changing its arguments from pages to folios. Link: https://lkml.kernel.org/r/20221101223059.460937-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 8 ++++---- mm/hugetlb.c | 2 +- mm/hugetlb_cgroup.c | 8 +++----- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index a7e3540f7f38..789b6fef176d 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -177,8 +177,8 @@ extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, bool region_del); extern void hugetlb_cgroup_file_init(void) __init; -extern void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage); +extern void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio); #else static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, @@ -286,8 +286,8 @@ static inline void hugetlb_cgroup_file_init(void) { } -static inline void hugetlb_cgroup_migrate(struct page *oldhpage, - struct page *newhpage) +static inline void hugetlb_cgroup_migrate(struct folio *old_folio, + struct folio *new_folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 01ea43b22724..05a832886a09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7325,7 +7325,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) { struct hstate *h = page_hstate(oldpage); - hugetlb_cgroup_migrate(oldpage, newpage); + hugetlb_cgroup_migrate(page_folio(oldpage), page_folio(newpage)); set_page_owner_migrate_reason(newpage, reason); /* diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 87a1125aa42d..b1b18337a56a 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -885,13 +885,11 @@ void __init hugetlb_cgroup_file_init(void) * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ -void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; - struct hstate *h = page_hstate(oldhpage); - struct folio *old_folio = page_folio(oldhpage); - struct folio *new_folio = page_folio(newhpage); + struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; @@ -905,7 +903,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(new_folio, h_cg); set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); - list_move(&newhpage->lru, &h->hugepage_activelist); + list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; } From d5e33bd8c16b6f5f47665d378f078bee72b85225 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:55 -0700 Subject: [PATCH 104/313] mm/hugetlb: convert isolate_or_dissolve_huge_page to folios Removes a call to compound_head() by using a folio when operating on the head page of a hugetlb compound page. Link: https://lkml.kernel.org/r/20221101223059.460937-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- mm/hugetlb.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 05a832886a09..666a771c9a3d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2832,7 +2832,7 @@ free_new: int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; - struct page *head; + struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -2841,9 +2841,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); - if (PageHuge(page)) { - head = compound_head(page); - h = page_hstate(head); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; @@ -2858,10 +2857,10 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && !isolate_hugetlb(head, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) ret = 0; - else if (!page_count(head)) - ret = alloc_and_dissolve_huge_page(h, head, list); + else if (!folio_ref_count(folio)) + ret = alloc_and_dissolve_huge_page(h, &folio->page, list); return ret; } From 0356c4b96f6890dd61af4c902f681764f4bdba09 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:56 -0700 Subject: [PATCH 105/313] mm/hugetlb: convert free_huge_page to folios Use folios inside free_huge_page(), this is in preparation for converting hugetlb_cgroup_uncharge_page() to take in a folio. Link: https://lkml.kernel.org/r/20221101223059.460937-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- mm/hugetlb.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 666a771c9a3d..9841fb0fcaf9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1704,21 +1704,22 @@ void free_huge_page(struct page *page) * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - struct hugepage_subpool *spool = hugetlb_page_subpool(page); + struct folio *folio = page_folio(page); + struct hstate *h = folio_hstate(folio); + int nid = folio_nid(folio); + struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(page_mapcount(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); - hugetlb_set_page_subpool(page, NULL); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); - page->mapping = NULL; - restore_reserve = HPageRestoreReserve(page); - ClearHPageRestoreReserve(page); + hugetlb_set_folio_subpool(folio, NULL); + if (folio_test_anon(folio)) + __ClearPageAnonExclusive(&folio->page); + folio->mapping = NULL; + restore_reserve = folio_test_hugetlb_restore_reserve(folio); + folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a @@ -1740,7 +1741,7 @@ void free_huge_page(struct page *page) } spin_lock_irqsave(&hugetlb_lock, flags); - ClearHPageMigratable(page); + folio_clear_hugetlb_migratable(folio); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), @@ -1748,7 +1749,7 @@ void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (HPageTemporary(page)) { + if (folio_test_hugetlb_temporary(folio)) { remove_hugetlb_page(h, page, false); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page, true); From d4ab0316cc33aeedf6dcb1c2c25e097a25766132 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:57 -0700 Subject: [PATCH 106/313] mm/hugetlb_cgroup: convert hugetlb_cgroup_uncharge_page() to folios Continue to use a folio inside free_huge_page() by converting hugetlb_cgroup_uncharge_page*() to folios. Link: https://lkml.kernel.org/r/20221101223059.460937-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 16 ++++++++-------- mm/hugetlb.c | 15 +++++++++------ mm/hugetlb_cgroup.c | 21 ++++++++++----------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 789b6fef176d..c70f92fe493e 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -158,10 +158,10 @@ extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page); -extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page); -extern void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page); +extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio); +extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); @@ -254,14 +254,14 @@ hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, { } -static inline void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { } -static inline void hugetlb_cgroup_uncharge_page_rsvd(int idx, +static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, - struct page *page) + struct folio *folio) { } static inline void hugetlb_cgroup_uncharge_cgroup(int idx, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9841fb0fcaf9..e1950fff6aa9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1742,10 +1742,10 @@ void free_huge_page(struct page *page) spin_lock_irqsave(&hugetlb_lock, flags); folio_clear_hugetlb_migratable(folio); - hugetlb_cgroup_uncharge_page(hstate_index(h), - pages_per_huge_page(h), page); - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio(hstate_index(h), + pages_per_huge_page(h), folio); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); if (restore_reserve) h->resv_huge_pages++; @@ -2872,6 +2872,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; + struct folio *folio; long map_chg, map_commit; long gbl_chg; int ret, idx; @@ -2935,6 +2936,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * a reservation exists for the allocation. */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); + if (!page) { spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); @@ -2949,6 +2951,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_refcounted(page); /* Fall through */ } + folio = page_folio(page); hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. @@ -2978,8 +2981,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (deferred_reserve) - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); } return page; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index b1b18337a56a..4cd57f979245 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -346,11 +346,10 @@ void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, /* * Should be called with hugetlb_lock held */ -static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page, bool rsvd) +static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; - struct folio *folio = page_folio(page); if (hugetlb_cgroup_disabled()) return; @@ -368,27 +367,27 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, css_put(&h_cg->css); else { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } -void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } -void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, From 541b7c7b3ec0555a09782b463bcbc2cb86d97085 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:58 -0700 Subject: [PATCH 107/313] mm/hugeltb_cgroup: convert hugetlb_cgroup_commit_charge*() to folios Convert hugetlb_cgroup_commit_charge*() to internally use folios to clean up the code after __set_hugetlb_cgroup() was changed to take a folio. Link: https://lkml.kernel.org/r/20221101223059.460937-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- mm/hugetlb_cgroup.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 4cd57f979245..d9e4425d81ac 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -310,21 +310,21 @@ int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page, bool rsvd) + struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; - __set_hugetlb_cgroup(page_folio(page), h_cg, rsvd); + __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } @@ -333,14 +333,18 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); + struct folio *folio = page_folio(page); + + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); + struct folio *folio = page_folio(page); + + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* From 345c62d163496ae4b5c1ce530b1588067d8f5a8b Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 1 Nov 2022 15:30:59 -0700 Subject: [PATCH 108/313] mm/hugetlb: convert move_hugetlb_state() to folios Clean up unmap_and_move_huge_page() by converting move_hugetlb_state() to take in folios. [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n build] Link: https://lkml.kernel.org/r/20221101223059.460937-10-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Aneesh Kumar K.V Cc: Bui Quang Minh Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mina Almasry Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 11 ++++++++--- mm/hugetlb.c | 22 ++++++++++++---------- mm/migrate.c | 4 ++-- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 65ea34022aa2..58a30938a9b1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -187,7 +187,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void putback_active_hugepage(struct page *page); -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -407,8 +407,8 @@ static inline void putback_active_hugepage(struct page *page) { } -static inline void move_hugetlb_state(struct page *oldpage, - struct page *newpage, int reason) +static inline void move_hugetlb_state(struct folio *old_folio, + struct folio *new_folio, int reason) { } @@ -991,6 +991,11 @@ void hugetlb_unregister_node(struct node *node); #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) +{ + return NULL; +} + static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e1950fff6aa9..76ebefe02827 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7324,15 +7324,15 @@ void putback_active_hugepage(struct page *page) put_page(page); } -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { - struct hstate *h = page_hstate(oldpage); + struct hstate *h = folio_hstate(old_folio); - hugetlb_cgroup_migrate(page_folio(oldpage), page_folio(newpage)); - set_page_owner_migrate_reason(newpage, reason); + hugetlb_cgroup_migrate(old_folio, new_folio); + set_page_owner_migrate_reason(&new_folio->page, reason); /* - * transfer temporary state of the new huge page. This is + * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. @@ -7341,12 +7341,14 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (HPageTemporary(newpage)) { - int old_nid = page_to_nid(oldpage); - int new_nid = page_to_nid(newpage); + if (folio_test_hugetlb_temporary(new_folio)) { + int old_nid = folio_nid(old_folio); + int new_nid = folio_nid(new_folio); + + + folio_set_hugetlb_temporary(old_folio); + folio_clear_hugetlb_temporary(new_folio); - SetHPageTemporary(oldpage); - ClearHPageTemporary(newpage); /* * There is no need to transfer the per-node surplus state diff --git a/mm/migrate.c b/mm/migrate.c index f8c85b42e2bc..4aea647a0180 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { + if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } @@ -1348,7 +1348,7 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - move_hugetlb_state(hpage, new_hpage, reason); + move_hugetlb_state(src, dst, reason); put_new_page = NULL; } From 44467bbb7e81ebcef2a5bfc9d6546bf7cd015374 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:21 +0000 Subject: [PATCH 109/313] mm/damon/core: add a callback for scheme target regions check Patch series "efficiently expose damos action tried regions information". DAMON users can retrieve the monitoring results via 'after_aggregation' callbacks if the user is using the kernel API, or 'damon_aggregated' tracepoint if the user is in the user space. Those are useful if full monitoring results are necessary. However, if the user has interest in only a snapshot of the results for some regions having specific access pattern, the interfaces could be inefficient. For example, some users only want to know which memory regions are not accessed for more than a specific time at the moment. Also, some DAMOS users would want to know exactly to what memory regions the schemes' actions tried to be applied, for a debugging or a tuning. As DAMOS has its internal mechanism for quota and regions prioritization, the users would need to simulate DAMOS' mechanism against the monitoring results. That's unnecessarily complex. This patchset implements DAMON kernel API callbacks and sysfs directory for efficient exposure of the information for the use cases. The new callback will be called for each region when a DAMOS action is gonna tried to be applied to it. The sysfs directory will be called 'tried_regions' and placed under each scheme sysfs directory. Users can write a special keyworkd, 'update_schemes_regions', to the 'state' file of a kdamond sysfs directory. Then, DAMON sysfs interface will fill the directory with the information of regions that corresponding scheme action was tried to be applied for next one aggregation interval. Patches Sequence ---------------- The first one (patch 1) implements the callback for the kernel space users. Following two patches (patches 2 and 3) implements sysfs directories for the information and its sub directories. Two patches (patches 4 and 5) for implementing the special keywords for filling the data to and cleaning up the directories follow. Patch 6 adds a selftest for the new sysfs directory. Finally, two patches (patches 7 and 8) document the new feature in the administrator guide and the ABI document. This patch (of 8): Getting DAMON monitoring results of only specific access pattern (e.g., getting address ranges of memory that not accessed at all for two minutes) can be useful for efficient monitoring of the system. The information can also be helpful for deep level investigation of DAMON-based operation schemes. For that, users need to record (in case of the user space users) or iterate (in case of the kernel space users) full monitoring results and filter it out for the specific access pattern. In case of the DAMOS investigation, users will even need to simulate DAMOS' quota and prioritization mechanisms. It's inefficient and complex. Add a new DAMON callback that will be called before each scheme is applied to each region. DAMON kernel API users will be able to do the query-like monitoring results collection, or DAMOS investigation in an efficient and simple way using it. Commits for providing the capability to the user space users will follow. Link: https://lkml.kernel.org/r/20221101220328.95765-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ mm/damon/core.c | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 620ada094c3b..35630634d790 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -357,6 +357,7 @@ struct damon_operations { * @after_wmarks_check: Called after each schemes' watermarks check. * @after_sampling: Called after each sampling. * @after_aggregation: Called after each aggregation. + * @before_damos_apply: Called before applying DAMOS action. * @before_terminate: Called before terminating the monitoring. * @private: User private data. * @@ -385,6 +386,10 @@ struct damon_callback { int (*after_wmarks_check)(struct damon_ctx *context); int (*after_sampling)(struct damon_ctx *context); int (*after_aggregation)(struct damon_ctx *context); + int (*before_damos_apply)(struct damon_ctx *context, + struct damon_target *target, + struct damon_region *region, + struct damos *scheme); void (*before_terminate)(struct damon_ctx *context); }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 80d5937fe337..ceec75b88ef9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -772,6 +772,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; + int err = 0; if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { @@ -782,7 +783,10 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, damon_split_region_at(t, r, sz); } ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); + if (c->callback.before_damos_apply) + err = c->callback.before_damos_apply(c, t, r, s); + if (!err) + sz_applied = c->ops.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); From 5181b75f438d2e5b7f27bf48c6ea88a87c2882b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:22 +0000 Subject: [PATCH 110/313] mm/damon/sysfs-schemes: implement schemes/tried_regions directory For efficient and simple query-like DAMON monitoring results readings and deep level investigations of DAMOS, DAMON kernel API (include/linux/damon.h) users can use 'before_damos_apply' DAMON callback. However, DAMON sysfs interface users don't have such option. Add a directory, namely 'tried_regions', under each scheme directory to use it as the interface for the purpose. Note that this commit is implementing only the directory but the data filling. After the data filling change is made, users will be able to signal DAMON to fill the directory with the regions that corresponding scheme has tried to be applied. By setting the access pattern of the scheme, users could do the efficient query-like monitoring. Link: https://lkml.kernel.org/r/20221101220328.95765-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 57 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 9509d5c1e7fc..500759d8b20c 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -9,6 +9,36 @@ #include "sysfs-common.h" +/* + * scheme regions directory + */ + +struct damon_sysfs_scheme_regions { + struct kobject kobj; +}; + +static struct damon_sysfs_scheme_regions * +damon_sysfs_scheme_regions_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL); +} + +static void damon_sysfs_scheme_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); +} + +static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); + +static struct kobj_type damon_sysfs_scheme_regions_ktype = { + .release = damon_sysfs_scheme_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_regions_groups, +}; + /* * schemes/stats directory */ @@ -635,6 +665,7 @@ struct damon_sysfs_scheme { struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; struct damon_sysfs_stats *stats; + struct damon_sysfs_scheme_regions *tried_regions; }; /* This should match with enum damos_action */ @@ -743,6 +774,25 @@ static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) return err; } +static int damon_sysfs_scheme_set_tried_regions( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_regions *tried_regions = + damon_sysfs_scheme_regions_alloc(); + int err; + + if (!tried_regions) + return -ENOMEM; + err = kobject_init_and_add(&tried_regions->kobj, + &damon_sysfs_scheme_regions_ktype, &scheme->kobj, + "tried_regions"); + if (err) + kobject_put(&tried_regions->kobj); + else + scheme->tried_regions = tried_regions; + return err; +} + static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) { int err; @@ -759,8 +809,14 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) err = damon_sysfs_scheme_set_stats(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_tried_regions(scheme); + if (err) + goto put_tried_regions_out; return 0; +put_tried_regions_out: + kobject_put(&scheme->tried_regions->kobj); + scheme->tried_regions = NULL; put_watermarks_quotas_access_pattern_out: kobject_put(&scheme->watermarks->kobj); scheme->watermarks = NULL; @@ -781,6 +837,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); kobject_put(&scheme->stats->kobj); + kobject_put(&scheme->tried_regions->kobj); } static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, From 9277d0367ba18ef4bb98bafb1209e715844cdf7e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:23 +0000 Subject: [PATCH 111/313] mm/damon/sysfs-schemes: implement scheme region directory Implement region directories under 'tried_regions' directory of each scheme DAMON sysfs directory. This directory will provide the address range, the monitored access frequency ('nr_accesses'), and the age of each DAMON region that corresponding DAMON-based operation scheme has tried to be applied. Note that this commit doesn't implement the code for filling the data but only the sysfs directory. Link: https://lkml.kernel.org/r/20221101220328.95765-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 123 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 122 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 500759d8b20c..f0b5ad7e721d 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -9,18 +9,138 @@ #include "sysfs-common.h" +/* + * scheme region directory + */ + +struct damon_sysfs_scheme_region { + struct kobject kobj; + struct damon_addr_range ar; + unsigned int nr_accesses; + unsigned int age; + struct list_head list; +}; + +static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( + struct damon_region *region) +{ + struct damon_sysfs_scheme_region *sysfs_region = kmalloc( + sizeof(*sysfs_region), GFP_KERNEL); + + if (!sysfs_region) + return NULL; + sysfs_region->kobj = (struct kobject){}; + sysfs_region->ar = region->ar; + sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->age = region->age; + INIT_LIST_HEAD(&sysfs_region->list); + return sysfs_region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.start); +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.end); +} + +static ssize_t nr_accesses_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->nr_accesses); +} + +static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->age); +} + +static void damon_sysfs_scheme_region_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + list_del(®ion->list); + kfree(region); +} + +static struct kobj_attribute damon_sysfs_scheme_region_start_attr = + __ATTR_RO_MODE(start, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_end_attr = + __ATTR_RO_MODE(end, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = + __ATTR_RO_MODE(nr_accesses, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_age_attr = + __ATTR_RO_MODE(age, 0400); + +static struct attribute *damon_sysfs_scheme_region_attrs[] = { + &damon_sysfs_scheme_region_start_attr.attr, + &damon_sysfs_scheme_region_end_attr.attr, + &damon_sysfs_scheme_region_nr_accesses_attr.attr, + &damon_sysfs_scheme_region_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); + +static struct kobj_type damon_sysfs_scheme_region_ktype = { + .release = damon_sysfs_scheme_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_region_groups, +}; + /* * scheme regions directory */ struct damon_sysfs_scheme_regions { struct kobject kobj; + struct list_head regions_list; + int nr_regions; }; static struct damon_sysfs_scheme_regions * damon_sysfs_scheme_regions_alloc(void) { - return kzalloc(sizeof(struct damon_sysfs_scheme_regions), GFP_KERNEL); + struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), + GFP_KERNEL); + + regions->kobj = (struct kobject){}; + INIT_LIST_HEAD(®ions->regions_list); + regions->nr_regions = 0; + return regions; +} + +static void damon_sysfs_scheme_regions_rm_dirs( + struct damon_sysfs_scheme_regions *regions) +{ + struct damon_sysfs_scheme_region *r, *next; + + list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + /* release function deletes it from the list */ + kobject_put(&r->kobj); + regions->nr_regions--; + } } static void damon_sysfs_scheme_regions_release(struct kobject *kobj) @@ -837,6 +957,7 @@ static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) kobject_put(&scheme->quotas->kobj); kobject_put(&scheme->watermarks->kobj); kobject_put(&scheme->stats->kobj); + damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); kobject_put(&scheme->tried_regions->kobj); } From f1d13cacabe140305844879e495ca67837e059cc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:24 +0000 Subject: [PATCH 112/313] mm/damon/sysfs: implement DAMOS tried regions update command Implement the code for filling the data of 'tried_regions' DAMON sysfs directory. With this commit, DAMON sysfs interface users can write a special keyword, 'update_schemes_tried_regions' to the corresponding 'state' file of the kdamond. Then, DAMON sysfs interface will collect the tried regions information using the 'before_damos_apply()' callback for one aggregation interval and populate scheme region directories with the values. [sj@kernel.org: skip tried regions update if the scheme directory was removed] Link: https://lkml.kernel.org/r/20221114182954.4745-2-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 6 +++ mm/damon/sysfs-schemes.c | 80 ++++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs.c | 57 +++++++++++++++++++++++++++- 3 files changed, 141 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 4626b2784404..634a6e7fca78 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -44,3 +44,9 @@ int damon_sysfs_set_schemes(struct damon_ctx *ctx, void damon_sysfs_schemes_update_stats( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index f0b5ad7e721d..5f14f18bcc49 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1244,3 +1244,83 @@ void damon_sysfs_schemes_update_stats( sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; } } + +/* + * damon_sysfs_schemes that need to update its schemes regions dir. Protected + * by damon_sysfs_lock + */ +static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; +static int damon_sysfs_schemes_region_idx; + +/* + * DAMON callback that called before damos apply. While this callback is + * registered, damon_sysfs_lock should be held to ensure the regions + * directories exist. + */ +static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos *scheme; + struct damon_sysfs_scheme_regions *sysfs_regions; + struct damon_sysfs_scheme_region *region; + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + if (scheme == s) + break; + schemes_idx++; + } + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + return 0; + + sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + region = damon_sysfs_scheme_region_alloc(r); + list_add_tail(®ion->list, &sysfs_regions->regions_list); + sysfs_regions->nr_regions++; + if (kobject_init_and_add(®ion->kobj, + &damon_sysfs_scheme_region_ktype, + &sysfs_regions->kobj, "%d", + damon_sysfs_schemes_region_idx++)) { + kobject_put(®ion->kobj); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_scheme *sysfs_scheme; + + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + damon_sysfs_scheme_regions_rm_dirs( + sysfs_scheme->tried_regions); + } + + damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + return 0; +} + +/* + * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller + * should unlock damon_sysfs_lock which held before + * damon_sysfs_schemes_update_regions_start() + */ +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) +{ + damon_sysfs_schemes_for_damos_callback = NULL; + ctx->callback.before_damos_apply = NULL; + damon_sysfs_schemes_region_idx = 0; + return 0; +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 284daf274b3e..ffb5a84059d7 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -999,6 +999,11 @@ enum damon_sysfs_cmd { * files. */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried + * regions + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ @@ -1011,6 +1016,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_regions", }; /* @@ -1193,6 +1199,16 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; + struct damon_sysfs_kdamond *kdamond; + + /* damon_sysfs_schemes_update_regions_stop() might not yet called */ + kdamond = damon_sysfs_cmd_request.kdamond; + if (kdamond && damon_sysfs_cmd_request.cmd == + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && + ctx == kdamond->damon_ctx) { + damon_sysfs_schemes_update_regions_stop(ctx); + mutex_unlock(&damon_sysfs_lock); + } if (!damon_target_has_pid(ctx)) return; @@ -1225,6 +1241,27 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) return 0; } +static int damon_sysfs_upd_schemes_regions_start( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_start( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + +static int damon_sysfs_upd_schemes_regions_stop( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_stop(ctx); +} + static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1277,10 +1314,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; + static bool damon_sysfs_schemes_regions_updating; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!mutex_trylock(&damon_sysfs_lock)) + if (!damon_sysfs_schemes_regions_updating && + !mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) @@ -1292,13 +1331,27 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + if (!damon_sysfs_schemes_regions_updating) { + err = damon_sysfs_upd_schemes_regions_start(kdamond); + if (!err) { + damon_sysfs_schemes_regions_updating = true; + goto keep_lock_out; + } + } else { + err = damon_sysfs_upd_schemes_regions_stop(kdamond); + damon_sysfs_schemes_regions_updating = false; + } + break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - mutex_unlock(&damon_sysfs_lock); + if (!damon_sysfs_schemes_regions_updating) + mutex_unlock(&damon_sysfs_lock); +keep_lock_out: return err; } From 772c15e5adcb32a42dbbcdb905ec49f662312976 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:25 +0000 Subject: [PATCH 113/313] mm/damon/sysfs-schemes: implement DAMOS-tried regions clear command When there are huge number of DAMON regions that specific scheme actions are tried to be applied, directories and files under 'tried_regions' scheme directory could waste some memory. Add another special input keyword ('clear_schemes_tried_regions') for 'state' file of each kdamond sysfs directory that can be used for cleanup of the 'tried_regions' sub-directories. [sj@kernel.org: skip regions clearing if the scheme directory was removed] Link: https://lkml.kernel.org/r/20221114182954.4745-3-sj@kernel.org Link: https://lkml.kernel.org/r/20221101220328.95765-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 4 ++++ mm/damon/sysfs-schemes.c | 14 +++++++++++++- mm/damon/sysfs.c | 20 ++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index 634a6e7fca78..604a6cbc3ede 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -50,3 +50,7 @@ int damon_sysfs_schemes_update_regions_start( struct damon_ctx *ctx); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); + +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 5f14f18bcc49..81fc4d27f4e4 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1292,7 +1292,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, } /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ -int damon_sysfs_schemes_update_regions_start( +int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx) { @@ -1302,11 +1302,23 @@ int damon_sysfs_schemes_update_regions_start( damon_for_each_scheme(scheme, ctx) { struct damon_sysfs_scheme *sysfs_scheme; + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); } + return 0; +} +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; return 0; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ffb5a84059d7..aeb0beb1da91 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1004,6 +1004,11 @@ enum damon_sysfs_cmd { * regions */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, + /* + * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried + * regions + */ + DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ @@ -1017,6 +1022,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "commit", "update_schemes_stats", "update_schemes_tried_regions", + "clear_schemes_tried_regions", }; /* @@ -1262,6 +1268,17 @@ static int damon_sysfs_upd_schemes_regions_stop( return damon_sysfs_schemes_update_regions_stop(ctx); } +static int damon_sysfs_clear_schemes_regions( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -1343,6 +1360,9 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) damon_sysfs_schemes_regions_updating = false; } break; + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + err = damon_sysfs_clear_schemes_regions(kdamond); + break; default: break; } From 2b3ee3f66c673312ea377bcfb54cb2b9abc8473b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:26 +0000 Subject: [PATCH 114/313] tools/selftets/damon/sysfs: test tried_regions directory existence Add a simple test case for ensuring tried_regions directory existence. Link: https://lkml.kernel.org/r/20221101220328.95765-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 89592c64462f..db4942383a50 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -80,6 +80,12 @@ test_range() ensure_file "$range_dir/max" "exist" 600 } +test_tried_regions() +{ + tried_regions_dir=$1 + ensure_dir "$tried_regions_dir" "exist" +} + test_stats() { stats_dir=$1 @@ -138,6 +144,7 @@ test_scheme() test_quotas "$scheme_dir/quotas" test_watermarks "$scheme_dir/watermarks" test_stats "$scheme_dir/stats" + test_tried_regions "$scheme_dir/tried_regions" } test_schemes() From 7f0a86f3c99bc9736445ef64aa65c9bd6161a47b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:27 +0000 Subject: [PATCH 115/313] Docs/admin-guide/mm/damon/usage: document schemes//tried_regions sysfs directory Document 'tried_regions' directory in DAMON sysfs interface usage in the administrator guide. Link: https://lkml.kernel.org/r/20221101220328.95765-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 45 ++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index c17e02e1e426..1a5b6b71efa1 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -88,6 +88,9 @@ comma (","). :: │ │ │ │ │ │ │ │ weights/sz_permil,nr_accesses_permil,age_permil │ │ │ │ │ │ │ watermarks/metric,interval_us,high,mid,low │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds + │ │ │ │ │ │ │ tried_regions/ + │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age + │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... │ │ │ │ ... │ │ ... @@ -125,7 +128,14 @@ in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the user inputs in the sysfs files except ``state`` file again. Writing ``update_schemes_stats`` to ``state`` file updates the contents of stats files for each DAMON-based operation scheme of the kdamond. For details of the -stats, please refer to :ref:`stats section `. +stats, please refer to :ref:`stats section `. Writing +``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based +operation scheme action tried regions directory for each DAMON-based operation +scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state`` +file clears the DAMON-based operating scheme action tried regions directory for +each DAMON-based operation scheme of the kdamond. For details of the +DAMON-based operation scheme action tried regions directory, please refer to +:ref:tried_regions section `. If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread. @@ -166,6 +176,8 @@ You can set and get what type of monitoring operations DAMON will use for the context by writing one of the keywords listed in ``avail_operations`` file and reading from the ``operations`` file. +.. _sysfs_monitoring_attrs: + contexts//monitoring_attrs/ ------------------------------ @@ -255,8 +267,9 @@ to ``N-1``. Each directory represents each DAMON-based operation scheme. schemes// ------------ -In each scheme directory, four directories (``access_pattern``, ``quotas``, -``watermarks``, and ``stats``) and one file (``action``) exist. +In each scheme directory, five directories (``access_pattern``, ``quotas``, +``watermarks``, ``stats``, and ``tried_regions``) and one file (``action``) +exist. The ``action`` file is for setting and getting what action you want to apply to memory regions having specific access pattern of the interest. The keywords @@ -351,6 +364,32 @@ should ask DAMON sysfs interface to updte the content of the files for the stats by writing a special keyword, ``update_schemes_stats`` to the relevant ``kdamonds//state`` file. +.. _sysfs_schemes_tried_regions: + +schemes//tried_regions/ +-------------------------- + +When a special keyword, ``update_schemes_tried_regions``, is written to the +relevant ``kdamonds//state`` file, DAMON creates directories named integer +starting from ``0`` under this directory. Each directory contains files +exposing detailed information about each of the memory region that the +corresponding scheme's ``action`` has tried to be applied under this directory, +during next :ref:`aggregation interval `. The +information includes address range, ``nr_accesses``, , and ``age`` of the +region. + +The directories will be removed when another special keyword, +``clear_schemes_tried_regions``, is written to the relevant +``kdamonds//state`` file. + +tried_regions// +------------------ + +In each region directory, you will find four files (``start``, ``end``, +``nr_accesses``, and ``age``). Reading the files will show the start and end +addresses, ``nr_accesses``, and ``age`` of the region that corresponding +DAMON-based operation scheme ``action`` has tried to be applied. + Example ~~~~~~~ From 1b0006daa36f2ccb7f213007365d504bcd016312 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 1 Nov 2022 22:03:28 +0000 Subject: [PATCH 116/313] Docs/ABI/damon: document 'schemes//tried_regions' sysfs directory Update DAMON ABI document for the 'tried_regions' directory of DAMON sysfs interface. Link: https://lkml.kernel.org/r/20221101220328.95765-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 08b9df323560..13397b853692 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -27,6 +27,10 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or makes the kdamond reads the user inputs in the sysfs files except 'state' again. Writing 'update_schemes_stats' to the file updates contents of schemes stats files of the kdamond. + Writing 'update_schemes_tried_regions' to the file updates + contents of 'tried_regions' directory of every scheme directory + of this kdamond. Writing 'clear_schemes_tried_regions' to the + file removes contents of the 'tried_regions' directory. What: /sys/kernel/mm/damon/admin/kdamonds//pid Date: Mar 2022 @@ -283,3 +287,31 @@ Date: Mar 2022 Contact: SeongJae Park Description: Reading this file returns the number of the exceed events of the scheme's quotas. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//start +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the start address of a memory region + that corresponding DAMON-based Operation Scheme's action has + tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//end +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the end address of a memory region + that corresponding DAMON-based Operation Scheme's action has + tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//nr_accesses +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the 'nr_accesses' of a memory region + that corresponding DAMON-based Operation Scheme's action has + tried to be applied. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//age +Date: Oct 2022 +Contact: SeongJae Park +Description: Reading this file returns the 'age' of a memory region that + corresponding DAMON-based Operation Scheme's action has tried + to be applied. From e6aff38b2e25e934e95471351c96d1410bb17561 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 1 Nov 2022 22:14:08 +0100 Subject: [PATCH 117/313] mm/damon: use kstrtobool() instead of strtobool() strtobool() is the same as kstrtobool(). However, the latter is more used within the kernel. In order to remove strtobool() and slightly simplify kstrtox.h, switch to the other function name. While at it, include the corresponding header file () Link: https://lkml.kernel.org/r/ed2b46489a513988688decb53850339cc228940c.1667336095.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 3 ++- mm/damon/reclaim.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 2a532e3983df..7b8fce2f67a8 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "damon-lru-sort: " fmt #include +#include #include #include "modules-common.h" @@ -241,7 +242,7 @@ static int damon_lru_sort_enabled_store(const char *val, bool enable; int err; - err = strtobool(val, &enable); + err = kstrtobool(val, &enable); if (err) return err; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e57604bec06d..e82631f39481 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "damon-reclaim: " fmt #include +#include #include #include "modules-common.h" @@ -187,7 +188,7 @@ static int damon_reclaim_enabled_store(const char *val, bool enable; int err; - err = strtobool(val, &enable); + err = kstrtobool(val, &enable); if (err) return err; From f15be1b8d449a8eebe82d77164bf760804753651 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 1 Nov 2022 22:14:09 +0100 Subject: [PATCH 118/313] mm: use kstrtobool() instead of strtobool() strtobool() is the same as kstrtobool(). However, the latter is more used within the kernel. In order to remove strtobool() and slightly simplify kstrtox.h, switch to the other function name. While at it, include the corresponding header file () Link: https://lkml.kernel.org/r/03f9401a6c8b87a1c786a2138d16b048f8d0eb53.1667336095.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Acked-by: Pasha Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 3 ++- mm/usercopy.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 433dbce13fe1..93e633c1d587 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -4,6 +4,7 @@ * Copyright (c) 2021, Google LLC. * Pasha Tatashin */ +#include #include #include @@ -23,7 +24,7 @@ EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { - return strtobool(buf, &__page_table_check_enabled); + return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); diff --git a/mm/usercopy.c b/mm/usercopy.c index c1ee15a98633..4c3164beacec 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -258,7 +259,7 @@ static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { - if (strtobool(str, &enable_checks)) + if (kstrtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; From ca92ea3dc5a2b01f98e9f02b7a6bc03be06fe124 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 30 Oct 2022 17:41:50 -0400 Subject: [PATCH 119/313] mm: always compile in pte markers Patch series "mm: Use pte marker for swapin errors". This series uses the pte marker to replace the swapin error swap entry, then we save one more swap entry slot for swap devices. A new pte marker bit is defined. This patch (of 2): The PTE markers code is tiny and now it's enabled for most of the distributions. It's fine to keep it as-is, but to make a broader use of it (e.g. replacing read error swap entry) it needs to be there always otherwise we need special code path to take care of !PTE_MARKER case. It'll be easier just make pte marker always exist. Use this chance to extend its usage to anonymous too by simply touching up some of the old comments, because it'll be used for anonymous pages in the follow up patches. Link: https://lkml.kernel.org/r/20221030214151.402274-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221030214151.402274-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Huang Ying Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Naoya Horiguchi Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++------- include/linux/swapops.h | 31 ------------------------------- mm/Kconfig | 7 ------- mm/memory.c | 7 +++---- 4 files changed, 6 insertions(+), 49 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 369d7799205d..211aeca9bfa7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -60,17 +60,13 @@ static inline int current_is_kswapd(void) SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ SWP_PTE_MARKER_NUM) /* - * PTE markers are used to persist information onto PTEs that are mapped with - * file-backed memories. As its name "PTE" hints, it should only be applied to - * the leaves of pgtables. + * PTE markers are used to persist information onto PTEs that otherwise + * should be a none pte. As its name "PTE" hints, it should only be + * applied to the leaves of pgtables. */ -#ifdef CONFIG_PTE_MARKER #define SWP_PTE_MARKER_NUM 1 #define SWP_PTE_MARKER (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ SWP_MIGRATION_NUM + SWP_DEVICE_NUM) -#else -#define SWP_PTE_MARKER_NUM 0 -#endif /* * Unaddressable device memory support. See include/linux/hmm.h and diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3ba9bf56899d..35c1fe62d2e1 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -412,8 +412,6 @@ typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) #define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) -#ifdef CONFIG_PTE_MARKER - static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { return swp_entry(SWP_PTE_MARKER, marker); @@ -434,32 +432,6 @@ static inline bool is_pte_marker(pte_t pte) return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); } -#else /* CONFIG_PTE_MARKER */ - -static inline swp_entry_t make_pte_marker_entry(pte_marker marker) -{ - /* This should never be called if !CONFIG_PTE_MARKER */ - WARN_ON_ONCE(1); - return swp_entry(0, 0); -} - -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return false; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return 0; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return false; -} - -#endif /* CONFIG_PTE_MARKER */ - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -477,9 +449,6 @@ static inline pte_t make_pte_marker(pte_marker marker) * memory, kernel-only memory (including when the system is during-boot), * non-ram based generic file-system. It's fine to be used even there, but the * extra pte marker check will be pure overhead. - * - * For systems configured with !CONFIG_PTE_MARKER this will be automatically - * optimized to pte_none(). */ static inline int pte_none_mostly(pte_t pte) { diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..4b28800d9be1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1107,17 +1107,10 @@ config HAVE_ARCH_USERFAULTFD_MINOR help Arch has userfaultfd minor fault support -config PTE_MARKER - bool - - help - Allows to create marker PTEs for file-backed memory. - config PTE_MARKER_UFFD_WP bool "Userfaultfd write protection support for shmem/hugetlbfs" default y depends on HAVE_ARCH_USERFAULTFD_WP - select PTE_MARKER help Allows to create marker PTEs for userfaultfd write protection diff --git a/mm/memory.c b/mm/memory.c index 659620b6770f..b79d27533722 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3662,11 +3662,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) unsigned long marker = pte_marker_get(entry); /* - * PTE markers should always be with file-backed memories, and the - * marker should never be empty. If anything weird happened, the best - * thing to do is to kill the process along with its mm. + * PTE markers should never be empty. If anything weird happened, + * the best thing to do is to kill the process along with its mm. */ - if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; if (pte_marker_entry_uffd_wp(entry)) From 15520a3f046998e3f57e695743e99b0875e2dae7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 30 Oct 2022 17:41:51 -0400 Subject: [PATCH 120/313] mm: use pte markers for swap errors PTE markers are ideal mechanism for things like SWP_SWAPIN_ERROR. Using a whole swap entry type for this purpose can be an overkill, especially if we already have PTE markers. Define a new bit for swapin error and replace it with pte markers. Then we can safely drop SWP_SWAPIN_ERROR and give one device slot back to swap. We used to have SWP_SWAPIN_ERROR taking the page pfn as part of the swap entry, but it's never used. Neither do I see how it can be useful because normally the swapin failure should not be caused by a bad page but bad swap device. Drop it alongside. Link: https://lkml.kernel.org/r/20221030214151.402274-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Huang Ying Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 +----- include/linux/swapops.h | 26 ++++++++++++++------------ mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- mm/swapfile.c | 2 +- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 211aeca9bfa7..fec6647a289a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,10 +55,6 @@ static inline int current_is_kswapd(void) * actions on faults. */ -#define SWP_SWAPIN_ERROR_NUM 1 -#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ - SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ - SWP_PTE_MARKER_NUM) /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be @@ -121,7 +117,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ - SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM) + SWP_PTE_MARKER_NUM) /* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 35c1fe62d2e1..27ade4f22abb 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -162,16 +162,6 @@ static inline void *swp_to_radix_entry(swp_entry_t entry) return xa_mk_value(entry.val); } -static inline swp_entry_t make_swapin_error_entry(struct page *page) -{ - return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page)); -} - -static inline int is_swapin_error_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_SWAPIN_ERROR; -} - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -409,8 +399,9 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; -#define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_MASK (PTE_MARKER_UFFD_WP) +#define PTE_MARKER_UFFD_WP BIT(0) +#define PTE_MARKER_SWAPIN_ERROR BIT(1) +#define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { @@ -437,6 +428,17 @@ static inline pte_t make_pte_marker(pte_marker marker) return swp_entry_to_pte(make_pte_marker_entry(marker)); } +static inline swp_entry_t make_swapin_error_entry(void) +{ + return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); +} + +static inline int is_swapin_error_entry(swp_entry_t entry) +{ + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); +} + /* * This is a special version to check pte_none() just to cover the case when * the pte is a pte marker. It existed because in many cases the pte marker diff --git a/mm/memory.c b/mm/memory.c index b79d27533722..142c4229549b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3668,6 +3668,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (WARN_ON_ONCE(!marker)) return VM_FAULT_SIGBUS; + /* Higher priority than uffd-wp when data corrupted */ + if (marker & PTE_MARKER_SWAPIN_ERROR) + return VM_FAULT_SIGBUS; + if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); @@ -3727,8 +3731,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_swapin_error_entry(entry)) { - ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { diff --git a/mm/shmem.c b/mm/shmem.c index 0a7c4a748811..7428ae3fa4b9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1682,7 +1682,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swp_entry_t swapin_error; void *old; - swapin_error = make_swapin_error_entry(&folio->page); + swapin_error = make_swapin_error_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); diff --git a/mm/swapfile.c b/mm/swapfile.c index 72e481aacd5d..03fe0949f6b2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1781,7 +1781,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t pteval; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + pteval = swp_entry_to_pte(make_swapin_error_entry()); set_pte_at(vma->vm_mm, addr, pte, pteval); swap_free(entry); ret = 0; From 65917b538bcc4d8c0d8e199a6f7b7426acf13d58 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Thu, 3 Nov 2022 22:38:18 -0400 Subject: [PATCH 121/313] zsmalloc: replace IS_ERR() with IS_ERR_VALUE() Avoid typecasts that are needed for IS_ERR() and use IS_ERR_VALUE() instead. Link: https://lkml.kernel.org/r/20221104023818.1728-1-wangdeming@inspur.com Signed-off-by: Deming Wang Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d03941cace2c..b52b7bb88b52 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -387,7 +387,7 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, { *handle = zs_malloc(pool, size, gfp); - if (IS_ERR((void *)(*handle))) + if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); return 0; } From 634ba645f9bc888227ca954ea643579268d1b6d8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 3 Nov 2022 18:16:32 -0700 Subject: [PATCH 122/313] selftests/vm: update hugetlb madvise Commit 8ebe0a5eaaeb ("mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs") changed how the passed length was interpreted for hugetlb mappings. It was changed from align up to align down. The hugetlb-madvise test explicitly tests this behavior. Change test to expect new behavior. Link: https://lkml.kernel.org/r/20221104011632.357049-1-mike.kravetz@oracle.com Link: https://lore.kernel.org/oe-lkp/202211040619.2ec447d7-oliver.sang@intel.com Signed-off-by: Mike Kravetz Reported-by: kernel test robot Cc: David Hildenbrand Cc: Rik van Riel Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugetlb-madvise.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index f96435b70986..a634f47d1e56 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -195,7 +195,7 @@ int main(int argc, char **argv) exit(1); } - /* addr + length should be aligned up to huge page size */ + /* addr + length should be aligned down to huge page size */ if (madvise(addr, ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, MADV_DONTNEED)) { @@ -203,10 +203,11 @@ int main(int argc, char **argv) exit(1); } - /* should free all pages in mapping */ - validate_free_pages(free_hugepages); + /* should free all but last page in mapping */ + validate_free_pages(free_hugepages - 1); (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + validate_free_pages(free_hugepages); /* * Test MADV_DONTNEED on anonymous private mapping From dad6a5eb55564845aa17b8b20fa834af21e46c48 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:48:45 -0700 Subject: [PATCH 123/313] mm,hugetlb: use folio fields in second tail page Patch series "mm,huge,rmap: unify and speed up compound mapcounts". This patch (of 3): We want to declare one more int in the first tail of a compound page: that first tail page being valuable property, since every compound page has a first tail, but perhaps no more than that. No problem on 64-bit: there is already space for it. No problem with 32-bit THPs: 5.18 commit 5232c63f46fd ("mm: Make compound_pincount always available") kindly cleared the space for it, apparently not realizing that only 64-bit architectures enable CONFIG_THP_SWAP (whose use of tail page->private might conflict) - but make sure of that in its Kconfig. But hugetlb pages use tail page->private of the first tail page for a subpool pointer, which will conflict; and they also use page->private of the 2nd, 3rd and 4th tails. Undo "mm: add private field of first tail to struct page and struct folio"'s recent addition of private_1 to the folio tail: instead add hugetlb_subpool, hugetlb_cgroup, hugetlb_cgroup_rsvd, hugetlb_hwpoison to a second tail page of the folio: THP has long been using several fields of that tail, so make better use of it for hugetlb too. This is not how a generic folio should be declared in future, but it is an effective transitional way to make use of it. Delete the SUBPAGE_INDEX stuff, but keep __NR_USED_SUBPAGE: now 3. [hughd@google.com: prefix folio's page_1 and page_2 with double underscore, give folio's _flags_2 and _head_2 a line documentation each] Link: https://lkml.kernel.org/r/9e2cb6b-5b58-d3f2-b5ee-5f8a14e8f10@google.com Link: https://lkml.kernel.org/r/5f52de70-975-e94f-f141-543765736181@google.com Link: https://lkml.kernel.org/r/3818cc9a-9999-d064-d778-9c94c5911e6@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 23 +++-------- include/linux/hugetlb_cgroup.h | 31 +++++--------- include/linux/mm_types.h | 74 +++++++++++++++++++++++----------- mm/Kconfig | 2 +- mm/memory-failure.c | 5 +-- 5 files changed, 67 insertions(+), 68 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 58a30938a9b1..551834cd5299 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,22 +33,9 @@ typedef struct { unsigned long pd; } hugepd_t; /* * For HugeTLB page, there are more metadata to save in the struct page. But * the head struct page cannot meet our needs, so we have to abuse other tail - * struct page to store the metadata. In order to avoid conflicts caused by - * subsequent use of more tail struct pages, we gather these discrete indexes - * of tail struct page here. + * struct page to store the metadata. */ -enum { - SUBPAGE_INDEX_SUBPOOL = 1, /* reuse page->private */ -#ifdef CONFIG_CGROUP_HUGETLB - SUBPAGE_INDEX_CGROUP, /* reuse page->private */ - SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ - __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, -#endif -#ifdef CONFIG_MEMORY_FAILURE - SUBPAGE_INDEX_HWPOISON, -#endif - __NR_USED_SUBPAGE, -}; +#define __NR_USED_SUBPAGE 3 struct hugepage_subpool { spinlock_t lock; @@ -722,11 +709,11 @@ extern unsigned int default_hstate_idx; static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { - return (void *)folio_get_private_1(folio); + return folio->_hugetlb_subpool; } /* - * hugetlb page subpool pointer located in hpage[1].private + * hugetlb page subpool pointer located in hpage[2].hugetlb_subpool */ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) { @@ -736,7 +723,7 @@ static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) static inline void hugetlb_set_folio_subpool(struct folio *folio, struct hugepage_subpool *subpool) { - folio_set_private_1(folio, (unsigned long)subpool); + folio->_hugetlb_subpool = subpool; } static inline void hugetlb_set_page_subpool(struct page *hpage, diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index c70f92fe493e..f706626a8063 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -24,12 +24,10 @@ struct file_region; #ifdef CONFIG_CGROUP_HUGETLB /* * Minimum page order trackable by hugetlb cgroup. - * At least 4 pages are necessary for all the tracking information. - * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault - * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD]) - * is the reservation usage cgroup. + * At least 3 pages are necessary for all the tracking information. + * The second tail page contains all of the hugetlb-specific fields. */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1) +#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) enum hugetlb_memory_event { HUGETLB_MAX, @@ -69,21 +67,13 @@ struct hugetlb_cgroup { static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { - struct page *tail; - VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return NULL; - - if (rsvd) { - tail = folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD); - return (void *)page_private(tail); - } - - else { - tail = folio_page(folio, SUBPAGE_INDEX_CGROUP); - return (void *)page_private(tail); - } + if (rsvd) + return folio->_hugetlb_cgroup_rsvd; + else + return folio->_hugetlb_cgroup; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio) @@ -101,15 +91,12 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) return; if (rsvd) - set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP_RSVD), - (unsigned long)h_cg); + folio->_hugetlb_cgroup_rsvd = h_cg; else - set_page_private(folio_page(folio, SUBPAGE_INDEX_CGROUP), - (unsigned long)h_cg); + folio->_hugetlb_cgroup = h_cg; } static inline void set_hugetlb_cgroup(struct folio *folio, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e86861ff5bbd..44f1f8b6be02 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -145,15 +145,22 @@ struct page { atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ - unsigned long _private_1; #endif }; - struct { /* Second tail page of compound page */ + struct { /* Second tail page of transparent huge page */ unsigned long _compound_pad_1; /* compound_head */ unsigned long _compound_pad_2; /* For both global and memcg */ struct list_head deferred_list; }; + struct { /* Second tail page of hugetlb page */ + unsigned long _hugetlb_pad_1; /* compound_head */ + void *hugetlb_subpool; + void *hugetlb_cgroup; + void *hugetlb_cgroup_rsvd; + void *hugetlb_hwpoison; + /* No more space on 32-bit: use third tail if more */ + }; struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ @@ -260,13 +267,18 @@ struct page { * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. * @_flags_1: For large folios, additional page flags. - * @__head: Points to the folio. Do not use. + * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). - * @_private_1: Do not use directly, call folio_get_private_1(). + * @_flags_2: For alignment. Do not use. + * @_head_2: Points to the folio. Do not use. + * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h. + * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h. + * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -305,16 +317,31 @@ struct folio { }; struct page page; }; - unsigned long _flags_1; - unsigned long __head; - unsigned char _folio_dtor; - unsigned char _folio_order; - atomic_t _total_mapcount; - atomic_t _pincount; + union { + struct { + unsigned long _flags_1; + unsigned long _head_1; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; #ifdef CONFIG_64BIT - unsigned int _folio_nr_pages; + unsigned int _folio_nr_pages; #endif - unsigned long _private_1; + }; + struct page __page_1; + }; + union { + struct { + unsigned long _flags_2; + unsigned long _head_2; + void *_hugetlb_subpool; + void *_hugetlb_cgroup; + void *_hugetlb_cgroup_rsvd; + void *_hugetlb_hwpoison; + }; + struct page __page_2; + }; }; #define FOLIO_MATCH(pg, fl) \ @@ -335,16 +362,25 @@ FOLIO_MATCH(memcg_data, memcg_data); static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); -FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); FOLIO_MATCH(compound_mapcount, _total_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); -FOLIO_MATCH(_private_1, _private_1); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + 2 * sizeof(struct page)) +FOLIO_MATCH(flags, _flags_2); +FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(hugetlb_subpool, _hugetlb_subpool); +FOLIO_MATCH(hugetlb_cgroup, _hugetlb_cgroup); +FOLIO_MATCH(hugetlb_cgroup_rsvd, _hugetlb_cgroup_rsvd); +FOLIO_MATCH(hugetlb_hwpoison, _hugetlb_hwpoison); +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { @@ -388,16 +424,6 @@ static inline void *folio_get_private(struct folio *folio) return folio->private; } -static inline void folio_set_private_1(struct folio *folio, unsigned long private) -{ - folio->_private_1 = private; -} - -static inline unsigned long folio_get_private_1(struct folio *folio) -{ - return folio->_private_1; -} - struct page_frag_cache { void * va; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) diff --git a/mm/Kconfig b/mm/Kconfig index 4b28800d9be1..c86b69aff7d4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -775,7 +775,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 779a426d2cab..63d8501001c6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1687,8 +1687,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #ifdef CONFIG_HUGETLB_PAGE /* * Struct raw_hwp_page represents information about "raw error page", - * constructing singly linked list originated from ->private field of - * SUBPAGE_INDEX_HWPOISON-th tail page. + * constructing singly linked list from ->_hugetlb_hwpoison field of folio. */ struct raw_hwp_page { struct llist_node node; @@ -1697,7 +1696,7 @@ struct raw_hwp_page { static inline struct llist_head *raw_hwp_list_head(struct page *hpage) { - return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); + return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; } static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) From cb67f4282bf9693658dbda934a441ddbbb1446df Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:51:38 -0700 Subject: [PATCH 124/313] mm,thp,rmap: simplify compound page mapcount handling Compound page (folio) mapcount calculations have been different for anon and file (or shmem) THPs, and involved the obscure PageDoubleMap flag. And each huge mapping and unmapping of a file (or shmem) THP involved atomically incrementing and decrementing the mapcount of every subpage of that huge page, dirtying many struct page cachelines. Add subpages_mapcount field to the struct folio and first tail page, so that the total of subpage mapcounts is available in one place near the head: then page_mapcount() and total_mapcount() and page_mapped(), and their folio equivalents, are so quick that anon and file and hugetlb don't need to be optimized differently. Delete the unloved PageDoubleMap. page_add and page_remove rmap functions must now maintain the subpages_mapcount as well as the subpage _mapcount, when dealing with pte mappings of huge pages; and correct maintenance of NR_ANON_MAPPED and NR_FILE_MAPPED statistics still needs reading through the subpages, using nr_subpages_unmapped() - but only when first or last pmd mapping finds subpages_mapcount raised (double-map case, not the common case). But are those counts (used to decide when to split an anon THP, and in vmscan's pagecache_reclaimable heuristic) correctly maintained? Not quite: since page_remove_rmap() (and also split_huge_pmd()) is often called without page lock, there can be races when a subpage pte mapcount 0<->1 while compound pmd mapcount 0<->1 is scanning - races which the previous implementation had prevented. The statistics might become inaccurate, and even drift down until they underflow through 0. That is not good enough, but is better dealt with in a followup patch. Update a few comments on first and second tail page overlaid fields. hugepage_add_new_anon_rmap() has to "increment" compound_mapcount, but subpages_mapcount and compound_pincount are already correctly at 0, so delete its reinitialization of compound_pincount. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 119 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 92ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. But there might be some benchmarks which would show a slowdown, because tail struct pages now fall out of cache until final freeing checks them. Link: https://lkml.kernel.org/r/47ad693-717-79c8-e1ba-46c3a6602e48@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 18 ----- include/linux/mm.h | 85 ++++++++++++++------ include/linux/mm_types.h | 21 ++++- include/linux/page-flags.h | 21 ----- include/linux/rmap.h | 2 + mm/debug.c | 5 +- mm/folio-compat.c | 6 -- mm/huge_memory.c | 36 ++------- mm/hugetlb.c | 2 + mm/khugepaged.c | 11 +-- mm/page_alloc.c | 27 ++++--- mm/rmap.c | 142 +++++++++++++++++++-------------- mm/util.c | 79 ------------------ 13 files changed, 194 insertions(+), 261 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 216db1d67d04..a560e0c01b16 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -125,24 +125,6 @@ pages: ->_mapcount of all sub-pages in order to have race-free detection of last unmap of subpages. -PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. - -For anonymous pages, PageDoubleMap() also indicates ->_mapcount in all -subpages is offset up by one. This additional reference is required to -get race-free detection of unmap of subpages when we have them mapped with -both PMDs and PTEs. - -This optimization is required to lower the overhead of per-subpage mapcount -tracking. The alternative is to alter ->_mapcount in all subpages on each -map/unmap of the whole compound page. - -For anonymous pages, we set PG_double_map when a PMD of the page is split -for the first time, but still have a PMD mapping. The additional references -go away with the last compound_mapcount. - -File pages get PG_double_map set on the first map of the page with PTE and -goes away when the page gets evicted from the page cache. - split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page structures. It can be done easily for refcounts taken by page table diff --git a/include/linux/mm.h b/include/linux/mm.h index 3950ef45b9a9..a904c2d60f12 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -818,8 +818,8 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes; look at folio_mapcount() or page_mapcount() - * instead. + * debugging purposes - it does not include PTE-mapped sub-pages; look + * at folio_mapcount() or page_mapcount() or total_mapcount() instead. */ static inline int folio_entire_mapcount(struct folio *folio) { @@ -829,12 +829,20 @@ static inline int folio_entire_mapcount(struct folio *folio) /* * Mapcount of compound page as a whole, does not include mapped sub-pages. - * - * Must be called only for compound pages. + * Must be called only on head of compound page. */ -static inline int compound_mapcount(struct page *page) +static inline int head_compound_mapcount(struct page *head) { - return folio_entire_mapcount(page_folio(page)); + return atomic_read(compound_mapcount_ptr(head)) + 1; +} + +/* + * Sum of mapcounts of sub-pages, does not include compound mapcount. + * Must be called only on head of compound page. + */ +static inline int head_subpages_mapcount(struct page *head) +{ + return atomic_read(subpages_mapcount_ptr(head)); } /* @@ -847,11 +855,9 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -int __page_mapcount(struct page *page); - /* * Mapcount of 0-order page; when compound sub-page, includes - * compound_mapcount(). + * compound_mapcount of compound_head of page. * * Result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). @@ -859,25 +865,61 @@ int __page_mapcount(struct page *page); */ static inline int page_mapcount(struct page *page) { - if (unlikely(PageCompound(page))) - return __page_mapcount(page); - return atomic_read(&page->_mapcount) + 1; + int mapcount = atomic_read(&page->_mapcount) + 1; + + if (likely(!PageCompound(page))) + return mapcount; + page = compound_head(page); + return head_compound_mapcount(page) + mapcount; } -int folio_mapcount(struct folio *folio); - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int total_mapcount(struct page *page) { - return folio_mapcount(page_folio(page)); + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + return head_compound_mapcount(page) + head_subpages_mapcount(page); } -#else -static inline int total_mapcount(struct page *page) +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped, + * even if this particular subpage is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) { - return page_mapcount(page); + return total_mapcount(page) > 0; +} + +/** + * folio_mapcount() - Calculate the number of mappings of this folio. + * @folio: The folio. + * + * A large folio tracks both how many times the entire folio is mapped, + * and how many times each individual page in the folio is mapped. + * This function calculates the total number of times the folio is + * mapped. + * + * Return: The number of times this folio is mapped. + */ +static inline int folio_mapcount(struct folio *folio) +{ + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) + 1; + return atomic_read(folio_mapcount_ptr(folio)) + 1 + + atomic_read(folio_subpages_mapcount_ptr(folio)); +} + +/** + * folio_mapped - Is this folio mapped into userspace? + * @folio: The folio. + * + * Return: True if any page in this folio is referenced by user page tables. + */ +static inline bool folio_mapped(struct folio *folio) +{ + return folio_mapcount(folio) > 0; } -#endif static inline struct page *virt_to_head_page(const void *x) { @@ -1800,9 +1842,6 @@ static inline pgoff_t page_index(struct page *page) return page->index; } -bool page_mapped(struct page *page); -bool folio_mapped(struct folio *folio); - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 44f1f8b6be02..44a1a699b5ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -142,6 +142,7 @@ struct page { unsigned char compound_dtor; unsigned char compound_order; atomic_t compound_mapcount; + atomic_t subpages_mapcount; atomic_t compound_pincount; #ifdef CONFIG_64BIT unsigned int compound_nr; /* 1 << compound_order */ @@ -270,7 +271,8 @@ struct page { * @_head_1: Points to the folio. Do not use. * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). - * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_compound_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_subpages_mapcount: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * @_flags_2: For alignment. Do not use. @@ -323,7 +325,8 @@ struct folio { unsigned long _head_1; unsigned char _folio_dtor; unsigned char _folio_order; - atomic_t _total_mapcount; + atomic_t _compound_mapcount; + atomic_t _subpages_mapcount; atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; @@ -365,7 +368,8 @@ FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); FOLIO_MATCH(compound_dtor, _folio_dtor); FOLIO_MATCH(compound_order, _folio_order); -FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_mapcount, _compound_mapcount); +FOLIO_MATCH(subpages_mapcount, _subpages_mapcount); FOLIO_MATCH(compound_pincount, _pincount); #ifdef CONFIG_64BIT FOLIO_MATCH(compound_nr, _folio_nr_pages); @@ -388,11 +392,22 @@ static inline atomic_t *folio_mapcount_ptr(struct folio *folio) return &tail->compound_mapcount; } +static inline atomic_t *folio_subpages_mapcount_ptr(struct folio *folio) +{ + struct page *tail = &folio->page + 1; + return &tail->subpages_mapcount; +} + static inline atomic_t *compound_mapcount_ptr(struct page *page) { return &page[1].compound_mapcount; } +static inline atomic_t *subpages_mapcount_ptr(struct page *page) +{ + return &page[1].subpages_mapcount; +} + static inline atomic_t *compound_pincount_ptr(struct page *page) { return &page[1].compound_pincount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0b0ae5084e60..e42c55a7e012 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -176,9 +176,6 @@ enum pageflags { /* SLOB */ PG_slob_free = PG_private, - /* Compound pages. Stored in first tail page's flags */ - PG_double_map = PG_workingset, - #ifdef CONFIG_MEMORY_FAILURE /* * Compound pages. Stored in first tail page's flags. @@ -874,29 +871,11 @@ static inline int PageTransTail(struct page *page) { return PageTail(page); } - -/* - * PageDoubleMap indicates that the compound page is mapped with PTEs as well - * as PMDs. - * - * This is required for optimization of rmap operations for THP: we can postpone - * per small page mapcount accounting (and its overhead from atomic operations) - * until the first PMD split. - * - * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up - * by one. This reference will go away with last compound_mapcount. - * - * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). - */ -PAGEFLAG(DoubleMap, double_map, PF_SECOND) - TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) TESTPAGEFLAG_FALSE(TransTail, transtail) -PAGEFLAG_FALSE(DoubleMap, double_map) - TESTSCFLAG_FALSE(DoubleMap, double_map) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bd3504d11b15..1973649e8f93 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -206,6 +206,8 @@ void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, static inline void __page_dup_rmap(struct page *page, bool compound) { + if (!compound && PageCompound(page)) + atomic_inc(subpages_mapcount_ptr(compound_head(page))); atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } diff --git a/mm/debug.c b/mm/debug.c index 0fd15ba70d16..7f8e5f744e42 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,9 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - folio_entire_mapcount(folio), + head_compound_mapcount(head), + head_subpages_mapcount(head), head_compound_pincount(head)); } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index bac2a366aada..cbfe51091c39 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -39,12 +39,6 @@ void wait_for_stable_page(struct page *page) } EXPORT_SYMBOL_GPL(wait_for_stable_page); -bool page_mapped(struct page *page) -{ - return folio_mapped(page_folio(page)); -} -EXPORT_SYMBOL(page_mapped); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b26998d1845f..7703169107c6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2142,6 +2142,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); + atomic_add(HPAGE_PMD_NR, subpages_mapcount_ptr(page)); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2225,33 +2226,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); } - if (!pmd_migration) { - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && - !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_inc(&page[i]._mapcount); - } - - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __mod_lruvec_page_state(page, NR_ANON_THPS, - -HPAGE_PMD_NR); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); - } - } - unlock_page_memcg(page); - - /* Above is effectively page_remove_rmap(page, vma, true) */ - munlock_vma_page(page, vma, true); - } + if (!pmd_migration) + page_remove_rmap(page, vma, true); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); @@ -2453,7 +2429,7 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); - /* ->mapping in first tail page is compound_mapcount */ + /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; @@ -2463,6 +2439,10 @@ static void __split_huge_page_tail(struct page *head, int tail, * page->private should not be set in tail pages with the exception * of swap cache pages that store the swp_entry_t in tail pages. * Fix up and warn once if private is unexpectedly set. + * + * What of 32-bit systems, on which head[1].compound_pincount overlays + * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and + * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. */ if (!folio_test_swapcache(page_folio(head))) { VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 76ebefe02827..4f1338d82aab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1333,6 +1333,7 @@ static void __destroy_compound_gigantic_page(struct page *page, struct page *p; atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); for (i = 1; i < nr_pages; i++) { @@ -1852,6 +1853,7 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); return true; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9c111273bbf9..0d8f548d9d7e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1238,15 +1238,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, /* * Check if the page has any GUP (or other external) pins. * - * Here the check is racy it may see total_mapcount > refcount - * in some cases. - * For example, one process with one forked child process. - * The parent has the PMD split due to MADV_DONTNEED, then - * the child is trying unmap the whole PMD, but khugepaged - * may be scanning the parent between the child has - * PageDoubleMap flag cleared and dec the mapcount. So - * khugepaged may see total_mapcount > refcount. - * + * Here the check may be racy: + * it may see total_mapcount > refcount in some cases? * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e60657875d3..0705917ddf54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -798,6 +798,7 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); } @@ -1324,11 +1325,19 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping may be compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { + /* the first tail page: these may be in place of ->mapping */ + if (unlikely(head_compound_mapcount(head_page))) { bad_page(page, "nonzero compound_mapcount"); goto out; } + if (unlikely(head_subpages_mapcount(head_page))) { + bad_page(page, "nonzero subpages_mapcount"); + goto out; + } + if (unlikely(head_compound_pincount(head_page))) { + bad_page(page, "nonzero compound_pincount"); + goto out; + } break; case 2: /* @@ -1431,10 +1440,8 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) { - ClearPageDoubleMap(page); + if (compound) ClearPageHasHWPoisoned(page); - } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -6874,13 +6881,11 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores important compound page info. + * Call prep_compound_head() after the first tail page has + * been initialized, to not have the data overwritten. */ - if (pfn == head_pfn + 2) + if (pfn == head_pfn + 1) prep_compound_head(head, order); } } diff --git a/mm/rmap.c b/mm/rmap.c index 3b2d18bbdc44..f43339ea4970 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,6 +1085,24 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } +/* + * When mapping a THP's first pmd, or unmapping its last pmd, if that THP + * also has pte mappings, then those must be discounted: in order to maintain + * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, + * and to decide when an anon THP should be put on the deferred split queue. + */ +static int nr_subpages_unmapped(struct page *head, int nr_subpages) +{ + int nr = nr_subpages; + int i; + + /* Discount those subpages mapped by pte */ + for (i = 0; i < nr_subpages; i++) + if (atomic_read(&head[i]._mapcount) >= 0) + nr--; + return nr; +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1194,6 +1212,7 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + int nr, nr_pages; bool compound = flags & RMAP_COMPOUND; bool first; @@ -1202,28 +1221,32 @@ void page_add_anon_rmap(struct page *page, else VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound) { + if (compound && PageTransHuge(page)) { atomic_t *mapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); mapcount = compound_mapcount_ptr(page); first = atomic_inc_and_test(mapcount); + + nr = nr_pages = thp_nr_pages(page); + if (first && head_subpages_mapcount(page)) + nr = nr_subpages_unmapped(page, nr_pages); } else { + nr = 1; + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_inc(subpages_mapcount_ptr(head)); + nr = !head_compound_mapcount(head); + } first = atomic_inc_and_test(&page->_mapcount); } + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) { - int nr = compound ? thp_nr_pages(page) : 1; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption - * disabled. - */ if (compound) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pages); __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1265,8 +1288,6 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { /* increment count (starts at -1) */ @@ -1287,29 +1308,19 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int i, nr = 0; + int nr = 0; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); + int nr_pages; - for (i = 0; i < nr_pages; i++) { - if (atomic_inc_and_test(&page[i]._mapcount)) - nr++; - } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; - /* - * It is racy to ClearPageDoubleMap in page_remove_file_rmap(); - * but page lock is held by all page_add_file_rmap() compound - * callers, and SetPageDoubleMap below warns if !PageLocked: - * so here is a place that DoubleMap can be safely cleared. - */ - VM_WARN_ON_ONCE(!PageLocked(page)); - if (nr == nr_pages && PageDoubleMap(page)) - ClearPageDoubleMap(page); + nr = nr_pages = thp_nr_pages(page); + if (head_subpages_mapcount(page)) + nr = nr_subpages_unmapped(page, nr_pages); if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, @@ -1318,11 +1329,15 @@ void page_add_file_rmap(struct page *page, __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, nr_pages); } else { - if (PageTransCompound(page) && page_mapping(page)) { - VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + bool pmd_mapped = false; + + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_inc(subpages_mapcount_ptr(head)); + pmd_mapped = head_compound_mapcount(head); } - if (atomic_inc_and_test(&page->_mapcount)) + if (atomic_inc_and_test(&page->_mapcount) && !pmd_mapped) nr++; } out: @@ -1335,7 +1350,7 @@ out: static void page_remove_file_rmap(struct page *page, bool compound) { - int i, nr = 0; + int nr = 0; VM_BUG_ON_PAGE(compound && !PageHead(page), page); @@ -1348,14 +1363,15 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); + int nr_pages; - for (i = 0; i < nr_pages; i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; - } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; + return; + + nr = nr_pages = thp_nr_pages(page); + if (head_subpages_mapcount(page)) + nr = nr_subpages_unmapped(page, nr_pages); + if (PageSwapBacked(page)) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, -nr_pages); @@ -1363,17 +1379,25 @@ static void page_remove_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, -nr_pages); } else { - if (atomic_add_negative(-1, &page->_mapcount)) + bool pmd_mapped = false; + + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_dec(subpages_mapcount_ptr(head)); + pmd_mapped = head_compound_mapcount(head); + } + if (atomic_add_negative(-1, &page->_mapcount) && !pmd_mapped) nr++; } -out: + if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); } static void page_remove_anon_compound_rmap(struct page *page) { - int i, nr; + int nr, nr_pages; if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; @@ -1385,27 +1409,19 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); + nr = nr_pages = thp_nr_pages(page); + __mod_lruvec_page_state(page, NR_ANON_THPS, -nr); - if (TestClearPageDoubleMap(page)) { - /* - * Subpages can be mapped with PTEs too. Check how many of - * them are still mapped. - */ - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; - } + if (head_subpages_mapcount(page)) { + nr = nr_subpages_unmapped(page, nr_pages); /* * Queue the page for deferred split if at least one small * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < thp_nr_pages(page)) + if (nr && nr < nr_pages) deferred_split_huge_page(page); - } else { - nr = thp_nr_pages(page); } if (nr) @@ -1423,6 +1439,8 @@ static void page_remove_anon_compound_rmap(struct page *page) void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { + bool pmd_mapped = false; + lock_page_memcg(page); if (!PageAnon(page)) { @@ -1435,15 +1453,17 @@ void page_remove_rmap(struct page *page, goto out; } + if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + atomic_dec(subpages_mapcount_ptr(head)); + pmd_mapped = head_compound_mapcount(head); + } + /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) + if (!atomic_add_negative(-1, &page->_mapcount) || pmd_mapped) goto out; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption disabled. - */ __dec_lruvec_page_state(page, NR_ANON_MAPPED); if (PageTransCompound(page)) @@ -2569,8 +2589,8 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); ClearHPageRestoreReserve(page); __page_set_anon_rmap(page, vma, address, 1); } diff --git a/mm/util.c b/mm/util.c index 12984e76767e..b56c92fb910f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,32 +717,6 @@ void *page_rmapping(struct page *page) return folio_raw_mapping(page_folio(page)); } -/** - * folio_mapped - Is this folio mapped into userspace? - * @folio: The folio. - * - * Return: True if any page in this folio is referenced by user page tables. - */ -bool folio_mapped(struct folio *folio) -{ - long i, nr; - - if (!folio_test_large(folio)) - return atomic_read(&folio->_mapcount) >= 0; - if (atomic_read(folio_mapcount_ptr(folio)) >= 0) - return true; - if (folio_test_hugetlb(folio)) - return false; - - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) { - if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) - return true; - } - return false; -} -EXPORT_SYMBOL(folio_mapped); - struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -783,59 +757,6 @@ struct address_space *folio_mapping(struct folio *folio) } EXPORT_SYMBOL(folio_mapping); -/* Slow path of page_mapcount() for compound pages */ -int __page_mapcount(struct page *page) -{ - int ret; - - ret = atomic_read(&page->_mapcount) + 1; - /* - * For file THP page->_mapcount contains total number of mapping - * of the page: no need to look into compound_mapcount. - */ - if (!PageAnon(page) && !PageHuge(page)) - return ret; - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - return ret; -} -EXPORT_SYMBOL_GPL(__page_mapcount); - -/** - * folio_mapcount() - Calculate the number of mappings of this folio. - * @folio: The folio. - * - * A large folio tracks both how many times the entire folio is mapped, - * and how many times each individual page in the folio is mapped. - * This function calculates the total number of times the folio is - * mapped. - * - * Return: The number of times this folio is mapped. - */ -int folio_mapcount(struct folio *folio) -{ - int i, compound, nr, ret; - - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) + 1; - - compound = folio_entire_mapcount(folio); - if (folio_test_hugetlb(folio)) - return compound; - ret = compound; - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) - ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; - /* File pages has compound_mapcount included in _mapcount */ - if (!folio_test_anon(folio)) - return ret - compound * nr; - if (folio_test_double_map(folio)) - ret -= nr; - return ret; -} - /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. From 9bd3155ed83b723be719e522760f107229e2a61b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Nov 2022 18:53:45 -0700 Subject: [PATCH 125/313] mm,thp,rmap: lock_compound_mapcounts() on THP mapcounts Fix the races in maintaining compound_mapcount, subpages_mapcount and subpage _mapcount by using PG_locked in the first tail of any compound page for a bit_spin_lock() on such modifications; skipping the usual atomic operations on those fields in this case. Bring page_remove_file_rmap() and page_remove_anon_compound_rmap() back into page_remove_rmap() itself. Rearrange page_add_anon_rmap() and page_add_file_rmap() and page_remove_rmap() to follow the same "if (compound) {lock} else if (PageCompound) {lock} else {atomic}" pattern (with a PageTransHuge in the compound test, like before, to avoid BUG_ONs and optimize away that block when THP is not configured). Move all the stats updates outside, after the bit_spin_locked section, so that it is sure to be a leaf lock. Add page_dup_compound_rmap() to manage compound locking versus atomics in sync with the rest. In particular, hugetlb pages are still using the atomics: to avoid unnecessary interference there, and because they never have subpage mappings; but this exception can easily be changed. Conveniently, page_dup_compound_rmap() turns out to suit an anon THP's __split_huge_pmd_locked() too. bit_spin_lock() is not popular with PREEMPT_RT folks: but PREEMPT_RT sensibly excludes TRANSPARENT_HUGEPAGE already, so its only exposure is to the non-hugetlb non-THP pte-mapped compound pages (with large folios being currently dependent on TRANSPARENT_HUGEPAGE). There is never any scan of subpages in this case; but we have chosen to use PageCompound tests rather than PageTransCompound tests to gate the use of lock_compound_mapcounts(), so that page_mapped() is correct on all compound pages, whether or not TRANSPARENT_HUGEPAGE is enabled: could that be a problem for PREEMPT_RT, when there is contention on the lock - under heavy concurrent forking for example? If so, then it can be turned into a sleeping lock (like folio_lock()) when PREEMPT_RT. A simple 100 X munmap(mmap(2GB, MAP_SHARED|MAP_POPULATE, tmpfs), 2GB) took 18 seconds on small pages, and used to take 1 second on huge pages, but now takes 115 milliseconds on huge pages. Mapping by pmds a second time used to take 860ms and now takes 86ms; mapping by pmds after mapping by ptes (when the scan is needed) used to take 870ms and now takes 495ms. Mapping huge pages by ptes is largely unaffected but variable: between 5% faster and 5% slower in what I've recorded. Contention on the lock is likely to behave worse than contention on the atomics behaved. Link: https://lkml.kernel.org/r/1b42bd1a-8223-e827-602f-d466c2db7d3c@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 14 +- include/linux/rmap.h | 14 +- mm/huge_memory.c | 3 +- mm/rmap.c | 349 ++++++++++++++++++--------------- 4 files changed, 211 insertions(+), 169 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index a560e0c01b16..1e2a637cc607 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -117,13 +117,15 @@ pages: - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeeds on tail pages. - - map/unmap of the pages with PTE entry increment/decrement ->_mapcount - on relevant sub-page of the compound page. + - map/unmap of PMD entry for the whole compound page increment/decrement + ->compound_mapcount, stored in the first tail page of the compound page. - - map/unmap of the whole compound page is accounted for in compound_mapcount - (stored in first tail page). For file huge pages, we also increment - ->_mapcount of all sub-pages in order to have race-free detection of - last unmap of subpages. + - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount + on relevant sub-page of the compound page, and also increment/decrement + ->subpages_mapcount, stored in first tail page of the compound page. + In order to have race-free accounting of sub-pages mapped, changes to + sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are + are all locked by bit_spin_lock of PG_locked in the first tail ->flags. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 1973649e8f93..011a7530dc76 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,16 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -static inline void __page_dup_rmap(struct page *page, bool compound) -{ - if (!compound && PageCompound(page)) - atomic_inc(subpages_mapcount_ptr(compound_head(page))); - atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); -} +void page_dup_compound_rmap(struct page *page, bool compound); static inline void page_dup_file_rmap(struct page *page, bool compound) { - __page_dup_rmap(page, compound); + if (PageCompound(page)) + page_dup_compound_rmap(page, compound); + else + atomic_inc(&page->_mapcount); } /** @@ -262,7 +260,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * the page R/O into both processes. */ dup: - __page_dup_rmap(page, compound); + page_dup_file_rmap(page, compound); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7703169107c6..114517e8cbfc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2142,7 +2142,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - atomic_add(HPAGE_PMD_NR, subpages_mapcount_ptr(page)); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2222,7 +2221,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); if (!pmd_migration) - atomic_inc(&page[i]._mapcount); + page_dup_compound_rmap(page + i, false); pte_unmap(pte); } diff --git a/mm/rmap.c b/mm/rmap.c index f43339ea4970..512e53cae2ca 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,11 +1085,66 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } +struct compound_mapcounts { + unsigned int compound_mapcount; + unsigned int subpages_mapcount; +}; + +/* + * lock_compound_mapcounts() first locks, then copies subpages_mapcount and + * compound_mapcount from head[1].compound_mapcount and subpages_mapcount, + * converting from struct page's internal representation to logical count + * (that is, adding 1 to compound_mapcount to hide its offset by -1). + */ +static void lock_compound_mapcounts(struct page *head, + struct compound_mapcounts *local) +{ + bit_spin_lock(PG_locked, &head[1].flags); + local->compound_mapcount = atomic_read(compound_mapcount_ptr(head)) + 1; + local->subpages_mapcount = atomic_read(subpages_mapcount_ptr(head)); +} + +/* + * After caller has updated subpage._mapcount, local subpages_mapcount and + * local compound_mapcount, as necessary, unlock_compound_mapcounts() converts + * and copies them back to the compound head[1] fields, and then unlocks. + */ +static void unlock_compound_mapcounts(struct page *head, + struct compound_mapcounts *local) +{ + atomic_set(compound_mapcount_ptr(head), local->compound_mapcount - 1); + atomic_set(subpages_mapcount_ptr(head), local->subpages_mapcount); + bit_spin_unlock(PG_locked, &head[1].flags); +} + +/* + * When acting on a compound page under lock_compound_mapcounts(), avoid the + * unnecessary overhead of an actual atomic operation on its subpage mapcount. + * Return true if this is the first increment or the last decrement + * (remembering that page->_mapcount -1 represents logical mapcount 0). + */ +static bool subpage_mapcount_inc(struct page *page) +{ + int orig_mapcount = atomic_read(&page->_mapcount); + + atomic_set(&page->_mapcount, orig_mapcount + 1); + return orig_mapcount < 0; +} + +static bool subpage_mapcount_dec(struct page *page) +{ + int orig_mapcount = atomic_read(&page->_mapcount); + + atomic_set(&page->_mapcount, orig_mapcount - 1); + return orig_mapcount == 0; +} + /* * When mapping a THP's first pmd, or unmapping its last pmd, if that THP * also has pte mappings, then those must be discounted: in order to maintain * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, * and to decide when an anon THP should be put on the deferred split queue. + * This function must be called between lock_ and unlock_compound_mapcounts(). */ static int nr_subpages_unmapped(struct page *head, int nr_subpages) { @@ -1103,6 +1158,40 @@ static int nr_subpages_unmapped(struct page *head, int nr_subpages) return nr; } +/* + * page_dup_compound_rmap(), used when copying mm, or when splitting pmd, + * provides a simple example of using lock_ and unlock_compound_mapcounts(). + */ +void page_dup_compound_rmap(struct page *page, bool compound) +{ + struct compound_mapcounts mapcounts; + struct page *head; + + /* + * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; + * but at present they are still being managed by atomic operations: + * which are likely to be somewhat faster, so don't rush to convert + * them over without evaluating the effect. + * + * Note that hugetlb does not call page_add_file_rmap(): + * here is where hugetlb shared page mapcount is raised. + */ + if (PageHuge(page)) { + atomic_inc(compound_mapcount_ptr(page)); + return; + } + + head = compound_head(page); + lock_compound_mapcounts(head, &mapcounts); + if (compound) { + mapcounts.compound_mapcount++; + } else { + mapcounts.subpages_mapcount++; + subpage_mapcount_inc(page); + } + unlock_compound_mapcounts(head, &mapcounts); +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1212,7 +1301,8 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - int nr, nr_pages; + struct compound_mapcounts mapcounts; + int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first; @@ -1222,33 +1312,37 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); if (compound && PageTransHuge(page)) { - atomic_t *mapcount; - VM_BUG_ON_PAGE(!PageLocked(page), page); - mapcount = compound_mapcount_ptr(page); - first = atomic_inc_and_test(mapcount); - - nr = nr_pages = thp_nr_pages(page); - if (first && head_subpages_mapcount(page)) - nr = nr_subpages_unmapped(page, nr_pages); - } else { - nr = 1; - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - atomic_inc(subpages_mapcount_ptr(head)); - nr = !head_compound_mapcount(head); + lock_compound_mapcounts(page, &mapcounts); + first = !mapcounts.compound_mapcount; + mapcounts.compound_mapcount++; + if (first) { + nr = nr_pmdmapped = thp_nr_pages(page); + if (mapcounts.subpages_mapcount) + nr = nr_subpages_unmapped(page, nr_pmdmapped); } + unlock_compound_mapcounts(page, &mapcounts); + + } else if (PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + first = subpage_mapcount_inc(page); + nr = first && !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + + } else { first = atomic_inc_and_test(&page->_mapcount); + nr = first; } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (first) { - if (compound) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pages); + if (nr_pmdmapped) + __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - } if (unlikely(PageKsm(page))) unlock_page_memcg(page); @@ -1308,39 +1402,41 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int nr = 0; + struct compound_mapcounts mapcounts; + int nr = 0, nr_pmdmapped = 0; + bool first; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); + if (compound && PageTransHuge(page)) { - int nr_pages; - - if (!atomic_inc_and_test(compound_mapcount_ptr(page))) - goto out; - - nr = nr_pages = thp_nr_pages(page); - if (head_subpages_mapcount(page)) - nr = nr_subpages_unmapped(page, nr_pages); - - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - nr_pages); - } else { - bool pmd_mapped = false; - - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - atomic_inc(subpages_mapcount_ptr(head)); - pmd_mapped = head_compound_mapcount(head); + lock_compound_mapcounts(page, &mapcounts); + first = !mapcounts.compound_mapcount; + mapcounts.compound_mapcount++; + if (first) { + nr = nr_pmdmapped = thp_nr_pages(page); + if (mapcounts.subpages_mapcount) + nr = nr_subpages_unmapped(page, nr_pmdmapped); } - if (atomic_inc_and_test(&page->_mapcount) && !pmd_mapped) - nr++; + unlock_compound_mapcounts(page, &mapcounts); + + } else if (PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + first = subpage_mapcount_inc(page); + nr = first && !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + + } else { + first = atomic_inc_and_test(&page->_mapcount); + nr = first; } -out: + + if (nr_pmdmapped) + __mod_lruvec_page_state(page, PageSwapBacked(page) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); unlock_page_memcg(page); @@ -1348,86 +1444,6 @@ out: mlock_vma_page(page, vma, compound); } -static void page_remove_file_rmap(struct page *page, bool compound) -{ - int nr = 0; - - VM_BUG_ON_PAGE(compound && !PageHead(page), page); - - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ - if (unlikely(PageHuge(page))) { - /* hugetlb pages are always mapped with pmds */ - atomic_dec(compound_mapcount_ptr(page)); - return; - } - - /* page still mapped by someone else? */ - if (compound && PageTransHuge(page)) { - int nr_pages; - - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; - - nr = nr_pages = thp_nr_pages(page); - if (head_subpages_mapcount(page)) - nr = nr_subpages_unmapped(page, nr_pages); - - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - -nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - -nr_pages); - } else { - bool pmd_mapped = false; - - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - - atomic_dec(subpages_mapcount_ptr(head)); - pmd_mapped = head_compound_mapcount(head); - } - if (atomic_add_negative(-1, &page->_mapcount) && !pmd_mapped) - nr++; - } - - if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); -} - -static void page_remove_anon_compound_rmap(struct page *page) -{ - int nr, nr_pages; - - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; - - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - return; - - nr = nr_pages = thp_nr_pages(page); - __mod_lruvec_page_state(page, NR_ANON_THPS, -nr); - - if (head_subpages_mapcount(page)) { - nr = nr_subpages_unmapped(page, nr_pages); - - /* - * Queue the page for deferred split if at least one small - * page of the compound page is unmapped, but at least one - * small page is still mapped. - */ - if (nr && nr < nr_pages) - deferred_split_huge_page(page); - } - - if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); -} - /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -1439,46 +1455,73 @@ static void page_remove_anon_compound_rmap(struct page *page) void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - bool pmd_mapped = false; + struct compound_mapcounts mapcounts; + int nr = 0, nr_pmdmapped = 0; + bool last; + + VM_BUG_ON_PAGE(compound && !PageHead(page), page); + + /* Hugetlb pages are not counted in NR_*MAPPED */ + if (unlikely(PageHuge(page))) { + /* hugetlb pages are always mapped with pmds */ + atomic_dec(compound_mapcount_ptr(page)); + return; + } lock_page_memcg(page); - if (!PageAnon(page)) { - page_remove_file_rmap(page, compound); - goto out; - } + /* page still mapped by someone else? */ + if (compound && PageTransHuge(page)) { + lock_compound_mapcounts(page, &mapcounts); + mapcounts.compound_mapcount--; + last = !mapcounts.compound_mapcount; + if (last) { + nr = nr_pmdmapped = thp_nr_pages(page); + if (mapcounts.subpages_mapcount) + nr = nr_subpages_unmapped(page, nr_pmdmapped); + } + unlock_compound_mapcounts(page, &mapcounts); - if (compound) { - page_remove_anon_compound_rmap(page); - goto out; - } - - if (PageTransCompound(page)) { + } else if (PageCompound(page)) { struct page *head = compound_head(page); - atomic_dec(subpages_mapcount_ptr(head)); - pmd_mapped = head_compound_mapcount(head); + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount--; + last = subpage_mapcount_dec(page); + nr = last && !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + + } else { + last = atomic_add_negative(-1, &page->_mapcount); + nr = last; } - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount) || pmd_mapped) - goto out; - - __dec_lruvec_page_state(page, NR_ANON_MAPPED); - - if (PageTransCompound(page)) - deferred_split_huge_page(compound_head(page)); + if (nr_pmdmapped) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : + (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : + NR_FILE_PMDMAPPED), -nr_pmdmapped); + } + if (nr) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : + NR_FILE_MAPPED, -nr); + /* + * Queue anon THP for deferred split if at least one small + * page of the compound page is unmapped, but at least one + * small page is still mapped. + */ + if (PageTransCompound(page) && PageAnon(page)) + if (!compound || nr < nr_pmdmapped) + deferred_split_huge_page(compound_head(page)); + } /* - * It would be tidy to reset the PageAnon mapping here, + * It would be tidy to reset PageAnon mapping when fully unmapped, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_unref_page, + * before us: so leave the reset to free_pages_prepare, * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. */ -out: + unlock_page_memcg(page); munlock_vma_page(page, vma, compound); From d8dd5e979d09c7463618853fb4aedd88e3efc8ae Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 9 Nov 2022 18:18:49 -0800 Subject: [PATCH 126/313] mm,thp,rmap: handle the normal !PageCompound case first Commit ("mm,thp,rmap: lock_compound_mapcounts() on THP mapcounts") propagated the "if (compound) {lock} else if (PageCompound) {lock} else {atomic}" pattern throughout; but Linus hated the way that gives primacy to the uncommon case: switch to "if (!PageCompound) {atomic} else if (compound) {lock} else {lock}" throughout. Linus has a bigger idea for how to improve it all, but here just make that rearrangement. Link: https://lkml.kernel.org/r/fca2f694-2098-b0ef-d4e-f1d8b94d318c@google.com Signed-off-by: Hugh Dickins Cc: David Hildenbrand Cc: James Houghton Cc: John Hubbard Cc: Kirill A. Shutemov Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/rmap.c | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 512e53cae2ca..4833d28c5e1a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1311,7 +1311,11 @@ void page_add_anon_rmap(struct page *page, else VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound && PageTransHuge(page)) { + if (likely(!PageCompound(page))) { + first = atomic_inc_and_test(&page->_mapcount); + nr = first; + + } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); first = !mapcounts.compound_mapcount; mapcounts.compound_mapcount++; @@ -1321,8 +1325,7 @@ void page_add_anon_rmap(struct page *page, nr = nr_subpages_unmapped(page, nr_pmdmapped); } unlock_compound_mapcounts(page, &mapcounts); - - } else if (PageCompound(page)) { + } else { struct page *head = compound_head(page); lock_compound_mapcounts(head, &mapcounts); @@ -1330,10 +1333,6 @@ void page_add_anon_rmap(struct page *page, first = subpage_mapcount_inc(page); nr = first && !mapcounts.compound_mapcount; unlock_compound_mapcounts(head, &mapcounts); - - } else { - first = atomic_inc_and_test(&page->_mapcount); - nr = first; } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); @@ -1373,20 +1372,23 @@ void page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - const bool compound = PageCompound(page); - int nr = compound ? thp_nr_pages(page) : 1; + int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); __SetPageSwapBacked(page); - if (compound) { + + if (likely(!PageCompound(page))) { + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + nr = 1; + } else { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); + nr = thp_nr_pages(page); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); - } else { - /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); } + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1409,7 +1411,11 @@ void page_add_file_rmap(struct page *page, VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (compound && PageTransHuge(page)) { + if (likely(!PageCompound(page))) { + first = atomic_inc_and_test(&page->_mapcount); + nr = first; + + } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); first = !mapcounts.compound_mapcount; mapcounts.compound_mapcount++; @@ -1419,8 +1425,7 @@ void page_add_file_rmap(struct page *page, nr = nr_subpages_unmapped(page, nr_pmdmapped); } unlock_compound_mapcounts(page, &mapcounts); - - } else if (PageCompound(page)) { + } else { struct page *head = compound_head(page); lock_compound_mapcounts(head, &mapcounts); @@ -1428,10 +1433,6 @@ void page_add_file_rmap(struct page *page, first = subpage_mapcount_inc(page); nr = first && !mapcounts.compound_mapcount; unlock_compound_mapcounts(head, &mapcounts); - - } else { - first = atomic_inc_and_test(&page->_mapcount); - nr = first; } if (nr_pmdmapped) @@ -1471,7 +1472,11 @@ void page_remove_rmap(struct page *page, lock_page_memcg(page); /* page still mapped by someone else? */ - if (compound && PageTransHuge(page)) { + if (likely(!PageCompound(page))) { + last = atomic_add_negative(-1, &page->_mapcount); + nr = last; + + } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); mapcounts.compound_mapcount--; last = !mapcounts.compound_mapcount; @@ -1481,8 +1486,7 @@ void page_remove_rmap(struct page *page, nr = nr_subpages_unmapped(page, nr_pmdmapped); } unlock_compound_mapcounts(page, &mapcounts); - - } else if (PageCompound(page)) { + } else { struct page *head = compound_head(page); lock_compound_mapcounts(head, &mapcounts); @@ -1490,10 +1494,6 @@ void page_remove_rmap(struct page *page, last = subpage_mapcount_dec(page); nr = last && !mapcounts.compound_mapcount; unlock_compound_mapcounts(head, &mapcounts); - - } else { - last = atomic_add_negative(-1, &page->_mapcount); - nr = last; } if (nr_pmdmapped) { From d7ec8f421ade2817983963a106b0085cc478c17b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 7 Nov 2022 16:50:01 +0000 Subject: [PATCH 127/313] selftests/damon: test non-context inputs to rm_contexts file There was a bug[1] that triggered by writing non-context DAMON debugfs file names to the 'rm_contexts' DAMON debugfs file. Add a selftest for the bug to avoid it happen again. [1] https://lore.kernel.org/damon/000000000000ede3ac05ec4abf8e@google.com/ Link: https://lkml.kernel.org/r/20221107165001.5717-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + .../damon/debugfs_rm_non_contexts.sh | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tools/testing/selftests/damon/debugfs_rm_non_contexts.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index af490acc5348..838a8e49f77b 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -7,6 +7,7 @@ TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh +TEST_PROGS += debugfs_rm_non_contexts.sh TEST_PROGS += sysfs.sh TEST_PROGS += reclaim.sh lru_sort.sh diff --git a/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh new file mode 100644 index 000000000000..48b7af6b022c --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_rm_non_contexts.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test putting non-ctx files/dirs to rm_contexts file +# =================================================== + +dmesg -C + +for file in "$DBGFS/"* +do + echo "$(basename "$f")" > "$DBGFS/rm_contexts" + if dmesg | grep -q BUG + then + dmesg + exit 1 + fi +done From 11aad2631bf74b3c811dee76154702aab855a323 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Mon, 7 Nov 2022 15:39:22 +0000 Subject: [PATCH 128/313] mm/hugetlb_vmemmap: remap head page to newly allocated page Today with `hugetlb_free_vmemmap=on` the struct page memory that is freed back to page allocator is as following: for a 2M hugetlb page it will reuse the first 4K vmemmap page to remap the remaining 7 vmemmap pages, and for a 1G hugetlb it will remap the remaining 4095 vmemmap pages. Essentially, that means that it breaks the first 4K of a potentially contiguous chunk of memory of 32K (for 2M hugetlb pages) or 16M (for 1G hugetlb pages). For this reason the memory that it's free back to page allocator cannot be used for hugetlb to allocate huge pages of the same size, but rather only of a smaller huge page size: Trying to assign a 64G node to hugetlb (on a 128G 2node guest, each node having 64G): * Before allocation: Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ... Node 0, zone Normal, type Movable 340 100 32 15 1 2 0 0 0 1 15558 $ echo 32768 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages 31987 * After: Node 0, zone Normal, type Movable 30893 32006 31515 7 0 0 0 0 0 0 0 Notice how the memory freed back are put back into 4K / 8K / 16K page pools. And it allocates a total of 31987 pages (63974M). To fix this behaviour rather than remapping second vmemmap page (thus breaking the contiguous block of memory backing the struct pages) repopulate the first vmemmap page with a new one. We allocate and copy from the currently mapped vmemmap page, and then remap it later on. The same algorithm works if there's a pre initialized walk::reuse_page and the head page doesn't need to be skipped and instead we remap it when the @addr being changed is the @reuse_addr. The new head page is allocated in vmemmap_remap_free() given that on restore there's no need for functional change. Note that, because right now one hugepage is remapped at a time, thus only one free 4K page at a time is needed to remap the head page. Should it fail to allocate said new page, it reuses the one that's already mapped just like before. As a result, for every 64G of contiguous hugepages it can give back 1G more of contiguous memory per 64G, while needing in total 128M new 4K pages (for 2M hugetlb) or 256k (for 1G hugetlb). After the changes, try to assign a 64G node to hugetlb (on a 128G 2node guest, each node with 64G): * Before allocation Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ... Node 0, zone Normal, type Movable 1 1 1 0 0 1 0 0 1 1 15564 $ echo 32768 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages $ cat /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages 32394 * After: Node 0, zone Normal, type Movable 0 50 97 108 96 81 70 46 18 0 0 In the example above, 407 more hugeltb 2M pages are allocated i.e. 814M out of the 32394 (64788M) allocated. So the memory freed back is indeed being used back in hugetlb and there's no massive order-0..order-2 pages accumulated unused. [joao.m.martins@oracle.com: v3] Link: https://lkml.kernel.org/r/20221109200623.96867-1-joao.m.martins@oracle.com [joao.m.martins@oracle.com: add smp_wmb() to ensure page contents are visible prior to PTE write] Link: https://lkml.kernel.org/r/20221110121214.6297-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20221107153922.77094-1-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 7898c2c75e35..45e93a545dd7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -203,12 +203,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - /* - * We only change the mapping of the vmemmap virtual address range - * [@start + PAGE_SIZE, end), so we only need to flush the TLB which - * belongs to the range. - */ - flush_tlb_kernel_range(start + PAGE_SIZE, end); + flush_tlb_kernel_range(start, end); return 0; } @@ -244,9 +239,23 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - pte_t entry = mk_pte(walk->reuse_page, pgprot); struct page *page = pte_page(*pte); + pte_t entry; + /* Remapping the head page requires r/w */ + if (unlikely(addr == walk->reuse_addr)) { + pgprot = PAGE_KERNEL; + list_del(&walk->reuse_page->lru); + + /* + * Makes sure that preceding stores to the page contents from + * vmemmap_remap_free() become visible before the set_pte_at() + * write. + */ + smp_wmb(); + } + + entry = mk_pte(walk->reuse_page, pgprot); list_add_tail(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } @@ -315,6 +324,24 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, }; + int nid = page_to_nid((struct page *)start); + gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | + __GFP_NOWARN; + + /* + * Allocate a new head vmemmap page to avoid breaking a contiguous + * block of struct page memory when freeing it back to page allocator + * in free_vmemmap_page_list(). This will allow the likely contiguous + * struct page backing memory to be kept contiguous and allowing for + * more allocations of hugepages. Fallback to the currently + * mapped head page in case should it fail to allocate. + */ + walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); + if (walk.reuse_page) { + copy_page(page_to_virt(walk.reuse_page), + (void *)walk.reuse_addr); + list_add(&walk.reuse_page->lru, &vmemmap_pages); + } /* * In order to make remapping routine most efficient for the huge pages, From be5ef2d9b006bbd93b1a03e1da2dbd19fb0b9f14 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:42:04 -0800 Subject: [PATCH 129/313] mm,thp,rmap: subpages_mapcount of PTE-mapped subpages Patch series "mm,thp,rmap: rework the use of subpages_mapcount", v2. This patch (of 3): Following suggestion from Linus, instead of counting every PTE map of a compound page in subpages_mapcount, just count how many of its subpages are PTE-mapped: this yields the exact number needed for NR_ANON_MAPPED and NR_FILE_MAPPED stats, without any need for a locked scan of subpages; and requires updating the count less often. This does then revert total_mapcount() and folio_mapcount() to needing a scan of subpages; but they are inherently racy, and need no locking, so Linus is right that the scans are much better done there. Plus (unlike in 6.1 and previous) subpages_mapcount lets us avoid the scan in the common case of no PTE maps. And page_mapped() and folio_mapped() remain scanless and just as efficient with the new meaning of subpages_mapcount: those are the functions which I most wanted to remove the scan from. The updated page_dup_compound_rmap() is no longer suitable for use by anon THP's __split_huge_pmd_locked(); but page_add_anon_rmap() can be used for that, so long as its VM_BUG_ON_PAGE(!PageLocked) is deleted. Evidence is that this way goes slightly faster than the previous implementation for most cases; but significantly faster in the (now scanless) pmds after ptes case, which started out at 870ms and was brought down to 495ms by the previous series, now takes around 105ms. Link: https://lkml.kernel.org/r/a5849eca-22f1-3517-bf29-95d982242742@google.com Link: https://lkml.kernel.org/r/eec17e16-4e1-7c59-f1bc-5bca90dac919@google.com Signed-off-by: Hugh Dickins Suggested-by: Linus Torvalds Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 3 +- include/linux/mm.h | 52 ++++++----- include/linux/rmap.h | 9 +- mm/huge_memory.c | 2 +- mm/rmap.c | 160 ++++++++++++++------------------- 5 files changed, 107 insertions(+), 119 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index 1e2a637cc607..af4c9d70321d 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -122,7 +122,8 @@ pages: - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount on relevant sub-page of the compound page, and also increment/decrement - ->subpages_mapcount, stored in first tail page of the compound page. + ->subpages_mapcount, stored in first tail page of the compound page, when + _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. In order to have race-free accounting of sub-pages mapped, changes to sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are are all locked by bit_spin_lock of PG_locked in the first tail ->flags. diff --git a/include/linux/mm.h b/include/linux/mm.h index a904c2d60f12..84fb91f6f56e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -837,7 +837,7 @@ static inline int head_compound_mapcount(struct page *head) } /* - * Sum of mapcounts of sub-pages, does not include compound mapcount. + * Number of sub-pages mapped by PTE, does not include compound mapcount. * Must be called only on head of compound page. */ static inline int head_subpages_mapcount(struct page *head) @@ -873,23 +873,7 @@ static inline int page_mapcount(struct page *page) return head_compound_mapcount(page) + mapcount; } -static inline int total_mapcount(struct page *page) -{ - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) + 1; - page = compound_head(page); - return head_compound_mapcount(page) + head_subpages_mapcount(page); -} - -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped, - * even if this particular subpage is not itself mapped by any PTE or PMD. - */ -static inline bool page_mapped(struct page *page) -{ - return total_mapcount(page) > 0; -} +int total_compound_mapcount(struct page *head); /** * folio_mapcount() - Calculate the number of mappings of this folio. @@ -906,8 +890,20 @@ static inline int folio_mapcount(struct folio *folio) { if (likely(!folio_test_large(folio))) return atomic_read(&folio->_mapcount) + 1; - return atomic_read(folio_mapcount_ptr(folio)) + 1 + - atomic_read(folio_subpages_mapcount_ptr(folio)); + return total_compound_mapcount(&folio->page); +} + +static inline int total_mapcount(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + return total_compound_mapcount(compound_head(page)); +} + +static inline bool folio_large_is_mapped(struct folio *folio) +{ + return atomic_read(folio_mapcount_ptr(folio)) + + atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; } /** @@ -918,7 +914,21 @@ static inline int folio_mapcount(struct folio *folio) */ static inline bool folio_mapped(struct folio *folio) { - return folio_mapcount(folio) > 0; + if (likely(!folio_test_large(folio))) + return atomic_read(&folio->_mapcount) >= 0; + return folio_large_is_mapped(folio); +} + +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any sub-page of compound page is mapped, + * even if this particular sub-page is not itself mapped by any PTE or PMD. + */ +static inline bool page_mapped(struct page *page) +{ + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + return folio_large_is_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 011a7530dc76..5dadb9a3e010 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,14 +204,15 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -void page_dup_compound_rmap(struct page *page, bool compound); +void page_dup_compound_rmap(struct page *page); static inline void page_dup_file_rmap(struct page *page, bool compound) { - if (PageCompound(page)) - page_dup_compound_rmap(page, compound); - else + /* Is page being mapped by PTE? */ + if (likely(!compound)) atomic_inc(&page->_mapcount); + else + page_dup_compound_rmap(page); } /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 114517e8cbfc..681253adf529 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2221,7 +2221,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); if (!pmd_migration) - page_dup_compound_rmap(page + i, false); + page_add_anon_rmap(page + i, vma, addr, false); pte_unmap(pte); } diff --git a/mm/rmap.c b/mm/rmap.c index 4833d28c5e1a..e813785da613 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1117,55 +1117,36 @@ static void unlock_compound_mapcounts(struct page *head, bit_spin_unlock(PG_locked, &head[1].flags); } -/* - * When acting on a compound page under lock_compound_mapcounts(), avoid the - * unnecessary overhead of an actual atomic operation on its subpage mapcount. - * Return true if this is the first increment or the last decrement - * (remembering that page->_mapcount -1 represents logical mapcount 0). - */ -static bool subpage_mapcount_inc(struct page *page) +int total_compound_mapcount(struct page *head) { - int orig_mapcount = atomic_read(&page->_mapcount); - - atomic_set(&page->_mapcount, orig_mapcount + 1); - return orig_mapcount < 0; -} - -static bool subpage_mapcount_dec(struct page *page) -{ - int orig_mapcount = atomic_read(&page->_mapcount); - - atomic_set(&page->_mapcount, orig_mapcount - 1); - return orig_mapcount == 0; -} - -/* - * When mapping a THP's first pmd, or unmapping its last pmd, if that THP - * also has pte mappings, then those must be discounted: in order to maintain - * NR_ANON_MAPPED and NR_FILE_MAPPED statistics exactly, without any drift, - * and to decide when an anon THP should be put on the deferred split queue. - * This function must be called between lock_ and unlock_compound_mapcounts(). - */ -static int nr_subpages_unmapped(struct page *head, int nr_subpages) -{ - int nr = nr_subpages; + int mapcount = head_compound_mapcount(head); + int nr_subpages; int i; - /* Discount those subpages mapped by pte */ + /* In the common case, avoid the loop when no subpages mapped by PTE */ + if (head_subpages_mapcount(head) == 0) + return mapcount; + /* + * Add all the PTE mappings of those subpages mapped by PTE. + * Limit the loop, knowing that only subpages_mapcount are mapped? + * Perhaps: given all the raciness, that may be a good or a bad idea. + */ + nr_subpages = thp_nr_pages(head); for (i = 0; i < nr_subpages; i++) - if (atomic_read(&head[i]._mapcount) >= 0) - nr--; - return nr; + mapcount += atomic_read(&head[i]._mapcount); + + /* But each of those _mapcounts was based on -1 */ + mapcount += nr_subpages; + return mapcount; } /* - * page_dup_compound_rmap(), used when copying mm, or when splitting pmd, + * page_dup_compound_rmap(), used when copying mm, * provides a simple example of using lock_ and unlock_compound_mapcounts(). */ -void page_dup_compound_rmap(struct page *page, bool compound) +void page_dup_compound_rmap(struct page *head) { struct compound_mapcounts mapcounts; - struct page *head; /* * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; @@ -1176,20 +1157,15 @@ void page_dup_compound_rmap(struct page *page, bool compound) * Note that hugetlb does not call page_add_file_rmap(): * here is where hugetlb shared page mapcount is raised. */ - if (PageHuge(page)) { - atomic_inc(compound_mapcount_ptr(page)); - return; - } + if (PageHuge(head)) { + atomic_inc(compound_mapcount_ptr(head)); + } else if (PageTransHuge(head)) { + /* That test is redundant: it's for safety or to optimize out */ - head = compound_head(page); - lock_compound_mapcounts(head, &mapcounts); - if (compound) { + lock_compound_mapcounts(head, &mapcounts); mapcounts.compound_mapcount++; - } else { - mapcounts.subpages_mapcount++; - subpage_mapcount_inc(page); + unlock_compound_mapcounts(head, &mapcounts); } - unlock_compound_mapcounts(head, &mapcounts); } /** @@ -1304,35 +1280,34 @@ void page_add_anon_rmap(struct page *page, struct compound_mapcounts mapcounts; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; - bool first; + bool first = true; if (unlikely(PageKsm(page))) lock_page_memcg(page); - else - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (likely(!PageCompound(page))) { + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; + if (first && PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + nr = !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ - } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); first = !mapcounts.compound_mapcount; mapcounts.compound_mapcount++; if (first) { - nr = nr_pmdmapped = thp_nr_pages(page); - if (mapcounts.subpages_mapcount) - nr = nr_subpages_unmapped(page, nr_pmdmapped); + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - mapcounts.subpages_mapcount; } unlock_compound_mapcounts(page, &mapcounts); - } else { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - first = subpage_mapcount_inc(page); - nr = first && !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); @@ -1411,28 +1386,29 @@ void page_add_file_rmap(struct page *page, VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (likely(!PageCompound(page))) { + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); nr = first; + if (first && PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount++; + nr = !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ - } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); first = !mapcounts.compound_mapcount; mapcounts.compound_mapcount++; if (first) { - nr = nr_pmdmapped = thp_nr_pages(page); - if (mapcounts.subpages_mapcount) - nr = nr_subpages_unmapped(page, nr_pmdmapped); + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - mapcounts.subpages_mapcount; } unlock_compound_mapcounts(page, &mapcounts); - } else { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - first = subpage_mapcount_inc(page); - nr = first && !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); } if (nr_pmdmapped) @@ -1471,29 +1447,29 @@ void page_remove_rmap(struct page *page, lock_page_memcg(page); - /* page still mapped by someone else? */ - if (likely(!PageCompound(page))) { + /* Is page being unmapped by PTE? Is this its last map to be removed? */ + if (likely(!compound)) { last = atomic_add_negative(-1, &page->_mapcount); nr = last; + if (last && PageCompound(page)) { + struct page *head = compound_head(page); + + lock_compound_mapcounts(head, &mapcounts); + mapcounts.subpages_mapcount--; + nr = !mapcounts.compound_mapcount; + unlock_compound_mapcounts(head, &mapcounts); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ - } else if (compound && PageTransHuge(page)) { lock_compound_mapcounts(page, &mapcounts); mapcounts.compound_mapcount--; last = !mapcounts.compound_mapcount; if (last) { - nr = nr_pmdmapped = thp_nr_pages(page); - if (mapcounts.subpages_mapcount) - nr = nr_subpages_unmapped(page, nr_pmdmapped); + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - mapcounts.subpages_mapcount; } unlock_compound_mapcounts(page, &mapcounts); - } else { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount--; - last = subpage_mapcount_dec(page); - nr = last && !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); } if (nr_pmdmapped) { From 4b51634cd16a01b2be0f6b69cc0dae63de4751f2 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:49:36 -0800 Subject: [PATCH 130/313] mm,thp,rmap: subpages_mapcount COMPOUND_MAPPED if PMD-mapped Can the lock_compound_mapcount() bit_spin_lock apparatus be removed now? Yes. Not by atomic64_t or cmpxchg games, those get difficult on 32-bit; but if we slightly abuse subpages_mapcount by additionally demanding that one bit be set there when the compound page is PMD-mapped, then a cascade of two atomic ops is able to maintain the stats without bit_spin_lock. This is harder to reason about than when bit_spin_locked, but I believe safe; and no drift in stats detected when testing. When there are racing removes and adds, of course the sequence of operations is less well- defined; but each operation on subpages_mapcount is atomically good. What might be disastrous, is if subpages_mapcount could ever fleetingly appear negative: but the pte lock (or pmd lock) these rmap functions are called under, ensures that a last remove cannot race ahead of a first add. Continue to make an exception for hugetlb (PageHuge) pages, though that exception can be easily removed by a further commit if necessary: leave subpages_mapcount 0, don't bother with COMPOUND_MAPPED in its case, just carry on checking compound_mapcount too in folio_mapped(), page_mapped(). Evidence is that this way goes slightly faster than the previous implementation in all cases (pmds after ptes now taking around 103ms); and relieves us of worrying about contention on the bit_spin_lock. Link: https://lkml.kernel.org/r/3978f3ca-5473-55a7-4e14-efea5968d892@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- Documentation/mm/transhuge.rst | 7 +- include/linux/mm.h | 19 +++++- include/linux/rmap.h | 13 ++-- mm/page_alloc.c | 2 +- mm/rmap.c | 121 +++++++-------------------------- 5 files changed, 51 insertions(+), 111 deletions(-) diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst index af4c9d70321d..ec3dc5b04226 100644 --- a/Documentation/mm/transhuge.rst +++ b/Documentation/mm/transhuge.rst @@ -118,15 +118,14 @@ pages: succeeds on tail pages. - map/unmap of PMD entry for the whole compound page increment/decrement - ->compound_mapcount, stored in the first tail page of the compound page. + ->compound_mapcount, stored in the first tail page of the compound page; + and also increment/decrement ->subpages_mapcount (also in the first tail) + by COMPOUND_MAPPED when compound_mapcount goes from -1 to 0 or 0 to -1. - map/unmap of sub-pages with PTE entry increment/decrement ->_mapcount on relevant sub-page of the compound page, and also increment/decrement ->subpages_mapcount, stored in first tail page of the compound page, when _mapcount goes from -1 to 0 or 0 to -1: counting sub-pages mapped by PTE. - In order to have race-free accounting of sub-pages mapped, changes to - sub-page ->_mapcount, ->subpages_mapcount and ->compound_mapcount are - are all locked by bit_spin_lock of PG_locked in the first tail ->flags. split_huge_page internally has to distribute the refcounts in the head page to the tail pages before clearing all PG_head/tail bits from the page diff --git a/include/linux/mm.h b/include/linux/mm.h index 84fb91f6f56e..d33639be3db3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -836,13 +836,22 @@ static inline int head_compound_mapcount(struct page *head) return atomic_read(compound_mapcount_ptr(head)) + 1; } +/* + * If a 16GB hugetlb page were mapped by PTEs of all of its 4kB sub-pages, + * its subpages_mapcount would be 0x400000: choose the COMPOUND_MAPPED bit + * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE). Hugetlb currently + * leaves subpages_mapcount at 0, but avoid surprise if it participates later. + */ +#define COMPOUND_MAPPED 0x800000 +#define SUBPAGES_MAPPED (COMPOUND_MAPPED - 1) + /* * Number of sub-pages mapped by PTE, does not include compound mapcount. * Must be called only on head of compound page. */ static inline int head_subpages_mapcount(struct page *head) { - return atomic_read(subpages_mapcount_ptr(head)); + return atomic_read(subpages_mapcount_ptr(head)) & SUBPAGES_MAPPED; } /* @@ -902,8 +911,12 @@ static inline int total_mapcount(struct page *page) static inline bool folio_large_is_mapped(struct folio *folio) { - return atomic_read(folio_mapcount_ptr(folio)) + - atomic_read(folio_subpages_mapcount_ptr(folio)) >= 0; + /* + * Reading folio_mapcount_ptr() below could be omitted if hugetlb + * participated in incrementing subpages_mapcount when compound mapped. + */ + return atomic_read(folio_subpages_mapcount_ptr(folio)) > 0 || + atomic_read(folio_mapcount_ptr(folio)) >= 0; } /** diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5dadb9a3e010..bd3504d11b15 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -204,15 +204,14 @@ void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address); -void page_dup_compound_rmap(struct page *page); +static inline void __page_dup_rmap(struct page *page, bool compound) +{ + atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); +} static inline void page_dup_file_rmap(struct page *page, bool compound) { - /* Is page being mapped by PTE? */ - if (likely(!compound)) - atomic_inc(&page->_mapcount); - else - page_dup_compound_rmap(page); + __page_dup_rmap(page, compound); } /** @@ -261,7 +260,7 @@ static inline int page_try_dup_anon_rmap(struct page *page, bool compound, * the page R/O into both processes. */ dup: - page_dup_file_rmap(page, compound); + __page_dup_rmap(page, compound); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0705917ddf54..c33b6963c2d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1330,7 +1330,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) bad_page(page, "nonzero compound_mapcount"); goto out; } - if (unlikely(head_subpages_mapcount(head_page))) { + if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { bad_page(page, "nonzero subpages_mapcount"); goto out; } diff --git a/mm/rmap.c b/mm/rmap.c index e813785da613..459dc1c44d8a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1085,38 +1085,6 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -struct compound_mapcounts { - unsigned int compound_mapcount; - unsigned int subpages_mapcount; -}; - -/* - * lock_compound_mapcounts() first locks, then copies subpages_mapcount and - * compound_mapcount from head[1].compound_mapcount and subpages_mapcount, - * converting from struct page's internal representation to logical count - * (that is, adding 1 to compound_mapcount to hide its offset by -1). - */ -static void lock_compound_mapcounts(struct page *head, - struct compound_mapcounts *local) -{ - bit_spin_lock(PG_locked, &head[1].flags); - local->compound_mapcount = atomic_read(compound_mapcount_ptr(head)) + 1; - local->subpages_mapcount = atomic_read(subpages_mapcount_ptr(head)); -} - -/* - * After caller has updated subpage._mapcount, local subpages_mapcount and - * local compound_mapcount, as necessary, unlock_compound_mapcounts() converts - * and copies them back to the compound head[1] fields, and then unlocks. - */ -static void unlock_compound_mapcounts(struct page *head, - struct compound_mapcounts *local) -{ - atomic_set(compound_mapcount_ptr(head), local->compound_mapcount - 1); - atomic_set(subpages_mapcount_ptr(head), local->subpages_mapcount); - bit_spin_unlock(PG_locked, &head[1].flags); -} - int total_compound_mapcount(struct page *head) { int mapcount = head_compound_mapcount(head); @@ -1140,34 +1108,6 @@ int total_compound_mapcount(struct page *head) return mapcount; } -/* - * page_dup_compound_rmap(), used when copying mm, - * provides a simple example of using lock_ and unlock_compound_mapcounts(). - */ -void page_dup_compound_rmap(struct page *head) -{ - struct compound_mapcounts mapcounts; - - /* - * Hugetlb pages could use lock_compound_mapcounts(), like THPs do; - * but at present they are still being managed by atomic operations: - * which are likely to be somewhat faster, so don't rush to convert - * them over without evaluating the effect. - * - * Note that hugetlb does not call page_add_file_rmap(): - * here is where hugetlb shared page mapcount is raised. - */ - if (PageHuge(head)) { - atomic_inc(compound_mapcount_ptr(head)); - } else if (PageTransHuge(head)) { - /* That test is redundant: it's for safety or to optimize out */ - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.compound_mapcount++; - unlock_compound_mapcounts(head, &mapcounts); - } -} - /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1277,7 +1217,7 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - struct compound_mapcounts mapcounts; + atomic_t *mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; bool first = true; @@ -1290,24 +1230,20 @@ void page_add_anon_rmap(struct page *page, first = atomic_inc_and_test(&page->_mapcount); nr = first; if (first && PageCompound(page)) { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - nr = !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = !(nr & COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ - lock_compound_mapcounts(page, &mapcounts); - first = !mapcounts.compound_mapcount; - mapcounts.compound_mapcount++; + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - mapcounts.subpages_mapcount; + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); } - unlock_compound_mapcounts(page, &mapcounts); } VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); @@ -1360,6 +1296,7 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); nr = thp_nr_pages(page); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } @@ -1379,7 +1316,7 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - struct compound_mapcounts mapcounts; + atomic_t *mapped; int nr = 0, nr_pmdmapped = 0; bool first; @@ -1391,24 +1328,20 @@ void page_add_file_rmap(struct page *page, first = atomic_inc_and_test(&page->_mapcount); nr = first; if (first && PageCompound(page)) { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount++; - nr = !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = !(nr & COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ - lock_compound_mapcounts(page, &mapcounts); - first = !mapcounts.compound_mapcount; - mapcounts.compound_mapcount++; + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - mapcounts.subpages_mapcount; + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); } - unlock_compound_mapcounts(page, &mapcounts); } if (nr_pmdmapped) @@ -1432,7 +1365,7 @@ void page_add_file_rmap(struct page *page, void page_remove_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - struct compound_mapcounts mapcounts; + atomic_t *mapped; int nr = 0, nr_pmdmapped = 0; bool last; @@ -1452,24 +1385,20 @@ void page_remove_rmap(struct page *page, last = atomic_add_negative(-1, &page->_mapcount); nr = last; if (last && PageCompound(page)) { - struct page *head = compound_head(page); - - lock_compound_mapcounts(head, &mapcounts); - mapcounts.subpages_mapcount--; - nr = !mapcounts.compound_mapcount; - unlock_compound_mapcounts(head, &mapcounts); + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_dec_return_relaxed(mapped); + nr = !(nr & COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ - lock_compound_mapcounts(page, &mapcounts); - mapcounts.compound_mapcount--; - last = !mapcounts.compound_mapcount; + last = atomic_add_negative(-1, compound_mapcount_ptr(page)); if (last) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - mapcounts.subpages_mapcount; + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); } - unlock_compound_mapcounts(page, &mapcounts); } if (nr_pmdmapped) { From 96d82deb743ab42c8f0b911eb49db83f0e6db311 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 22 Nov 2022 01:51:50 -0800 Subject: [PATCH 131/313] mm,thp,rmap: clean up the end of __split_huge_pmd_locked() It's hard to add a page_add_anon_rmap() into __split_huge_pmd_locked()'s HPAGE_PMD_NR set_pte_at() loop, without wincing at the "freeze" case's HPAGE_PMD_NR page_remove_rmap() loop below it. It's just a mistake to add rmaps in the "freeze" (insert migration entries prior to splitting huge page) case: the pmd_migration case already avoids doing that, so just follow its lead. page_add_ref() versus put_page() likewise. But why is one more put_page() needed in the "freeze" case? Because it's removing the pmd rmap, already removed when pmd_migration (and freeze and pmd_migration are mutually exclusive cases). Link: https://lkml.kernel.org/r/d43748aa-fece-e0b9-c4ab-f23c9ebc9011@google.com Signed-off-by: Hugh Dickins Acked-by: Kirill A. Shutemov Cc: Dan Carpenter Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 681253adf529..aba340684285 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2141,7 +2141,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, uffd_wp = pmd_uffd_wp(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); - page_ref_add(page, HPAGE_PMD_NR - 1); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2161,6 +2160,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) freeze = false; + if (!freeze) + page_ref_add(page, HPAGE_PMD_NR - 1); } /* @@ -2216,27 +2217,21 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); + page_add_anon_rmap(page + i, vma, addr, false); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - if (!pmd_migration) - page_add_anon_rmap(page + i, vma, addr, false); pte_unmap(pte); } if (!pmd_migration) page_remove_rmap(page, vma, true); + if (freeze) + put_page(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - - if (freeze) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, vma, false); - put_page(page + i); - } - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, From 1a1af17ea81115914c8efc1177fd94719c84fc11 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Tue, 8 Nov 2022 19:53:48 +0800 Subject: [PATCH 132/313] tools/vm/page_owner: ignore page_owner_sort binary page_owner_sort was introduced since commit 48c96a368579 ("mm/page_owner: keep track of page owners"), and we should ignore it. Link: https://lkml.kernel.org/r/tencent_F6CAC0ABE16839E2B2419BD07316DA65BB06@qq.com Signed-off-by: Rong Tao Signed-off-by: Andrew Morton --- tools/vm/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/vm/.gitignore b/tools/vm/.gitignore index 79bb92ae1bb3..922879f93fc8 100644 --- a/tools/vm/.gitignore +++ b/tools/vm/.gitignore @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only slabinfo page-types +page_owner_sort From d84887739d5c982afa50b155aad628bb8ff206c5 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 8 Nov 2022 18:46:46 +0100 Subject: [PATCH 133/313] mm/mprotect: allow clean exclusive anon pages to be writable Patch series "mm/autonuma: replace savedwrite infrastructure", v2. As discussed in my talk at LPC, we can reuse the same mechanism for deciding whether to map a pte writable when upgrading permissions via mprotect() -- e.g., PROT_READ -> PROT_READ|PROT_WRITE -- to replace the savedwrite infrastructure used for NUMA hinting faults (e.g., PROT_NONE -> PROT_READ|PROT_WRITE). Instead of maintaining previous write permissions for a pte/pmd, we re-determine if the pte/pmd can be writable. The big benefit is that we have a common logic for deciding whether we can map a pte/pmd writable on protection changes. For private mappings, there should be no difference -- from what I understand, that is what autonuma benchmarks care about. I ran autonumabench for v1 on a system with 2 NUMA nodes, 96 GiB each via: perf stat --null --repeat 10 The numa01 benchmark is quite noisy in my environment and I failed to reduce the noise so far. numa01: mm-unstable: 146.88 +- 6.54 seconds time elapsed ( +- 4.45% ) mm-unstable++: 147.45 +- 13.39 seconds time elapsed ( +- 9.08% ) numa02: mm-unstable: 16.0300 +- 0.0624 seconds time elapsed ( +- 0.39% ) mm-unstable++: 16.1281 +- 0.0945 seconds time elapsed ( +- 0.59% ) It is worth noting that for shared writable mappings that require writenotify, we will only avoid write faults if the pte/pmd is dirty (inherited from the older mprotect logic). If we ever care about optimizing that further, we'd need a different mechanism to identify whether the FS still needs to get notified on the next write access. In any case, such an optimization will then not be autonuma-specific, but mprotect() permission upgrades would similarly benefit from it. This patch (of 7): Anonymous pages might have the dirty bit clear, but this should not prevent mprotect from making them writable if they are exclusive. Therefore, skip the test whether the page is dirty in this case. Note that there are already other ways to get a writable PTE mapping an anonymous page that is clean: for example, via MADV_FREE. In an ideal world, we'd have a different indication from the FS whether writenotify is still required. [david@redhat.com: return directly; update description] Link: https://lkml.kernel.org/r/20221108174652.198904-1-david@redhat.com Link: https://lkml.kernel.org/r/20221108174652.198904-2-david@redhat.com Signed-off-by: Nadav Amit Signed-off-by: David Hildenbrand Cc: Linus Torvalds Cc: Mel Gorman Cc: Dave Chinner Cc: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Vlastimil Babka Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Mike Rapoport Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/mprotect.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 8d770855b591..86a28c0e190f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -46,7 +46,7 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); - if (pte_protnone(pte) || !pte_dirty(pte)) + if (pte_protnone(pte)) return false; /* Do we need write faults for softdirty tracking? */ @@ -65,11 +65,10 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, * the PT lock. */ page = vm_normal_page(vma, addr, pte); - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) - return false; + return page && PageAnon(page) && PageAnonExclusive(page); } - return true; + return pte_dirty(pte); } static unsigned long change_pte_range(struct mmu_gather *tlb, From 7ea7e333842ed50fe0a0b256c270b54f8ec2353e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:47 +0100 Subject: [PATCH 134/313] mm/mprotect: minor can_change_pte_writable() cleanups We want to replicate this code for handling PMDs soon. (1) No need to crash the kernel, warning and rejecting is good enough. As this will no longer get optimized out, drop the pte_write() check: no harm would be done. (2) Add a comment why PROT_NONE mapped pages are excluded. (3) Add a comment regarding MAP_SHARED handling and why we rely on the dirty bit in the PTE. Link: https://lkml.kernel.org/r/20221108174652.198904-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mprotect.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 86a28c0e190f..72aabffb7871 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -44,8 +44,10 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, { struct page *page; - VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; + /* Don't touch entries that are not even readable. */ if (pte_protnone(pte)) return false; @@ -59,15 +61,22 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) { /* - * We can only special-case on exclusive anonymous pages, - * because we know that our write-fault handler similarly would - * map them writable without any additional checks while holding - * the PT lock. + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. */ page = vm_normal_page(vma, addr, pte); return page && PageAnon(page) && PageAnonExclusive(page); } + /* + * Writable MAP_SHARED mapping: "clean" might indicate that the FS still + * needs a real write-fault for writenotify + * (see vma_wants_writenotify()). If "dirty", the assumption is that the + * FS was already notified and we can simply mark the PTE writable + * just like the write-fault handler would do. + */ return pte_dirty(pte); } From c27f479ef5428f691787fb6fe3703a70e931ae8c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:48 +0100 Subject: [PATCH 135/313] mm/huge_memory: try avoiding write faults when changing PMD protection Let's replicate what we have for PTEs in can_change_pte_writable() also for PMDs. While this might look like a pure performance improvement, we'll us this to get rid of savedwrite handling in do_huge_pmd_numa_page() next. Place do_huge_pmd_numa_page() strategically good for that purpose. Note that MM_CP_TRY_CHANGE_WRITABLE is currently only set when we come via mprotect_fixup(). Link: https://lkml.kernel.org/r/20221108174652.198904-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/huge_memory.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aba340684285..fac917b78102 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1390,6 +1390,36 @@ fallback: return VM_FAULT_FALLBACK; } +static inline bool can_change_pmd_writable(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page; + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; + + /* Don't touch entries that are not even readable (NUMA hinting). */ + if (pmd_protnone(pmd)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_huge_pmd_wp(vma, pmd)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* See can_change_pte_writable(). */ + page = vm_normal_page_pmd(vma, addr, pmd); + return page && PageAnon(page) && PageAnonExclusive(page); + } + + /* See can_change_pte_writable(). */ + return pmd_dirty(pmd); +} + /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, @@ -1893,13 +1923,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ entry = pmd_clear_uffd_wp(entry); } + + /* See change_pte_range(). */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && + can_change_pmd_writable(vma, addr, entry)) + entry = pmd_mkwrite(entry); + ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); if (huge_pmd_needs_flush(oldpmd, entry)) tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); - - BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); return ret; From eb309ec89953d6a3e8e35a3a577bab13893858d8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:49 +0100 Subject: [PATCH 136/313] mm/mprotect: factor out check whether manual PTE write upgrades are required Let's factor the check out into vma_wants_manual_pte_write_upgrade(), to be reused in NUMA hinting fault context soon. Link: https://lkml.kernel.org/r/20221108174652.198904-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++-- mm/mprotect.c | 17 ++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d33639be3db3..e203e8a83e2d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2088,6 +2088,20 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) +int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); +static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) +{ + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + return vma_wants_writenotify(vma, vma->vm_page_prot); + return !!(vma->vm_flags & VM_WRITE); + +} extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, @@ -2227,8 +2241,6 @@ static inline int pte_devmap(pte_t pte) } #endif -int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); - extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, diff --git a/mm/mprotect.c b/mm/mprotect.c index 72aabffb7871..fe22db2c9cdd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -558,8 +558,8 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; + unsigned int mm_cp_flags = 0; unsigned long charged = 0; - bool try_change_writable; pgoff_t pgoff; int error; @@ -637,20 +637,11 @@ success: * held in write mode. */ vma->vm_flags = newflags; - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); - else - try_change_writable = !!(vma->vm_flags & VM_WRITE); + if (vma_wants_manual_pte_write_upgrade(vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, - try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); + change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major From 6a56ccbcf6c69538b152644107a1d7383c876ca7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:50 +0100 Subject: [PATCH 137/313] mm/autonuma: use can_change_(pte|pmd)_writable() to replace savedwrite commit b191f9b106ea ("mm: numa: preserve PTE write permissions across a NUMA hinting fault") added remembering write permissions using ordinary pte_write() for PROT_NONE mapped pages to avoid write faults when remapping the page !PROT_NONE on NUMA hinting faults. That commit noted: The patch looks hacky but the alternatives looked worse. The tidest was to rewalk the page tables after a hinting fault but it was more complex than this approach and the performance was worse. It's not generally safe to just mark the page writable during the fault if it's a write fault as it may have been read-only for COW so that approach was discarded. Later, commit 288bc54949fc ("mm/autonuma: let architecture override how the write bit should be stashed in a protnone pte.") introduced a family of savedwrite PTE functions that didn't necessarily improve the whole situation. One confusing thing is that nowadays, if a page is pte_protnone() and pte_savedwrite() then also pte_write() is true. Another source of confusion is that there is only a single pte_mk_savedwrite() call in the kernel. All other write-protection code seems to silently rely on pte_wrprotect(). Ever since PageAnonExclusive was introduced and we started using it in mprotect context via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection"), we do have machinery in place to avoid write faults when changing protection, which is exactly what we want to do here. Let's similarly do what ordinary mprotect() does nowadays when upgrading write permissions and reuse can_change_pte_writable() and can_change_pmd_writable() to detect if we can upgrade PTE permissions to be writable. For anonymous pages there should be absolutely no change: if an anonymous page is not exclusive, it could not have been mapped writable -- because only exclusive anonymous pages can be mapped writable. However, there *might* be a change for writable shared mappings that require writenotify: if they are not dirty, we cannot map them writable. While it might not matter in practice, we'd need a different way to identify whether writenotify is actually required -- and ordinary mprotect would benefit from that as well. Note that we don't optimize for the actual migration case: (1) When migration succeeds the new PTE will not be writable because the source PTE was not writable (protnone); in the future we might just optimize that case similarly by reusing can_change_pte_writable()/can_change_pmd_writable() when removing migration PTEs. (2) When migration fails, we'd have to recalculate the "writable" flag because we temporarily dropped the PT lock; for now keep it simple and set "writable=false". We'll remove all savedwrite leftovers next. Link: https://lkml.kernel.org/r/20221108174652.198904-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/huge_memory.c | 26 +++++++++++++++----------- mm/ksm.c | 9 ++++----- mm/memory.c | 16 +++++++++++++--- mm/mprotect.c | 7 ++----- 5 files changed, 36 insertions(+), 24 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e203e8a83e2d..8597ef676fc3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2102,6 +2102,8 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma return !!(vma->vm_flags & VM_WRITE); } +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); extern unsigned long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fac917b78102..29102e3ddf84 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1511,8 +1511,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); - bool migrated = false; - bool was_writable = pmd_savedwrite(oldpmd); + bool migrated = false, writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1522,12 +1521,22 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } pmd = pmd_modify(oldpmd, vma->vm_page_prot); + + /* + * Detect now whether the PMD could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pmd_write(pmd); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pmd_writable(vma, vmf->address, pmd)) + writable = true; + page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) goto out_map; /* See similar comment in do_numa_page for explanation */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); @@ -1546,6 +1555,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } spin_unlock(vmf->ptl); + writable = false; migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { @@ -1572,7 +1582,7 @@ out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); - if (was_writable) + if (writable) pmd = pmd_mkwrite(pmd); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); @@ -1813,11 +1823,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; - bool preserve_write; - int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); @@ -1828,9 +1837,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; - #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -1910,8 +1916,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); if (uffd_wp) { entry = pmd_wrprotect(entry); entry = pmd_mkuffd_wp(entry); diff --git a/mm/ksm.c b/mm/ksm.c index 7ba97f86d831..a71245241d22 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1041,7 +1041,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, anon_exclusive = PageAnonExclusive(page); if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || anon_exclusive || mm_tlb_flush_pending(mm)) { pte_t entry; @@ -1079,11 +1078,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_dirty(entry)) set_page_dirty(page); + entry = pte_mkclean(entry); + + if (pte_write(entry)) + entry = pte_wrprotect(entry); - if (pte_protnone(entry)) - entry = pte_mkclean(pte_clear_savedwrite(entry)); - else - entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; diff --git a/mm/memory.c b/mm/memory.c index 142c4229549b..1749c638734f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4675,10 +4675,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = NUMA_NO_NODE; + bool writable = false; int last_cpupid; int target_nid; pte_t pte, old_pte; - bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* @@ -4697,6 +4697,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); + /* + * Detect now whether the PTE could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pte_write(pte); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pte_writable(vma, vmf->address, pte)) + writable = true; + page = vm_normal_page(vma, vmf->address, pte); if (!page || is_zone_device_page(page)) goto out_map; @@ -4713,7 +4722,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; /* @@ -4740,6 +4749,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); + writable = false; /* Migrate to the requested node */ if (migrate_misplaced_page(page, vma, target_nid)) { @@ -4768,7 +4778,7 @@ out_map: old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); - if (was_writable) + if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); diff --git a/mm/mprotect.c b/mm/mprotect.c index fe22db2c9cdd..093cb50f2fc4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -39,8 +39,8 @@ #include "internal.h" -static inline bool can_change_pte_writable(struct vm_area_struct *vma, - unsigned long addr, pte_t pte) +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { struct page *page; @@ -121,7 +121,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM @@ -177,8 +176,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (preserve_write) - ptent = pte_mk_savedwrite(ptent); if (uffd_wp) { ptent = pte_wrprotect(ptent); From d6379159f47630813f06f97535cc82ce7b9eed49 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:51 +0100 Subject: [PATCH 138/313] mm: remove unused savedwrite infrastructure NUMA hinting no longer uses savedwrite, let's rip it out. ... and while at it, drop __pte_write() and __pmd_write() on ppc64. Link: https://lkml.kernel.org/r/20221108174652.198904-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/pgtable.h | 80 +------------------- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- include/linux/pgtable.h | 24 ------ mm/debug_vm_pgtable.c | 32 -------- 4 files changed, 5 insertions(+), 133 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c436d8422654..cb4c67bf45d7 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -401,35 +401,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #define pmdp_clear_flush_young pmdp_test_and_clear_young -static inline int __pte_write(pte_t pte) -{ - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE)); -} - -#ifdef CONFIG_NUMA_BALANCING -#define pte_savedwrite pte_savedwrite -static inline bool pte_savedwrite(pte_t pte) -{ - /* - * Saved write ptes are prot none ptes that doesn't have - * privileged bit sit. We mark prot none as one which has - * present and pviliged bit set and RWX cleared. To mark - * protnone which used to have _PAGE_WRITE set we clear - * the privileged bit. - */ - return !(pte_raw(pte) & cpu_to_be64(_PAGE_RWX | _PAGE_PRIVILEGED)); -} -#else -#define pte_savedwrite pte_savedwrite -static inline bool pte_savedwrite(pte_t pte) -{ - return false; -} -#endif - static inline int pte_write(pte_t pte) { - return __pte_write(pte) || pte_savedwrite(pte); + return !!(pte_raw(pte) & cpu_to_be64(_PAGE_WRITE)); } static inline int pte_read(pte_t pte) @@ -441,24 +415,16 @@ static inline int pte_read(pte_t pte) static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (__pte_write(*ptep)) + if (pte_write(*ptep)) pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); - else if (unlikely(pte_savedwrite(*ptep))) - pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 0); } #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - /* - * We should not find protnone for hugetlb, but this complete the - * interface. - */ - if (__pte_write(*ptep)) + if (pte_write(*ptep)) pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); - else if (unlikely(pte_savedwrite(*ptep))) - pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1); } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR @@ -535,36 +501,6 @@ static inline int pte_protnone(pte_t pte) return (pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE | _PAGE_RWX)) == cpu_to_be64(_PAGE_PRESENT | _PAGE_PTE); } - -#define pte_mk_savedwrite pte_mk_savedwrite -static inline pte_t pte_mk_savedwrite(pte_t pte) -{ - /* - * Used by Autonuma subsystem to preserve the write bit - * while marking the pte PROT_NONE. Only allow this - * on PROT_NONE pte - */ - VM_BUG_ON((pte_raw(pte) & cpu_to_be64(_PAGE_PRESENT | _PAGE_RWX | _PAGE_PRIVILEGED)) != - cpu_to_be64(_PAGE_PRESENT | _PAGE_PRIVILEGED)); - return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); -} - -#define pte_clear_savedwrite pte_clear_savedwrite -static inline pte_t pte_clear_savedwrite(pte_t pte) -{ - /* - * Used by KSM subsystem to make a protnone pte readonly. - */ - VM_BUG_ON(!pte_protnone(pte)); - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); -} -#else -#define pte_clear_savedwrite pte_clear_savedwrite -static inline pte_t pte_clear_savedwrite(pte_t pte) -{ - VM_WARN_ON(1); - return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); -} #endif /* CONFIG_NUMA_BALANCING */ static inline bool pte_hw_valid(pte_t pte) @@ -641,8 +577,6 @@ static inline unsigned long pte_pfn(pte_t pte) /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { - if (unlikely(pte_savedwrite(pte))) - return pte_clear_savedwrite(pte); return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_WRITE)); } @@ -1139,8 +1073,6 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) -#define pmd_mk_savedwrite(pmd) pte_pmd(pte_mk_savedwrite(pmd_pte(pmd))) -#define pmd_clear_savedwrite(pmd) pte_pmd(pte_clear_savedwrite(pmd_pte(pmd))) #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #define pmd_soft_dirty(pmd) pte_soft_dirty(pmd_pte(pmd)) @@ -1162,8 +1094,6 @@ static inline int pmd_protnone(pmd_t pmd) #endif /* CONFIG_NUMA_BALANCING */ #define pmd_write(pmd) pte_write(pmd_pte(pmd)) -#define __pmd_write(pmd) __pte_write(pmd_pte(pmd)) -#define pmd_savedwrite(pmd) pte_savedwrite(pmd_pte(pmd)) #define pmd_access_permitted pmd_access_permitted static inline bool pmd_access_permitted(pmd_t pmd, bool write) @@ -1241,10 +1171,8 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - if (__pmd_write((*pmdp))) + if (pmd_write(*pmdp)) pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); - else if (unlikely(pmd_savedwrite(*pmdp))) - pmd_hugepage_update(mm, addr, pmdp, 0, _PAGE_PRIVILEGED); } /* diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5a05953ae13f..9182324dbef9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -265,7 +265,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } pte = kvmppc_read_update_linux_pte(ptep, writing); if (pte_present(pte) && !pte_protnone(pte)) { - if (writing && !__pte_write(pte)) + if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); is_ci = pte_ci(pte); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f0d7d0b9471..c74cce67eec8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -503,30 +503,6 @@ static inline pte_t pte_sw_mkyoung(pte_t pte) #define pte_sw_mkyoung pte_sw_mkyoung #endif -#ifndef pte_savedwrite -#define pte_savedwrite pte_write -#endif - -#ifndef pte_mk_savedwrite -#define pte_mk_savedwrite pte_mkwrite -#endif - -#ifndef pte_clear_savedwrite -#define pte_clear_savedwrite pte_wrprotect -#endif - -#ifndef pmd_savedwrite -#define pmd_savedwrite pmd_write -#endif - -#ifndef pmd_mk_savedwrite -#define pmd_mk_savedwrite pmd_mkwrite -#endif - -#ifndef pmd_clear_savedwrite -#define pmd_clear_savedwrite pmd_wrprotect -#endif - #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2b61fde8c38c..c631ade3f1d2 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -171,18 +171,6 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } -static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) -{ - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - pr_debug("Validating PTE saved write\n"); - WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); - WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -302,22 +290,6 @@ static void __init pmd_leaf_tests(struct pgtable_debug_args *args) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) -{ - pmd_t pmd; - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - if (!has_transparent_hugepage()) - return; - - pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); - WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); - WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); -} - #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -451,7 +423,6 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -1288,9 +1259,6 @@ static int __init debug_vm_pgtable(void) pmd_leaf_tests(&args); pud_leaf_tests(&args); - pte_savedwrite_tests(&args); - pmd_savedwrite_tests(&args); - pte_special_tests(&args); pte_protnone_tests(&args); pmd_protnone_tests(&args); From 07f8bac4982f98fc4b5ae05679d76fccc15079ea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 8 Nov 2022 18:46:52 +0100 Subject: [PATCH 139/313] selftests/vm: anon_cow: add mprotect() optimization tests Let's extend the test to cover the possible mprotect() optimization when removing write-protection. mprotect() must not allow write-access to a COW-shared page by accident. Link: https://lkml.kernel.org/r/20221108174652.198904-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Dave Chinner Cc: Hugh Dickins Cc: Linus Torvalds Cc: Mel Gorman Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/anon_cow.c | 49 +++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/anon_cow.c index 705bd0b3db11..bbb251eb5025 100644 --- a/tools/testing/selftests/vm/anon_cow.c +++ b/tools/testing/selftests/vm/anon_cow.c @@ -190,7 +190,8 @@ static int child_vmsplice_memcmp_fn(char *mem, size_t size, typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); -static void do_test_cow_in_parent(char *mem, size_t size, child_fn fn) +static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, + child_fn fn) { struct comm_pipes comm_pipes; char buf; @@ -212,6 +213,22 @@ static void do_test_cow_in_parent(char *mem, size_t size, child_fn fn) while (read(comm_pipes.child_ready[0], &buf, 1) != 1) ; + + if (do_mprotect) { + /* + * mprotect() optimizations might try avoiding + * write-faults by directly mapping pages writable. + */ + ret = mprotect(mem, size, PROT_READ); + ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + } + /* Modify the page. */ memset(mem, 0xff, size); write(comm_pipes.parent_ready[1], "0", 1); @@ -229,12 +246,22 @@ close_comm_pipes: static void test_cow_in_parent(char *mem, size_t size) { - do_test_cow_in_parent(mem, size, child_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_memcmp_fn); +} + +static void test_cow_in_parent_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_memcmp_fn); } static void test_vmsplice_in_child(char *mem, size_t size) { - do_test_cow_in_parent(mem, size, child_vmsplice_memcmp_fn); + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); +} + +static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); } static void do_test_vmsplice_in_parent(char *mem, size_t size, @@ -969,6 +996,14 @@ static const struct test_case test_cases[] = { "Basic COW after fork()", test_cow_in_parent, }, + /* + * Basic test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "Basic COW after fork() with mprotect() optimization", + test_cow_in_parent_mprotect, + }, /* * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If * we miss to break COW, the child observes modifications by the parent. @@ -978,6 +1013,14 @@ static const struct test_case test_cases[] = { "vmsplice() + unmap in child", test_vmsplice_in_child }, + /* + * vmsplice() test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "vmsplice() + unmap in child with mprotect() optimization", + test_vmsplice_in_child_mprotect + }, /* * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after * fork(); modify in the child. If we miss to break COW, the parent From 70fb4fdff5826a48886152fd5c5db04eb6c59a40 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:48 -0800 Subject: [PATCH 140/313] mm: introduce 'encoded' page pointers with embedded extra bits We already have this notion in parts of the MM code (see the mlock code with the LRU_PAGE and NEW_PAGE bits), but I'm going to introduce a new case, and I refuse to do the same thing we've done before where we just put bits in the raw pointer and say it's still a normal pointer. So this introduces a 'struct encoded_page' pointer that cannot be used for anything else than to encode a real page pointer and a couple of extra bits in the low bits. That way the compiler can trivially track the state of the pointer and you just explicitly encode and decode the extra bits. Note that this makes the alignment of 'struct page' explicit even for the case where CONFIG_HAVE_ALIGNED_STRUCT_PAGE is not set. That is entirely redundant in almost all cases, since the page structure already contains several word-sized entries. However, on m68k, the alignment of even 32-bit data is just 16 bits, and as such in theory the alignment of 'struct page' could be too. So let's just make it very very explicit that the alignment needs to be at least 32 bits, giving us a guarantee of two unused low bits in the pointer. Now, in practice, our page struct array is aligned much more than that anyway, even on m68k, and our existing code in mm/mlock.c obviously already depended on that. But since the whole point of this change is to be careful about the type system when hiding extra bits in the pointer, let's also be explicit about the assumptions we make. NOTE! This is being very careful in another way too: it has a build-time assertion that the 'flags' added to the page pointer actually fit in the two bits. That means that this helper must be inlined, and can only be used in contexts where the compiler can statically determine that the value fits in the available bits. [akpm@linux-foundation.org: kerneldoc on a forward-declared struct confuses htmldocs] Link: https://lore.kernel.org/all/Y2tKixpO4RO6DgW5@tuxmaker.boeblingen.de.ibm.com/ Link: https://lkml.kernel.org/r/20221109203051.1835763-1-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Reviewed-by: David Hildenbrand Cc: Alexander Gordeev Cc: Aneesh Kumar K.V Cc: Christian Borntraeger Cc: Gerald Schaefer Cc: Heiko Carstens [s390] Cc: Nadav Amit Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 44a1a699b5ad..6b0009e7d4ae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -68,7 +68,7 @@ struct mem_cgroup; #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) #else -#define _struct_page_alignment +#define _struct_page_alignment __aligned(sizeof(unsigned long)) #endif struct page { @@ -251,6 +251,38 @@ struct page { #endif } _struct_page_alignment; +/* + * struct encoded_page - a nonexistent type marking this pointer + * + * An 'encoded_page' pointer is a pointer to a regular 'struct page', but + * with the low bits of the pointer indicating extra context-dependent + * information. Not super-common, but happens in mmu_gather and mlock + * handling, and this acts as a type system check on that use. + * + * We only really have two guaranteed bits in general, although you could + * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + * for more. + * + * Use the supplied helper functions to endcode/decode the pointer and bits. + */ +struct encoded_page; +#define ENCODE_PAGE_BITS 3ul +static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags) +{ + BUILD_BUG_ON(flags > ENCODE_PAGE_BITS); + return (struct encoded_page *)(flags | (unsigned long)page); +} + +static inline unsigned long encoded_page_flags(struct encoded_page *page) +{ + return ENCODE_PAGE_BITS & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); +} + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. From 449c796768c9a1c738d1fa8671fb01663380b8a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:49 -0800 Subject: [PATCH 141/313] mm: teach release_pages() to take an array of encoded page pointers too release_pages() already could take either an array of page pointers, or an array of folio pointers. Expand it to also accept an array of encoded page pointers, which is what both the existing mlock() use and the upcoming mmu_gather use of encoded page pointers wants. Note that release_pages() won't actually use, or react to, any extra encoded bits. Instead, this is very much a case of "I have walked the array of encoded pages and done everything the extra bits tell me to do, now release it all". Also, while the "either page or folio pointers" dual use was handled with a cast of the pointer in "release_folios()", this takes a slightly different approach and uses the "transparent union" attribute to describe the set of arguments to the function: https://gcc.gnu.org/onlinedocs/gcc/Common-Type-Attributes.html which has been supported by gcc forever, but the kernel hasn't used before. That allows us to avoid using various wrappers with casts, and just use the same function regardless of use. Link: https://lkml.kernel.org/r/20221109203051.1835763-2-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++++++++-- mm/swap.c | 16 ++++++++++++---- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8597ef676fc3..f873441303b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1245,7 +1245,24 @@ static inline void folio_put_refs(struct folio *folio, int refs) __folio_put(folio); } -void release_pages(struct page **pages, int nr); +/** + * release_pages - release an array of pages or folios + * + * This just releases a simple array of multiple pages, and + * accepts various different forms of said page array: either + * a regular old boring array of pages, an array of folios, or + * an array of encoded page pointers. + * + * The transparent union syntax for this kind of "any of these + * argument types" is all kinds of ugly, so look away. + */ +typedef union { + struct page **pages; + struct folio **folios; + struct encoded_page **encoded_pages; +} release_pages_arg __attribute__ ((__transparent_union__)); + +void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. @@ -1261,7 +1278,7 @@ void release_pages(struct page **pages, int nr); */ static inline void folios_put(struct folio **folios, unsigned int nr) { - release_pages((struct page **)folios, nr); + release_pages(folios, nr); } static inline void put_page(struct page *page) diff --git a/mm/swap.c b/mm/swap.c index b9a6817e07ff..70e2063ef43a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -981,22 +981,30 @@ void lru_cache_disable(void) /** * release_pages - batched put_page() - * @pages: array of pages to release + * @arg: array of pages to release * @nr: number of pages * - * Decrement the reference count on all the pages in @pages. If it + * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. */ -void release_pages(struct page **pages, int nr) +void release_pages(release_pages_arg arg, int nr) { int i; + struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { - struct folio *folio = page_folio(pages[i]); + struct folio *folio; + + /* Turn any of the argument types into a folio */ + folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get From 7cc8f9c7146a5c2dad6e71653c4f69972e73df6b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:50 -0800 Subject: [PATCH 142/313] mm: mmu_gather: prepare to gather encoded page pointers with flags This is purely a preparatory patch that makes all the data structures ready for encoding flags with the mmu_gather page pointers. The code currently always sets the flag to zero and doesn't use it yet, but now it's tracking the type state along. The next step will be to actually start using it. Link: https://lkml.kernel.org/r/20221109203051.1835763-3-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 8 +++++--- include/asm-generic/tlb.h | 9 +++++---- include/linux/swap.h | 2 +- mm/mmu_gather.c | 8 ++++---- mm/swap_state.c | 11 ++++------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 3a5c8fb590e5..05142226d65d 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -25,7 +25,8 @@ void __tlb_remove_table(void *_table); static inline void tlb_flush(struct mmu_gather *tlb); static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size); + struct encoded_page *page, + int page_size); #define tlb_flush tlb_flush #define pte_free_tlb pte_free_tlb @@ -42,9 +43,10 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, * has already been freed, so just do free_page_and_swap_cache. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) + struct encoded_page *page, + int page_size) { - free_page_and_swap_cache(page); + free_page_and_swap_cache(encoded_page_ptr(page)); return false; } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index cab7cfebf40b..54d03d1e712e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -246,7 +246,7 @@ struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; - struct page *pages[]; + struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ @@ -260,7 +260,8 @@ struct mmu_gather_batch { */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) -extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, +extern bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct encoded_page *page, int page_size); #endif @@ -435,13 +436,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) + if (__tlb_remove_page_size(tlb, encode_page(page, 0), page_size)) tlb_flush_mmu(tlb); } static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - return __tlb_remove_page_size(tlb, page, PAGE_SIZE); + return __tlb_remove_page_size(tlb, encode_page(page, 0), PAGE_SIZE); } /* tlb_remove_page diff --git a/include/linux/swap.h b/include/linux/swap.h index fec6647a289a..b61e2007d156 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -463,7 +463,7 @@ static inline unsigned long total_swapcache_pages(void) extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); -extern void free_pages_and_swap_cache(struct page **, int); +extern void free_pages_and_swap_cache(struct encoded_page **, int); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 3a2c3f8cad2f..382581c4a9f6 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -48,7 +48,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -77,7 +77,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) { struct mmu_gather_batch *batch; @@ -92,13 +92,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); + VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); return false; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 40fe6f23e105..2927507b43d8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -303,15 +303,12 @@ void free_page_and_swap_cache(struct page *page) * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ -void free_pages_and_swap_cache(struct page **pages, int nr) +void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { - struct page **pagep = pages; - int i; - lru_add_drain(); - for (i = 0; i < nr; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, nr); + for (int i = 0; i < nr; i++) + free_swap_cache(encoded_page_ptr(pages[i])); + release_pages(pages, nr); } static inline bool swap_use_vma_readahead(void) From 5df397dec7c4c08c23bd14f162f1228836faa4ce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2022 12:30:51 -0800 Subject: [PATCH 143/313] mm: delay page_remove_rmap() until after the TLB has been flushed When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPUs could still be using the page through stale TLB entries until after the flush. However, we have removed the rmap entry for that page early, which means that functions like folio_mkclean() would end up not serializing with the page table lock because the page had already been made invisible to rmap. And that is a problem, because while the TLB entry exists, we could end up with the following situation: (a) one CPU could come in and clean it, never seeing our mapping of the page (b) another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This extends our current TLB gather infrastructure to optionally track a "should I do a delayed page_remove_rmap() for this page after flushing the TLB". It uses the newly introduced 'encoded page pointer' to do that without having to keep separate data around. Note, this is complicated by a couple of issues: - we want to delay the rmap removal, but not past the page table lock, because that simplifies the memcg accounting - only SMP configurations want to delay TLB flushing, since on UP there are obviously no remote TLBs to worry about, and the page table lock means there are no preemption issues either - s390 has its own mmu_gather model that doesn't delay TLB flushing, and as a result also does not want the delayed rmap. As such, we can treat S390 like the UP case and use a common fallback for the "no delays" case. - we can track an enormous number of pages in our mmu_gather structure, with MAX_GATHER_BATCH_COUNT batches of MAX_TABLE_BATCH pages each, all set up to be approximately 10k pending pages. We do not want to have a huge number of batched pages that we then need to check for delayed rmap handling inside the page table lock. Particularly that last point results in a noteworthy detail, where the normal page batch gathering is limited once we have delayed rmaps pending, in such a way that only the last batch (the so-called "active batch") in the mmu_gather structure can have any delayed entries. NOTE! While the "possibly lost dirty data" sounds catastrophic, for this all to happen you need to have a user thread doing either madvise() with MADV_DONTNEED or a full re-mmap() of the area concurrently with another thread continuing to use said mapping. So arguably this is about user space doing crazy things, but from a VM consistency standpoint it's better if we track the dirty bit properly even when user space goes off the rails. [akpm@linux-foundation.org: fix UP build, per Linus] Link: https://lore.kernel.org/all/B88D3073-440A-41C7-95F4-895D3F657EF2@gmail.com/ Link: https://lkml.kernel.org/r/20221109203051.1835763-4-torvalds@linux-foundation.org Signed-off-by: Linus Torvalds Acked-by: Johannes Weiner Acked-by: Hugh Dickins Reported-by: Nadav Amit Tested-by: Nadav Amit Signed-off-by: Andrew Morton --- arch/s390/include/asm/tlb.h | 3 +++ include/asm-generic/tlb.h | 31 +++++++++++++++++++++++++++++-- mm/memory.c | 23 +++++++++++++++++------ mm/mmu_gather.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 05142226d65d..b91f4a9b044c 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -41,6 +41,9 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. + * + * s390 doesn't delay rmap removal, so there is nothing encoded in + * the page pointer. */ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 54d03d1e712e..b46617207c93 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -263,6 +263,28 @@ struct mmu_gather_batch { extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size); + +#ifdef CONFIG_SMP +/* + * This both sets 'delayed_rmap', and returns true. It would be an inline + * function, except we define it before the 'struct mmu_gather'. + */ +#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) +extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); +#endif + +#endif + +/* + * We have a no-op version of the rmap removal that doesn't + * delay anything. That is used on S390, which flushes remote + * TLBs synchronously, and on UP, which doesn't have any + * remote TLBs to flush and is not preemptible due to this + * all happening under the page table lock. + */ +#ifndef tlb_delay_rmap +#define tlb_delay_rmap(tlb) (false) +static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* @@ -295,6 +317,11 @@ struct mmu_gather { */ unsigned int freed_tables : 1; + /* + * Do we have pending delayed rmap removals? + */ + unsigned int delayed_rmap : 1; + /* * at which levels have we cleared entries? */ @@ -440,9 +467,9 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, tlb_flush_mmu(tlb); } -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static __always_inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags) { - return __tlb_remove_page_size(tlb, encode_page(page, 0), PAGE_SIZE); + return __tlb_remove_page_size(tlb, encode_page(page, flags), PAGE_SIZE); } /* tlb_remove_page diff --git a/mm/memory.c b/mm/memory.c index 1749c638734f..6c85cba02113 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1374,6 +1374,8 @@ again: break; if (pte_present(ptent)) { + unsigned int delay_rmap; + page = vm_normal_page(vma, addr, ptent); if (unlikely(!should_zap_page(details, page))) continue; @@ -1385,20 +1387,26 @@ again: if (unlikely(!page)) continue; + delay_rmap = 0; if (!PageAnon(page)) { if (pte_dirty(ptent)) { - force_flush = 1; set_page_dirty(page); + if (tlb_delay_rmap(tlb)) { + delay_rmap = 1; + force_flush = 1; + } } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, vma, false); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (!delay_rmap) { + page_remove_rmap(page, vma, false); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { force_flush = 1; addr += PAGE_SIZE; break; @@ -1455,8 +1463,11 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) + if (force_flush) { tlb_flush_mmu_tlbonly(tlb); + if (tlb->delayed_rmap) + tlb_flush_rmaps(tlb, vma); + } pte_unmap_unlock(start_pte, ptl); /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 382581c4a9f6..1de1cf9ba581 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; + /* No more batching if we have delayed rmaps pending */ + if (tlb->delayed_rmap) + return false; + batch = tlb->active; if (batch->next) { tlb->active = batch->next; @@ -43,6 +48,33 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +#ifdef CONFIG_SMP +/** + * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB + * @tlb: the current mmu_gather + * + * Note that because of how tlb_next_batch() above works, we will + * never start new batches with pending delayed rmaps, so we only + * need to walk through the current active batch. + */ +void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + struct mmu_gather_batch *batch; + + batch = tlb->active; + for (int i = 0; i < batch->nr; i++) { + struct encoded_page *enc = batch->encoded_pages[i]; + + if (encoded_page_flags(enc)) { + struct page *page = encoded_page_ptr(enc); + page_remove_rmap(page, vma, false); + } + } + + tlb->delayed_rmap = 0; +} +#endif + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; @@ -284,6 +316,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, tlb->active = &tlb->local; tlb->batch_count = 0; #endif + tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE From f036c8184f8b6750fa642485fb01eb6ff036a86b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 16 Nov 2022 08:49:30 +0100 Subject: [PATCH 144/313] mm: mmu_gather: do not expose delayed_rmap flag Flag delayed_rmap of 'struct mmu_gather' is rather a private member, but it is still accessed directly. Instead, let the TLB gather code access the flag. Link: https://lkml.kernel.org/r/Y3SWCu6NRaMQ5dbD@li-4a3a4a4c-28e5-11b2-a85c-a8d192c6f089.ibm.com Signed-off-by: Alexander Gordeev Acked-by: Linus Torvalds Signed-off-by: Andrew Morton --- mm/memory.c | 3 +-- mm/mmu_gather.c | 3 +++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6c85cba02113..086cb3dd8608 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1465,8 +1465,7 @@ again: /* Do the actual TLB flush before dropping ptl */ if (force_flush) { tlb_flush_mmu_tlbonly(tlb); - if (tlb->delayed_rmap) - tlb_flush_rmaps(tlb, vma); + tlb_flush_rmaps(tlb, vma); } pte_unmap_unlock(start_pte, ptl); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1de1cf9ba581..dd1f8ca40cb5 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -61,6 +61,9 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { struct mmu_gather_batch *batch; + if (!tlb->delayed_rmap) + return; + batch = tlb->active; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = batch->encoded_pages[i]; From 7ac07a26dea79c3892436bce41cce03dcbd3c4c7 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:35 +0900 Subject: [PATCH 145/313] zram: preparation for multi-zcomp support Patch series "zram: Support multiple compression streams", v5. This series adds support for multiple compression streams. The main idea is that different compression algorithms have different characteristics and zram may benefit when it uses a combination of algorithms: a default algorithm that is faster but have lower compression rate and a secondary algorithm that can use higher compression rate at a price of slower compression/decompression. There are several use-case for this functionality: - huge pages re-compression: zstd or deflate can successfully compress huge pages (~50% of huge pages on my synthetic ChromeOS tests), IOW pages that lzo was not able to compress. - idle pages re-compression: idle/cold pages sit in the memory and we may reduce zsmalloc memory usage if we recompress those idle pages. Userspace has a number of ways to control the behavior and impact of zram recompression: what type of pages should be recompressed, size watermarks, etc. Please refer to documentation patch. This patch (of 13): The patch turns compression streams and compressor algorithm name struct zram members into arrays, so that we can have multiple compression streams support (in the next patches). The patch uses a rather explicit API for compressor selection: - Get primary (default) compression stream zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]) - Get secondary compression stream zcomp_stream_get(zram->comps[ZRAM_SECONDARY_COMP]) We use similar API for compression streams put(). At this point we always have just one compression stream, since CONFIG_ZRAM_MULTI_COMP is not yet defined. Link: https://lkml.kernel.org/r/20221109115047.2921851-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20221109115047.2921851-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Minchan Kim Cc: Nitin Gupta Cc: Suleiman Souhlal Cc: Nhat Pham Cc: Alexey Romanov Signed-off-by: Andrew Morton --- drivers/block/zram/zcomp.c | 6 +-- drivers/block/zram/zcomp.h | 2 +- drivers/block/zram/zram_drv.c | 90 +++++++++++++++++++++++++---------- drivers/block/zram/zram_drv.h | 14 +++++- 4 files changed, 80 insertions(+), 32 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 0916de952e09..55af4efd7983 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -206,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp) * case of allocation error, or any other error potentially * returned by zcomp_init(). */ -struct zcomp *zcomp_create(const char *compress) +struct zcomp *zcomp_create(const char *alg) { struct zcomp *comp; int error; @@ -216,14 +216,14 @@ struct zcomp *zcomp_create(const char *compress) * is not loaded yet. We must do it here, otherwise we are about to * call /sbin/modprobe under CPU hot-plug lock. */ - if (!zcomp_available_algorithm(compress)) + if (!zcomp_available_algorithm(alg)) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) return ERR_PTR(-ENOMEM); - comp->name = compress; + comp->name = alg; error = zcomp_init(comp); if (error) { kfree(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index 40f6420f4b2e..cdefdef93da8 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -27,7 +27,7 @@ int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node); ssize_t zcomp_available_show(const char *comp, char *buf); bool zcomp_available_algorithm(const char *comp); -struct zcomp *zcomp_create(const char *comp); +struct zcomp *zcomp_create(const char *alg); void zcomp_destroy(struct zcomp *comp); struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 87711ddf4b54..fbe46c6177fd 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1004,36 +1004,53 @@ static ssize_t comp_algorithm_show(struct device *dev, struct zram *zram = dev_to_zram(dev); down_read(&zram->init_lock); - sz = zcomp_available_show(zram->compressor, buf); + sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf); up_read(&zram->init_lock); return sz; } +static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) +{ + /* Do not kfree() algs that we didn't allocate, IOW the default ones */ + if (zram->comp_algs[prio] != default_compressor) + kfree(zram->comp_algs[prio]); + zram->comp_algs[prio] = alg; +} + static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); - char compressor[ARRAY_SIZE(zram->compressor)]; + char *compressor; size_t sz; - strscpy(compressor, buf, sizeof(compressor)); + sz = strlen(buf); + if (sz >= CRYPTO_MAX_ALG_NAME) + return -E2BIG; + + compressor = kstrdup(buf, GFP_KERNEL); + if (!compressor) + return -ENOMEM; + /* ignore trailing newline */ - sz = strlen(compressor); if (sz > 0 && compressor[sz - 1] == '\n') compressor[sz - 1] = 0x00; - if (!zcomp_available_algorithm(compressor)) + if (!zcomp_available_algorithm(compressor)) { + kfree(compressor); return -EINVAL; + } down_write(&zram->init_lock); if (init_done(zram)) { up_write(&zram->init_lock); + kfree(compressor); pr_info("Can't change algorithm for initialized device\n"); return -EBUSY; } - strcpy(zram->compressor, compressor); + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, compressor); up_write(&zram->init_lock); return len; } @@ -1281,7 +1298,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, size = zram_get_obj_size(zram, index); if (size != PAGE_SIZE) - zstrm = zcomp_stream_get(zram->comp); + zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { @@ -1293,7 +1310,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, dst = kmap_atomic(page); ret = zcomp_decompress(zstrm, src, size, dst); kunmap_atomic(dst); - zcomp_stream_put(zram->comp); + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); } zs_unmap_object(zram->mem_pool, handle); zram_slot_unlock(zram, index); @@ -1360,13 +1377,13 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, kunmap_atomic(mem); compress_again: - zstrm = zcomp_stream_get(zram->comp); + zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); src = kmap_atomic(page); ret = zcomp_compress(zstrm, src, &comp_len); kunmap_atomic(src); if (unlikely(ret)) { - zcomp_stream_put(zram->comp); + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); pr_err("Compression failed! err=%d\n", ret); zs_free(zram->mem_pool, handle); return ret; @@ -1394,7 +1411,7 @@ compress_again: __GFP_HIGHMEM | __GFP_MOVABLE); if (IS_ERR((void *)handle)) { - zcomp_stream_put(zram->comp); + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | @@ -1411,14 +1428,14 @@ compress_again: * zstrm buffer back. It is necessary that the dereferencing * of the zstrm variable below occurs correctly. */ - zstrm = zcomp_stream_get(zram->comp); + zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); } alloced_pages = zs_get_total_pages(zram->mem_pool); update_used_max(zram, alloced_pages); if (zram->limit_pages && alloced_pages > zram->limit_pages) { - zcomp_stream_put(zram->comp); + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_free(zram->mem_pool, handle); return -ENOMEM; } @@ -1432,7 +1449,7 @@ compress_again: if (comp_len == PAGE_SIZE) kunmap_atomic(src); - zcomp_stream_put(zram->comp); + zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); zs_unmap_object(zram->mem_pool, handle); atomic64_add(comp_len, &zram->stats.compr_data_size); out: @@ -1707,6 +1724,20 @@ out: return ret; } +static void zram_destroy_comps(struct zram *zram) +{ + u32 prio; + + for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + struct zcomp *comp = zram->comps[prio]; + + zram->comps[prio] = NULL; + if (!comp) + continue; + zcomp_destroy(comp); + } +} + static void zram_reset_device(struct zram *zram) { down_write(&zram->init_lock); @@ -1724,11 +1755,11 @@ static void zram_reset_device(struct zram *zram) /* I/O operation under all of CPU are done so let's free */ zram_meta_free(zram, zram->disksize); zram->disksize = 0; + zram_destroy_comps(zram); memset(&zram->stats, 0, sizeof(zram->stats)); - zcomp_destroy(zram->comp); - zram->comp = NULL; reset_bdev(zram); + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); up_write(&zram->init_lock); } @@ -1739,6 +1770,7 @@ static ssize_t disksize_store(struct device *dev, struct zcomp *comp; struct zram *zram = dev_to_zram(dev); int err; + u32 prio; disksize = memparse(buf, NULL); if (!disksize) @@ -1757,22 +1789,28 @@ static ssize_t disksize_store(struct device *dev, goto out_unlock; } - comp = zcomp_create(zram->compressor); - if (IS_ERR(comp)) { - pr_err("Cannot initialise %s compressing backend\n", - zram->compressor); - err = PTR_ERR(comp); - goto out_free_meta; - } + for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + if (!zram->comp_algs[prio]) + continue; - zram->comp = comp; + comp = zcomp_create(zram->comp_algs[prio]); + if (IS_ERR(comp)) { + pr_err("Cannot initialise %s compressing backend\n", + zram->comp_algs[prio]); + err = PTR_ERR(comp); + goto out_free_comps; + } + + zram->comps[prio] = comp; + } zram->disksize = disksize; set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT); up_write(&zram->init_lock); return len; -out_free_meta: +out_free_comps: + zram_destroy_comps(zram); zram_meta_free(zram, disksize); out_unlock: up_write(&zram->init_lock); @@ -1959,7 +1997,7 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; - strscpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram->comp_algs[ZRAM_PRIMARY_COMP] = default_compressor; zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index a2bda53020fd..7a643c8c38ec 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -89,10 +89,20 @@ struct zram_stats { #endif }; +#ifdef CONFIG_ZRAM_MULTI_COMP +#define ZRAM_PRIMARY_COMP 0U +#define ZRAM_SECONDARY_COMP 1U +#define ZRAM_MAX_COMPS 4U +#else +#define ZRAM_PRIMARY_COMP 0U +#define ZRAM_SECONDARY_COMP 0U +#define ZRAM_MAX_COMPS 1U +#endif + struct zram { struct zram_table_entry *table; struct zs_pool *mem_pool; - struct zcomp *comp; + struct zcomp *comps[ZRAM_MAX_COMPS]; struct gendisk *disk; /* Prevent concurrent execution of device init */ struct rw_semaphore init_lock; @@ -107,7 +117,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - char compressor[CRYPTO_MAX_ALG_NAME]; + const char *comp_algs[ZRAM_MAX_COMPS]; /* * zram is claimed so open request will be failed */ From 001d9273570115b2eb360d5452bbc46f6cc063a1 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:36 +0900 Subject: [PATCH 146/313] zram: add recompression algorithm sysfs knob Introduce recomp_algorithm sysfs knob that controls secondary algorithm selection used for recompression. We will support up to 3 secondary compression algorithms which are sorted in order of their priority. To select an algorithm user has to provide its name and priority: echo "algo=zstd priority=1" > /sys/block/zramX/recomp_algorithm echo "algo=deflate priority=2" > /sys/block/zramX/recomp_algorithm During recompression zram iterates through the list of registered secondary algorithms in order of their priorities. We also have a short version for cases when there is only one secondary compression algorithm: echo "algo=zstd" > /sys/block/zramX/recomp_algorithm This will register zstd as the secondary algorithm with priority 1. Link: https://lkml.kernel.org/r/20221109115047.2921851-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 124 ++++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 19 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fbe46c6177fd..8ea4908f5961 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -997,31 +997,28 @@ static ssize_t max_comp_streams_store(struct device *dev, return len; } -static ssize_t comp_algorithm_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) { - size_t sz; - struct zram *zram = dev_to_zram(dev); + /* Do not free statically defined compression algorithms */ + if (zram->comp_algs[prio] != default_compressor) + kfree(zram->comp_algs[prio]); + + zram->comp_algs[prio] = alg; +} + +static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, char *buf) +{ + ssize_t sz; down_read(&zram->init_lock); - sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf); + sz = zcomp_available_show(zram->comp_algs[prio], buf); up_read(&zram->init_lock); return sz; } -static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) +static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) { - /* Do not kfree() algs that we didn't allocate, IOW the default ones */ - if (zram->comp_algs[prio] != default_compressor) - kfree(zram->comp_algs[prio]); - zram->comp_algs[prio] = alg; -} - -static ssize_t comp_algorithm_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - struct zram *zram = dev_to_zram(dev); char *compressor; size_t sz; @@ -1050,11 +1047,94 @@ static ssize_t comp_algorithm_store(struct device *dev, return -EBUSY; } - comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, compressor); + comp_algorithm_set(zram, prio, compressor); up_write(&zram->init_lock); - return len; + return 0; } +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf); +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + struct zram *zram = dev_to_zram(dev); + int ret; + + ret = __comp_algorithm_store(zram, ZRAM_PRIMARY_COMP, buf); + return ret ? ret : len; +} + +#ifdef CONFIG_ZRAM_MULTI_COMP +static ssize_t recomp_algorithm_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t sz = 0; + u32 prio; + + for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { + if (!zram->comp_algs[prio]) + continue; + + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "#%d: ", prio); + sz += __comp_algorithm_show(zram, prio, buf + sz); + } + + return sz; +} + +static ssize_t recomp_algorithm_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + struct zram *zram = dev_to_zram(dev); + int prio = ZRAM_SECONDARY_COMP; + char *args, *param, *val; + char *alg = NULL; + int ret; + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + if (!*val) + return -EINVAL; + + if (!strcmp(param, "algo")) { + alg = val; + continue; + } + + if (!strcmp(param, "priority")) { + ret = kstrtoint(val, 10, &prio); + if (ret) + return ret; + continue; + } + } + + if (!alg) + return -EINVAL; + + if (prio < ZRAM_SECONDARY_COMP || prio >= ZRAM_MAX_COMPS) + return -EINVAL; + + ret = __comp_algorithm_store(zram, prio, alg); + return ret ? ret : len; +} +#endif + static ssize_t compact_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -1895,6 +1975,9 @@ static DEVICE_ATTR_WO(writeback); static DEVICE_ATTR_RW(writeback_limit); static DEVICE_ATTR_RW(writeback_limit_enable); #endif +#ifdef CONFIG_ZRAM_MULTI_COMP +static DEVICE_ATTR_RW(recomp_algorithm); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1918,6 +2001,9 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_bd_stat.attr, #endif &dev_attr_debug_stat.attr, +#ifdef CONFIG_ZRAM_MULTI_COMP + &dev_attr_recomp_algorithm.attr, +#endif NULL, }; @@ -1997,7 +2083,7 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; - zram->comp_algs[ZRAM_PRIMARY_COMP] = default_compressor; + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); From 5561347aa598b6b12fb6069788ccec9b5e5ebec1 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:37 +0900 Subject: [PATCH 147/313] zram: factor out WB and non-WB zram read functions We will use non-WB variant in ZRAM page recompression path. Link: https://lkml.kernel.org/r/20221109115047.2921851-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 72 ++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 23 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8ea4908f5961..135fb946f83e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1336,8 +1336,29 @@ out: ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); } -static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, - struct bio *bio, bool partial_io) +/* + * Reads a page from the writeback devices. Corresponding ZRAM slot + * should be unlocked. + */ +static int zram_bvec_read_from_bdev(struct zram *zram, struct page *page, + u32 index, struct bio *bio, bool partial_io) +{ + struct bio_vec bvec = { + .bv_page = page, + .bv_len = PAGE_SIZE, + .bv_offset = 0, + }; + + return read_from_bdev(zram, &bvec, zram_get_element(zram, index), bio, + partial_io); +} + +/* + * Reads (decompresses if needed) a page from zspool (zsmalloc). + * Corresponding ZRAM slot should be locked. + */ +static int zram_read_from_zspool(struct zram *zram, struct page *page, + u32 index) { struct zcomp_strm *zstrm; unsigned long handle; @@ -1345,23 +1366,6 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, void *src, *dst; int ret; - zram_slot_lock(zram, index); - if (zram_test_flag(zram, index, ZRAM_WB)) { - struct bio_vec bvec; - - zram_slot_unlock(zram, index); - /* A null bio means rw_page was used, we must fallback to bio */ - if (!bio) - return -EOPNOTSUPP; - - bvec.bv_page = page; - bvec.bv_len = PAGE_SIZE; - bvec.bv_offset = 0; - return read_from_bdev(zram, &bvec, - zram_get_element(zram, index), - bio, partial_io); - } - handle = zram_get_handle(zram, index); if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) { unsigned long value; @@ -1371,7 +1375,6 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, mem = kmap_atomic(page); zram_fill_page(mem, PAGE_SIZE, value); kunmap_atomic(mem); - zram_slot_unlock(zram, index); return 0; } @@ -1393,17 +1396,40 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); } zs_unmap_object(zram->mem_pool, handle); - zram_slot_unlock(zram, index); + return ret; +} + +static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, + struct bio *bio, bool partial_io) +{ + int ret; + + zram_slot_lock(zram, index); + if (!zram_test_flag(zram, index, ZRAM_WB)) { + /* Slot should be locked through out the function call */ + ret = zram_read_from_zspool(zram, page, index); + zram_slot_unlock(zram, index); + } else { + /* Slot should be unlocked before the function call */ + zram_slot_unlock(zram, index); + + /* A null bio means rw_page was used, we must fallback to bio */ + if (!bio) + return -EOPNOTSUPP; + + ret = zram_bvec_read_from_bdev(zram, page, index, bio, + partial_io); + } /* Should NEVER happen. Return bio error if it does. */ - if (WARN_ON(ret)) + if (WARN_ON(ret < 0)) pr_err("Decompression failed! err=%d, page=%u\n", ret, index); return ret; } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset, struct bio *bio) { int ret; struct page *page; From 84b33bf7888975d28c0e57011b75c445279c60ec Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:38 +0900 Subject: [PATCH 148/313] zram: introduce recompress sysfs knob Allow zram to recompress (using secondary compression streams) pages. Re-compression algorithms (we support up to 3 at this stage) are selected via recomp_algorithm: echo "algo=zstd priority=1" > /sys/block/zramX/recomp_algorithm Please read documentation for more details. We support several recompression modes: 1) IDLE pages recompression is activated by `idle` mode echo "type=idle" > /sys/block/zram0/recompress 2) Since there may be many idle pages user-space may pass a size threshold value (in bytes) and we will recompress pages only of equal or greater size: echo "threshold=888" > /sys/block/zram0/recompress 3) HUGE pages recompression is activated by `huge` mode echo "type=huge" > /sys/block/zram0/recompress 4) HUGE_IDLE pages recompression is activated by `huge_idle` mode echo "type=huge_idle" > /sys/block/zram0/recompress [senozhatsky@chromium.org: we should always zero out err variable in recompress loop[ Link: https://lkml.kernel.org/r/20221110143423.3250790-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20221109115047.2921851-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nathan Chancellor Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/Kconfig | 9 ++ drivers/block/zram/zram_drv.c | 264 +++++++++++++++++++++++++++++++++- drivers/block/zram/zram_drv.h | 7 + 3 files changed, 277 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index d4100b0c083e..0386b7da02aa 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -78,3 +78,12 @@ config ZRAM_MEMORY_TRACKING /sys/kernel/debug/zram/zramX/block_state. See Documentation/admin-guide/blockdev/zram.rst for more information. + +config ZRAM_MULTI_COMP + bool "Enable multiple compression streams" + depends on ZRAM + help + This will enable multi-compression streams, so that ZRAM can + re-compress pages using a potentially slower but more effective + compression algorithm. Note, that IDLE page recompression + requires ZRAM_MEMORY_TRACKING. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 135fb946f83e..97300b3a83c3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -155,6 +155,25 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif +static inline void zram_set_priority(struct zram *zram, u32 index, u32 prio) +{ + prio &= ZRAM_COMP_PRIORITY_MASK; + /* + * Clear previous priority value first, in case if we recompress + * further an already recompressed page + */ + zram->table[index].flags &= ~(ZRAM_COMP_PRIORITY_MASK << + ZRAM_COMP_PRIORITY_BIT1); + zram->table[index].flags |= (prio << ZRAM_COMP_PRIORITY_BIT1); +} + +static inline u32 zram_get_priority(struct zram *zram, u32 index) +{ + u32 prio = zram->table[index].flags >> ZRAM_COMP_PRIORITY_BIT1; + + return prio & ZRAM_COMP_PRIORITY_MASK; +} + /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1304,6 +1323,11 @@ static void zram_free_page(struct zram *zram, size_t index) atomic64_dec(&zram->stats.huge_pages); } + if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); + + zram_set_priority(zram, index, 0); + if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); free_block_bdev(zram, zram_get_element(zram, index)); @@ -1364,6 +1388,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, unsigned long handle; unsigned int size; void *src, *dst; + u32 prio; int ret; handle = zram_get_handle(zram, index); @@ -1380,8 +1405,10 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, size = zram_get_obj_size(zram, index); - if (size != PAGE_SIZE) - zstrm = zcomp_stream_get(zram->comps[ZRAM_PRIMARY_COMP]); + if (size != PAGE_SIZE) { + prio = zram_get_priority(zram, index); + zstrm = zcomp_stream_get(zram->comps[prio]); + } src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); if (size == PAGE_SIZE) { @@ -1393,7 +1420,7 @@ static int zram_read_from_zspool(struct zram *zram, struct page *page, dst = kmap_atomic(page); ret = zcomp_decompress(zstrm, src, size, dst); kunmap_atomic(dst); - zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); + zcomp_stream_put(zram->comps[prio]); } zs_unmap_object(zram->mem_pool, handle); return ret; @@ -1624,6 +1651,235 @@ out: return ret; } +#ifdef CONFIG_ZRAM_MULTI_COMP +/* + * This function will decompress (unless it's ZRAM_HUGE) the page and then + * attempt to compress it using provided compression algorithm priority + * (which is potentially more effective). + * + * Corresponding ZRAM slot should be locked. + */ +static int zram_recompress(struct zram *zram, u32 index, struct page *page, + u32 threshold, u32 prio, u32 prio_max) +{ + struct zcomp_strm *zstrm = NULL; + unsigned long handle_old; + unsigned long handle_new; + unsigned int comp_len_old; + unsigned int comp_len_new; + void *src, *dst; + int ret; + + handle_old = zram_get_handle(zram, index); + if (!handle_old) + return -EINVAL; + + comp_len_old = zram_get_obj_size(zram, index); + /* + * Do not recompress objects that are already "small enough". + */ + if (comp_len_old < threshold) + return 0; + + ret = zram_read_from_zspool(zram, page, index); + if (ret) + return ret; + + /* + * Iterate the secondary comp algorithms list (in order of priority) + * and try to recompress the page. + */ + for (; prio < prio_max; prio++) { + if (!zram->comps[prio]) + continue; + + /* + * Skip if the object is already re-compressed with a higher + * priority algorithm (or same algorithm). + */ + if (prio <= zram_get_priority(zram, index)) + continue; + + zstrm = zcomp_stream_get(zram->comps[prio]); + src = kmap_atomic(page); + ret = zcomp_compress(zstrm, src, &comp_len_new); + kunmap_atomic(src); + + if (ret) { + zcomp_stream_put(zram->comps[prio]); + return ret; + } + + /* Continue until we make progress */ + if (comp_len_new >= huge_class_size || + comp_len_new >= comp_len_old || + (threshold && comp_len_new >= threshold)) { + zcomp_stream_put(zram->comps[prio]); + continue; + } + + /* Recompression was successful so break out */ + break; + } + + /* + * We did not try to recompress, e.g. when we have only one + * secondary algorithm and the page is already recompressed + * using that algorithm + */ + if (!zstrm) + return 0; + + /* + * All secondary algorithms failed to re-compress the page in a way + * that would save memory, mark the object as incompressible so that + * we will not try to compress it again. + */ + if (comp_len_new >= huge_class_size || comp_len_new >= comp_len_old) { + zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); + return 0; + } + + /* Successful recompression but above threshold */ + if (threshold && comp_len_new >= threshold) + return 0; + + /* + * No direct reclaim (slow path) for handle allocation and no + * re-compression attempt (unlike in __zram_bvec_write()) since + * we already have stored that object in zsmalloc. If we cannot + * alloc memory for recompressed object then we bail out and + * simply keep the old (existing) object in zsmalloc. + */ + handle_new = zs_malloc(zram->mem_pool, comp_len_new, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM | + __GFP_MOVABLE); + if (IS_ERR_VALUE(handle_new)) { + zcomp_stream_put(zram->comps[prio]); + return PTR_ERR((void *)handle_new); + } + + dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO); + memcpy(dst, zstrm->buffer, comp_len_new); + zcomp_stream_put(zram->comps[prio]); + + zs_unmap_object(zram->mem_pool, handle_new); + + zram_free_page(zram, index); + zram_set_handle(zram, index, handle_new); + zram_set_obj_size(zram, index, comp_len_new); + zram_set_priority(zram, index, prio); + + atomic64_add(comp_len_new, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); + + return 0; +} + +#define RECOMPRESS_IDLE (1 << 0) +#define RECOMPRESS_HUGE (1 << 1) + +static ssize_t recompress_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u32 mode = 0, threshold = 0, prio = ZRAM_SECONDARY_COMP; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + char *args, *param, *val; + unsigned long index; + struct page *page; + ssize_t ret; + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + if (!*val) + return -EINVAL; + + if (!strcmp(param, "type")) { + if (!strcmp(val, "idle")) + mode = RECOMPRESS_IDLE; + if (!strcmp(val, "huge")) + mode = RECOMPRESS_HUGE; + if (!strcmp(val, "huge_idle")) + mode = RECOMPRESS_IDLE | RECOMPRESS_HUGE; + continue; + } + + if (!strcmp(param, "threshold")) { + /* + * We will re-compress only idle objects equal or + * greater in size than watermark. + */ + ret = kstrtouint(val, 10, &threshold); + if (ret) + return ret; + continue; + } + } + + if (threshold >= PAGE_SIZE) + return -EINVAL; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + ret = -EINVAL; + goto release_init_lock; + } + + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto release_init_lock; + } + + ret = len; + for (index = 0; index < nr_pages; index++) { + int err = 0; + + zram_slot_lock(zram, index); + + if (!zram_allocated(zram, index)) + goto next; + + if (mode & RECOMPRESS_IDLE && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + + if (mode & RECOMPRESS_HUGE && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_UNDER_WB) || + zram_test_flag(zram, index, ZRAM_SAME) || + zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + err = zram_recompress(zram, index, page, threshold, + prio, ZRAM_MAX_COMPS); +next: + zram_slot_unlock(zram, index); + if (err) { + ret = err; + break; + } + + cond_resched(); + } + + __free_page(page); + +release_init_lock: + up_read(&zram->init_lock); + return ret; +} +#endif + /* * zram_bio_discard - handler on discard request * @index: physical block index in PAGE_SIZE units @@ -2003,6 +2259,7 @@ static DEVICE_ATTR_RW(writeback_limit_enable); #endif #ifdef CONFIG_ZRAM_MULTI_COMP static DEVICE_ATTR_RW(recomp_algorithm); +static DEVICE_ATTR_WO(recompress); #endif static struct attribute *zram_disk_attrs[] = { @@ -2029,6 +2286,7 @@ static struct attribute *zram_disk_attrs[] = { &dev_attr_debug_stat.attr, #ifdef CONFIG_ZRAM_MULTI_COMP &dev_attr_recomp_algorithm.attr, + &dev_attr_recompress.attr, #endif NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 7a643c8c38ec..b80faae76835 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -40,6 +40,9 @@ */ #define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) +/* Only 2 bits are allowed for comp priority index */ +#define ZRAM_COMP_PRIORITY_MASK 0x3 + /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { /* zram slot is locked */ @@ -49,6 +52,10 @@ enum zram_pageflags { ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ + ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */ + + ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ + ZRAM_COMP_PRIORITY_BIT2, /* Second bit of comp priority index */ __NR_ZRAM_PAGEFLAGS, }; From 60e9b39ebec56467c36c3da76eee28083196cdf1 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:39 +0900 Subject: [PATCH 149/313] zram: add recompress flag to read_block_state() Add a new flag to zram block state that shows if the page was recompressed (using alternative compression algorithm). Link: https://lkml.kernel.org/r/20221109115047.2921851-6-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 9 ++++++--- drivers/block/zram/zram_drv.c | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index c73b16930449..177a142c3146 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -411,9 +411,10 @@ pages of the process with*pagemap. If you enable the feature, you could see block state via /sys/kernel/debug/zram/zram0/block_state". The output is as follows:: - 300 75.033841 .wh. - 301 63.806904 s... - 302 63.806919 ..hi + 300 75.033841 .wh.. + 301 63.806904 s.... + 302 63.806919 ..hi. + 303 62.801919 ....r First column zram's block index. @@ -430,6 +431,8 @@ Third column huge page i: idle page + r: + recompressed page (secondary compression algorithm) First line of above example says 300th block is accessed at 75.033841sec and the block's state is huge so it is written back to the backing diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 97300b3a83c3..ddbfa70ef9a3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -936,13 +936,14 @@ static ssize_t read_block_state(struct file *file, char __user *buf, ts = ktime_to_timespec64(zram->table[index].ac_time); copied = snprintf(kbuf + written, count, - "%12zd %12lld.%06lu %c%c%c%c\n", + "%12zd %12lld.%06lu %c%c%c%c%c\n", index, (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC, zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', - zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.'); + zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.', + zram_get_priority(zram, index) ? 'r' : '.'); if (count <= copied) { zram_slot_unlock(zram, index); From 9fda785dbd14cfc7d874d00d2b007cb143aa48d0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:40 +0900 Subject: [PATCH 150/313] zram: clarify writeback_store() comment Re-phrase writeback BIO error comment. Link: https://lkml.kernel.org/r/20221109115047.2921851-7-senozhatsky@chromium.org Reported-by: Andrew Morton Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ddbfa70ef9a3..0ca0bf330d8f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -769,8 +769,12 @@ static ssize_t writeback_store(struct device *dev, zram_clear_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); /* - * Return last IO error unless every IO were - * not suceeded. + * BIO errors are not fatal, we continue and simply + * attempt to writeback the remaining objects (pages). + * At the same time we need to signal user-space that + * some writes (at least one, but also could be all of + * them) were not successful and we do so by returning + * the most recent BIO error. */ ret = err; continue; From f24ee92cbe13242758635e654b2422dbf4912e4b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:41 +0900 Subject: [PATCH 151/313] zram: use IS_ERR_VALUE() to check for zs_malloc() errors Avoid typecasts that are needed for IS_ERR() and use IS_ERR_VALUE() instead. Link: https://lkml.kernel.org/r/20221109115047.2921851-8-senozhatsky@chromium.org Suggested-by: Andrew Morton Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0ca0bf330d8f..473fc5eb71a1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1542,19 +1542,19 @@ compress_again: * if we have a 'non-null' handle here then we are coming * from the slow path and handle has already been allocated. */ - if (IS_ERR((void *)handle)) + if (IS_ERR_VALUE(handle)) handle = zs_malloc(zram->mem_pool, comp_len, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | __GFP_HIGHMEM | __GFP_MOVABLE); - if (IS_ERR((void *)handle)) { + if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP]); atomic64_inc(&zram->stats.writestall); handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE); - if (IS_ERR((void *)handle)) + if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); if (comp_len != PAGE_SIZE) From 7c2af309abd24ff4e313246bf9b68f398d95c871 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Wed, 9 Nov 2022 20:50:42 +0900 Subject: [PATCH 152/313] zram: add size class equals check into recompression It makes no sense for us to recompress the object if it will be in the same size class. We anyway don't get any memory gain. But, at the same time, we get a CPU time overhead when inserting this object into zspage and decompressing it afterwards. [senozhatsky: rebased and fixed conflicts] Link: https://lkml.kernel.org/r/20221109115047.2921851-9-senozhatsky@chromium.org Signed-off-by: Alexey Romanov Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 11 ++++++++++- include/linux/zsmalloc.h | 2 ++ mm/zsmalloc.c | 21 +++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 473fc5eb71a1..66659f16f6c8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1672,6 +1672,8 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, unsigned long handle_new; unsigned int comp_len_old; unsigned int comp_len_new; + unsigned int class_index_old; + unsigned int class_index_new; void *src, *dst; int ret; @@ -1690,6 +1692,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, if (ret) return ret; + class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); /* * Iterate the secondary comp algorithms list (in order of priority) * and try to recompress the page. @@ -1715,9 +1718,13 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, return ret; } + class_index_new = zs_lookup_class_index(zram->mem_pool, + comp_len_new); + /* Continue until we make progress */ if (comp_len_new >= huge_class_size || comp_len_new >= comp_len_old || + class_index_new >= class_index_old || (threshold && comp_len_new >= threshold)) { zcomp_stream_put(zram->comps[prio]); continue; @@ -1740,7 +1747,9 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, * that would save memory, mark the object as incompressible so that * we will not try to compress it again. */ - if (comp_len_new >= huge_class_size || comp_len_new >= comp_len_old) { + if (comp_len_new >= huge_class_size || + comp_len_new >= comp_len_old || + class_index_new >= class_index_old) { zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2a430e713ce5..a48cd0ffe57d 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -55,5 +55,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); + void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b52b7bb88b52..78feda34ad9a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1205,6 +1205,27 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +/** + * zs_lookup_class_index() - Returns index of the zsmalloc &size_class + * that hold objects of the provided size. + * @pool: zsmalloc pool to use + * @size: object size + * + * Context: Any context. + * + * Return: the index of the zsmalloc &size_class that hold objects of the + * provided size. + */ +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) +{ + struct size_class *class; + + class = pool->size_class[get_size_class_index(size)]; + + return class->index; +} +EXPORT_SYMBOL_GPL(zs_lookup_class_index); + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); From 4942cf6ad07c487d24112ffbb27362f4e6b409b8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:43 +0900 Subject: [PATCH 153/313] zram: remove redundant checks from zram_recompress() Size class index comparison is powerful enough so we can remove object size comparisons. Link: https://lkml.kernel.org/r/20221109115047.2921851-10-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 66659f16f6c8..72beb33366fb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1722,9 +1722,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, comp_len_new); /* Continue until we make progress */ - if (comp_len_new >= huge_class_size || - comp_len_new >= comp_len_old || - class_index_new >= class_index_old || + if (class_index_new >= class_index_old || (threshold && comp_len_new >= threshold)) { zcomp_stream_put(zram->comps[prio]); continue; @@ -1747,9 +1745,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, * that would save memory, mark the object as incompressible so that * we will not try to compress it again. */ - if (comp_len_new >= huge_class_size || - comp_len_new >= comp_len_old || - class_index_new >= class_index_old) { + if (class_index_new >= class_index_old) { zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } From a55cf9648d3de486ccf0eca980a02f0faff8ec45 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:44 +0900 Subject: [PATCH 154/313] zram: add algo parameter support to zram_recompress() Recompression iterates through all the registered secondary compression algorithms in order of their priorities so that we have higher chances of finding the algorithm that compresses a particular page. This, however, may not always be best approach and sometimes we may want to limit recompression to only one particular algorithm. For instance, when a higher priority algorithm uses too much power and device has a relatively low battery level we may want to limit recompression to use only a lower priority algorithm, which uses less power. Introduce algo= parameter support to recompression sysfs knob so that user-sapce can request recompression with particular algorithm only: echo "type=idle algo=zstd" > /sys/block/zramX/recompress Link: https://lkml.kernel.org/r/20221109115047.2921851-11-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 54 +++++++++++++++++++++++++++++------ drivers/block/zram/zram_drv.h | 1 + 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 72beb33366fb..798c421fdd36 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1674,6 +1674,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, unsigned int comp_len_new; unsigned int class_index_old; unsigned int class_index_new; + u32 num_recomps = 0; void *src, *dst; int ret; @@ -1708,6 +1709,7 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, if (prio <= zram_get_priority(zram, index)) continue; + num_recomps++; zstrm = zcomp_stream_get(zram->comps[prio]); src = kmap_atomic(page); ret = zcomp_compress(zstrm, src, &comp_len_new); @@ -1740,13 +1742,19 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, if (!zstrm) return 0; - /* - * All secondary algorithms failed to re-compress the page in a way - * that would save memory, mark the object as incompressible so that - * we will not try to compress it again. - */ if (class_index_new >= class_index_old) { - zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); + /* + * Secondary algorithms failed to re-compress the page + * in a way that would save memory, mark the object as + * incompressible so that we will not try to compress + * it again. + * + * We need to make sure that all secondary algorithms have + * failed, so we test if the number of recompressions matches + * the number of active secondary algorithms. + */ + if (num_recomps == zram->num_active_comps - 1) + zram_set_flag(zram, index, ZRAM_INCOMPRESSIBLE); return 0; } @@ -1795,10 +1803,11 @@ static ssize_t recompress_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS; struct zram *zram = dev_to_zram(dev); - u32 mode = 0, threshold = 0, prio = ZRAM_SECONDARY_COMP; unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; - char *args, *param, *val; + char *args, *param, *val, *algo = NULL; + u32 mode = 0, threshold = 0; unsigned long index; struct page *page; ssize_t ret; @@ -1830,6 +1839,11 @@ static ssize_t recompress_store(struct device *dev, return ret; continue; } + + if (!strcmp(param, "algo")) { + algo = val; + continue; + } } if (threshold >= PAGE_SIZE) @@ -1841,6 +1855,26 @@ static ssize_t recompress_store(struct device *dev, goto release_init_lock; } + if (algo) { + bool found = false; + + for (; prio < ZRAM_MAX_COMPS; prio++) { + if (!zram->comp_algs[prio]) + continue; + + if (!strcmp(zram->comp_algs[prio], algo)) { + prio_max = min(prio + 1, ZRAM_MAX_COMPS); + found = true; + break; + } + } + + if (!found) { + ret = -EINVAL; + goto release_init_lock; + } + } + page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; @@ -1871,7 +1905,7 @@ static ssize_t recompress_store(struct device *dev, goto next; err = zram_recompress(zram, index, page, threshold, - prio, ZRAM_MAX_COMPS); + prio, prio_max); next: zram_slot_unlock(zram, index); if (err) { @@ -2107,6 +2141,7 @@ static void zram_destroy_comps(struct zram *zram) if (!comp) continue; zcomp_destroy(comp); + zram->num_active_comps--; } } @@ -2174,6 +2209,7 @@ static ssize_t disksize_store(struct device *dev, } zram->comps[prio] = comp; + zram->num_active_comps++; } zram->disksize = disksize; set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index b80faae76835..473325415a74 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -125,6 +125,7 @@ struct zram { */ u64 disksize; /* bytes */ const char *comp_algs[ZRAM_MAX_COMPS]; + s8 num_active_comps; /* * zram is claimed so open request will be failed */ From 443dd798062c1549e790539e572cbda4b7a8df30 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:45 +0900 Subject: [PATCH 155/313] documentation: add zram recompression documentation Document user-space visible device attributes that are enabled by ZRAM_MULTI_COMP. Link: https://lkml.kernel.org/r/20221109115047.2921851-12-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 81 +++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 177a142c3146..d898b7ace33d 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -401,6 +401,87 @@ budget in next setting is user's job. If admin wants to measure writeback count in a certain period, they could know it via /sys/block/zram0/bd_stat's 3rd column. +recompression +------------- + +With CONFIG_ZRAM_MULTI_COMP, zram can recompress pages using alternative +(secondary) compression algorithms. The basic idea is that alternative +compression algorithm can provide better compression ratio at a price of +(potentially) slower compression/decompression speeds. Alternative compression +algorithm can, for example, be more successful compressing huge pages (those +that default algorithm failed to compress). Another application is idle pages +recompression - pages that are cold and sit in the memory can be recompressed +using more effective algorithm and, hence, reduce zsmalloc memory usage. + +With CONFIG_ZRAM_MULTI_COMP, zram supports up to 4 compression algorithms: +one primary and up to 3 secondary ones. Primary zram compressor is explained +in "3) Select compression algorithm", secondary algorithms are configured +using recomp_algorithm device attribute. + +Example::: + + #show supported recompression algorithms + cat /sys/block/zramX/recomp_algorithm + #1: lzo lzo-rle lz4 lz4hc [zstd] + #2: lzo lzo-rle lz4 [lz4hc] zstd + +Alternative compression algorithms are sorted by priority. In the example +above, zstd is used as the first alternative algorithm, which has priority +of 1, while lz4hc is configured as a compression algorithm with priority 2. +Alternative compression algorithm's priority is provided during algorithms +configuration::: + + #select zstd recompression algorithm, priority 1 + echo "algo=zstd priority=1" > /sys/block/zramX/recomp_algorithm + + #select deflate recompression algorithm, priority 2 + echo "algo=deflate priority=2" > /sys/block/zramX/recomp_algorithm + +Another device attribute that CONFIG_ZRAM_MULTI_COMP enables is recompress, +which controls recompression. + +Examples::: + + #IDLE pages recompression is activated by `idle` mode + echo "type=idle" > /sys/block/zramX/recompress + + #HUGE pages recompression is activated by `huge` mode + echo "type=huge" > /sys/block/zram0/recompress + + #HUGE_IDLE pages recompression is activated by `huge_idle` mode + echo "type=huge_idle" > /sys/block/zramX/recompress + +The number of idle pages can be significant, so user-space can pass a size +threshold (in bytes) to the recompress knob: zram will recompress only pages +of equal or greater size::: + + #recompress all pages larger than 3000 bytes + echo "threshold=3000" > /sys/block/zramX/recompress + + #recompress idle pages larger than 2000 bytes + echo "type=idle threshold=2000" > /sys/block/zramX/recompress + +Recompression of idle pages requires memory tracking. + +During re-compression for every page, that matches re-compression criteria, +ZRAM iterates the list of registered alternative compression algorithms in +order of their priorities. ZRAM stops either when re-compression was +successful (re-compressed object is smaller in size than the original one) +and matches re-compression criteria (e.g. size threshold) or when there are +no secondary algorithms left to try. If none of the secondary algorithms can +successfully re-compressed the page such a page is marked as incompressible, +so ZRAM will not attempt to re-compress it in the future. + +This re-compression behaviour, when it iterates through the list of +registered compression algorithms, increases our chances of finding the +algorithm that successfully compresses a particular page. Sometimes, however, +it is convenient (and sometimes even necessary) to limit recompression to +only one particular algorithm so that it will not try any other algorithms. +This can be achieved by providing a algo=NAME parameter::: + + #use zstd algorithm only (if registered) + echo "type=huge algo=zstd" > /sys/block/zramX/recompress + memory tracking =============== From b46f9ea3cb351587b2cfc68f7211f7a7cc5b6673 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:46 +0900 Subject: [PATCH 156/313] zram: add incompressible writeback Add support for incompressible pages writeback: echo incompressible > /sys/block/zramX/writeback Link: https://lkml.kernel.org/r/20221109115047.2921851-13-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 7 ++++++- drivers/block/zram/zram_drv.c | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index d898b7ace33d..f14c8c2e42f3 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -348,8 +348,13 @@ this can be accomplished with:: echo huge_idle > /sys/block/zramX/writeback +If a user chooses to writeback only incompressible pages (pages that none of +algorithms can compress) this can be accomplished with:: + + echo incompressible > /sys/block/zramX/writeback + If an admin wants to write a specific page in zram device to the backing device, -they could write a page index into the interface. +they could write a page index into the interface:: echo "page_index=1251" > /sys/block/zramX/writeback diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 798c421fdd36..25b7ff2b56bf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -645,10 +645,10 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec, #define PAGE_WB_SIG "page_index=" -#define PAGE_WRITEBACK 0 -#define HUGE_WRITEBACK (1<<0) -#define IDLE_WRITEBACK (1<<1) - +#define PAGE_WRITEBACK 0 +#define HUGE_WRITEBACK (1<<0) +#define IDLE_WRITEBACK (1<<1) +#define INCOMPRESSIBLE_WRITEBACK (1<<2) static ssize_t writeback_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -669,6 +669,8 @@ static ssize_t writeback_store(struct device *dev, mode = HUGE_WRITEBACK; else if (sysfs_streq(buf, "huge_idle")) mode = IDLE_WRITEBACK | HUGE_WRITEBACK; + else if (sysfs_streq(buf, "incompressible")) + mode = INCOMPRESSIBLE_WRITEBACK; else { if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) return -EINVAL; @@ -731,11 +733,15 @@ static ssize_t writeback_store(struct device *dev, goto next; if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) + !zram_test_flag(zram, index, ZRAM_IDLE)) goto next; if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) + !zram_test_flag(zram, index, ZRAM_HUGE)) goto next; + if (mode & INCOMPRESSIBLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + /* * Clearing ZRAM_UNDER_WB is duty of caller. * IOW, zram_free_page never clear it. From 77db7bb56bd711586243924a5582727f7a93fb7f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Nov 2022 20:50:47 +0900 Subject: [PATCH 157/313] zram: add incompressible flag to read_block_state() Add a new flag to zram block state that shows if the page is incompressible: that none of the algorithm (including secondary ones) could compress it. Link: https://lkml.kernel.org/r/20221109115047.2921851-14-senozhatsky@chromium.org Suggested-by: Minchan Kim Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Cc: Nhat Pham Cc: Nitin Gupta Cc: Suleiman Souhlal Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 11 +++++++---- drivers/block/zram/zram_drv.c | 6 ++++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index f14c8c2e42f3..e4551579cb12 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -497,10 +497,11 @@ pages of the process with*pagemap. If you enable the feature, you could see block state via /sys/kernel/debug/zram/zram0/block_state". The output is as follows:: - 300 75.033841 .wh.. - 301 63.806904 s.... - 302 63.806919 ..hi. - 303 62.801919 ....r + 300 75.033841 .wh... + 301 63.806904 s..... + 302 63.806919 ..hi.. + 303 62.801919 ....r. + 304 146.781902 ..hi.n First column zram's block index. @@ -519,6 +520,8 @@ Third column idle page r: recompressed page (secondary compression algorithm) + n: + none (including secondary) of algorithms could compress it First line of above example says 300th block is accessed at 75.033841sec and the block's state is huge so it is written back to the backing diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 25b7ff2b56bf..9d33801e8ba8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -946,14 +946,16 @@ static ssize_t read_block_state(struct file *file, char __user *buf, ts = ktime_to_timespec64(zram->table[index].ac_time); copied = snprintf(kbuf + written, count, - "%12zd %12lld.%06lu %c%c%c%c%c\n", + "%12zd %12lld.%06lu %c%c%c%c%c%c\n", index, (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC, zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.', zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.', - zram_get_priority(zram, index) ? 'r' : '.'); + zram_get_priority(zram, index) ? 'r' : '.', + zram_test_flag(zram, index, + ZRAM_INCOMPRESSIBLE) ? 'n' : '.'); if (count <= copied) { zram_slot_unlock(zram, index); From c959a0e8de2c4db6ca6cc8f490223e2e1e58934b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 15 Nov 2022 11:03:14 +0900 Subject: [PATCH 158/313] Docs/ABI/zram: document zram recompress sysfs knobs Document zram re-compression sysfs knobs. Link: https://lkml.kernel.org/r/20221115020314.386235-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-block-zram | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 14b2bf2e5105..628a00fb20a9 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram @@ -137,3 +137,17 @@ Description: The writeback_limit file is read-write and specifies the maximum amount of writeback ZRAM can do. The limit could be changed in run time. + +What: /sys/block/zram/recomp_algorithm +Date: November 2022 +Contact: Sergey Senozhatsky +Description: + The recomp_algorithm file is read-write and allows to set + or show secondary compression algorithms. + +What: /sys/block/zram/recompress +Date: November 2022 +Contact: Sergey Senozhatsky +Description: + The recompress file is write-only and triggers re-compression + with secondary compression algorithms. From c66b6ead74ffdad8659eb829468343a88afc2f2c Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Wed, 9 Nov 2022 14:56:46 +0530 Subject: [PATCH 159/313] mm/kfence: remove hung_task cruft commit fdf756f71271 ("sched: Fix more TASK_state comparisons") makes hung_task not to monitor TASK_IDLE tasks. The special handling to workaround hung_task warnings is not required anymore. Link: https://lkml.kernel.org/r/1667986006-25420-1-git-send-email-quic_pkondeti@quicinc.com Signed-off-by: Pavankumar Kondeti Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- mm/kfence/core.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 141788858b70..08f5bd6fc36d 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -799,16 +798,7 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - if (sysctl_hung_task_timeout_secs) { - /* - * During low activity with no allocations we might wait a - * while; let's avoid the hung task warning. - */ - wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), - sysctl_hung_task_timeout_secs * HZ / 2); - } else { - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); - } + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); From 16fd6b31dd9b24acf83d439a73a41c4138199424 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 9 Nov 2022 16:40:27 +0800 Subject: [PATCH 160/313] Revert "mm: migration: fix the FOLL_GET failure on following huge page" Revert commit 831568214883 ("mm: migration: fix the FOLL_GET failure on following huge page"), since after commit 1a6baaa0db73 ("s390/hugetlb: switch to generic version of follow_huge_pud()") and commit 57a196a58421 ("hugetlb: simplify hugetlb handling in follow_page_mask") were merged, now all the following huge page routines can support FOLL_GET operation. Link: https://lkml.kernel.org/r/496786039852aba90ffa68f10d0df3f4236a990b.1667983080.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Haiyue Wang Cc: Baolin Wang Cc: "Huang, Ying" Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/migrate.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 4aea647a0180..4eccf4e1da2c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1899,7 +1899,6 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); - unsigned int foll_flags = FOLL_DUMP; struct vm_area_struct *vma; struct page *page; int err = -EFAULT; @@ -1908,12 +1907,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma) goto set_status; - /* Not all huge page follow APIs support 'FOLL_GET' */ - if (!is_vm_hugetlb_page(vma)) - foll_flags |= FOLL_GET; - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, foll_flags); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1926,8 +1921,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!is_zone_device_page(page)) err = page_to_nid(page); - if (foll_flags & FOLL_GET) - put_page(page); + put_page(page); set_status: *status = err; From 49f51859221a3dfee27488eaeaff800459cac6a9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 9 Nov 2022 09:23:47 +0800 Subject: [PATCH 161/313] migrate: convert unmap_and_move() to use folios Patch series "migrate: convert migrate_pages()/unmap_and_move() to use folios", v2. The conversion is quite straightforward, just replace the page API to the corresponding folio API. migrate_pages() and unmap_and_move() mostly work with folios (head pages) only. This patch (of 2): Quite straightforward, the page functions are converted to corresponding folio functions. Same for comments. Link: https://lkml.kernel.org/r/20221109012348.93849-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20221109012348.93849-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Yang Shi Reviewed-by: Zi Yan Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Baolin Wang Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/migrate.c | 54 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 4eccf4e1da2c..e41a3cd24a59 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1150,79 +1150,79 @@ out: } /* - * Obtain the lock on page, remove all ptes and migrate the page - * to the newly allocated page in newpage. + * Obtain the lock on folio, remove all ptes and migrate the folio + * to the newly allocated folio in dst. */ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct page *page, + unsigned long private, struct folio *src, int force, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { - struct folio *dst, *src = page_folio(page); + struct folio *dst; int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; - if (!thp_migration_supported() && PageTransHuge(page)) + if (!thp_migration_supported() && folio_test_transhuge(src)) return -ENOSYS; - if (page_count(page) == 1) { - /* Page was freed from under us. So we are done. */ - ClearPageActive(page); - ClearPageUnevictable(page); + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); /* free_pages_prepare() will clear PG_isolated. */ goto out; } - newpage = get_new_page(page, private); + newpage = get_new_page(&src->page, private); if (!newpage) return -ENOMEM; dst = page_folio(newpage); - newpage->private = 0; + dst->private = 0; rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) - set_page_owner_migrate_reason(newpage, reason); + set_page_owner_migrate_reason(&dst->page, reason); out: if (rc != -EAGAIN) { /* - * A page that has been migrated has all references - * removed and will be freed. A page that has not been + * A folio that has been migrated has all references + * removed and will be freed. A folio that has not been * migrated will have kept its references and be restored. */ - list_del(&page->lru); + list_del(&src->lru); } /* * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the page to right list unless + * isolation. Otherwise, restore the folio to right list unless * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { /* - * Compaction can migrate also non-LRU pages which are + * Compaction can migrate also non-LRU folios which are * not accounted to NR_ISOLATED_*. They can be recognized - * as __PageMovable + * as __folio_test_movable */ - if (likely(!__PageMovable(page))) - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); + if (likely(!__folio_test_movable(src))) + mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + + folio_is_file_lru(src), -folio_nr_pages(src)); if (reason != MR_MEMORY_FAILURE) /* - * We release the page in page_handle_poison. + * We release the folio in page_handle_poison. */ - put_page(page); + folio_put(src); } else { if (rc != -EAGAIN) - list_add_tail(&page->lru, ret); + list_add_tail(&src->lru, ret); if (put_new_page) - put_new_page(newpage, private); + put_new_page(&dst->page, private); else - put_page(newpage); + folio_put(dst); } return rc; @@ -1459,7 +1459,7 @@ thp_subpage_migration: &ret_pages); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode, + private, page_folio(page), pass > 2, mode, reason, &ret_pages); /* * The rules are: From eaec4e639f11413ce75fbf38affd1aa5c40979e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 9 Nov 2022 09:23:48 +0800 Subject: [PATCH 162/313] migrate: convert migrate_pages() to use folios Quite straightforward, the page functions are converted to corresponding folio functions. Same for comments. THP specific code are converted to be large folio. Link: https://lkml.kernel.org/r/20221109012348.93849-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Zi Yan Cc: Yang Shi Cc: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 210 +++++++++++++++++++++++++++------------------------ 1 file changed, 112 insertions(+), 98 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index e41a3cd24a59..4aa3b6d4f67c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1373,231 +1373,245 @@ out: return rc; } -static inline int try_split_thp(struct page *page, struct list_head *split_pages) +static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) { int rc; - lock_page(page); - rc = split_huge_page_to_list(page, split_pages); - unlock_page(page); + folio_lock(folio); + rc = split_folio_to_list(folio, split_folios); + folio_unlock(folio); if (!rc) - list_move_tail(&page->lru, split_pages); + list_move_tail(&folio->lru, split_folios); return rc; } /* - * migrate_pages - migrate the pages specified in a list, to the free pages + * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration * - * @from: The list of pages to be migrated. - * @get_new_page: The function used to allocate free pages to be used - * as the target of the page migration. - * @put_new_page: The function used to free target pages if migration + * @from: The list of folios to be migrated. + * @get_new_page: The function used to allocate free folios to be used + * as the target of the folio migration. + * @put_new_page: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for - * page migration, if any. - * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of normal pages migrated successfully if + * folio migration, if any. + * @reason: The reason for folio migration. + * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * - * The function returns after 10 attempts or if no pages are movable any more - * because the list has become empty or no retryable pages exist any more. - * It is caller's responsibility to call putback_movable_pages() to return pages + * The function returns after 10 attempts or if no folios are movable any more + * because the list has become empty or no retryable folios exist any more. + * It is caller's responsibility to call putback_movable_pages() to return folios * to the LRU or free list only if ret != 0. * - * Returns the number of {normal page, THP, hugetlb} that were not migrated, or - * an error code. The number of THP splits will be considered as the number of - * non-migrated THP, no matter how many subpages of the THP are migrated successfully. + * Returns the number of {normal folio, large folio, hugetlb} that were not + * migrated, or an error code. The number of large folio splits will be + * considered as the number of non-migrated large folio, no matter how many + * split folios of the large folio are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; + int large_retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_failed_pages = 0; int nr_retry_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; + int nr_large_failed = 0; int nr_thp_failed = 0; int nr_thp_split = 0; int pass = 0; + bool is_large = false; bool is_thp = false; - struct page *page; - struct page *page2; - int rc, nr_subpages; - LIST_HEAD(ret_pages); - LIST_HEAD(thp_split_pages); + struct folio *folio, *folio2; + int rc, nr_pages; + LIST_HEAD(ret_folios); + LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); - bool no_subpage_counting = false; + bool no_split_folio_counting = false; trace_mm_migrate_pages_start(mode, reason); -thp_subpage_migration: - for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { +split_folio_migration: + for (pass = 0; pass < 10 && (retry || large_retry); pass++) { retry = 0; + large_retry = 0; thp_retry = 0; nr_retry_pages = 0; - list_for_each_entry_safe(page, page2, from, lru) { + list_for_each_entry_safe(folio, folio2, from, lru) { /* - * THP statistics is based on the source huge page. - * Capture required information that might get lost - * during migration. + * Large folio statistics is based on the source large + * folio. Capture required information that might get + * lost during migration. */ - is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = compound_nr(page); + is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); cond_resched(); - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, page, - pass > 2, mode, reason, - &ret_pages); + put_new_page, private, + &folio->page, pass > 2, mode, + reason, + &ret_folios); else rc = unmap_and_move(get_new_page, put_new_page, - private, page_folio(page), pass > 2, mode, - reason, &ret_pages); + private, folio, pass > 2, mode, + reason, &ret_folios); /* * The rules are: - * Success: non hugetlb page will be freed, hugetlb - * page will be put back + * Success: non hugetlb folio will be freed, hugetlb + * folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list - * Other errno: put on ret_pages list then splice to + * Other errno: put on ret_folios list then splice to * from list */ switch(rc) { /* - * THP migration might be unsupported or the - * allocation could've failed so we should - * retry on the same page with the THP split - * to base pages. + * Large folio migration might be unsupported or + * the allocation could've failed so we should retry + * on the same folio with the large folio split + * to normal folios. * - * Sub-pages are put in thp_split_pages, and + * Split folios are put in split_folios, and * we will migrate them after the rest of the * list is processed. */ case -ENOSYS: - /* THP migration is unsupported */ - if (is_thp) { - nr_thp_failed++; - if (!try_split_thp(page, &thp_split_pages)) { - nr_thp_split++; + /* Large folio migration is unsupported */ + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + if (!try_split_folio(folio, &split_folios)) { + nr_thp_split += is_thp; break; } /* Hugetlb migration is unsupported */ - } else if (!no_subpage_counting) { + } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_subpages; - list_move_tail(&page->lru, &ret_pages); + nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, &ret_folios); break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate - * other pages, just exit. + * other folios, just exit. */ - if (is_thp) { - nr_thp_failed++; - /* THP NUMA faulting doesn't split THP to retry. */ + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ if (!nosplit) { - int ret = try_split_thp(page, &thp_split_pages); + int ret = try_split_folio(folio, &split_folios); if (!ret) { - nr_thp_split++; + nr_thp_split += is_thp; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { /* - * Try again to split THP to mitigate - * the failure of longterm pinning. + * Try again to split large folio to + * mitigate the failure of longterm pinning. */ - thp_retry++; - nr_retry_pages += nr_subpages; + large_retry++; + thp_retry += is_thp; + nr_retry_pages += nr_pages; break; } } - } else if (!no_subpage_counting) { + } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_subpages + nr_retry_pages; + nr_failed_pages += nr_pages + nr_retry_pages; /* - * There might be some subpages of fail-to-migrate THPs - * left in thp_split_pages list. Move them back to migration + * There might be some split folios of fail-to-migrate large + * folios left in split_folios list. Move them back to migration * list so that they could be put back to the right list by - * the caller otherwise the page refcnt will be leaked. + * the caller otherwise the folio refcnt will be leaked. */ - list_splice_init(&thp_split_pages, from); + list_splice_init(&split_folios, from); /* nr_failed isn't updated for not used */ + nr_large_failed += large_retry; nr_thp_failed += thp_retry; goto out; case -EAGAIN: - if (is_thp) - thp_retry++; - else if (!no_subpage_counting) + if (is_large) { + large_retry++; + thp_retry += is_thp; + } else if (!no_split_folio_counting) { retry++; - nr_retry_pages += nr_subpages; + } + nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: - nr_succeeded += nr_subpages; - if (is_thp) - nr_thp_succeeded++; + nr_succeeded += nr_pages; + nr_thp_succeeded += is_thp; break; default: /* * Permanent failure (-EBUSY, etc.): - * unlike -EAGAIN case, the failed page is - * removed from migration page list and not + * unlike -EAGAIN case, the failed folio is + * removed from migration folio list and not * retried in the next outer loop. */ - if (is_thp) - nr_thp_failed++; - else if (!no_subpage_counting) + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { nr_failed++; + } - nr_failed_pages += nr_subpages; + nr_failed_pages += nr_pages; break; } } } nr_failed += retry; + nr_large_failed += large_retry; nr_thp_failed += thp_retry; nr_failed_pages += nr_retry_pages; /* - * Try to migrate subpages of fail-to-migrate THPs, no nr_failed - * counting in this round, since all subpages of a THP is counted - * as 1 failure in the first round. + * Try to migrate split folios of fail-to-migrate large folios, no + * nr_failed counting in this round, since all split folios of a + * large folio is counted as 1 failure in the first round. */ - if (!list_empty(&thp_split_pages)) { + if (!list_empty(&split_folios)) { /* - * Move non-migrated pages (after 10 retries) to ret_pages + * Move non-migrated folios (after 10 retries) to ret_folios * to avoid migrating them again. */ - list_splice_init(from, &ret_pages); - list_splice_init(&thp_split_pages, from); - no_subpage_counting = true; + list_splice_init(from, &ret_folios); + list_splice_init(&split_folios, from); + no_split_folio_counting = true; retry = 1; - goto thp_subpage_migration; + goto split_folio_migration; } - rc = nr_failed + nr_thp_failed; + rc = nr_failed + nr_large_failed; out: /* - * Put the permanent failure page back to migration list, they + * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ - list_splice(&ret_pages, from); + list_splice(&ret_folios, from); /* - * Return 0 in case all subpages of fail-to-migrate THPs are - * migrated successfully. + * Return 0 in case all split folios of fail-to-migrate large folios + * are migrated successfully. */ if (list_empty(from)) rc = 0; From 4a625ceee8a0ab0273534cb6b432ce6b331db5ee Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 10 Nov 2022 07:07:51 +0400 Subject: [PATCH 163/313] mm/demotion: fix NULL vs IS_ERR checking in memory_tier_init alloc_memory_type() returns error pointers on error instead of NULL. Use IS_ERR() to check the return value to fix this. Link: https://lkml.kernel.org/r/20221110030751.1627266-1-linmq006@gmail.com Fixes: 7b88bda3761b ("mm/demotion/dax/kmem: set node's abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE") Signed-off-by: Miaoqian Lin Reviewed-by: "Huang, Ying" Cc: Aneesh Kumar K.V Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 939e200c283b..c734658c6242 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -645,7 +645,7 @@ static int __init memory_tier_init(void) * than default DRAM tier. */ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); - if (!default_dram_type) + if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); /* From 9b34a307f39497198645de5e43f3f00b5e873249 Mon Sep 17 00:00:00 2001 From: Jian Wen Date: Fri, 11 Nov 2022 11:46:39 +0800 Subject: [PATCH 164/313] docs: admin-guide: cgroup-v1: update description of inactive_file MADV_FREE pages have been moved into the LRU_INACTIVE_FILE list by commit f7ad2a6cb9f7 ("mm: move MADV_FREE pages into LRU_INACTIVE_FILE list"). Link: https://lkml.kernel.org/r/20221111034639.3593380-1-wenjian1@xiaomi.com Signed-off-by: Jian Wen Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 5b86245450bd..60370f2c67b9 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -543,7 +543,8 @@ inactive_anon # of bytes of anonymous and swap cache memory on inactive LRU list. active_anon # of bytes of anonymous and swap cache memory on active LRU list. -inactive_file # of bytes of file-backed memory on inactive LRU list. +inactive_file # of bytes of file-backed memory and MADV_FREE anonymous memory( + LazyFree pages) on inactive LRU list. active_file # of bytes of file-backed memory on active LRU list. unevictable # of bytes of memory that cannot be reclaimed (mlocked etc). =============== =============================================================== From 25e9fa22fbfec1e58c955d2670bf9a18f4ebe9ef Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Tue, 15 Nov 2022 01:14:26 +0800 Subject: [PATCH 165/313] mm/kmemleak.c: fix a comment I noticed a typo in a code comment and I fixed it. Link: https://lkml.kernel.org/r/20221114171426.91745-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 646e2979641f..267332904354 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1461,7 +1461,7 @@ static void scan_gray_list(void) } /* - * Conditionally call resched() in a object iteration loop while making sure + * Conditionally call resched() in an object iteration loop while making sure * that the given object won't go away without RCU read lock by performing a * get_object() if !pinned. * From c2da319c2e2789dccb20fdafe520ac61c9df84f7 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Sun, 13 Nov 2022 19:04:47 -0500 Subject: [PATCH 166/313] mm/uffd: sanity check write bit for uffd-wp protected ptes Let's add one sanity check for CONFIG_DEBUG_VM on the write bit in whatever chance we have when walking through the pgtables. It can bring the error earlier even before the app notices the data was corrupted on the snapshot. Also it helps us to identify this is a wrong pgtable setup, so hopefully a great information to have for debugging too. Link: https://lkml.kernel.org/r/20221114000447.1681003-3-peterx@redhat.com Signed-off-by: Peter Xu Cc: Andrea Arcangeli Cc: Alistair Popple Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 286a71810f9e..0564edd24ffb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -292,7 +292,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP static inline int pte_uffd_wp(pte_t pte) { - return pte_flags(pte) & _PAGE_UFFD_WP; + bool wp = pte_flags(pte) & _PAGE_UFFD_WP; + +#ifdef CONFIG_DEBUG_VM + /* + * Having write bit for wr-protect-marked present ptes is fatal, + * because it means the uffd-wp bit will be ignored and write will + * just go through. + * + * Use any chance of pgtable walking to verify this (e.g., when + * page swapped out or being migrated for all purposes). It means + * something is already wrong. Tell the admin even before the + * process crashes. We also nail it with wrong pgtable setup. + */ + WARN_ON_ONCE(wp && pte_write(pte)); +#endif + + return wp; } static inline pte_t pte_mkuffd_wp(pte_t pte) From 369258ce41c6d7663a7b6d509356fecad577378d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 14 Nov 2022 15:55:07 -0800 Subject: [PATCH 167/313] hugetlb: remove duplicate mmu notifications The common hugetlb unmap routine __unmap_hugepage_range performs mmu notification calls. However, in the case where __unmap_hugepage_range is called via __unmap_hugepage_range_final, mmu notification calls are performed earlier in other calling routines. Remove mmu notification calls from __unmap_hugepage_range. Add notification calls to the only other caller: unmap_hugepage_range. unmap_hugepage_range is called for truncation and hole punch, so change notification type from UNMAP to CLEAR as this is more appropriate. Link: https://lkml.kernel.org/r/20221114235507.294320-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: Peter Xu Cc: Wei Chen Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mina Almasry Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4f1338d82aab..3fd4570fb8b0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5076,7 +5076,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - struct mmu_notifier_range range; unsigned long last_addr_mask; bool force_flush = false; @@ -5091,13 +5090,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); - /* - * If sharing possible, alert mmu notifiers of worst case. - */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, - end); - adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); - mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { @@ -5182,7 +5174,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct if (ref_page) break; } - mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); /* @@ -5210,6 +5201,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + /* mmu notification performed in caller */ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ @@ -5234,10 +5226,18 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + struct mmu_notifier_range range; struct mmu_gather tlb; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + + mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } From b7217a0bbe00a98a8f4b15ebc2a8355a31a59e1e Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Mon, 14 Nov 2022 23:59:49 +0000 Subject: [PATCH 168/313] mm: shrinkers: add missing includes for undeclared types The shrinker.h header depends on a user including other headers before it for types used by shrinker.h. Fix this by including the appropriate headers in shrinker.h. ./include/linux/shrinker.h:13:9: error: unknown type name `gfp_t' 13 | gfp_t gfp_mask; | ^~~~~ ./include/linux/shrinker.h:71:26: error: field `list' has incomplete type 71 | struct list_head list; | ^~~~ ./include/linux/shrinker.h:82:9: error: unknown type name `atomic_long_t' 82 | atomic_long_t *nr_deferred; | Link: https://lkml.kernel.org/r/20221114235949.201749-1-tjmercier@google.com Fixes: 83aeeada7c69 ("vmscan: use atomic-long for shrinker batching") Fixes: b0d40c92adaf ("superblock: introduce per-sb cache shrinker infrastructure") Signed-off-by: T.J. Mercier Cc: Al Viro Cc: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 08e6054e061f..71310efe2fab 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -2,6 +2,9 @@ #ifndef _LINUX_SHRINKER_H #define _LINUX_SHRINKER_H +#include +#include + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. From d09e8ca6cb93bb4b97517a18fbbf7eccb0e9ff43 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 15 Nov 2022 02:06:01 +0000 Subject: [PATCH 169/313] mm: anonymous shared memory naming Since commit 9a10064f5625 ("mm: add a field to store names for private anonymous memory"), name for private anonymous memory, but not shared anonymous, can be set. However, naming shared anonymous memory just as useful for tracking purposes. Extend the functionality to be able to set names for shared anon. There are two ways to create anonymous shared memory, using memfd or directly via mmap(): 1. fd = memfd_create(...) mem = mmap(..., MAP_SHARED, fd, ...) 2. mem = mmap(..., MAP_SHARED | MAP_ANONYMOUS, -1, ...) In both cases the anonymous shared memory is created the same way by mapping an unlinked file on tmpfs. The memfd way allows to give a name for anonymous shared memory, but not useful when parts of shared memory require to have distinct names. Example use case: The VMM maps VM memory as anonymous shared memory (not private because VMM is sandboxed and drivers are running in their own processes). However, the VM tells back to the VMM how parts of the memory are actually used by the guest, how each of the segments should be backed (i.e. 4K pages, 2M pages), and some other information about the segments. The naming allows us to monitor the effective memory footprint for each of these segments from the host without looking inside the guest. Sample output: /* Create shared anonymous segmenet */ anon_shmem = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); /* Name the segment: "MY-NAME" */ rv = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, anon_shmem, SIZE, "MY-NAME"); cat /proc//maps (and smaps): 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 [anon_shmem:MY-NAME] If the segment is not named, the output is: 7fc8e2b4c000-7fc8f2b4c000 rw-s 00000000 00:01 1024 /dev/zero (deleted) Link: https://lkml.kernel.org/r/20221115020602.804224-1-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Bagas Sanjaya Cc: Colin Cross Cc: Hugh Dickins Cc: Johannes Weiner Cc: Jonathan Corbet Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Paul Gortmaker Cc: Peter Xu Cc: Sean Christopherson Cc: Vincent Whitchurch Cc: Vlastimil Babka Cc: xu xin Cc: Yang Shi Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 8 +++++--- fs/proc/task_mmu.c | 15 +++++++++++---- include/linux/mm.h | 2 ++ include/linux/mm_types.h | 26 ++++++++++++-------------- mm/madvise.c | 7 ++----- mm/shmem.c | 29 +++++++++++++++++++++++++---- 6 files changed, 57 insertions(+), 30 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 898c99eae8e4..b8f175ae4853 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,14 +426,16 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ============= ==================================== + =================== =========================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - [anon:] an anonymous mapping that has been + [anon:] a private anonymous mapping that has been named by userspace - ============= ==================================== + [anon_shmem:] an anonymous shared memory mapping that has + been named by userspace + =================== =========================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8a74cdcc9af0..89338950afd3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -274,6 +274,7 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { + struct anon_vma_name *anon_name = NULL; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; vm_flags_t flags = vma->vm_flags; @@ -293,6 +294,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); + if (mm) + anon_name = anon_vma_name(vma); /* * Print the dentry name for named mappings, and a @@ -300,7 +303,14 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) */ if (file) { seq_pad(m, ' '); - seq_file_path(m, file, "\n"); + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) + seq_printf(m, "[anon_shmem:%s]", anon_name->name); + else + seq_file_path(m, file, "\n"); goto done; } @@ -312,8 +322,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { - struct anon_vma_name *anon_name; - if (!mm) { name = "[vdso]"; goto done; @@ -330,7 +338,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - anon_name = anon_vma_name(vma); if (anon_name) { seq_pad(m, ' '); seq_printf(m, "[anon:%s]", anon_name->name); diff --git a/include/linux/mm.h b/include/linux/mm.h index f873441303b7..686879dbb0bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,8 +700,10 @@ static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); +bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b0009e7d4ae..157c2e22cc7f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -549,21 +549,11 @@ struct vm_area_struct { * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. * - * For private anonymous mappings, a pointer to a null terminated string - * containing the name given to the vma, or NULL if unnamed. */ - - union { - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* - * Serialized by mmap_sem. Never use directly because it is - * valid only when vm_file is NULL. Use anon_vma_name instead. - */ - struct anon_vma_name *anon_name; - }; + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -584,6 +574,14 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_sem. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif diff --git a/mm/madvise.c b/mm/madvise.c index b913ba6efc10..83b0c91a126b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -95,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - if (vma->vm_file) - return NULL; - return vma->anon_name; } @@ -183,7 +180,7 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; - if (!vma->vm_file) { + if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; @@ -1273,7 +1270,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, int error; /* Only anonymous mappings can be named */ - if (vma->vm_file) + if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, diff --git a/mm/shmem.c b/mm/shmem.c index 7428ae3fa4b9..f418d21205be 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -237,11 +237,17 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_anon_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_anon_vm_ops; +} + bool vma_is_shmem(struct vm_area_struct *vma) { - return vma->vm_ops == &shmem_vm_ops; + return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); @@ -2263,7 +2269,8 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { - struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); int ret; ret = seal_check_future_write(info->seals, vma); @@ -2274,7 +2281,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); - vma->vm_ops = &shmem_vm_ops; + /* This is anonymous shared memory if it is unlinked at the time of mmap */ + if (inode->i_nlink) + vma->vm_ops = &shmem_vm_ops; + else + vma->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -3988,6 +3999,15 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; +static const struct vm_operations_struct shmem_anon_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; @@ -4163,6 +4183,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops +#define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 @@ -4268,7 +4289,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + vma->vm_ops = &shmem_anon_vm_ops; return 0; } From e83b39d6bbdb6d25bd6f5c258832774635d29b47 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Nov 2022 13:32:55 +0100 Subject: [PATCH 170/313] mm: make drop_caches keep reclaiming on all nodes Currently, drop_caches are reclaiming node-by-node, looping on each node until reclaim could not make progress. This can however leave quite some slab entries (such as filesystem inodes) unreclaimed if objects say on node 1 keep objects on node 0 pinned. So move the "loop until no progress" loop to the node-by-node iteration to retry reclaim also on other nodes if reclaim on some nodes made progress. This fixes problem when drop_caches was not reclaiming lots of otherwise perfectly fine to reclaim inodes. Link: https://lkml.kernel.org/r/20221115123255.12559-1-jack@suse.cz Signed-off-by: Jan Kara Reported-by: You Zhou Reported-by: Pengfei Xu Tested-by: Pengfei Xu Reviewed-by: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton --- mm/vmscan.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index d7c71be6417d..82f32c929b11 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1021,31 +1021,34 @@ out: return freed; } -static void drop_slab_node(int nid) +static unsigned long drop_slab_node(int nid) { - unsigned long freed; - int shift = 0; + unsigned long freed = 0; + struct mem_cgroup *memcg = NULL; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct mem_cgroup *memcg = NULL; + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - if (fatal_signal_pending(current)) - return; - - freed = 0; - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - } while ((freed >> shift++) > 1); + return freed; } void drop_slab(void) { int nid; + int shift = 0; + unsigned long freed; - for_each_online_node(nid) - drop_slab_node(nid); + do { + freed = 0; + for_each_online_node(nid) { + if (fatal_signal_pending(current)) + return; + + freed += drop_slab_node(nid); + } + } while ((freed >> shift++) > 1); } static int reclaimer_offset(void) From dbaf7dc97ab8d526a20d3477419bc14b4890a82c Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 7 Nov 2022 09:56:59 +0800 Subject: [PATCH 171/313] hugetlbfs: inode: remove unnecessary (void*) conversions The ei pointer does not need to cast the type. Link: https://lkml.kernel.org/r/20221107015659.3221-1-zeming@nfschina.com Signed-off-by: Li zeming Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3ee84604e36d..790d2727141a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1279,7 +1279,7 @@ static const struct address_space_operations hugetlbfs_aops = { static void init_once(void *foo) { - struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + struct hugetlbfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } From eff6aa17aa7c06c25c0df80060cd0fe621dac276 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Sun, 13 Nov 2022 08:38:45 +0800 Subject: [PATCH 172/313] selftests/damon: fix unnecessary compilation warnings When testing overflow and overread, there is no need to keep unnecessary compilation warnings, we should simply ignore them. The motivation for this patch is to eliminate the compilation warning, maybe one day we will compile the kernel with "-Werror -Wall", at which point this compilation warning will turn into a compilation error, we should fix this error in advance. How to reproduce the problem (with gcc-11.3.1): $ make -C tools/testing/selftests/ ... warning: `write' reading 4294967295 bytes from a region of size 1 [-Wstringop-overread] warning: `read' writing 4294967295 bytes into a region of size 25 overflows the destination [-Wstringop-overflow=] "-Wno-stringop-overread" is supported at least in gcc-11.1.0. Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d14c547abd484d3540b692bb8048c4a6efe92c8b Link: https://lkml.kernel.org/r/tencent_51C4ACA8CB3895C2D7F35178440283602107@qq.com Signed-off-by: Rong Tao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/huge_count_read_write.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c index ad7a6b4cf338..a6fe0689f88d 100644 --- a/tools/testing/selftests/damon/huge_count_read_write.c +++ b/tools/testing/selftests/damon/huge_count_read_write.c @@ -8,6 +8,13 @@ #include #include +#pragma GCC diagnostic push +#if __GNUC__ >= 11 && __GNUC_MINOR__ >= 1 +/* Ignore read(2) overflow and write(2) overread compile warnings */ +#pragma GCC diagnostic ignored "-Wstringop-overread" +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif + void write_read_with_huge_count(char *file) { int filedesc = open(file, O_RDWR); @@ -27,6 +34,8 @@ void write_read_with_huge_count(char *file) close(filedesc); } +#pragma GCC diagnostic pop + int main(int argc, char *argv[]) { if (argc != 2) { From 53b2d09bdd12092a7341c08b6b863560db62fa57 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 16 Nov 2022 16:07:09 -0400 Subject: [PATCH 173/313] mm/gup: remove the restriction on locked with FOLL_LONGTERM This restriction was created because FOLL_LONGTERM used to scan the vma list, so it could not tolerate becoming unlocked. That was fixed in commit 52650c8b466b ("mm/gup: remove the vma allocation from gup_longterm_locked()") and the restriction on !vma was removed. However, the locked restriction remained, even though it isn't necessary anymore. Adjust __gup_longterm_locked() so it can handle the mmap_read_lock() becoming unlocked while it is looping for migration. Migration does not require the mmap_read_sem because it is only handling struct pages. If we had to unlock then ensure the whole thing returns unlocked. Remove __get_user_pages_remote() and __gup_longterm_unlocked(). These cases can now just directly call other functions. Link: https://lkml.kernel.org/r/0-v1-b9ae39aa8884+14dbb-gup_longterm_locked_jgg@nvidia.com Signed-off-by: Jason Gunthorpe Reviewed-by: John Hubbard Cc: Alistair Popple Cc: John Hubbard Signed-off-by: Andrew Morton --- mm/gup.c | 109 ++++++++++++++----------------------------------------- 1 file changed, 27 insertions(+), 82 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 6b16aecf5d2c..2500d00db51b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2049,14 +2049,19 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, + int *locked, unsigned int gup_flags) { + bool must_unlock = false; unsigned int flags; long rc, nr_pinned_pages; + if (locked && WARN_ON_ONCE(!*locked)) + return -EINVAL; + if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); + locked, gup_flags); /* * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM @@ -2070,8 +2075,13 @@ static long __gup_longterm_locked(struct mm_struct *mm, return -EINVAL; flags = memalloc_pin_save(); do { + if (locked && !*locked) { + mmap_read_lock(mm); + must_unlock = true; + *locked = 1; + } nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, + pages, vmas, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; @@ -2081,6 +2091,10 @@ static long __gup_longterm_locked(struct mm_struct *mm, } while (rc == -EAGAIN); memalloc_pin_restore(flags); + if (locked && *locked && must_unlock) { + mmap_read_unlock(mm); + *locked = 0; + } return rc ? rc : nr_pinned_pages; } @@ -2104,35 +2118,6 @@ static bool is_valid_gup_flags(unsigned int gup_flags) } #ifdef CONFIG_MMU -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - /* - * Parts of FOLL_LONGTERM behavior are incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. However, this only comes up if locked is set, and there are - * callers that do request FOLL_LONGTERM, but do not set locked. So, - * allow what we can. - */ - if (gup_flags & FOLL_LONGTERM) { - if (WARN_ON_ONCE(locked)) - return -EINVAL; - /* - * This will check the vmas (even if our vmas arg is NULL) - * and return -ENOTSUPP if DAX isn't allowed in this case: - */ - return __gup_longterm_locked(mm, start, nr_pages, pages, - vmas, gup_flags | FOLL_TOUCH | - FOLL_REMOTE); - } - - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); -} - /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm @@ -2201,8 +2186,8 @@ long get_user_pages_remote(struct mm_struct *mm, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2214,14 +2199,6 @@ long get_user_pages_remote(struct mm_struct *mm, { return 0; } - -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - return 0; -} #endif /* !CONFIG_MMU */ /** @@ -2248,7 +2225,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags | FOLL_TOUCH); + pages, vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -2274,18 +2251,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - mmap_read_lock(mm); - ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked, + gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); return ret; @@ -2879,29 +2847,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -static int __gup_longterm_unlocked(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - int ret; - - /* - * FIXME: FOLL_LONGTERM does not work with - * get_user_pages_unlocked() (see comments in that function) - */ - if (gup_flags & FOLL_LONGTERM) { - mmap_read_lock(current->mm); - ret = __gup_longterm_locked(current->mm, - start, nr_pages, - pages, NULL, gup_flags); - mmap_read_unlock(current->mm); - } else { - ret = get_user_pages_unlocked(start, nr_pages, - pages, gup_flags); - } - - return ret; -} - static unsigned long lockless_pages_from_mm(unsigned long start, unsigned long end, unsigned int gup_flags, @@ -2985,8 +2930,8 @@ static int internal_get_user_pages_fast(unsigned long start, /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, - pages); + ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages, + gup_flags); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3185,9 +3130,9 @@ long pin_user_pages_remote(struct mm_struct *mm, if (WARN_ON_ONCE(!pages)) return -EINVAL; - gup_flags |= FOLL_PIN; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_PIN | FOLL_TOUCH | + FOLL_REMOTE); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3221,7 +3166,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, gup_flags |= FOLL_PIN; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags); + pages, vmas, NULL, gup_flags); } EXPORT_SYMBOL(pin_user_pages); From 749477244b05be0d9b6dcc10c161bfa4c4749d78 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 16 Nov 2022 14:19:22 +0100 Subject: [PATCH 174/313] mm: Kconfig: make config SECRETMEM visible with EXPERT Commit 6a108a14fa35 ("kconfig: rename CONFIG_EMBEDDED to CONFIG_EXPERT") introduces CONFIG_EXPERT to carry the previous intent of CONFIG_EMBEDDED and just gives that intent a much better name. That has been clearly a good and long overdue renaming, and it is clearly an improvement to the kernel build configuration that has shown to help managing the kernel build configuration in the last decade. However, rather than bravely and radically just deleting CONFIG_EMBEDDED, this commit gives CONFIG_EMBEDDED a new intended semantics, but keeps it open for future contributors to implement that intended semantics: A new CONFIG_EMBEDDED option is added that automatically selects CONFIG_EXPERT when enabled and can be used in the future to isolate options that should only be considered for embedded systems (RISC architectures, SLOB, etc). Since then, this CONFIG_EMBEDDED implicitly had two purposes: - It can make even more options visible beyond what CONFIG_EXPERT makes visible. In other words, it may introduce another level of enabling the visibility of configuration options: always visible, visible with CONFIG_EXPERT and visible with CONFIG_EMBEDDED. - Set certain default values of some configurations differently, following the assumption that configuring a kernel build for an embedded system generally starts with a different set of default values compared to kernel builds for all other kind of systems. Considering the second purpose, note that already probably arguing that a kernel build for an embedded system would choose some values differently is already tricky: the set of embedded systems with Linux kernels is already quite diverse. Many embedded system have powerful CPUs and it would not be clear that all embedded systems just optimize towards one specific aspect, e.g., a smaller kernel image size. So, it is unclear if starting with "one set of default configuration" that is induced by CONFIG_EMBEDDED is a good offer for developers configuring their kernels. Also, the differences of needed user-space features in an embedded system compared to a non-embedded system are probably difficult or even impossible to name in some generic way. So it is not surprising that in the last decade hardly anyone has contributed changes to make something default differently in case of CONFIG_EMBEDDED=y. Currently, in v6.0-rc4, SECRETMEM is the only config switched off if CONFIG_EMBEDDED=y. As long as that is actually the only option that currently is selected or deselected, it is better to just make SECRETMEM configurable at build time by experts using menuconfig instead. Make SECRETMEM configurable when EXPERT is set and otherwise default to yes. Further, SECRETMEM needs ARCH_HAS_SET_DIRECT_MAP. This allows us to remove CONFIG_EMBEDDED in the close future. Link: https://lkml.kernel.org/r/20221116131922.25533-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Acked-by: Mike Rapoport Acked-by: Arnd Bergmann Reviewed-by: Masahiro Yamada Signed-off-by: Andrew Morton --- mm/Kconfig | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index c86b69aff7d4..4e8a2697f28d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1074,7 +1074,13 @@ config IO_MAPPING bool config SECRETMEM - def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + default y + bool "Enable memfd_secret() system call" if EXPERT + depends on ARCH_HAS_SET_DIRECT_MAP + help + Enable the memfd_secret() system call with the ability to create + memory areas visible only in the context of the owning process and + not mapped to other processes and other kernel page tables. config ANON_VMA_NAME bool "Anonymous VMA name support" From 7aca5ca154930a06612f4d7b81f710f3e1027e04 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:40 +0100 Subject: [PATCH 175/313] selftests/vm: anon_cow: prepare for non-anonymous COW tests Patch series "mm/gup: remove FOLL_FORCE usage from drivers (reliable R/O long-term pinning)". For now, we did not support reliable R/O long-term pinning in COW mappings. That means, if we would trigger R/O long-term pinning in MAP_PRIVATE mapping, we could end up pinning the (R/O-mapped) shared zeropage or a pagecache page. The next write access would trigger a write fault and replace the pinned page by an exclusive anonymous page in the process page table; whatever the process would write to that private page copy would not be visible by the owner of the previous page pin: for example, RDMA could read stale data. The end result is essentially an unexpected and hard-to-debug memory corruption. Some drivers tried working around that limitation by using "FOLL_FORCE|FOLL_WRITE|FOLL_LONGTERM" for R/O long-term pinning for now. FOLL_WRITE would trigger a write fault, if required, and break COW before pinning the page. FOLL_FORCE is required because the VMA might lack write permissions, and drivers wanted to make that working as well, just like one would expect (no write access, but still triggering a write access to break COW). However, that is not a practical solution, because (1) Drivers that don't stick to that undocumented and debatable pattern would still run into that issue. For example, VFIO only uses FOLL_LONGTERM for R/O long-term pinning. (2) Using FOLL_WRITE just to work around a COW mapping + page pinning limitation is unintuitive. FOLL_WRITE would, for example, mark the page softdirty or trigger uffd-wp, even though, there actually isn't going to be any write access. (3) The purpose of FOLL_FORCE is debug access, not access without lack of VMA permissions by arbitrarty drivers. So instead, make R/O long-term pinning work as expected, by breaking COW in a COW mapping early, such that we can remove any FOLL_FORCE usage from drivers and make FOLL_FORCE ptrace-specific (renaming it to FOLL_PTRACE). More details in patch #8. This patch (of 19): Originally, the plan was to have a separate tests for testing COW of non-anonymous (e.g., shared zeropage) pages. Turns out, that we'd need a lot of similar functionality and that there isn't a really good reason to separate it. So let's prepare for non-anon tests by renaming to "cow". Link: https://lkml.kernel.org/r/20221116102659.70287-1-david@redhat.com Link: https://lkml.kernel.org/r/20221116102659.70287-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Alexander Shishkin Cc: Alexander Viro Cc: Alex Williamson Cc: Andrea Arcangeli Cc: Andy Walls Cc: Anton Ivanov Cc: Arnaldo Carvalho de Melo Cc: Arnd Bergmann Cc: Bernard Metzler Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Benvenuti Cc: Christian Gmeiner Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Hansen Cc: David Airlie Cc: David S. Miller Cc: Dennis Dalessandro Cc: "Eric W . Biederman" Cc: Greg Kroah-Hartman Cc: Hans Verkuil Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Inki Dae Cc: Ivan Kokshaysky Cc: James Morris Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: Johannes Berg Cc: John Hubbard Cc: Kees Cook Cc: Kentaro Takeda Cc: Krzysztof Kozlowski Cc: Kyungmin Park Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Linus Torvalds Cc: Lucas Stach Cc: Marek Szyprowski Cc: Mark Rutland Cc: Matthew Wilcox Cc: Matt Turner Cc: Mauro Carvalho Chehab Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Namhyung Kim Cc: Nelson Escobar Cc: Nicholas Piggin Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Moore Cc: Peter Xu Cc: Peter Zijlstra Cc: Richard Henderson Cc: Richard Weinberger Cc: Russell King Cc: Serge Hallyn Cc: Seung-Woo Kim Cc: Shuah Khan Cc: Tetsuo Handa Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tomasz Figa Cc: Will Deacon Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/.gitignore | 2 +- tools/testing/selftests/vm/Makefile | 10 ++++---- tools/testing/selftests/vm/check_config.sh | 4 +-- .../selftests/vm/{anon_cow.c => cow.c} | 25 +++++++++++-------- tools/testing/selftests/vm/run_vmtests.sh | 2 +- 5 files changed, 24 insertions(+), 19 deletions(-) rename tools/testing/selftests/vm/{anon_cow.c => cow.c} (97%) diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore index 8a536c731e3c..ee8c41c998e6 100644 --- a/tools/testing/selftests/vm/.gitignore +++ b/tools/testing/selftests/vm/.gitignore @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -anon_cow +cow hugepage-mmap hugepage-mremap hugepage-shm diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 00920cb8b499..a4d764efd6e3 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -27,7 +27,7 @@ MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread -TEST_GEN_FILES = anon_cow +TEST_GEN_FILES = cow TEST_GEN_FILES += compaction_test TEST_GEN_FILES += gup_test TEST_GEN_FILES += hmm-tests @@ -98,7 +98,7 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk -$(OUTPUT)/anon_cow: vm_util.c +$(OUTPUT)/cow: vm_util.c $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c @@ -154,8 +154,8 @@ warn_32bit_failure: endif endif -# ANON_COW_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. -$(OUTPUT)/anon_cow: LDLIBS += $(ANON_COW_EXTRA_LIBS) +# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) $(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap @@ -168,7 +168,7 @@ local_config.mk local_config.h: check_config.sh EXTRA_CLEAN += local_config.mk local_config.h -ifeq ($(ANON_COW_EXTRA_LIBS),) +ifeq ($(COW_EXTRA_LIBS),) all: warn_missing_liburing warn_missing_liburing: diff --git a/tools/testing/selftests/vm/check_config.sh b/tools/testing/selftests/vm/check_config.sh index 9a44c6520925..bcba3af0acea 100644 --- a/tools/testing/selftests/vm/check_config.sh +++ b/tools/testing/selftests/vm/check_config.sh @@ -21,11 +21,11 @@ $CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 if [ -f $tmpfile_o ]; then echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE - echo "ANON_COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE + echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE else echo "// No liburing support found" > $OUTPUT_H_FILE echo "# No liburing support found, so:" > $OUTPUT_MKFILE - echo "ANON_COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE + echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE fi rm ${tmpname}.* diff --git a/tools/testing/selftests/vm/anon_cow.c b/tools/testing/selftests/vm/cow.c similarity index 97% rename from tools/testing/selftests/vm/anon_cow.c rename to tools/testing/selftests/vm/cow.c index bbb251eb5025..d202bfd63585 100644 --- a/tools/testing/selftests/vm/anon_cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * COW (Copy On Write) tests for anonymous memory. + * COW (Copy On Write) tests. * * Copyright 2022, Red Hat, Inc. * @@ -986,7 +986,11 @@ struct test_case { test_fn fn; }; -static const struct test_case test_cases[] = { +/* + * Test cases that are specific to anonymous pages: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_test_cases[] = { /* * Basic COW tests for fork() without any GUP. If we miss to break COW, * either the child can observe modifications by the parent or the @@ -1104,7 +1108,7 @@ static const struct test_case test_cases[] = { }, }; -static void run_test_case(struct test_case const *test_case) +static void run_anon_test_case(struct test_case const *test_case) { int i; @@ -1125,15 +1129,17 @@ static void run_test_case(struct test_case const *test_case) hugetlbsizes[i]); } -static void run_test_cases(void) +static void run_anon_test_cases(void) { int i; - for (i = 0; i < ARRAY_SIZE(test_cases); i++) - run_test_case(&test_cases[i]); + ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) + run_anon_test_case(&anon_test_cases[i]); } -static int tests_per_test_case(void) +static int tests_per_anon_test_case(void) { int tests = 2 + nr_hugetlbsizes; @@ -1144,7 +1150,6 @@ static int tests_per_test_case(void) int main(int argc, char **argv) { - int nr_test_cases = ARRAY_SIZE(test_cases); int err; pagesize = getpagesize(); @@ -1152,14 +1157,14 @@ int main(int argc, char **argv) detect_hugetlbsizes(); ksft_print_header(); - ksft_set_plan(nr_test_cases * tests_per_test_case()); + ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case()); gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("opening pagemap failed\n"); - run_test_cases(); + run_anon_test_cases(); err = ksft_get_fail_cnt(); if (err) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 1fa783732296..54d7a822c2ce 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -186,6 +186,6 @@ fi run_test ./soft-dirty # COW tests for anonymous memory -run_test ./anon_cow +run_test ./cow exit $exitcode From f8664f3c4a08f799122e8f0a8093056a7b3fbc8d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:41 +0100 Subject: [PATCH 176/313] selftests/vm: cow: basic COW tests for non-anonymous pages Let's add basic tests for COW with non-anonymous pages in private mappings: write access should properly trigger COW and result in the private changes not being visible through other page mappings. Especially, add tests for: * Zeropage * Huge zeropage * Ordinary pagecache pages via memfd and tmpfile() * Hugetlb pages via memfd Fortunately, all tests pass. Link: https://lkml.kernel.org/r/20221116102659.70287-3-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/cow.c | 338 ++++++++++++++++++++++++++++++- 1 file changed, 337 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c index d202bfd63585..fb07bd44529c 100644 --- a/tools/testing/selftests/vm/cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "local_config.h" #ifdef LOCAL_CONFIG_HAVE_LIBURING @@ -35,6 +36,7 @@ static size_t thpsize; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; static int gup_fd; +static bool has_huge_zeropage; static void detect_thpsize(void) { @@ -64,6 +66,31 @@ static void detect_thpsize(void) close(fd); } +static void detect_huge_zeropage(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", + O_RDONLY); + size_t enabled = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + enabled = strtoul(buf, NULL, 10); + if (enabled == 1) { + has_huge_zeropage = true; + ksft_print_msg("[INFO] huge zeropage is enabled\n"); + } + } + + close(fd); +} + static void detect_hugetlbsizes(void) { DIR *dir = opendir("/sys/kernel/mm/hugepages/"); @@ -1148,6 +1175,312 @@ static int tests_per_anon_test_case(void) return tests; } +typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); + +static void test_cow(char *mem, const char *smem, size_t size) +{ + char *old = malloc(size); + + /* Backup the original content. */ + memcpy(old, smem, size); + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* See if we still read the old values via the other mapping. */ + ksft_test_result(!memcmp(smem, old, size), + "Other mapping not modified\n"); + free(old); +} + +static void run_with_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + + ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Read from the page to populate the shared zeropage. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +} + +static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, *mmap_mem, *mmap_smem, tmp; + size_t mmap_size; + int ret; + + ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); + + if (!has_huge_zeropage) { + ksft_test_result_skip("Huge zeropage not enabled\n"); + return; + } + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + mmap_smem = mmap(NULL, mmap_size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_smem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + ret |= madvise(smem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Read from the memory to populate the huge shared zeropage. Read from + * the first sub-page and test if we get another sub-page populated + * automatically. + */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || + !pagemap_is_populated(pagemap_fd, smem + pagesize)) { + ksft_test_result_skip("Did not get THPs populated\n"); + goto munmap; + } + + fn(mem, smem, thpsize); +munmap: + munmap(mmap_mem, mmap_size); + if (mmap_smem != MAP_FAILED) + munmap(mmap_smem, mmap_size); +} + +static void run_with_memfd(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd\n", desc); + + fd = memfd_create("test", 0); + if (fd < 0) { + ksft_test_result_fail("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + close(fd); +} + +static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + FILE *file; + int fd; + + ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + return; + } + + fd = fileno(file); + if (fd < 0) { + ksft_test_result_skip("fileno() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + fclose(file); +} + +static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, + size_t hugetlbsize) +{ + int flags = MFD_HUGETLB; + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; + + fd = memfd_create("test", flags); + if (fd < 0) { + ksft_test_result_skip("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, hugetlbsize)) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, + 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); + if (mem != MAP_FAILED) + munmap(smem, hugetlbsize); +close: + close(fd); +} + +struct non_anon_test_case { + const char *desc; + non_anon_test_fn fn; +}; + +/* + * Test cases that target any pages in private mappings that are non anonymous: + * pages that may get shared via COW ndependent of fork(). This includes + * the shared zeropage(s), pagecache pages, ... + */ +static const struct non_anon_test_case non_anon_test_cases[] = { + /* + * Basic COW test without any GUP. If we miss to break COW, changes are + * visible via other private/shared mappings. + */ + { + "Basic COW", + test_cow, + }, +}; + +static void run_non_anon_test_case(struct non_anon_test_case const *test_case) +{ + int i; + + run_with_zeropage(test_case->fn, test_case->desc); + run_with_memfd(test_case->fn, test_case->desc); + run_with_tmpfile(test_case->fn, test_case->desc); + if (thpsize) + run_with_huge_zeropage(test_case->fn, test_case->desc); + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_memfd_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_non_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) + run_non_anon_test_case(&non_anon_test_cases[i]); +} + +static int tests_per_non_anon_test_case(void) +{ + int tests = 3 + nr_hugetlbsizes; + + if (thpsize) + tests += 1; + return tests; +} + int main(int argc, char **argv) { int err; @@ -1155,9 +1488,11 @@ int main(int argc, char **argv) pagesize = getpagesize(); detect_thpsize(); detect_hugetlbsizes(); + detect_huge_zeropage(); ksft_print_header(); - ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case()); + ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); @@ -1165,6 +1500,7 @@ int main(int argc, char **argv) ksft_exit_fail_msg("opening pagemap failed\n"); run_anon_test_cases(); + run_non_anon_test_cases(); err = ksft_get_fail_cnt(); if (err) From 97713a3abe338bb6c968e77264edbb68eb8d932a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:42 +0100 Subject: [PATCH 177/313] selftests/vm: cow: R/O long-term pinning reliability tests for non-anon pages Let's test whether R/O long-term pinning is reliable for non-anonymous memory: when R/O long-term pinning a page, the expectation is that we break COW early before pinning, such that actual write access via the page tables won't break COW later and end up replacing the R/O-pinned page in the page table. Consequently, R/O long-term pinning in private mappings would only target exclusive anonymous pages. For now, all tests fail: # [RUN] R/O longterm GUP pin ... with shared zeropage not ok 151 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd not ok 152 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with tmpfile not ok 153 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with huge zeropage not ok 154 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (2048 kB) not ok 155 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (1048576 kB) not ok 156 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with shared zeropage not ok 157 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd not ok 158 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with tmpfile not ok 159 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with huge zeropage not ok 160 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (2048 kB) not ok 161 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (1048576 kB) not ok 162 Longterm R/O pin is reliable Link: https://lkml.kernel.org/r/20221116102659.70287-4-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/cow.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c index fb07bd44529c..73e05b52c49e 100644 --- a/tools/testing/selftests/vm/cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -561,6 +561,7 @@ static void test_iouring_fork(char *mem, size_t size) #endif /* LOCAL_CONFIG_HAVE_LIBURING */ enum ro_pin_test { + RO_PIN_TEST, RO_PIN_TEST_SHARED, RO_PIN_TEST_PREVIOUSLY_SHARED, RO_PIN_TEST_RO_EXCLUSIVE, @@ -593,6 +594,8 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, } switch (test) { + case RO_PIN_TEST: + break; case RO_PIN_TEST_SHARED: case RO_PIN_TEST_PREVIOUSLY_SHARED: /* @@ -1193,6 +1196,16 @@ static void test_cow(char *mem, const char *smem, size_t size) free(old); } +static void test_ro_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, false); +} + +static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, true); +} + static void run_with_zeropage(non_anon_test_fn fn, const char *desc) { char *mem, *smem, tmp; @@ -1433,7 +1446,7 @@ struct non_anon_test_case { }; /* - * Test cases that target any pages in private mappings that are non anonymous: + * Test cases that target any pages in private mappings that are not anonymous: * pages that may get shared via COW ndependent of fork(). This includes * the shared zeropage(s), pagecache pages, ... */ @@ -1446,6 +1459,19 @@ static const struct non_anon_test_case non_anon_test_cases[] = { "Basic COW", test_cow, }, + /* + * Take a R/O longterm pin. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O longterm GUP pin", + test_ro_pin, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O longterm GUP-fast pin", + test_ro_fast_pin, + }, }; static void run_non_anon_test_case(struct non_anon_test_case const *test_case) From cdc5021cda194112bc0962d6a0e90b379968c504 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:43 +0100 Subject: [PATCH 178/313] mm: add early FAULT_FLAG_UNSHARE consistency checks For now, FAULT_FLAG_UNSHARE only applies to anonymous pages, which implies a COW mapping. Let's hide FAULT_FLAG_UNSHARE early if we're not dealing with a COW mapping, such that we treat it like a read fault as documented and don't have to worry about the flag throughout all fault handlers. While at it, centralize the check for mutual exclusion of FAULT_FLAG_UNSHARE and FAULT_FLAG_WRITE and just drop the check that either flag is set in the WP handler. Link: https://lkml.kernel.org/r/20221116102659.70287-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/huge_memory.c | 3 --- mm/hugetlb.c | 5 ----- mm/memory.c | 23 ++++++++++++++++++++--- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 29102e3ddf84..5eb702726a0e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1313,9 +1313,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); - if (is_huge_zero_pmd(orig_pmd)) goto fallback; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3fd4570fb8b0..3d381b26d553 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5316,9 +5316,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; - VM_BUG_ON(unshare && (flags & FOLL_WRITE)); - VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); - /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. @@ -5328,8 +5325,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - if (unlikely(unshare)) - return 0; set_huge_ptep_writable(vma, haddr, ptep); return 0; } diff --git a/mm/memory.c b/mm/memory.c index 086cb3dd8608..07380ef935ac 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3343,9 +3343,6 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct folio *folio; - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); - if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -5161,6 +5158,22 @@ static void lru_gen_exit_fault(void) } #endif /* CONFIG_LRU_GEN */ +static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, + unsigned int *flags) +{ + if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { + if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) + return VM_FAULT_SIGSEGV; + /* + * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's + * just treat it like an ordinary read-fault otherwise. + */ + if (!is_cow_mapping(vma->vm_flags)) + *flags &= ~FAULT_FLAG_UNSHARE; + } + return 0; +} + /* * By the time we get here, we already hold the mm semaphore * @@ -5177,6 +5190,10 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); + ret = sanitize_fault_flags(vma, &flags); + if (ret) + return ret; + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, flags & FAULT_FLAG_REMOTE)) From 79881fed6052a9ce00cfb63297832b9faacf8cf3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:44 +0100 Subject: [PATCH 179/313] mm: add early FAULT_FLAG_WRITE consistency checks Let's catch abuse of FAULT_FLAG_WRITE early, such that we don't have to care in all other handlers and might get "surprises" if we forget to do so. Write faults without VM_MAYWRITE don't make any sense, and our maybe_mkwrite() logic could have hidden such abuse for now. Write faults without VM_WRITE on something that is not a COW mapping is similarly broken, and e.g., do_wp_page() could end up placing an anonymous page into a shared mapping, which would be bad. This is a preparation for reliable R/O long-term pinning of pages in private mappings, whereby we want to make sure that we will never break COW in a read-only private mapping. Link: https://lkml.kernel.org/r/20221116102659.70287-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 07380ef935ac..5e4df6b87016 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5170,6 +5170,14 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, */ if (!is_cow_mapping(vma->vm_flags)) *flags &= ~FAULT_FLAG_UNSHARE; + } else if (*flags & FAULT_FLAG_WRITE) { + /* Write faults on read-only mappings are impossible ... */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) + return VM_FAULT_SIGSEGV; + /* ... and FOLL_FORCE only applies to COW mappings. */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && + !is_cow_mapping(vma->vm_flags))) + return VM_FAULT_SIGSEGV; } return 0; } From b9086fde6d44e8a95dc95b822bd87386129b832d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:45 +0100 Subject: [PATCH 180/313] mm: rework handling in do_wp_page() based on private vs. shared mappings We want to extent FAULT_FLAG_UNSHARE support to anything mapped into a COW mapping (pagecache page, zeropage, PFN, ...), not just anonymous pages. Let's prepare for that by handling shared mappings first such that we can handle private mappings last. While at it, use folio-based functions instead of page-based functions where we touch the code either way. Link: https://lkml.kernel.org/r/20221116102659.70287-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5e4df6b87016..5d4b42f1a8d6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3341,7 +3341,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; - struct folio *folio; + struct folio *folio = NULL; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { @@ -3359,13 +3359,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); - if (!vmf->page) { - if (unlikely(unshare)) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + /* + * Shared mapping: we are guaranteed to have VM_WRITE and + * FAULT_FLAG_WRITE set at this point. + */ + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -3373,20 +3372,19 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) + if (!vmf->page) return wp_pfn_shared(vmf); - - pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf); + return wp_page_shared(vmf); } + if (vmf->page) + folio = page_folio(vmf->page); + /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. + * Private mapping: create an exclusive anonymous page copy if reuse + * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. */ - folio = page_folio(vmf->page); - if (folio_test_anon(folio)) { + if (folio && folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. @@ -3437,19 +3435,17 @@ reuse: /* No anonymous page -> nothing to do. */ pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf); } copy: /* * Ok, we need to copy. Oh, well.. */ - get_page(vmf->page); + if (folio) + folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM - if (PageKsm(vmf->page)) + if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); From aea06577a9005ca81c35196d6171cac346d3b251 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:46 +0100 Subject: [PATCH 181/313] mm: don't call vm_ops->huge_fault() in wp_huge_pmd()/wp_huge_pud() for private mappings If we already have a PMD/PUD mapped write-protected in a private mapping and we want to break COW either due to FAULT_FLAG_WRITE or FAULT_FLAG_UNSHARE, there is no need to inform the file system just like on the PTE path. Let's just split (->zap) + fallback in that case. This is a preparation for more generic FAULT_FLAG_UNSHARE support in COW mappings. Link: https://lkml.kernel.org/r/20221116102659.70287-8-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5d4b42f1a8d6..6cec0adab37f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4802,6 +4802,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + vm_fault_t ret; if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) && @@ -4809,11 +4810,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } /* COW or write-notify handled on pte level: split pmd. */ @@ -4839,14 +4842,17 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + vm_fault_t ret; + /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) goto split; - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); - - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } split: /* COW or write-notify not handled on PUD level: split pud.*/ From 8d6a0ac09a16c026e1e2a03a61e12e95c48a25a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:47 +0100 Subject: [PATCH 182/313] mm: extend FAULT_FLAG_UNSHARE support to anything in a COW mapping Extend FAULT_FLAG_UNSHARE to break COW on anything mapped into a COW (i.e., private writable) mapping and adjust the documentation accordingly. FAULT_FLAG_UNSHARE will now also break COW when encountering the shared zeropage, a pagecache page, a PFNMAP, ... inside a COW mapping, by properly replacing the mapped page/pfn by a private copy (an exclusive anonymous page). Note that only do_wp_page() needs care: hugetlb_wp() already handles FAULT_FLAG_UNSHARE correctly. wp_huge_pmd()/wp_huge_pud() also handles it correctly, for example, splitting the huge zeropage on FAULT_FLAG_UNSHARE such that we can handle FAULT_FLAG_UNSHARE on the PTE level. This change is a requirement for reliable long-term R/O pinning in COW mappings. Link: https://lkml.kernel.org/r/20221116102659.70287-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 ++++---- mm/memory.c | 4 ---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 157c2e22cc7f..018b1c098173 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1039,9 +1039,9 @@ typedef struct { * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to unshare (and mark - * exclusive) a possibly shared anonymous page that is - * mapped R/O. + * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a + * COW mapping, making sure that an exclusive anon page is + * mapped after the fault. * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. * We should only access orig_pte if this flag set. * @@ -1066,7 +1066,7 @@ typedef struct { * * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal. * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when - * no existing R/O-mapped anonymous page is encountered. + * applied to mappings that are not COW mappings. */ enum fault_flag { FAULT_FLAG_WRITE = 1 << 0, diff --git a/mm/memory.c b/mm/memory.c index 6cec0adab37f..815d2ff05c62 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3431,10 +3431,6 @@ reuse: } wp_page_reuse(vmf); return VM_FAULT_WRITE; - } else if (unshare) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; } copy: /* From 84209e87c6963f928194a890399e24e8ad299db1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:48 +0100 Subject: [PATCH 183/313] mm/gup: reliable R/O long-term pinning in COW mappings We already support reliable R/O pinning of anonymous memory. However, assume we end up pinning (R/O long-term) a pagecache page or the shared zeropage inside a writable private ("COW") mapping. The next write access will trigger a write-fault and replace the pinned page by an exclusive anonymous page in the process page tables to break COW: the pinned page no longer corresponds to the page mapped into the process' page table. Now that FAULT_FLAG_UNSHARE can break COW on anything mapped into a COW mapping, let's properly break COW first before R/O long-term pinning something that's not an exclusive anon page inside a COW mapping. FAULT_FLAG_UNSHARE will break COW and map an exclusive anon page instead that can get pinned safely. With this change, we can stop using FOLL_FORCE|FOLL_WRITE for reliable R/O long-term pinning in COW mappings. With this change, the new R/O long-term pinning tests for non-anonymous memory succeed: # [RUN] R/O longterm GUP pin ... with shared zeropage ok 151 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd ok 152 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with tmpfile ok 153 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with huge zeropage ok 154 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (2048 kB) ok 155 Longterm R/O pin is reliable # [RUN] R/O longterm GUP pin ... with memfd hugetlb (1048576 kB) ok 156 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with shared zeropage ok 157 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd ok 158 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with tmpfile ok 159 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with huge zeropage ok 160 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (2048 kB) ok 161 Longterm R/O pin is reliable # [RUN] R/O longterm GUP-fast pin ... with memfd hugetlb (1048576 kB) ok 162 Longterm R/O pin is reliable Note 1: We don't care about short-term R/O-pinning, because they have snapshot semantics: they are not supposed to observe modifications that happen after pinning. As one example, assume we start direct I/O to read from a page and store page content into a file: modifications to page content after starting direct I/O are not guaranteed to end up in the file. So even if we'd pin the shared zeropage, the end result would be as expected -- getting zeroes stored to the file. Note 2: For shared mappings we'll now always fallback to the slow path to lookup the VMA when R/O long-term pining. While that's the necessary price we have to pay right now, it's actually not that bad in practice: most FOLL_LONGTERM users already specify FOLL_WRITE, for example, along with FOLL_FORCE because they tried dealing with COW mappings correctly ... Note 3: For users that use FOLL_LONGTERM right now without FOLL_WRITE, such as VFIO, we'd now no longer pin the shared zeropage. Instead, we'd populate exclusive anon pages that we can pin. There was a concern that this could affect the memlock limit of existing setups. For example, a VM running with VFIO could run into the memlock limit and fail to run. However, we essentially had the same behavior already in commit 17839856fd58 ("gup: document and work around "COW can break either way" issue") which got merged into some enterprise distros, and there were not any such complaints. So most probably, we're fine. Link: https://lkml.kernel.org/r/20221116102659.70287-10-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Daniel Vetter Reviewed-by: Vlastimil Babka Reviewed-by: John Hubbard Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 ++++++++++++++++++++++++--- mm/gup.c | 10 +++++----- mm/huge_memory.c | 2 +- mm/hugetlb.c | 7 ++++--- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 686879dbb0bd..d8363ac34a7c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3149,8 +3149,12 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Must be called with the (sub)page that's actually referenced via the * page table entry, which might not necessarily be the head page for a * PTE-mapped THP. + * + * If the vma is NULL, we're coming from the GUP-fast path and might have + * to fallback to the slow path just to lookup the vma. */ -static inline bool gup_must_unshare(unsigned int flags, struct page *page) +static inline bool gup_must_unshare(struct vm_area_struct *vma, + unsigned int flags, struct page *page) { /* * FOLL_WRITE is implicitly handled correctly as the page table entry @@ -3163,8 +3167,25 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) * Note: PageAnon(page) is stable until the page is actually getting * freed. */ - if (!PageAnon(page)) - return false; + if (!PageAnon(page)) { + /* + * We only care about R/O long-term pining: R/O short-term + * pinning does not have the semantics to observe successive + * changes through the process page tables. + */ + if (!(flags & FOLL_LONGTERM)) + return false; + + /* We really need the vma ... */ + if (!vma) + return true; + + /* + * ... because we only care about writable private ("COW") + * mappings where we have to break COW early. + */ + return is_cow_mapping(vma->vm_flags); + } /* Paired with a memory barrier in page_try_share_anon_rmap(). */ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) diff --git a/mm/gup.c b/mm/gup.c index 2500d00db51b..39c84a200f06 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -603,7 +603,7 @@ retry: } } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } @@ -2380,7 +2380,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, goto pte_unmap; } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2566,7 +2566,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2632,7 +2632,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2672,7 +2672,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5eb702726a0e..86a30041a2e1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1480,7 +1480,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; - if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3d381b26d553..9d97c9a2a15d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6197,7 +6197,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, } } -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, +static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, + unsigned int flags, pte_t *pte, bool *unshare) { pte_t pteval = huge_ptep_get(pte); @@ -6209,7 +6210,7 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; if (flags & FOLL_WRITE) return true; - if (gup_must_unshare(flags, pte_page(pteval))) { + if (gup_must_unshare(vma, flags, pte_page(pteval))) { *unshare = true; return true; } @@ -6338,7 +6339,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * directly from any kind of swap entries. */ if (absent || - __follow_hugetlb_must_fault(flags, pte, &unshare)) { + __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; From b40656aa7d559adc1fe689396dc58b92a9a27286 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:49 +0100 Subject: [PATCH 184/313] RDMA/umem: remove FOLL_FORCE usage GUP now supports reliable R/O long-term pinning in COW mappings, such that we break COW early. MAP_SHARED VMAs only use the shared zeropage so far in one corner case (DAXFS file with holes), which can be ignored because GUP does not support long-term pinning in fsdax (see check_vma_flags()). Consequently, FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM is no longer required for reliable R/O long-term pinning: FOLL_LONGTERM is sufficient. So stop using FOLL_FORCE, which is really only for ptrace access. Link: https://lkml.kernel.org/r/20221116102659.70287-11-david@redhat.com Tested-by: Leon Romanovsky [over mlx4 and mlx5] Signed-off-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- drivers/infiniband/core/umem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 86d479772fbc..755a9c57db6f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -156,7 +156,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, struct mm_struct *mm; unsigned long npages; int pinned, ret; - unsigned int gup_flags = FOLL_WRITE; + unsigned int gup_flags = FOLL_LONGTERM; /* * If the combination of the addr and size requested for this memory @@ -210,8 +210,8 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, cur_base = addr & PAGE_MASK; - if (!umem->writable) - gup_flags |= FOLL_FORCE; + if (umem->writable) + gup_flags |= FOLL_WRITE; while (npages) { cond_resched(); @@ -219,7 +219,7 @@ struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags | FOLL_LONGTERM, page_list); + gup_flags, page_list); if (pinned < 0) { ret = pinned; goto umem_release; From a9d0284033e974a355b806fdb5fbabf8301bcd16 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:50 +0100 Subject: [PATCH 185/313] RDMA/usnic: remove FOLL_FORCE usage GUP now supports reliable R/O long-term pinning in COW mappings, such that we break COW early. MAP_SHARED VMAs only use the shared zeropage so far in one corner case (DAXFS file with holes), which can be ignored because GUP does not support long-term pinning in fsdax (see check_vma_flags()). Consequently, FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM is no longer required for reliable R/O long-term pinning: FOLL_LONGTERM is sufficient. So stop using FOLL_FORCE, which is really only for ptrace access. Link: https://lkml.kernel.org/r/20221116102659.70287-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Christian Benvenuti Cc: Nelson Escobar Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- drivers/infiniband/hw/usnic/usnic_uiom.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 67923ced6e2d..c301b3be9f30 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -85,6 +85,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int dmasync, struct usnic_uiom_reg *uiomr) { struct list_head *chunk_list = &uiomr->chunk_list; + unsigned int gup_flags = FOLL_LONGTERM; struct page **page_list; struct scatterlist *sg; struct usnic_uiom_chunk *chunk; @@ -96,7 +97,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int off; int i; dma_addr_t pa; - unsigned int gup_flags; struct mm_struct *mm; /* @@ -131,8 +131,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, goto out; } - gup_flags = FOLL_WRITE; - gup_flags |= (writable) ? 0 : FOLL_FORCE; + if (writable) + gup_flags |= FOLL_WRITE; cur_base = addr & PAGE_MASK; ret = 0; @@ -140,8 +140,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = pin_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), - gup_flags | FOLL_LONGTERM, - page_list, NULL); + gup_flags, page_list, NULL); if (ret < 0) goto out; From 129e636fe9837fcfea68bfd368a07548d9880726 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:51 +0100 Subject: [PATCH 186/313] RDMA/siw: remove FOLL_FORCE usage GUP now supports reliable R/O long-term pinning in COW mappings, such that we break COW early. MAP_SHARED VMAs only use the shared zeropage so far in one corner case (DAXFS file with holes), which can be ignored because GUP does not support long-term pinning in fsdax (see check_vma_flags()). Consequently, FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM is no longer required for reliable R/O long-term pinning: FOLL_LONGTERM is sufficient. So stop using FOLL_FORCE, which is really only for ptrace access. Link: https://lkml.kernel.org/r/20221116102659.70287-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Bernard Metzler Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- drivers/infiniband/sw/siw/siw_mem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index 61c17db70d65..b2b33dd3b4fa 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -368,7 +368,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) struct mm_struct *mm_s; u64 first_page_va; unsigned long mlock_limit; - unsigned int foll_flags = FOLL_WRITE; + unsigned int foll_flags = FOLL_LONGTERM; int num_pages, num_chunks, i, rv = 0; if (!can_do_mlock()) @@ -391,8 +391,8 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) mmgrab(mm_s); - if (!writable) - foll_flags |= FOLL_FORCE; + if (writable) + foll_flags |= FOLL_WRITE; mmap_read_lock(mm_s); @@ -423,8 +423,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) while (nents) { struct page **plist = &umem->page_chunk[i].plist[got]; - rv = pin_user_pages(first_page_va, nents, - foll_flags | FOLL_LONGTERM, + rv = pin_user_pages(first_page_va, nents, foll_flags, plist, NULL); if (rv < 0) goto out_sem_up; From 3298de2c66e0276abe6b95041fd3605a377523fc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:52 +0100 Subject: [PATCH 187/313] media: videobuf-dma-sg: remove FOLL_FORCE usage GUP now supports reliable R/O long-term pinning in COW mappings, such that we break COW early. MAP_SHARED VMAs only use the shared zeropage so far in one corner case (DAXFS file with holes), which can be ignored because GUP does not support long-term pinning in fsdax (see check_vma_flags()). Consequently, FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM is no longer required for reliable R/O long-term pinning: FOLL_LONGTERM is sufficient. So stop using FOLL_FORCE, which is really only for ptrace access. Link: https://lkml.kernel.org/r/20221116102659.70287-14-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Daniel Vetter Acked-by: Hans Verkuil Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton --- drivers/media/v4l2-core/videobuf-dma-sg.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index f75e5eedeee0..234e9f647c96 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -151,17 +151,16 @@ static void videobuf_dma_init(struct videobuf_dmabuf *dma) static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, int direction, unsigned long data, unsigned long size) { + unsigned int gup_flags = FOLL_LONGTERM; unsigned long first, last; - int err, rw = 0; - unsigned int flags = FOLL_FORCE; + int err; dma->direction = direction; switch (dma->direction) { case DMA_FROM_DEVICE: - rw = READ; + gup_flags |= FOLL_WRITE; break; case DMA_TO_DEVICE: - rw = WRITE; break; default: BUG(); @@ -177,14 +176,11 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, if (NULL == dma->pages) return -ENOMEM; - if (rw == READ) - flags |= FOLL_WRITE; - dprintk(1, "init user [0x%lx+0x%lx => %lu pages]\n", data, size, dma->nr_pages); - err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, - flags | FOLL_LONGTERM, dma->pages, NULL); + err = pin_user_pages(data & PAGE_MASK, dma->nr_pages, gup_flags, + dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; From 7d96eb6a9164607df09c9589ed3ba9ef4e9cc2a6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:53 +0100 Subject: [PATCH 188/313] drm/etnaviv: remove FOLL_FORCE usage GUP now supports reliable R/O long-term pinning in COW mappings, such that we break COW early. MAP_SHARED VMAs only use the shared zeropage so far in one corner case (DAXFS file with holes), which can be ignored because GUP does not support long-term pinning in fsdax (see check_vma_flags()). commit cd5297b0855f ("drm/etnaviv: Use FOLL_FORCE for userptr") documents that FOLL_FORCE | FOLL_WRITE was really only used for reliable R/O pinning. Consequently, FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM is no longer required for reliable R/O long-term pinning: FOLL_LONGTERM is sufficient. So stop using FOLL_FORCE, which is really only for ptrace access. Link: https://lkml.kernel.org/r/20221116102659.70287-15-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Daniel Vetter Cc: Lucas Stach Cc: Russell King Cc: Christian Gmeiner Cc: David Airlie Signed-off-by: Andrew Morton --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index cc386f8a7116..efe2240945d0 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -638,6 +638,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj) struct page **pvec = NULL; struct etnaviv_gem_userptr *userptr = &etnaviv_obj->userptr; int ret, pinned = 0, npages = etnaviv_obj->base.size >> PAGE_SHIFT; + unsigned int gup_flags = FOLL_LONGTERM; might_lock_read(¤t->mm->mmap_lock); @@ -648,14 +649,15 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj) if (!pvec) return -ENOMEM; + if (!userptr->ro) + gup_flags |= FOLL_WRITE; + do { unsigned num_pages = npages - pinned; uint64_t ptr = userptr->ptr + pinned * PAGE_SIZE; struct page **pages = pvec + pinned; - ret = pin_user_pages_fast(ptr, num_pages, - FOLL_WRITE | FOLL_FORCE | FOLL_LONGTERM, - pages); + ret = pin_user_pages_fast(ptr, num_pages, gup_flags, pages); if (ret < 0) { unpin_user_pages(pvec, pinned); kvfree(pvec); From 70b96f24a441e5a7e0853e3893edd9dc58b67996 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:54 +0100 Subject: [PATCH 189/313] media: pci/ivtv: remove FOLL_FORCE usage FOLL_FORCE is really only for ptrace access. R/O pinning a page is supposed to fail if the VMA misses proper access permissions (no VM_READ). Let's just remove FOLL_FORCE usage here; there would have to be a pretty good reason to allow arbitrary drivers to R/O pin pages in a PROT_NONE VMA. Most probably, FOLL_FORCE usage is just some legacy leftover. Link: https://lkml.kernel.org/r/20221116102659.70287-16-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Hans Verkuil Cc: Andy Walls Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton --- drivers/media/pci/ivtv/ivtv-udma.c | 2 +- drivers/media/pci/ivtv/ivtv-yuv.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/media/pci/ivtv/ivtv-udma.c b/drivers/media/pci/ivtv/ivtv-udma.c index 210be8290f24..99b9f55ca829 100644 --- a/drivers/media/pci/ivtv/ivtv-udma.c +++ b/drivers/media/pci/ivtv/ivtv-udma.c @@ -115,7 +115,7 @@ int ivtv_udma_setup(struct ivtv *itv, unsigned long ivtv_dest_addr, /* Pin user pages for DMA Xfer */ err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count, - dma->map, FOLL_FORCE); + dma->map, 0); if (user_dma.page_count != err) { IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of %d\n", diff --git a/drivers/media/pci/ivtv/ivtv-yuv.c b/drivers/media/pci/ivtv/ivtv-yuv.c index 4ba10c34a16a..582146f8d70d 100644 --- a/drivers/media/pci/ivtv/ivtv-yuv.c +++ b/drivers/media/pci/ivtv/ivtv-yuv.c @@ -63,12 +63,11 @@ static int ivtv_yuv_prep_user_dma(struct ivtv *itv, struct ivtv_user_dma *dma, /* Pin user pages for DMA Xfer */ y_pages = pin_user_pages_unlocked(y_dma.uaddr, - y_dma.page_count, &dma->map[0], FOLL_FORCE); + y_dma.page_count, &dma->map[0], 0); uv_pages = 0; /* silence gcc. value is set and consumed only if: */ if (y_pages == y_dma.page_count) { uv_pages = pin_user_pages_unlocked(uv_dma.uaddr, - uv_dma.page_count, &dma->map[y_pages], - FOLL_FORCE); + uv_dma.page_count, &dma->map[y_pages], 0); } if (y_pages != y_dma.page_count || uv_pages != uv_dma.page_count) { From cb78a634f3f7ff743e19fbffcb72d794e4bd7f73 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:55 +0100 Subject: [PATCH 190/313] mm/frame-vector: remove FOLL_FORCE usage FOLL_FORCE is really only for ptrace access. According to commit 707947247e95 ("media: videobuf2-vmalloc: get_userptr: buffers are always writable"), get_vaddr_frames() currently pins all pages writable as a workaround for issues with read-only buffers. FOLL_FORCE, however, seems to be a legacy leftover as it predates commit 707947247e95 ("media: videobuf2-vmalloc: get_userptr: buffers are always writable"). Let's just remove it. Once the read-only buffer issue has been resolved, FOLL_WRITE could again be set depending on the DMA direction. Link: https://lkml.kernel.org/r/20221116102659.70287-17-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Daniel Vetter Acked-by: Hans Verkuil Acked-by: Tomasz Figa Cc: Marek Szyprowski Cc: Marek Szyprowski Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton --- drivers/media/common/videobuf2/frame_vector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/common/videobuf2/frame_vector.c b/drivers/media/common/videobuf2/frame_vector.c index 542dde9d2609..062e98148c53 100644 --- a/drivers/media/common/videobuf2/frame_vector.c +++ b/drivers/media/common/videobuf2/frame_vector.c @@ -50,7 +50,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, start = untagged_addr(start); ret = pin_user_pages_fast(start, nr_frames, - FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM, + FOLL_WRITE | FOLL_LONGTERM, (struct page **)(vec->ptrs)); if (ret > 0) { vec->got_ref = true; From c098ce73c247a0e36d8a6b8cfdc7d05e4bc81bd0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:56 +0100 Subject: [PATCH 191/313] drm/exynos: remove FOLL_FORCE usage FOLL_FORCE is really only for ptrace access. As we unpin the pinned pages using unpin_user_pages_dirty_lock(true), the assumption is that all these pages are writable. FOLL_FORCE in this case seems to be a legacy leftover. Let's just remove it. Link: https://lkml.kernel.org/r/20221116102659.70287-18-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Daniel Vetter Cc: Inki Dae Cc: Seung-Woo Kim Cc: Kyungmin Park Cc: David Airlie Cc: Krzysztof Kozlowski Signed-off-by: Andrew Morton --- drivers/gpu/drm/exynos/exynos_drm_g2d.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 471fd6c8135f..e19c2ceb3759 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -477,7 +477,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct g2d_data *g2d, } ret = pin_user_pages_fast(start, npages, - FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM, + FOLL_WRITE | FOLL_LONGTERM, g2d_userptr->pages); if (ret != npages) { DRM_DEV_ERROR(g2d->dev, From 20ea7783236c374ddb7f201132a5fb9624563a77 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:57 +0100 Subject: [PATCH 192/313] RDMA/hw/qib/qib_user_pages: remove FOLL_FORCE usage FOLL_FORCE is really only for ptrace access. As we unpin the pinned pages using unpin_user_pages_dirty_lock(true), the assumption is that all these pages are writable. FOLL_FORCE in this case seems to be a legacy leftover. Let's just remove it. Link: https://lkml.kernel.org/r/20221116102659.70287-19-david@redhat.com Signed-off-by: David Hildenbrand Cc: Dennis Dalessandro Cc: Jason Gunthorpe Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- drivers/infiniband/hw/qib/qib_user_pages.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index f4b5f05058e4..f693bc753b6b 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -110,7 +110,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, for (got = 0; got < num_pages; got += ret) { ret = pin_user_pages(start_page + got * PAGE_SIZE, num_pages - got, - FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, + FOLL_LONGTERM | FOLL_WRITE, p + got, NULL); if (ret < 0) { mmap_read_unlock(current->mm); From 052d9b0f7ae1200b4a0783cf934ee4a987d37fd7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 16 Nov 2022 11:26:58 +0100 Subject: [PATCH 193/313] habanalabs: remove FOLL_FORCE usage FOLL_FORCE is really only for ptrace access. As we unpin the pinned pages using unpin_user_pages_dirty_lock(true), the assumption is that all these pages are writable. FOLL_FORCE in this case seems to be due to copy-and-past from other drivers. Let's just remove it. Link: https://lkml.kernel.org/r/20221116102659.70287-20-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Oded Gabbay Cc: Oded Gabbay Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- drivers/misc/habanalabs/common/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index ef28f3b37b93..e35cca96bbef 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2312,8 +2312,7 @@ static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size, if (!userptr->pages) return -ENOMEM; - rc = pin_user_pages_fast(start, npages, - FOLL_FORCE | FOLL_WRITE | FOLL_LONGTERM, + rc = pin_user_pages_fast(start, npages, FOLL_WRITE | FOLL_LONGTERM, userptr->pages); if (rc != npages) { From f347454d034184b4f0a2caf6e14daf7848cea01c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 31 Oct 2022 16:25:24 +0100 Subject: [PATCH 194/313] mm/gup: disallow FOLL_FORCE|FOLL_WRITE on hugetlb mappings hugetlb does not support fake write-faults (write faults without write permissions). However, we are currently able to trigger a FAULT_FLAG_WRITE fault on a VMA without VM_WRITE. If we'd ever want to support FOLL_FORCE|FOLL_WRITE, we'd have to teach hugetlb to: (1) Leave the page mapped R/O after the fake write-fault, like maybe_mkwrite() does. (2) Allow writing to an exclusive anon page that's mapped R/O when FOLL_FORCE is set, like can_follow_write_pte(). E.g., __follow_hugetlb_must_fault() needs adjustment. For now, it's not clear if that added complexity is really required. History tolds us that FOLL_FORCE is dangerous and that we better limit its use to a bare minimum. -------------------------------------------------------------------------- #include #include #include #include #include #include #include #include int main(int argc, char **argv) { char *map; int mem_fd; map = mmap(NULL, 2 * 1024 * 1024u, PROT_READ, MAP_PRIVATE|MAP_ANON|MAP_HUGETLB|MAP_HUGE_2MB, -1, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed: %d\n", errno); return 1; } mem_fd = open("/proc/self/mem", O_RDWR); if (mem_fd < 0) { fprintf(stderr, "open(/proc/self/mem) failed: %d\n", errno); return 1; } if (pwrite(mem_fd, "0", 1, (uintptr_t) map) == 1) { fprintf(stderr, "write() succeeded, which is unexpected\n"); return 1; } printf("write() failed as expected: %d\n", errno); return 0; } -------------------------------------------------------------------------- Fortunately, we have a sanity check in hugetlb_wp() in place ever since commit 1d8d14641fd9 ("mm/hugetlb: support write-faults in shared mappings"), that bails out instead of silently mapping a page writable in a !PROT_WRITE VMA. Consequently, above reproducer triggers a warning, similar to the one reported by szsbot: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 3612 at mm/hugetlb.c:5313 hugetlb_wp+0x20a/0x1af0 mm/hugetlb.c:5313 Modules linked in: CPU: 1 PID: 3612 Comm: syz-executor250 Not tainted 6.1.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/11/2022 RIP: 0010:hugetlb_wp+0x20a/0x1af0 mm/hugetlb.c:5313 Code: ea 03 80 3c 02 00 0f 85 31 14 00 00 49 8b 5f 20 31 ff 48 89 dd 83 e5 02 48 89 ee e8 70 ab b7 ff 48 85 ed 75 5b e8 76 ae b7 ff <0f> 0b 41 bd 40 00 00 00 e8 69 ae b7 ff 48 b8 00 00 00 00 00 fc ff RSP: 0018:ffffc90003caf620 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000008640070 RCX: 0000000000000000 RDX: ffff88807b963a80 RSI: ffffffff81c4ed2a RDI: 0000000000000007 RBP: 0000000000000000 R08: 0000000000000007 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000008c07e R12: ffff888023805800 R13: 0000000000000000 R14: ffffffff91217f38 R15: ffff88801d4b0360 FS: 0000555555bba300(0000) GS:ffff8880b9b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fff7a47a1b8 CR3: 000000002378d000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: hugetlb_no_page mm/hugetlb.c:5755 [inline] hugetlb_fault+0x19cc/0x2060 mm/hugetlb.c:5874 follow_hugetlb_page+0x3f3/0x1850 mm/hugetlb.c:6301 __get_user_pages+0x2cb/0xf10 mm/gup.c:1202 __get_user_pages_locked mm/gup.c:1434 [inline] __get_user_pages_remote+0x18f/0x830 mm/gup.c:2187 get_user_pages_remote+0x84/0xc0 mm/gup.c:2260 __access_remote_vm+0x287/0x6b0 mm/memory.c:5517 ptrace_access_vm+0x181/0x1d0 kernel/ptrace.c:61 generic_ptrace_pokedata kernel/ptrace.c:1323 [inline] ptrace_request+0xb46/0x10c0 kernel/ptrace.c:1046 arch_ptrace+0x36/0x510 arch/x86/kernel/ptrace.c:828 __do_sys_ptrace kernel/ptrace.c:1296 [inline] __se_sys_ptrace kernel/ptrace.c:1269 [inline] __x64_sys_ptrace+0x178/0x2a0 kernel/ptrace.c:1269 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [...] So let's silence that warning by teaching GUP code that FOLL_FORCE -- so far -- does not apply to hugetlb. Note that FOLL_FORCE for read-access seems to be working as expected. The assumption is that this has been broken forever, only ever since above commit, we actually detect the wrong handling and WARN_ON_ONCE(). I assume this has been broken at least since 2014, when mm/gup.c came to life. I failed to come up with a suitable Fixes tag quickly. Link: https://lkml.kernel.org/r/20221031152524.173644-1-david@redhat.com Fixes: 1d8d14641fd9 ("mm/hugetlb: support write-faults in shared mappings") Signed-off-by: David Hildenbrand Reported-by: Cc: Mike Kravetz Cc: Peter Xu Cc: John Hubbard Cc: Jason Gunthorpe Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 39c84a200f06..90ae44f24870 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1009,6 +1009,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; + /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ + if (is_vm_hugetlb_page(vma)) + return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could From 931b6a8b36a2de3985eca27e758900e70cd99779 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 15 Nov 2022 18:38:08 -0700 Subject: [PATCH 195/313] mm: multi-gen LRU: remove NULL checks on NODE_DATA() NODE_DATA() is preallocated for all possible nodes after commit 09f49dca570a ("mm: handle uninitialized numa nodes gracefully"). Checking its return value against NULL is now unnecessary. Link: https://lkml.kernel.org/r/20221116013808.3995280-2-yuzhao@google.com Signed-off-by: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 82f32c929b11..805fa51b175c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3166,7 +3166,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) if (memcg) { struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - /* for hotadd_new_pgdat() */ + /* see the comment in mem_cgroup_lruvec() */ if (!lruvec->pgdat) lruvec->pgdat = pgdat; @@ -3175,7 +3175,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); - return pgdat ? &pgdat->__lruvec : NULL; + return &pgdat->__lruvec; } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) @@ -3239,9 +3239,6 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* the first addition since the last iteration */ if (lruvec->mm_state.tail == &mm_list->fifo) lruvec->mm_state.tail = &mm->lru_gen.list; @@ -3271,9 +3268,6 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* where the last iteration ended (exclusive) */ if (lruvec->mm_state.tail == &mm->lru_gen.list) lruvec->mm_state.tail = lruvec->mm_state.tail->next; @@ -5348,9 +5342,6 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); From 4c74b65f478dc9353780a6be17fc82f1b06cea80 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 16 Nov 2022 09:23:45 +0800 Subject: [PATCH 196/313] mm/migrate.c: stop using 0 as NULL pointer mm/migrate.c:1198:24: warning: Using plain integer as NULL pointer Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3080 Link: https://lkml.kernel.org/r/20221116012345.84870-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 4aa3b6d4f67c..3be90351ad1d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1180,7 +1180,7 @@ static int unmap_and_move(new_page_t get_new_page, return -ENOMEM; dst = page_folio(newpage); - dst->private = 0; + dst->private = NULL; rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(&dst->page, reason); From 47939359add5242d27ee6a30e8bcb0cef15ba45c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 17 Nov 2022 23:13:26 +0900 Subject: [PATCH 197/313] zram: remove unused stats fields We don't show num_reads and num_writes since we removed corresponding sysfs nodes in 2017. Block layer stats are exposed via /sys/block/zramX/stat file. However, we still increment those atomic vars and store them in zram stats. Remove leftovers. Link: https://lkml.kernel.org/r/20221117141326.1105181-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 -- drivers/block/zram/zram_drv.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9d33801e8ba8..e290d6d97047 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1981,11 +1981,9 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, int ret; if (!op_is_write(op)) { - atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset, bio); flush_dcache_page(bvec->bv_page); } else { - atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset, bio); } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 473325415a74..c5254626f051 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -76,8 +76,6 @@ struct zram_table_entry { struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ - atomic64_t num_reads; /* failed + successful */ - atomic64_t num_writes; /* --do-- */ atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ From 91a99f1d1248bdde674b66e20ac472ec76f6a202 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 17 Nov 2022 16:29:15 -0500 Subject: [PATCH 198/313] selftests/vm: use memfd for hugepage-mmap test This test was overlooked with a hard-coded mntpoint path in test when we're removing the hugetlb mntpoint in commit 0796c7b8be84. Fix it up so the test can keep running. Link: https://lkml.kernel.org/r/Y3aojfUC2nSwbCzB@x1n Fixes: 0796c7b8be84 ("selftests/vm: drop mnt point for hugetlb in run_vmtests.sh") Signed-off-by: Peter Xu Reported-by: Joel Savitz Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugepage-mmap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c index 93f9e7b81331..955ef87f382c 100644 --- a/tools/testing/selftests/vm/hugepage-mmap.c +++ b/tools/testing/selftests/vm/hugepage-mmap.c @@ -16,14 +16,13 @@ * range. * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. */ - +#define _GNU_SOURCE #include #include #include #include #include -#define FILE_NAME "huge/hugepagefile" #define LENGTH (256UL*1024*1024) #define PROTECTION (PROT_READ | PROT_WRITE) @@ -67,16 +66,16 @@ int main(void) void *addr; int fd, ret; - fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); + fd = memfd_create("hugepage-mmap", MFD_HUGETLB); if (fd < 0) { - perror("Open failed"); + perror("memfd_create() failed"); exit(1); } addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); if (addr == MAP_FAILED) { perror("mmap"); - unlink(FILE_NAME); + close(fd); exit(1); } @@ -87,7 +86,6 @@ int main(void) munmap(addr, LENGTH); close(fd); - unlink(FILE_NAME); return ret; } From c3e58a70425ac6ddaae1529c8146e88b4f7252bb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 18 Nov 2022 10:17:13 +0000 Subject: [PATCH 199/313] mm/page_alloc: always remove pages from temporary list Patch series "Leave IRQs enabled for per-cpu page allocations", v3. This patch (of 2): free_unref_page_list() has neglected to remove pages properly from the list of pages to free since forever. It works by coincidence because list_add happened to do the right thing adding the pages to just the PCP lists. However, a later patch added pages to either the PCP list or the zone list but only properly deleted the page from the list in one path leading to list corruption and a subsequent failure. As a preparation patch, always delete the pages from one list properly before adding to another. On its own, this fixes nothing although it adds a fractional amount of overhead but is critical to the next patch. Link: https://lkml.kernel.org/r/20221118101714.19590-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20221118101714.19590-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reported-by: Hugh Dickins Reviewed-by: Vlastimil Babka Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c33b6963c2d7..ca889fb53cbb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3553,6 +3553,8 @@ void free_unref_page_list(struct list_head *list) list_for_each_entry_safe(page, next, list, lru) { struct zone *zone = page_zone(page); + list_del(&page->lru); + /* Different zone, different pcp lock. */ if (zone != locked_zone) { if (pcp) From 5749077415994eb02d660b2559b9d8278521e73d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 18 Nov 2022 10:17:14 +0000 Subject: [PATCH 200/313] mm/page_alloc: leave IRQs enabled for per-cpu page allocations The pcp_spin_lock_irqsave protecting the PCP lists is IRQ-safe as a task allocating from the PCP must not re-enter the allocator from IRQ context. In each instance where IRQ-reentrancy is possible, the lock is acquired using pcp_spin_trylock_irqsave() even though IRQs are disabled and re-entrancy is impossible. Demote the lock to pcp_spin_lock avoids an IRQ disable/enable in the common case at the cost of some IRQ allocations taking a slower path. If the PCP lists need to be refilled, the zone lock still needs to disable IRQs but that will only happen on PCP refill and drain. If an IRQ is raised when a PCP allocation is in progress, the trylock will fail and fallback to using the buddy lists directly. Note that this may not be a universal win if an interrupt-intensive workload also allocates heavily from interrupt context and contends heavily on the zone->lock as a result. [mgorman@techsingularity.net: migratetype might be wrong if a PCP was locked] Link: https://lkml.kernel.org/r/20221122131229.5263-2-mgorman@techsingularity.net [yuzhao@google.com: reported lockdep issue on IO completion from softirq] [hughd@google.com: fix list corruption, lock improvements, micro-optimsations] Link: https://lkml.kernel.org/r/20221118101714.19590-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Vlastimil Babka Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 124 +++++++++++++++++++++--------------------------- 1 file changed, 54 insertions(+), 70 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca889fb53cbb..d9d83254c485 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -170,21 +170,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); _ret; \ }) -#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +#define pcpu_spin_trylock(type, member, ptr) \ ({ \ type *_ret; \ pcpu_task_pin(); \ _ret = this_cpu_ptr(ptr); \ - spin_lock_irqsave(&_ret->member, flags); \ - _ret; \ -}) - -#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ -({ \ - type *_ret; \ - pcpu_task_pin(); \ - _ret = this_cpu_ptr(ptr); \ - if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + if (!spin_trylock(&_ret->member)) { \ pcpu_task_unpin(); \ _ret = NULL; \ } \ @@ -197,27 +188,16 @@ static DEFINE_MUTEX(pcp_batch_high_lock); pcpu_task_unpin(); \ }) -#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ -({ \ - spin_unlock_irqrestore(&ptr->member, flags); \ - pcpu_task_unpin(); \ -}) - /* struct per_cpu_pages specific helpers. */ #define pcp_spin_lock(ptr) \ pcpu_spin_lock(struct per_cpu_pages, lock, ptr) -#define pcp_spin_lock_irqsave(ptr, flags) \ - pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) - -#define pcp_spin_trylock_irqsave(ptr, flags) \ - pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) +#define pcp_spin_trylock(ptr) \ + pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) #define pcp_spin_unlock(ptr) \ pcpu_spin_unlock(lock, ptr) -#define pcp_spin_unlock_irqrestore(ptr, flags) \ - pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -1554,6 +1534,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { + unsigned long flags; int min_pindex = 0; int max_pindex = NR_PCP_LISTS - 1; unsigned int order; @@ -1569,8 +1550,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); while (count > 0) { @@ -1618,7 +1598,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, } while (count > 0 && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -3132,10 +3112,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + unsigned long flags; int i, allocated = 0; - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -3169,7 +3149,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return allocated; } @@ -3186,16 +3166,9 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - unsigned long flags; - - /* - * free_pcppages_bulk expects IRQs disabled for zone->lock - * so even though pcp->lock is not intended to be IRQ-safe, - * it's needed in this context. - */ - spin_lock_irqsave(&pcp->lock, flags); + spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); - spin_unlock_irqrestore(&pcp->lock, flags); + spin_unlock(&pcp->lock); } } #endif @@ -3209,12 +3182,9 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) { - unsigned long flags; - - /* See drain_zone_pages on why this is disabling IRQs */ - spin_lock_irqsave(&pcp->lock, flags); + spin_lock(&pcp->lock); free_pcppages_bulk(zone, pcp->count, pcp, 0); - spin_unlock_irqrestore(&pcp->lock, flags); + spin_unlock(&pcp->lock); } } @@ -3480,7 +3450,6 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, */ void free_unref_page(struct page *page, unsigned int order) { - unsigned long flags; unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; struct zone *zone; @@ -3508,10 +3477,10 @@ void free_unref_page(struct page *page, unsigned int order) zone = page_zone(page); pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { free_unref_page_commit(zone, pcp, page, migratetype, order); - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); } @@ -3523,10 +3492,10 @@ void free_unref_page(struct page *page, unsigned int order) */ void free_unref_page_list(struct list_head *list) { + unsigned long __maybe_unused UP_flags; struct page *page, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - unsigned long flags; int batch_count = 0; int migratetype; @@ -3554,21 +3523,36 @@ void free_unref_page_list(struct list_head *list) struct zone *zone = page_zone(page); list_del(&page->lru); + migratetype = get_pcppage_migratetype(page); /* Different zone, different pcp lock. */ if (zone != locked_zone) { - if (pcp) - pcp_spin_unlock_irqrestore(pcp, flags); + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } + /* + * trylock is necessary as pages may be getting freed + * from IRQ or SoftIRQ context after an IO completion. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); + free_one_page(zone, page, page_to_pfn(page), + 0, migratetype, FPI_NONE); + locked_zone = NULL; + continue; + } locked_zone = zone; - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); + batch_count = 0; } /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. */ - migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; @@ -3576,18 +3560,23 @@ void free_unref_page_list(struct list_head *list) free_unref_page_commit(zone, pcp, page, migratetype, 0); /* - * Guard against excessive IRQ disabled times when we get - * a large list of pages to free. + * Guard against excessive lock hold times when freeing + * a large list of pages. Lock will be reacquired if + * necessary on the next iteration. */ if (++batch_count == SWAP_CLUSTER_MAX) { - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); batch_count = 0; - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); + pcp = NULL; + locked_zone = NULL; } } - if (pcp) - pcp_spin_unlock_irqrestore(pcp, flags); + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } } /* @@ -3788,15 +3777,11 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long flags; unsigned long __maybe_unused UP_flags; - /* - * spin_trylock may fail due to a parallel drain. In the future, the - * trylock will also protect against IRQ reentrancy. - */ + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) { pcp_trylock_finish(UP_flags); return NULL; @@ -3810,7 +3795,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); @@ -5381,7 +5366,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct page **page_array) { struct page *page; - unsigned long flags; unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; @@ -5463,9 +5447,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; - /* Is a parallel drain in progress? */ + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) goto failed_irq; @@ -5484,7 +5468,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); goto failed_irq; } break; @@ -5499,7 +5483,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); From a4bafffb5dc5be6c7a3b77b2de0cbaf6776a3c8b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 22 Nov 2022 13:12:29 +0000 Subject: [PATCH 201/313] mm/page_alloc: simplify locking during free_unref_page_list While freeing a large list, the zone lock will be released and reacquired to avoid long hold times since commit c24ad77d962c ("mm/page_alloc.c: avoid excessive IRQ disabled times in free_unref_page_list()"). As suggested by Vlastimil Babka, the lockrelease/reacquire logic can be simplified by reusing the logic that acquires a different lock when changing zones. Link: https://lkml.kernel.org/r/20221122131229.5263-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Reviewed-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9d83254c485..5ab9dd29ef7e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3525,13 +3525,19 @@ void free_unref_page_list(struct list_head *list) list_del(&page->lru); migratetype = get_pcppage_migratetype(page); - /* Different zone, different pcp lock. */ - if (zone != locked_zone) { + /* + * Either different zone requiring a different pcp lock or + * excessive lock hold times when freeing a large list of + * pages. + */ + if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + batch_count = 0; + /* * trylock is necessary as pages may be getting freed * from IRQ or SoftIRQ context after an IO completion. @@ -3546,7 +3552,6 @@ void free_unref_page_list(struct list_head *list) continue; } locked_zone = zone; - batch_count = 0; } /* @@ -3558,19 +3563,7 @@ void free_unref_page_list(struct list_head *list) trace_mm_page_free_batched(page); free_unref_page_commit(zone, pcp, page, migratetype, 0); - - /* - * Guard against excessive lock hold times when freeing - * a large list of pages. Lock will be reacquired if - * necessary on the next iteration. - */ - if (++batch_count == SWAP_CLUSTER_MAX) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); - batch_count = 0; - pcp = NULL; - locked_zone = NULL; - } + batch_count++; } if (pcp) { From 6dd8fe86fa84729538d8bed3149faf9c5886bb5b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:52 -0800 Subject: [PATCH 202/313] ext4: convert move_extent_per_page() to use folios Patch series "Removing the try_to_release_page() wrapper", v3. This patchset replaces the remaining calls of try_to_release_page() with the folio equivalent: filemap_release_folio(). This allows us to remove the wrapper. This patch (of 4): Convert move_extent_per_page() to use folios. This change removes 5 calls to compound_head() and is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 044e34cd835c..8dbb87edf24c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; + struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; @@ -313,6 +314,13 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ + folio[0] = page_folio(pagep[0]); + folio[1] = page_folio(pagep[1]); + + VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); + VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); + VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); + if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -331,10 +339,10 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((page_has_private(pagep[0]) && - !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && - !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; goto drop_data_sem; } @@ -344,19 +352,21 @@ again: block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); - goto unlock_pages; + goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); + *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); if (*err) - goto unlock_pages; + goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || - (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { + if ((folio_has_private(folio[0]) && + !filemap_release_folio(folio[0], 0)) || + (folio_has_private(folio[1]) && + !filemap_release_folio(folio[1], 0))) { *err = -EBUSY; - goto unlock_pages; + goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, @@ -369,13 +379,13 @@ data_copy: replaced_size = block_len_in_page << orig_inode->i_blkbits; } else - goto unlock_pages; + goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!page_has_buffers(pagep[0])) - create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); - bh = page_buffers(pagep[0]); + if (!folio_buffers(folio[0])) + create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); + bh = folio_buffers(folio[0]); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { @@ -385,7 +395,7 @@ data_copy: bh = bh->b_this_page; } if (!*err) - *err = block_commit_write(pagep[0], from, from + replaced_size); + *err = block_commit_write(&folio[0]->page, from, from + replaced_size); if (unlikely(*err < 0)) goto repair_branches; @@ -395,11 +405,11 @@ data_copy: *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); -unlock_pages: - unlock_page(pagep[0]); - put_page(pagep[0]); - unlock_page(pagep[1]); - put_page(pagep[1]); +unlock_folios: + folio_unlock(folio[0]); + folio_put(folio[0]); + folio_unlock(folio[1]); + folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && @@ -430,7 +440,7 @@ repair_branches: *err = -EIO; } replaced_count = 0; - goto unlock_pages; + goto unlock_folios; } /** From 64ab3195ea077eaeedc8b382939c3dc5ca56f369 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:53 -0800 Subject: [PATCH 203/313] khugepage: replace try_to_release_page() with filemap_release_folio() Replace some calls with their folio equivalents. This change removes 4 calls to compound_head() and is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- mm/khugepaged.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0d8f548d9d7e..913b0f489352 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1789,6 +1789,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_set(&xas, start); for (index = start; index < end; index++) { struct page *page = xas_next(&xas); + struct folio *folio; VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -1815,8 +1816,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } if (xa_is_value(page) || !PageUptodate(page)) { - struct folio *folio; - xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, @@ -1904,13 +1903,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (page_mapping(page) != mapping) { + folio = page_folio(page); + + if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - if (!is_shmem && (PageDirty(page) || - PageWriteback(page))) { + if (!is_shmem && (folio_test_dirty(folio) || + folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed @@ -1920,20 +1921,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (isolate_lru_page(page)) { + if (folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) { + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; - putback_lru_page(page); + folio_putback_lru(folio); goto out_unlock; } - if (page_mapped(page)) - try_to_unmap(page_folio(page), + if (folio_mapped(folio)) + try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); From ac5efa782041670b63a05c36d92d02a80e50bb63 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:54 -0800 Subject: [PATCH 204/313] memory-failure: convert truncate_error_page() to use folio Replace try_to_release_page() with filemap_release_folio(). This change is in preparation for the removal of the try_to_release_page() wrapper. Link: https://lkml.kernel.org/r/20221118073055.55694-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Naoya Horiguchi Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- mm/memory-failure.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 63d8501001c6..2e62940c7bae 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -840,12 +840,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn, int ret = MF_FAILED; if (mapping->a_ops->error_remove_page) { + struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { + } else if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_NOIO)) { pr_info("%#lx: failed to release buffers\n", pfn); } else { ret = MF_RECOVERED; From 7438899b0b8df16a73d9a5d49b2a345d165adfe8 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Thu, 17 Nov 2022 23:30:55 -0800 Subject: [PATCH 205/313] folio-compat: remove try_to_release_page() There are no more callers of try_to_release_page(), so remove it. This saves 85 bytes of kernel text. Link: https://lkml.kernel.org/r/20221118073055.55694-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - mm/folio-compat.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b33ab86d5dca..2ec0ca1f3d38 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1105,7 +1105,6 @@ void __filemap_remove_folio(struct folio *folio, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); -int try_to_release_page(struct page *page, gfp_t gfp); bool filemap_release_folio(struct folio *folio, gfp_t gfp); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index cbfe51091c39..86933fa8f3e1 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -118,12 +118,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -int try_to_release_page(struct page *page, gfp_t gfp) -{ - return filemap_release_folio(page_folio(page), gfp); -} -EXPORT_SYMBOL(try_to_release_page); - int isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) From f6fbb8b23b8155a6f7af1349b4595d0373167636 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 10 Nov 2022 12:35:41 +0100 Subject: [PATCH 206/313] Revert "kmsan: unpoison @tlb in arch_tlb_gather_mmu()" This reverts commit ac801e7e252c5588325e3c983c7d4167fc68c024. The patch in question was picked to -mm from the KMSAN v6 patch series (https://lore.kernel.org/linux-mm/20220905122452.2258262-1-glider@google.com/) and sneaked into mainline despite its removal from the v7 series (https://lore.kernel.org/linux-mm/20220915150417.722975-1-glider@google.com/) Currently KMSAN does not warn about origin chains hitting the maximum depth, so keeping @tlb poisoned won't result in any inconveniences. Link: https://lkml.kernel.org/r/20221110113541.1844156-1-glider@google.com Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Marco Elver Cc: Peter Zijlstra (Intel) Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index dd1f8ca40cb5..8247553a69c2 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include @@ -299,15 +298,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { - /* - * struct mmu_gather contains 7 1-bit fields packed into a 32-bit - * unsigned int value. The remaining 25 bits remain uninitialized - * and are never used, but KMSAN updates the origin for them in - * zap_pXX_range() in mm/memory.c, thus creating very long origin - * chains. This is technically correct, but consumes too much memory. - * Unpoisoning the whole structure will prevent creating such chains. - */ - kmsan_unpoison_memory(tlb, sizeof(*tlb)); tlb->mm = mm; tlb->fullmm = fullmm; From 845aad0aa038507c166cdc48fbf2e5d863fe73dc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 18 Nov 2022 21:51:17 -0800 Subject: [PATCH 207/313] maple_tree: allow TEST_MAPLE_TREE only when DEBUG_KERNEL is set Prevent a kconfig warning that is caused by TEST_MAPLE_TREE by adding a "depends on" clause for TEST_MAPLE_TREE since 'select' does not follow any kconfig dependencies. WARNING: unmet direct dependencies detected for DEBUG_MAPLE_TREE Depends on [n]: DEBUG_KERNEL [=n] Selected by [y]: - TEST_MAPLE_TREE [=y] && RUNTIME_TESTING_MENU [=y] Link: https://lkml.kernel.org/r/20221119055117.14094-1-rdunlap@infradead.org Fixes: 120b116208a0 ("maple_tree: reorganize testing to restore module testing") Signed-off-by: Randy Dunlap Reported-by: Geert Uytterhoeven Reported-by: kernel test robot Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 580e453e284e..a0dc28fdc567 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2244,6 +2244,7 @@ config TEST_XARRAY tristate "Test the XArray code at runtime" config TEST_MAPLE_TREE + depends on DEBUG_KERNEL select DEBUG_MAPLE_TREE tristate "Test the Maple Tree code at runtime" From 8e9d5ead865a1a7af74a444d2f00f1ef4539bfba Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:56 -0800 Subject: [PATCH 208/313] mm: add bdi_set_strict_limit() function Patch series "mm/block: add bdi sysfs knobs", v4. At meta network block devices (nbd) are used to implement remote block storage. In testing and during production it has been observed that these network block devices can consume a huge portion of the dirty writeback cache and writeback can take a considerable time. To be able to give stricter limits, I'm proposing the following changes: 1) introduce strictlimit knob Currently the max_ratio knob exists to limit the dirty_memory. However this knob only applies once (dirty_ratio + dirty_background_ratio) / 2 has been reached. With the BDI_CAP_STRICTLIMIT flag, the max_ratio can be applied without reaching that limit. This change exposes that knob. This knob can also be useful for NFS, fuse filesystems and USB devices. 2) Use part of 1000000 internal calculation The max_ratio is based on percentage. With the current machine sizes percentage values can be very high (1% of a 256GB main memory is already 2.5GB). This change uses part of 1000000 instead of percentages for the internal calculations. 3) Introduce two new sysfs knobs: min_bytes and max_bytes. Currently all calculations are based on ratio, but for a user it often more convenient to specify a limit in bytes. The new knobs will not store bytes values, instead they will translate the byte value to a corresponding ratio. As the internal values are now part of 1000, the ratio is closer to the specified value. However the value should be more seen as an approximation as it can fluctuate over time. 3) Introduce two new sysfs knobs: min_ratio_fine and max_ratio_fine. The granularity for the existing sysfs bdi knobs min_ratio and max_ratio is based on percentage values. The new sysfs bdi knobs min_ratio_fine and max_ratio_fine allow to specify the ratio as part of 1 million. This patch (of 20): This adds the bdi_set_strict_limit function to be able to set/unset the BDI_CAP_STRICTLIMIT flag. Link: https://lkml.kernel.org/r/20221119005215.3052436-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20221119005215.3052436-2-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Jens Axboe Cc: Chris Mason Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 439815cc1ab9..9c984ffc8a0a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -104,6 +104,7 @@ static inline unsigned long wb_stat_error(void) int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e9d8d857ecc..3745b886722f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -698,6 +698,21 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { From 27bbe9d48d4e298864e18b39f091342c68b81637 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:57 -0800 Subject: [PATCH 209/313] mm: add knob /sys/class/bdi//strict_limit Add a new knob to /sys/class/bdi//strict_limit. This new knob allows to set/unset the flag BDI_CAP_STRICTLIMIT in the bdi capabilities. Link: https://lkml.kernel.org/r/20221119005215.3052436-3-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/backing-dev.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c30419a5e119..a0899cce72ef 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -209,11 +209,40 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strict_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int strict_limit; + ssize_t ret; + + ret = kstrtouint(buf, 10, &strict_limit); + if (ret < 0) + return ret; + + ret = bdi_set_strict_limit(bdi, strict_limit); + if (!ret) + ret = count; + + return ret; +} + +static ssize_t strict_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strict_limit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); From 16b837eb84e6948f92411eb32e97a05f89733ddc Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:58 -0800 Subject: [PATCH 210/313] mm: document /sys/class/bdi//strict_limit knob This documents the new /sys/class/bdi//strict_limit knob. Link: https://lkml.kernel.org/r/20221119005215.3052436-4-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-class-bdi | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index 6d2a2fc189dd..68b5d4018c2f 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -55,6 +55,17 @@ Description: mount that is prone to get stuck, or a FUSE mount which cannot be trusted to play fair. + (read-write) +What: /sys/class/bdi//strict_limit +Date: October 2022 +Contact: Stefan Roesch +Description: + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. + (read-write) What: /sys/class/bdi//stable_pages_required Date: January 2008 From ae82291e9ca47c3d6da6b77a00f427754aca413e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:51:59 -0800 Subject: [PATCH 211/313] mm: use part per 1000000 for bdi ratios To get finer granularity for ratio calculations use part per million instead of percentiles. This is especially important if we want to automatically convert byte values to ratios. Otherwise the values that are actually used can be quite different. This is also important for machines with more main memory (1% of 256GB is already 2.5GB). Link: https://lkml.kernel.org/r/20221119005215.3052436-5-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 3 +++ mm/backing-dev.c | 6 +++--- mm/page-writeback.c | 15 +++++++++------ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 9c984ffc8a0a..1b50c028e5ad 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -102,6 +102,9 @@ static inline unsigned long wb_stat_error(void) #endif } +/* BDI ratio is expressed as part per 1000000 for finer granularity. */ +#define BDI_RATIO_SCALE 10000 + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a0899cce72ef..90fa517123dc 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -178,7 +178,7 @@ static ssize_t min_ratio_store(struct device *dev, return ret; } -BDI_SHOW(min_ratio, bdi->min_ratio) +BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -197,7 +197,7 @@ static ssize_t max_ratio_store(struct device *dev, return ret; } -BDI_SHOW(max_ratio, bdi->max_ratio) +BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, @@ -809,7 +809,7 @@ int bdi_init(struct backing_dev_info *bdi) kref_init(&bdi->refcnt); bdi->min_ratio = 0; - bdi->max_ratio = 100; + bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3745b886722f..dd98b2654302 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -197,7 +197,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, min *= this_bw; min = div64_ul(min, tot_bw); } - if (max < 100) { + if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } @@ -655,6 +655,8 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) unsigned int delta; int ret = 0; + min_ratio *= BDI_RATIO_SCALE; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; @@ -665,7 +667,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; - if (bdi_min_ratio + delta < 100) { + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { @@ -684,6 +686,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) if (max_ratio > 100) return -EINVAL; + max_ratio *= BDI_RATIO_SCALE; spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { @@ -775,15 +778,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); - wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - wb_thresh += (thresh * wb_min_ratio) / 100; - if (wb_thresh > (thresh * wb_max_ratio) / 100) - wb_thresh = thresh * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) + wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); return wb_thresh; } From 00df7d51263b46ed93f7572e2d09579746f7b1eb Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:00 -0800 Subject: [PATCH 212/313] mm: add bdi_get_max_bytes() function This adds a function to return the specified value for max_bytes. It converts the stored max_ratio of the bdi to the corresponding bytes value. It introduces the bdi_get_bytes helper function to do the conversion. This is an approximation as it is based on the value that is returned by global_dirty_limits(), which can change. The helper function will also be used by the min_bytes bdi knob. Link: https://lkml.kernel.org/r/20221119005215.3052436-6-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1b50c028e5ad..473686c32775 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -105,6 +105,7 @@ static inline unsigned long wb_stat_error(void) /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 +u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dd98b2654302..719404e0d03d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,6 +650,18 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; +static u64 bdi_get_bytes(unsigned int ratio) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; @@ -701,6 +713,11 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) From efc3e6ad53ea14225b434fddca261c9a1c56c707 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:01 -0800 Subject: [PATCH 213/313] mm: split off __bdi_set_max_ratio() function This splits off __bdi_set_max_ratio() from bdi_set_max_ratio(). __bdi_set_max_ratio() will also be called from bdi_set_max_bytes(), which will be introduced in the next patch. Link: https://lkml.kernel.org/r/20221119005215.3052436-7-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/page-writeback.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 719404e0d03d..e74ef596dc27 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -692,14 +692,10 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) return ret; } -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; - if (max_ratio > 100) - return -EINVAL; - max_ratio *= BDI_RATIO_SCALE; - spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; @@ -711,6 +707,14 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) return ret; } + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + if (max_ratio > 100) + return -EINVAL; + + return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); +} EXPORT_SYMBOL(bdi_set_max_ratio); u64 bdi_get_max_bytes(struct backing_dev_info *bdi) From 1bf27e98d26d1e62166a456ef17460be085cbe0b Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:02 -0800 Subject: [PATCH 214/313] mm: add bdi_set_max_bytes() function This introduces the bdi_set_max_bytes() function. The max_bytes function does not store the max_bytes value. Instead it converts the max_bytes value into the corresponding ratio value. Link: https://lkml.kernel.org/r/20221119005215.3052436-8-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 473686c32775..ea6c993433d5 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -108,6 +108,7 @@ static inline unsigned long wb_stat_error(void) u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e74ef596dc27..20ae9adeb22f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -650,6 +651,28 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + static u64 bdi_get_bytes(unsigned int ratio) { unsigned long background_thresh; @@ -722,6 +745,20 @@ u64 bdi_get_max_bytes(struct backing_dev_info *bdi) return bdi_get_bytes(bdi->max_ratio); } +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + unsigned long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_max_ratio(bdi, max_ratio); +} + int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) { if (strict_limit > 1) From c56e049a5e401a177c7c9b39a3bcc973ff5cec0b Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:03 -0800 Subject: [PATCH 215/313] mm: add knob /sys/class/bdi//max_bytes This adds the new knob max_bytes to specify a dirty memory limit for the corresponding bdi. The specified bytes value is converted to a ratio. Link: https://lkml.kernel.org/r/20221119005215.3052436-9-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/backing-dev.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 90fa517123dc..95d3229fc81f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -199,6 +199,34 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) +static ssize_t max_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); +} + +static ssize_t max_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_max_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(max_bytes); + static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -241,6 +269,7 @@ static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, + &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, NULL, From c354d9268d7825eb8643f658c5091079d4f11a4a Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:04 -0800 Subject: [PATCH 216/313] mm: document /sys/class/bdi//max_bytes knob This documents the new /sys/class/bdi//max_bytes knob. Link: https://lkml.kernel.org/r/20221119005215.3052436-10-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-class-bdi | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index 68b5d4018c2f..580f723de049 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -56,6 +56,20 @@ Description: be trusted to play fair. (read-write) + +What: /sys/class/bdi//max_bytes +Date: October 2022 +Contact: Stefan Roesch +Description: + Allows limiting a particular device to use not more than the + given 'max_bytes' of the write-back cache. This is useful in + situations where we want to avoid one device taking all or + most of the write-back cache. For example in case of an NFS + mount that is prone to get stuck, a FUSE mount which cannot be + trusted to play fair, or a nbd device. + + (read-write) + What: /sys/class/bdi//strict_limit Date: October 2022 Contact: Stefan Roesch From 712c00d66a342a3ed375df41c3df7d3d2abad2c0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:05 -0800 Subject: [PATCH 217/313] mm: add bdi_get_min_bytes() function This adds a function to return the specified value for min_bytes. It converts the stored min_ratio of the bdi to the corresponding bytes value. This is an approximation as it is based on the value that is returned by global_dirty_limits(), which can change. The returned value can be different than the value when the min_bytes value was set. Link: https://lkml.kernel.org/r/20221119005215.3052436-11-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index ea6c993433d5..8e04567727e6 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -105,6 +105,7 @@ static inline unsigned long wb_stat_error(void) /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 +u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 20ae9adeb22f..c47824464f4c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -740,6 +740,11 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); From 8021fb3232f265b81c7e4e7aba15bc3a04ff1fd3 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:06 -0800 Subject: [PATCH 218/313] mm: split off __bdi_set_min_ratio() function This splits off the __bdi_set_min_ratio() function from the bdi_set_min_ratio() function. The __bdi_set_min_ratio() function will also be called from the bdi_set_min_bytes() function, which will be introduced in the next patch. Link: https://lkml.kernel.org/r/20221119005215.3052436-12-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/page-writeback.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c47824464f4c..cefee7210d83 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -685,7 +685,7 @@ static u64 bdi_get_bytes(unsigned int ratio) return bytes; } -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; int ret = 0; @@ -731,6 +731,11 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra return ret; } +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); +} + int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { if (max_ratio > 100) From 803c98050569850be5fd51a2025c67622de887d9 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:07 -0800 Subject: [PATCH 219/313] mm: add bdi_set_min_bytes() function This introduces the bdi_set_min_bytes() function. The min_bytes function does not store the min_bytes value. Instead it converts the min_bytes value into the corresponding ratio value. Link: https://lkml.kernel.org/r/20221119005215.3052436-13-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8e04567727e6..572669758c7f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index cefee7210d83..3d151e7a9b6c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -750,6 +750,20 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi) return bdi_get_bytes(bdi->min_ratio); } +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + unsigned long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_min_ratio(bdi, min_ratio); +} + u64 bdi_get_max_bytes(struct backing_dev_info *bdi) { return bdi_get_bytes(bdi->max_ratio); From 9c84819bd64ec15cb15d041c45ebe4725e9d4f3b Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:08 -0800 Subject: [PATCH 220/313] mm: add /sys/class/bdi//min_bytes knob bdi has two existing knobs to limit the amount of dirty memory: min_ratio and max_ratio. However the granularity of the knobs is limited and often it is more convenient to specify limits in terms of bytes. This change adds the min_bytes knob. It does not store the min_bytes value, instead it converts the max_bytes value to a ratio. The value is therefore more an approximation than an absolute value. It also maintains the sum over all the bdi min_ratio values stored in the variable bdi_min_ratio. Link: https://lkml.kernel.org/r/20221119005215.3052436-14-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/backing-dev.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 95d3229fc81f..3fab79061ade 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -199,6 +199,34 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) +static ssize_t min_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); +} + +static ssize_t min_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_min_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(min_bytes); + static ssize_t max_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -269,6 +297,7 @@ static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, + &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, &dev_attr_strict_limit.attr, From 9c832a8d571784c998d0f9f5df480c62f7f3064c Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:09 -0800 Subject: [PATCH 221/313] mm: document /sys/class/bdi//min_bytes knob This documents the new /sys/class/bdi//min_bytes knob. [akpm@linux-foundation.org: fix htmldocs warnings] Link: https://lkml.kernel.org/r/20221119005215.3052436-15-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-class-bdi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index 580f723de049..a9a2f588a2b7 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -57,6 +57,21 @@ Description: (read-write) +What: /sys/class/bdi//min_bytes +Date: October 2022 +Contact: Stefan Roesch +Description: + Under normal circumstances each device is given a part of the + total write-back cache that relates to its current average + writeout speed in relation to the other devices. + + The 'min_bytes' parameter allows assigning a minimum + percentage of the write-back cache to a particular device + expressed in bytes. + For example, this is useful for providing a minimum QoS. + + (read-write) + What: /sys/class/bdi//max_bytes Date: October 2022 Contact: Stefan Roesch From 4e230b406eda9bdf7f8a71e2cc3df18a824abcb0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:10 -0800 Subject: [PATCH 222/313] mm: add bdi_set_max_ratio_no_scale() function This introduces bdi_set_max_ratio_no_scale(). It uses the max granularity for the ratio. This function by the new sysfs knob max_ratio_fine. Link: https://lkml.kernel.org/r/20221119005215.3052436-16-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 572669758c7f..d9acbb22ff25 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3d151e7a9b6c..f44ade72966c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -719,6 +719,9 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra { int ret = 0; + if (max_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; @@ -731,6 +734,11 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra return ret; } +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); @@ -738,9 +746,6 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { - if (max_ratio > 100) - return -EINVAL; - return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); } EXPORT_SYMBOL(bdi_set_max_ratio); From bca52dcbadc583f4db6435599c44a79f97293f06 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:11 -0800 Subject: [PATCH 223/313] mm: add /sys/class/bdi//max_ratio_fine knob This adds the max_ratio_fine knob. The knob specifies the values not based on 1 of 100, but instead 1 per million. Link: https://lkml.kernel.org/r/20221119005215.3052436-17-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/backing-dev.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3fab79061ade..94c2382367cf 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -199,6 +199,25 @@ static ssize_t max_ratio_store(struct device *dev, } BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) +static ssize_t max_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_ratio_fine, bdi->max_ratio) + static ssize_t min_bytes_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -297,6 +316,7 @@ static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, + &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, From 54790f30fea74247e2f38b4a632ee3dc2fe42d86 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:12 -0800 Subject: [PATCH 224/313] mm: document /sys/class/bdi//max_ratio_fine knob This documents the new /sys/class/bdi//max_ratio_fine knob. [akpm@linux-foundation.org: fix htmldocs warnings] Link: https://lkml.kernel.org/r/20221119005215.3052436-18-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-class-bdi | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index a9a2f588a2b7..e9c584db316f 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -57,6 +57,19 @@ Description: (read-write) +What: /sys/class/bdi//max_ratio_fine +Date: November 2022 +Contact: Stefan Roesch +Description: + Allows limiting a particular device to use not more than the + given value of the write-back cache. The value is given as part + of 1 million. This is useful in situations where we want to avoid + one device taking all or most of the write-back cache. For example + in case of an NFS mount that is prone to get stuck, or a FUSE mount + which cannot be trusted to play fair. + + (read-write) + What: /sys/class/bdi//min_bytes Date: October 2022 Contact: Stefan Roesch From 2c44af4f2aaa260199f218f11920c406e688693c Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:13 -0800 Subject: [PATCH 225/313] mm: add bdi_set_min_ratio_no_scale() function This introduces bdi_set_min_ratio_no_scale(). It uses the max granularity for the ratio. This function by the new sysfs knob min_ratio_fine. Link: https://lkml.kernel.org/r/20221119005215.3052436-19-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + mm/page-writeback.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d9acbb22ff25..fbad4fcd408e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -109,6 +109,7 @@ u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f44ade72966c..ad608ef2a243 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -690,6 +690,8 @@ static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ra unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; min_ratio *= BDI_RATIO_SCALE; spin_lock_bh(&bdi_lock); @@ -734,6 +736,11 @@ static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ra return ret; } +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) { return __bdi_set_max_ratio(bdi, max_ratio); From ad3e6dabf6f7d9ffd68eb711191ef16cdbdd25f0 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:14 -0800 Subject: [PATCH 226/313] mm: add /sys/class/bdi//min_ratio_fine knob This adds the min_ratio_fine knob. The knob specifies the values not based on 1 of 100, but instead 1 per million. Link: https://lkml.kernel.org/r/20221119005215.3052436-20-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/backing-dev.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 94c2382367cf..a53b9360b72e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -180,6 +180,25 @@ static ssize_t min_ratio_store(struct device *dev, } BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) +static ssize_t min_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_ratio_fine, bdi->min_ratio) + static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -315,6 +334,7 @@ static DEVICE_ATTR_RW(strict_limit); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, + &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, &dev_attr_max_ratio_fine.attr, &dev_attr_min_bytes.attr, From eba39236f18da7a50b6c51df5d902ee72c43e760 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 18 Nov 2022 16:52:15 -0800 Subject: [PATCH 227/313] mm: document /sys/class/bdi//min_ratio_fine knob This documents the new /sys/class/bdi//max_ratio_fine knob. [akpm@linux-foundation.org: fix htmldocs warnings] Link: https://lkml.kernel.org/r/20221119005215.3052436-21-shr@devkernel.io Signed-off-by: Stefan Roesch Cc: Chris Mason Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-class-bdi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index e9c584db316f..0d2abd88a18c 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -44,6 +44,21 @@ Description: (read-write) +What: /sys/class/bdi//min_ratio_fine +Date: November 2022 +Contact: Stefan Roesch +Description: + Under normal circumstances each device is given a part of the + total write-back cache that relates to its current average + writeout speed in relation to the other devices. + + The 'min_ratio_fine' parameter allows assigning a minimum reserve + of the write-back cache to a particular device. The value is + expressed as part of 1 million. For example, this is useful for + providing a minimum QoS. + + (read-write) + What: /sys/class/bdi//max_ratio Date: January 2008 Contact: Peter Zijlstra From be21b32afe470c5ae98e27e49201158a47032942 Mon Sep 17 00:00:00 2001 From: NARIBAYASHI Akira Date: Wed, 26 Oct 2022 20:24:38 +0900 Subject: [PATCH 228/313] mm, compaction: fix fast_isolate_around() to stay within boundaries Depending on the memory configuration, isolate_freepages_block() may scan pages out of the target range and causes panic. Panic can occur on systems with multiple zones in a single pageblock. The reason it is rare is that it only happens in special configurations. Depending on how many similar systems there are, it may be a good idea to fix this problem for older kernels as well. The problem is that pfn as argument of fast_isolate_around() could be out of the target range. Therefore we should consider the case where pfn < start_pfn, and also the case where end_pfn < pfn. This problem should have been addressd by the commit 6e2b7044c199 ("mm, compaction: make fast_isolate_freepages() stay within zone") but there was an oversight. Case1: pfn < start_pfn | node X's zone | node Y's zone +-----------------+------------------------------... pageblock ^ ^ ^ +-----------+-----------+-----------+-----------+... ^ ^ ^ ^ ^ end_pfn ^ start_pfn = cc->zone->zone_start_pfn pfn <---------> scanned range by "Scan After" Case2: end_pfn < pfn | node X's zone | node Y's zone +-----------------+------------------------------... pageblock ^ ^ ^ +-----------+-----------+-----------+-----------+... ^ ^ ^ ^ ^ pfn ^ end_pfn start_pfn <---------> scanned range by "Scan Before" It seems that there is no good reason to skip nr_isolated pages just after given pfn. So let perform simple scan from start to end instead of dividing the scan into "Before" and "After". Link: https://lkml.kernel.org/r/20221026112438.236336-1-a.naribayashi@fujitsu.com Fixes: 6e2b7044c199 ("mm, compaction: make fast_isolate_freepages() stay within zone"). Signed-off-by: NARIBAYASHI Akira Cc: David Rientjes Cc: Mel Gorman Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/compaction.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 1f6da31dd9a5..ca1603524bbe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1344,7 +1344,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) } static void -fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +fast_isolate_around(struct compact_control *cc, unsigned long pfn) { unsigned long start_pfn, end_pfn; struct page *page; @@ -1365,21 +1365,13 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long if (!page) return; - /* Scan before */ - if (start_pfn != pfn) { - isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); - if (cc->nr_freepages >= cc->nr_migratepages) - return; - } - - /* Scan after */ - start_pfn = pfn + nr_isolated; - if (start_pfn < end_pfn) - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (cc->nr_freepages < cc->nr_migratepages) set_pageblock_skip(page); + + return; } /* Search orders in round-robin fashion */ @@ -1556,7 +1548,7 @@ fast_isolate_freepages(struct compact_control *cc) return cc->free_pfn; low_pfn = page_to_pfn(page); - fast_isolate_around(cc, low_pfn, nr_isolated); + fast_isolate_around(cc, low_pfn); return low_pfn; } From 7ce5f7e16afa82d33dc47d633404b8b1142a5e44 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 23 Nov 2022 10:43:19 +0530 Subject: [PATCH 229/313] documentation/mm: update pmd_present() in arch_pgtable_helpers.rst Although pmd_present() might seem to indicate a valid and mapped pmd entry, in reality it returns true when pmd_page() points to a valid page in memory , regardless whether the pmd entry is mapped or not. Andrea Arcangeli had earlier explained [1] the required semantics for pmd_present(). This just updates the documentation for pmd_present() as required. [1] https://lore.kernel.org/lkml/20181017020930.GN30832@redhat.com/ Link: https://lkml.kernel.org/r/20221123051319.1312582-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Cc: Jonathan Corbet Cc: Andrea Arcangeli Cc: Mike Rapoport Signed-off-by: Andrew Morton --- Documentation/mm/arch_pgtable_helpers.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst index cbaee9e59241..fd2a19df884e 100644 --- a/Documentation/mm/arch_pgtable_helpers.rst +++ b/Documentation/mm/arch_pgtable_helpers.rst @@ -94,7 +94,7 @@ PMD Page Table Helpers +---------------------------+--------------------------------------------------+ | pmd_trans_huge | Tests a Transparent Huge Page (THP) at PMD | +---------------------------+--------------------------------------------------+ -| pmd_present | Tests a valid mapped PMD | +| pmd_present | Tests whether pmd_page() points to valid memory | +---------------------------+--------------------------------------------------+ | pmd_young | Tests a young PMD | +---------------------------+--------------------------------------------------+ From 8d9b63708ddd1ac51e0260c7b8f641daf01f4caf Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 22 Nov 2022 12:30:22 +0900 Subject: [PATCH 230/313] zswap: do not allocate from atomic pool zswap_frontswap_load() should be called from preemptible context (we even call mutex_lock() there) and it does not look like we need to do GFP_ATOMIC allocaion for temp buffer. The same applies to zswap_writeback_entry(). Use GFP_KERNEL for temporary buffer allocation in both cases. Link: https://lkml.kernel.org/r/Y3xCTr6ikbtcUr/y@google.com Signed-off-by: Johannes Weiner Signed-off-by: Nhat Pham Signed-off-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Cc: Sergey Senozhatsky Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zpool.c | 7 +++++++ mm/zswap.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/zpool.c b/mm/zpool.c index 68facc193496..f46c0d5e766c 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -387,6 +387,13 @@ bool zpool_evictable(struct zpool *zpool) * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. * @zpool: The zpool to test * + * Some allocators enter non-preemptible context in ->map() callback (e.g. + * disable pagefaults) and exit that context in ->unmap(), which limits what + * we can do with the mapped object. For instance, we cannot wait for + * asynchronous crypto API to decompress such an object or take mutexes + * since those will call into the scheduler. This function tells us whether + * we use such an allocator. + * * Returns: true if zpool can sleep; false otherwise. */ bool zpool_can_sleep_mapped(struct zpool *zpool) diff --git a/mm/zswap.c b/mm/zswap.c index 2d48fd59cc7a..3019f0bde194 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -958,7 +958,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) }; if (!zpool_can_sleep_mapped(pool)) { - tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!tmp) return -ENOMEM; } @@ -1311,7 +1311,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } if (!zpool_can_sleep_mapped(entry->pool->zpool)) { - tmp = kmalloc(entry->length, GFP_ATOMIC); + tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto freeentry; From 373dfda2bac1173971099dc76254a016646f62ab Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 24 Nov 2022 18:46:41 +0530 Subject: [PATCH 231/313] mm/thp: rename pmd_to_page() as pmd_pgtable_page() Current pmd_to_page(), which derives the page table page containing the pmd address has a very misleading name. The problem being, it sounds similar to pmd_page() which derives page embedded in a given pmd entry either for next level page or a mapped huge page. Rename it as pmd_pgtable_page() instead. Link: https://lkml.kernel.org/r/20221124131641.1523772-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d8363ac34a7c..2c73dc112ffc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2510,7 +2510,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_to_page(pmd_t *pmd) +static struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); @@ -2518,7 +2518,7 @@ static struct page *pmd_to_page(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_to_page(pmd)); + return ptlock_ptr(pmd_pgtable_page(pmd)); } static inline bool pmd_ptlock_init(struct page *page) @@ -2537,7 +2537,7 @@ static inline void pmd_ptlock_free(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (pmd_to_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) #else From 7e25de77bc5ea56cc3ff618fc8f4ea1896a4dbb3 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 25 Nov 2022 09:15:02 +0530 Subject: [PATCH 232/313] s390/mm: use pmd_pgtable_page() helper in __gmap_segment_gaddr() In __gmap_segment_gaddr() pmd level page table page is being extracted from the pmd pointer, similar to pmd_pgtable_page() implementation. This reduces some redundancy by directly using pmd_pgtable_page() instead, though first making it available. Link: https://lkml.kernel.org/r/20221125034502.1559986-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev Cc: Christian Borntraeger Cc: David Hildenbrand Cc: Heiko Carstens Signed-off-by: Andrew Morton --- arch/s390/mm/gmap.c | 5 ++--- include/linux/mm.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 02d15c8dc92e..8947451ae021 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -336,12 +336,11 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, static unsigned long __gmap_segment_gaddr(unsigned long *entry) { struct page *page; - unsigned long offset, mask; + unsigned long offset; offset = (unsigned long) entry / sizeof(unsigned long); offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE; - mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); - page = virt_to_page((void *)((unsigned long) entry & mask)); + page = pmd_pgtable_page((pmd_t *) entry); return page->index + offset; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2c73dc112ffc..8df5cae69c80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2510,7 +2510,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) #if USE_SPLIT_PMD_PTLOCKS -static struct page *pmd_pgtable_page(pmd_t *pmd) +static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); From 8ef9c32a12a8a0012a4988050947c45521260c5d Mon Sep 17 00:00:00 2001 From: Xu Panda Date: Thu, 24 Nov 2022 19:29:01 +0800 Subject: [PATCH 233/313] mm: vmscan: use sysfs_emit() to instead of scnprintf() Replace open-coded snprintf() with sysfs_emit() to simplify the code. Link: https://lkml.kernel.org/r/202211241929015476424@zte.com.cn Signed-off-by: Xu Panda Signed-off-by: Yang Yang Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 805fa51b175c..9356a3ee639c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5407,7 +5407,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); - return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); + return sysfs_emit(buf, "0x%04x\n", caps); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ From e833bc50340502a2a75b41bbd1a179aa769e2014 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Nov 2022 13:58:57 -0500 Subject: [PATCH 234/313] mm/thp: re-apply mkdirty for small pages after split We used to have 624a2c94f5b7 (Partly revert "mm/thp: carry over dirty bit when thp splits on pmd") fixing the regression reported here by Anatoly Pugachev on sparc64: https://lore.kernel.org/r/20221021160603.GA23307@u164.east.ru Where we temporarily ignored the dirty bit for small pages. Then, Hev also reported similar issue on loongarch: (the original mail was private, but Anatoly copied the list here) https://lore.kernel.org/r/CADxRZqxqb7f_WhMh=jweZP+ynf_JwGd-0VwbYgp4P+T0-AXosw@mail.gmail.com Hev pointed out that the issue is having HW write bit set within the pte_mkdirty() so the split pte can be written after split even if e.g. they were shared by more than one processes, causing data corrupt. Hev also tried to explain why loongarch set HW write bit in mkdirty: https://lore.kernel.org/r/CAHirt9itKO_K_HPboXh5AyJtt16Zf0cD73PtHvM=na39u_ztxA@mail.gmail.com One way to fix it is as what Huacai proposed here for loongarch (then we can re-apply the dirty bit in thp split): https://lore.kernel.org/r/20221117042532.4064448-1-chenhuacai@loongson.cnn We may need similar thing for sparc64, though. For now since we've found the root cause of the dirty bit issue the simpler solution (which won't lose the dirty bit for small) that will work for both is we wr-protect after pte_mkdirty(), so the HW write bit can be persistent after thp split. Add a comment for wrprotect, so we will not mess up the ordering later. With 624a2c94f5b7 (Partly revert "mm/thp: carry over dirty bit when thp splits on pmd") this is not a fix anymore, but just brings back the dirty bit for thp split safely, so we re-apply the optimization but in safe way. Provide a Tested-by credit to Hev too (not the exact same patch but the same outcome) for loongarch. Link: https://lkml.kernel.org/r/20221125185857.3110155-1-peterx@redhat.com Signed-off-by: Peter Xu Tested-by: Hev # loongarch Cc: Anatoly Pugachev Cc: Raghavendra K T Cc: Thorsten Leemhuis Cc: Mike Kravetz Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- mm/huge_memory.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86a30041a2e1..8f10afba17a6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2238,16 +2238,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = maybe_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); - if (!write) - entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); + /* NOTE: this may set soft-dirty too on some archs */ + if (dirty) + entry = pte_mkdirty(entry); /* - * NOTE: we don't do pte_mkdirty when dirty==true - * because it breaks sparc64 which can sigsegv - * random process. Need to revisit when we figure - * out what is special with sparc64. + * NOTE: this needs to happen after pte_mkdirty, + * because some archs (sparc64, loongarch) could + * set hw write bit when mkdirty. */ + if (!write) + entry = pte_wrprotect(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) From e0ff428042335c7b62785b3cf911c427a618bc86 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Fri, 25 Nov 2022 14:54:44 +0800 Subject: [PATCH 235/313] mm/memory-failure.c: cleanup in unpoison_memory If freeit is true, the value of ret must be zero, there is no need to check the value of freeit after label unlock_mutex. We can drop variable freeit to do this cleanup. Link: https://lkml.kernel.org/r/20221125065444.3462681-1-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: zhenwei pi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2e62940c7bae..c77a9e37e27e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2338,7 +2338,6 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int ret = -EBUSY; - int freeit = 0; unsigned long count = 1; bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -2413,10 +2412,9 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } } - freeit = !!TestClearPageHWPoison(p); put_page(page); - if (freeit) { + if (TestClearPageHWPoison(p)) { put_page(page); ret = 0; } @@ -2424,7 +2422,7 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); - if (!ret || freeit) { + if (!ret) { if (!huge) num_poisoned_pages_sub(pfn, 1); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", From 85463321e726fe59873bbc21f2f480747810aef8 Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Tue, 18 Oct 2022 19:12:22 -0400 Subject: [PATCH 236/313] selftests/vm: enable running select groups of tests Our memory management kernel CI testing at Red Hat uses the VM selftests and we have run into two problems: First, our LTP tests overlap with the VM selftests. We want to avoid unhelpful redundancy in our testing practices. Second, we have observed the current run_vmtests.sh to report overall failure/ambiguous results in the case that a machine lacks the necessary hardware to perform one or more of the tests. E.g. ksm tests that require more than one numa node. We want to be able to run the vm selftests suitable to particular hardware. Add the ability to run one or more groups of vm tests via run_vmtests.sh instead of simply all-or-none in order to solve these problems. Preserve existing default behavior of running all tests when the script is invoked with no arguments. Documentation of test groups is included in the patch as follows: # ./run_vmtests.sh [ -h || --help ] usage: ./tools/testing/selftests/vm/run_vmtests.sh [ -h | -t ""] -t: specify specific categories to tests to run -h: display this message The default behavior is to run all tests. Alternatively, specific groups tests can be run by passing a string to the -t argument containing one or more of the following categories separated by spaces: - mmap tests for mmap(2) - gup_test tests for gup using gup_test interface - userfaultfd tests for userfaultfd(2) - compaction a test for the patch "Allow compaction of unevictable pages" - mlock tests for mlock(2) - mremap tests for mremap(2) - hugevm tests for very large virtual address space - vmalloc vmalloc smoke tests - hmm hmm smoke tests - madv_populate test memadvise(2) MADV_POPULATE_{READ,WRITE} options - memfd_secret test memfd_secret(2) - process_mrelease test process_mrelease(2) - ksm ksm tests that do not require >=2 NUMA nodes - ksm_numa ksm tests that require >=2 NUMA nodes - pkey memory protection key tests - soft_dirty test soft dirty page bit semantics - anon_cow test anonymous copy-on-write semantics example: ./run_vmtests.sh -t "hmm mmap ksm" Link: https://lkml.kernel.org/r/20221018231222.1884715-1-jsavitz@redhat.com Signed-off-by: Joel Savitz Cc: Joel Savitz Cc: Nico Pache Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 205 +++++++++++++++------- 1 file changed, 143 insertions(+), 62 deletions(-) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 54d7a822c2ce..e26661feacf5 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -1,13 +1,88 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 -#please run as root +# Please run as root # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 exitcode=0 -#get huge pagesize and freepages from /proc/meminfo +usage() { + cat <"] + -t: specify specific categories to tests to run + -h: display this message + +The default behavior is to run all tests. + +Alternatively, specific groups tests can be run by passing a string +to the -t argument containing one or more of the following categories +separated by spaces: +- mmap + tests for mmap(2) +- gup_test + tests for gup using gup_test interface +- userfaultfd + tests for userfaultfd(2) +- compaction + a test for the patch "Allow compaction of unevictable pages" +- mlock + tests for mlock(2) +- mremap + tests for mremap(2) +- hugevm + tests for very large virtual address space +- vmalloc + vmalloc smoke tests +- hmm + hmm smoke tests +- madv_populate + test memadvise(2) MADV_POPULATE_{READ,WRITE} options +- memfd_secret + test memfd_secret(2) +- process_mrelease + test process_mrelease(2) +- ksm + ksm tests that do not require >=2 NUMA nodes +- ksm_numa + ksm tests that require >=2 NUMA nodes +- pkey + memory protection key tests +- soft_dirty + test soft dirty page bit semantics +- cow + test copy-on-write semantics +example: ./run_vmtests.sh -t "hmm mmap ksm" +EOF + exit 0 +} + + +while getopts "ht:" OPT; do + case ${OPT} in + "h") usage ;; + "t") VM_SELFTEST_ITEMS=${OPTARG} ;; + esac +done +shift $((OPTIND -1)) + +# default behavior: run all tests +VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} + +test_selected() { + if [ "$VM_SELFTEST_ITEMS" == "default" ]; then + # If no VM_SELFTEST_ITEMS are specified, run all tests + return 0 + fi + # If test selected argument is one of the test items + if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then + return 0 + else + return 1 + fi +} + +# get huge pagesize and freepages from /proc/meminfo while read -r name size unit; do if [ "$name" = "HugePages_Free:" ]; then freepgs="$size" @@ -27,7 +102,7 @@ hpgsize_MB=$((hpgsize_KB / 1024)) half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) needmem_KB=$((half_ufd_size_MB * 2 * 1024)) -#set proper nr_hugepages +# set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) needpgs=$((needmem_KB / hpgsize_KB)) @@ -56,136 +131,142 @@ else exit 1 fi -#filter 64bit architectures +# filter 64bit architectures ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" if [ -z "$ARCH" ]; then ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') fi VADDR64=0 -echo "$ARCH64STR" | grep "$ARCH" && VADDR64=1 +echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 # Usage: run_test [test binary] [arbitrary test arguments...] run_test() { - local title="running $*" - local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) - printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" + if test_selected ${CATEGORY}; then + local title="running $*" + local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) + printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" - "$@" - local ret=$? - if [ $ret -eq 0 ]; then - echo "[PASS]" - elif [ $ret -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip - else - echo "[FAIL]" - exitcode=1 - fi + "$@" + local ret=$? + if [ $ret -eq 0 ]; then + echo "[PASS]" + elif [ $ret -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip + else + echo "[FAIL]" + exitcode=1 + fi + fi # test_selected } -run_test ./hugepage-mmap +CATEGORY="hugetlb" run_test ./hugepage-mmap shmmax=$(cat /proc/sys/kernel/shmmax) shmall=$(cat /proc/sys/kernel/shmall) echo 268435456 > /proc/sys/kernel/shmmax echo 4194304 > /proc/sys/kernel/shmall -run_test ./hugepage-shm +CATEGORY="hugetlb" run_test ./hugepage-shm echo "$shmmax" > /proc/sys/kernel/shmmax echo "$shmall" > /proc/sys/kernel/shmall -run_test ./map_hugetlb -run_test ./hugepage-mremap -run_test ./hugepage-vmemmap -run_test ./hugetlb-madvise +CATEGORY="hugetlb" run_test ./map_hugetlb +CATEGORY="hugetlb" run_test ./hugepage-mremap +CATEGORY="hugetlb" run_test ./hugepage-vmemmap +CATEGORY="hugetlb" run_test ./hugetlb-madvise -echo "NOTE: The above hugetlb tests provide minimal coverage. Use" -echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" -echo " hugetlb regression testing." +if test_selected "hugetlb"; then + echo "NOTE: These hugetlb tests provide minimal coverage. Use" + echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" + echo " hugetlb regression testing." +fi -run_test ./map_fixed_noreplace +CATEGORY="mmap" run_test ./map_fixed_noreplace # get_user_pages_fast() benchmark -run_test ./gup_test -u +CATEGORY="gup_test" run_test ./gup_test -u # pin_user_pages_fast() benchmark -run_test ./gup_test -a +CATEGORY="gup_test" run_test ./gup_test -a # Dump pages 0, 19, and 4096, using pin_user_pages: -run_test ./gup_test -ct -F 0x1 0 19 0x1000 +CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 uffd_mods=("" ":dev") for mod in "${uffd_mods[@]}"; do - run_test ./userfaultfd anon${mod} 20 16 + CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 # Hugetlb tests require source and destination huge pages. Pass in half # the size ($half_ufd_size_MB), which is used for *each*. - run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 - run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 - run_test ./userfaultfd shmem${mod} 20 16 + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 done #cleanup echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages -run_test ./compaction_test +CATEGORY="compaction" run_test ./compaction_test -run_test sudo -u nobody ./on-fault-limit +CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit -run_test ./map_populate +CATEGORY="mmap" run_test ./map_populate -run_test ./mlock-random-test +CATEGORY="mlock" run_test ./mlock-random-test -run_test ./mlock2-tests +CATEGORY="mlock" run_test ./mlock2-tests -run_test ./mrelease_test +CATEGORY="process_mrelease" run_test ./mrelease_test -run_test ./mremap_test +CATEGORY="mremap" run_test ./mremap_test -run_test ./thuge-gen +CATEGORY="hugetlb" run_test ./thuge-gen if [ $VADDR64 -ne 0 ]; then - run_test ./virtual_address_range + CATEGORY="hugevm" run_test ./virtual_address_range # virtual address 128TB switch test - run_test ./va_128TBswitch.sh + CATEGORY="hugevm" run_test ./va_128TBswitch.sh fi # VADDR64 # vmalloc stability smoke test -run_test ./test_vmalloc.sh smoke +CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke -run_test ./mremap_dontunmap +CATEGORY="mremap" run_test ./mremap_dontunmap -run_test ./test_hmm.sh smoke +CATEGORY="hmm" run_test ./test_hmm.sh smoke # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests -run_test ./madv_populate +CATEGORY="madv_populate" run_test ./madv_populate -run_test ./memfd_secret +CATEGORY="memfd_secret" run_test ./memfd_secret # KSM MADV_MERGEABLE test with 10 identical pages -run_test ./ksm_tests -M -p 10 +CATEGORY="ksm" run_test ./ksm_tests -M -p 10 # KSM unmerge test -run_test ./ksm_tests -U +CATEGORY="ksm" run_test ./ksm_tests -U # KSM test with 10 zero pages and use_zero_pages = 0 -run_test ./ksm_tests -Z -p 10 -z 0 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 # KSM test with 10 zero pages and use_zero_pages = 1 -run_test ./ksm_tests -Z -p 10 -z 1 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 # KSM test with 2 NUMA nodes and merge_across_nodes = 1 -run_test ./ksm_tests -N -m 1 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 # KSM test with 2 NUMA nodes and merge_across_nodes = 0 -run_test ./ksm_tests -N -m 0 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 + +CATEGORY="ksm" run_test ./ksm_functional_tests # protection_keys tests if [ -x ./protection_keys_32 ] then - run_test ./protection_keys_32 + CATEGORY="pkey" run_test ./protection_keys_32 fi if [ -x ./protection_keys_64 ] then - run_test ./protection_keys_64 + CATEGORY="pkey" run_test ./protection_keys_64 fi -run_test ./soft-dirty +CATEGORY="soft_dirty" run_test ./soft-dirty -# COW tests for anonymous memory -run_test ./cow +# COW tests +CATEGORY="cow" run_test ./cow exit $exitcode From 93fb70aa5904c2577fab8100fa990ecfa4f5b4c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:36 +0200 Subject: [PATCH 237/313] selftests/vm: add KSM unmerge tests Patch series "mm/ksm: break_ksm() cleanups and fixes", v2. This series cleans up and fixes break_ksm(). In summary, we no longer use fake write faults to break COW but instead FAULT_FLAG_UNSHARE. Further, we move away from using follow_page() --- that we can hopefully remove completely at one point --- and use new walk_page_range_vma() instead. Fortunately, we can get rid of VM_FAULT_WRITE and FOLL_MIGRATION in common code now. Extend the existing ksm tests by an unmerge benchmark, and a some new unmerge tests. Also, add a selftest to measure MADV_UNMERGEABLE performance. In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~6% -- 7% (old: ~5250 MiB/s, new: ~4900 MiB/s). I don't think we particularly care for now, but it's good to be aware of the implication. This patch (of 9): Let's add three unmerge tests (MADV_UNMERGEABLE unmerging all pages in the range). test_unmerge(): basic unmerge tests test_unmerge_discarded(): have some pte_none() entries in the range test_unmerge_uffd_wp(): protect the merged pages using uffd-wp ksm_tests.c currently contains a mixture of benchmarks and tests, whereby each test is carried out by executing the ksm_tests binary with specific parameters. Let's add new ksm_functional_tests.c that performs multiple, smaller functional tests all at once. Link: https://lkml.kernel.org/r/20221021101141.84170-1-david@redhat.com Link: https://lkml.kernel.org/r/20221021101141.84170-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 2 + .../selftests/vm/ksm_functional_tests.c | 279 ++++++++++++++++++ tools/testing/selftests/vm/run_vmtests.sh | 2 + tools/testing/selftests/vm/vm_util.c | 10 + tools/testing/selftests/vm/vm_util.h | 1 + 5 files changed, 294 insertions(+) create mode 100644 tools/testing/selftests/vm/ksm_functional_tests.c diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index a4d764efd6e3..89c14e41bd43 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -55,6 +55,7 @@ TEST_GEN_FILES += userfaultfd TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_FILES += ksm_tests +TEST_GEN_PROGS += ksm_functional_tests ifeq ($(MACHINE),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) @@ -100,6 +101,7 @@ include ../lib.mk $(OUTPUT)/cow: vm_util.c $(OUTPUT)/khugepaged: vm_util.c +$(OUTPUT)/ksm_functional_tests: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c new file mode 100644 index 000000000000..96644be68962 --- /dev/null +++ b/tools/testing/selftests/vm/ksm_functional_tests.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KSM functional tests + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "vm_util.h" + +#define KiB 1024u +#define MiB (1024 * KiB) + +static int ksm_fd; +static int ksm_full_scans_fd; +static int pagemap_fd; +static size_t pagesize; + +static bool range_maps_duplicates(char *addr, unsigned long size) +{ + unsigned long offs_a, offs_b, pfn_a, pfn_b; + + /* + * There is no easy way to check if there are KSM pages mapped into + * this range. We only check that the range does not map the same PFN + * twice by comaring each pair of mapped pages. + */ + for (offs_a = 0; offs_a < size; offs_a += pagesize) { + pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); + /* Page not present or PFN not exposed by the kernel. */ + if (pfn_a == -1ull || !pfn_a) + continue; + + for (offs_b = offs_a + pagesize; offs_b < size; + offs_b += pagesize) { + pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); + if (pfn_b == -1ull || !pfn_b) + continue; + if (pfn_a == pfn_b) + return true; + } + } + return false; +} + +static long ksm_get_full_scans(void) +{ + char buf[10]; + ssize_t ret; + + ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +static int ksm_merge(void) +{ + long start_scans, end_scans; + + /* Wait for two full scans such that any possible merging happened. */ + start_scans = ksm_get_full_scans(); + if (start_scans < 0) + return start_scans; + if (write(ksm_fd, "1", 1) != 1) + return -errno; + do { + end_scans = ksm_get_full_scans(); + if (end_scans < 0) + return end_scans; + } while (end_scans < start_scans + 2); + + return 0; +} + +static char *mmap_and_merge_range(char val, unsigned long size) +{ + char *map; + + map = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return MAP_FAILED; + } + + /* Don't use THP. Ignore if THP are not around on a kernel. */ + if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto unmap; + } + + /* Make sure each page contains the same values to merge them. */ + memset(map, val, size); + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_test_result_fail("MADV_MERGEABLE failed\n"); + goto unmap; + } + + /* Run KSM to trigger merging and wait. */ + if (ksm_merge()) { + ksft_test_result_fail("Running KSM failed\n"); + goto unmap; + } + return map; +unmap: + munmap(map, size); + return MAP_FAILED; +} + +static void test_unmerge(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_unmerge_discarded(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* Discard half of all mapped pages so we have pte_none() entries. */ + if (madvise(map, size / 2, MADV_DONTNEED)) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto unmap; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +#ifdef __NR_userfaultfd +static void test_unmerge_uffd_wp(void) +{ + struct uffdio_writeprotect uffd_writeprotect; + struct uffdio_register uffdio_register; + const unsigned int size = 2 * MiB; + struct uffdio_api uffdio_api; + char *map; + int uffd; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* See if UFFD is around. */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto unmap; + } + + /* See if UFFD-WP is around. */ + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); + goto close_uffd; + } + + /* Register UFFD-WP, no need for an actual handler. */ + uffdio_register.range.start = (unsigned long) map; + uffdio_register.range.len = size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { + ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); + goto close_uffd; + } + + /* Write-protect the range using UFFD-WP. */ + uffd_writeprotect.range.start = (unsigned long) map; + uffd_writeprotect.range.len = size; + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); + goto close_uffd; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto close_uffd; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +close_uffd: + close(uffd); +unmap: + munmap(map, size); +} +#endif + +int main(int argc, char **argv) +{ + unsigned int tests = 2; + int err; + +#ifdef __NR_userfaultfd + tests++; +#endif + + ksft_print_header(); + ksft_set_plan(tests); + + pagesize = getpagesize(); + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + + test_unmerge(); + test_unmerge_discarded(); +#ifdef __NR_userfaultfd + test_unmerge_uffd_wp(); +#endif + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index e26661feacf5..8984e0bb58c7 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -253,6 +253,8 @@ CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 CATEGORY="ksm" run_test ./ksm_functional_tests +run_test ./ksm_functional_tests + # protection_keys tests if [ -x ./protection_keys_32 ] then diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 5bbf7641a0f0..710571902743 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -43,6 +43,16 @@ bool pagemap_is_populated(int fd, char *start) return entry & 0xc000000000000000ull; } +unsigned long pagemap_get_pfn(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* If present (63th bit), PFN is at bit 0 -- 54. */ + if (entry & 0x8000000000000000ull) + return entry & 0x007fffffffffffffull; + return -1ull; +} + void clear_softdirty(void) { int ret; diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 80d5a6ad413b..1995ee911ef2 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -6,6 +6,7 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); bool pagemap_is_swapped(int fd, char *start); bool pagemap_is_populated(int fd, char *start); +unsigned long pagemap_get_pfn(int fd, char *start); void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); From c31783eeae7b22dc3f6edde7339de6112959225d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:38 +0200 Subject: [PATCH 238/313] mm/pagewalk: don't trigger test_walk() in walk_page_vma() As Peter points out, the caller passes a single VMA and can just do that check itself. And in fact, no existing users rely on test_walk() getting called. So let's just remove it and make the implementation slightly more efficient. Link: https://lkml.kernel.org/r/20221021101141.84170-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 2 ++ mm/pagewalk.c | 7 ------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f3fafb731ffd..37dc0208862d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -27,6 +27,8 @@ struct mm_walk; * "do page table walk over the current vma", returning * a negative value means "abort current page table walk * right now" and returning 1 means "skip the current vma" + * Note that this callback is not called when the caller + * passes in a single VMA as for walk_page_vma(). * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2ff3a5bebceb..0a5d71aaf9c7 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -526,18 +526,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, .vma = vma, .private = private, }; - int err; if (!walk.mm) return -EINVAL; mmap_assert_locked(walk.mm); - - err = walk_page_test(vma->vm_start, vma->vm_end, &walk); - if (err > 0) - return 0; - if (err < 0) - return err; return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } From 5036880efdad976165c817dcb6a1c8c24fb16caa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:33 +0200 Subject: [PATCH 239/313] selftests/vm: add test to measure MADV_UNMERGEABLE performance Let's add a test to measure performance of KSM breaking not triggered via COW, but triggered by disabling KSM on an area filled with KSM pages via MADV_UNMERGEABLE. Link: https://lkml.kernel.org/r/20221021101141.84170-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/ksm_tests.c | 76 +++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_tests.c b/tools/testing/selftests/vm/ksm_tests.c index 0d85be2350fa..f9eb4d67e0dd 100644 --- a/tools/testing/selftests/vm/ksm_tests.c +++ b/tools/testing/selftests/vm/ksm_tests.c @@ -40,6 +40,7 @@ enum ksm_test_name { CHECK_KSM_NUMA_MERGE, KSM_MERGE_TIME, KSM_MERGE_TIME_HUGE_PAGES, + KSM_UNMERGE_TIME, KSM_COW_TIME }; @@ -108,7 +109,10 @@ static void print_help(void) " -P evaluate merging time and speed.\n" " For this test, the size of duplicated memory area (in MiB)\n" " must be provided using -s option\n" - " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -D evaluate unmerging time and speed when disabling KSM.\n" " For this test, the size of duplicated memory area (in MiB)\n" " must be provided using -s option\n" " -C evaluate the time required to break COW of merged pages.\n\n"); @@ -188,6 +192,16 @@ static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, return 0; } +static int ksm_unmerge_pages(void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_UNMERGEABLE)) { + perror("madvise"); + return 1; + } + return 0; +} + static bool assert_ksm_pages_count(long dupl_page_count) { unsigned long max_page_sharing, pages_sharing, pages_shared; @@ -560,6 +574,53 @@ err_out: return KSFT_FAIL; } +static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) { void *map_ptr; @@ -644,7 +705,7 @@ int main(int argc, char *argv[]) bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; long size_MB = 0; - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCH")) != -1) { + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { switch (opt) { case 'a': prot = str_to_prot(optarg); @@ -701,6 +762,9 @@ int main(int argc, char *argv[]) case 'H': test_name = KSM_MERGE_TIME_HUGE_PAGES; break; + case 'D': + test_name = KSM_UNMERGE_TIME; + break; case 'C': test_name = KSM_COW_TIME; break; @@ -762,6 +826,14 @@ int main(int argc, char *argv[]) ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, size_MB); break; + case KSM_UNMERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; case KSM_COW_TIME: ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, page_size); From 58f595c6659198e1ad0ed431a408ddd79b21e579 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:34 +0200 Subject: [PATCH 240/313] mm/ksm: simplify break_ksm() to not rely on VM_FAULT_WRITE Now that GUP no longer requires VM_FAULT_WRITE, break_ksm() is the sole remaining user of VM_FAULT_WRITE. As we also want to stop triggering a fake write fault and instead use FAULT_FLAG_UNSHARE -- similar to GUP-triggered unsharing when taking a R/O pin on a shared anonymous page (including KSM pages), let's stop relying on VM_FAULT_WRITE. Let's rework break_ksm() to not rely on the return value of handle_mm_fault() anymore to figure out whether COW-breaking was successful. Simply perform another follow_page() lookup to verify the result. While this makes break_ksm() slightly less efficient, we can simplify handle_mm_fault() a little and easily switch to FAULT_FLAG_UNSHARE without introducing similar KSM-specific behavior for FAULT_FLAG_UNSHARE. In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~4% -- 5% (old: ~5250 MiB/s, new: ~5010 MiB/s). I don't think that we particularly care about that performance drop when unmerging. If it ever turns out to be an actual performance issue, we can think about a better alternative for FAULT_FLAG_UNSHARE -- let's just keep it simple for now. Link: https://lkml.kernel.org/r/20221021101141.84170-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/ksm.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index a71245241d22..4efdc424a3fc 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -440,26 +440,27 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) vm_fault_t ret = 0; do { + bool ksm_page = false; + cond_resched(); page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) - ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, - NULL); - else - ret = VM_FAULT_WRITE; + ksm_page = true; put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + + if (!ksm_page) + return 0; + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + NULL); + } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* - * We must loop because handle_mm_fault() may back out if there's - * any difficulty e.g. if pte accessed bit gets updated concurrently. - * - * VM_FAULT_WRITE is what we have been hoping for: it indicates that - * COW has been broken, even if the vma does not permit VM_WRITE; - * but note that a concurrent fault might break PageKsm for us. + * We must loop until we no longer find a KSM page because + * handle_mm_fault() may back out if there's any difficulty e.g. if + * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's From cb8d863313436339fb60f7dd5131af2e5854621e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:35 +0200 Subject: [PATCH 241/313] mm: remove VM_FAULT_WRITE All users -- GUP and KSM -- are gone, let's just remove it. Link: https://lkml.kernel.org/r/20221021101141.84170-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 --- mm/huge_memory.c | 2 +- mm/memory.c | 9 ++++----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 018b1c098173..199f98be6f9c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -929,7 +929,6 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_OOM: Out Of Memory * @VM_FAULT_SIGBUS: Bad access * @VM_FAULT_MAJOR: Page read from storage - * @VM_FAULT_WRITE: Special case for get_user_pages * @VM_FAULT_HWPOISON: Hit poisoned small page * @VM_FAULT_HWPOISON_LARGE: Hit poisoned large page. Index encoded * in upper bits @@ -950,7 +949,6 @@ enum vm_fault_reason { VM_FAULT_OOM = (__force vm_fault_t)0x000001, VM_FAULT_SIGBUS = (__force vm_fault_t)0x000002, VM_FAULT_MAJOR = (__force vm_fault_t)0x000004, - VM_FAULT_WRITE = (__force vm_fault_t)0x000008, VM_FAULT_HWPOISON = (__force vm_fault_t)0x000010, VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020, VM_FAULT_SIGSEGV = (__force vm_fault_t)0x000040, @@ -976,7 +974,6 @@ enum vm_fault_reason { { VM_FAULT_OOM, "OOM" }, \ { VM_FAULT_SIGBUS, "SIGBUS" }, \ { VM_FAULT_MAJOR, "MAJOR" }, \ - { VM_FAULT_WRITE, "WRITE" }, \ { VM_FAULT_HWPOISON, "HWPOISON" }, \ { VM_FAULT_HWPOISON_LARGE, "HWPOISON_LARGE" }, \ { VM_FAULT_SIGSEGV, "SIGSEGV" }, \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8f10afba17a6..1d9ad909c87c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,7 +1376,7 @@ reuse: if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - return VM_FAULT_WRITE; + return 0; } unlock_fallback: diff --git a/mm/memory.c b/mm/memory.c index 815d2ff05c62..aad226daf41b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3213,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } delayacct_wpcopy_end(); - return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; + return 0; oom_free_new: put_page(new_page); oom: @@ -3277,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - vm_fault_t ret = VM_FAULT_WRITE; + vm_fault_t ret = 0; get_page(vmf->page); @@ -3430,7 +3430,7 @@ reuse: return 0; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } copy: /* @@ -3944,7 +3944,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; } From 6cce3314b928b2db7d5f48171e18314226551c3f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:37 +0200 Subject: [PATCH 242/313] mm/ksm: fix KSM COW breaking with userfaultfd-wp via FAULT_FLAG_UNSHARE Let's stop breaking COW via a fake write fault and let's use FAULT_FLAG_UNSHARE instead. This avoids any wrong side effects of the fake write fault, such as mapping the PTE writable and marking the pte dirty/softdirty. Consequently, we will no longer trigger a fake write fault and break COW without any such side-effects. Also, this fixes KSM interaction with userfaultfd-wp: when we have a KSM page that's write-protected by userfaultfd, break_ksm()->handle_mm_fault() will fail with VM_FAULT_SIGBUS and will simply return in break_ksm() with 0 instead of actually breaking COW. For now, the KSM unmerge tests can trigger that: $ sudo ./ksm_functional_tests TAP version 13 1..3 # [RUN] test_unmerge ok 1 Pages were unmerged # [RUN] test_unmerge_discarded ok 2 Pages were unmerged # [RUN] test_unmerge_uffd_wp not ok 3 Pages were unmerged Bail out! 1 out of 3 tests failed # Planned tests != run tests (2 != 3) # Totals: pass:2 fail:1 xfail:0 xpass:0 skip:0 error:0 The warning in dmesg also indicates this wrong handling: [ 230.096368] FAULT_FLAG_ALLOW_RETRY missing 881 [ 230.100822] CPU: 1 PID: 1643 Comm: ksm-uffd-wp [...] [ 230.110124] Hardware name: [...] [ 230.117775] Call Trace: [ 230.120227] [ 230.122334] dump_stack_lvl+0x44/0x5c [ 230.126010] handle_userfault.cold+0x14/0x19 [ 230.130281] ? tlb_finish_mmu+0x65/0x170 [ 230.134207] ? uffd_wp_range+0x65/0xa0 [ 230.137959] ? _raw_spin_unlock+0x15/0x30 [ 230.141972] ? do_wp_page+0x50/0x590 [ 230.145551] __handle_mm_fault+0x9f5/0xf50 [ 230.149652] ? mmput+0x1f/0x40 [ 230.152712] handle_mm_fault+0xb9/0x2a0 [ 230.156550] break_ksm+0x141/0x180 [ 230.159964] unmerge_ksm_pages+0x60/0x90 [ 230.163890] ksm_madvise+0x3c/0xb0 [ 230.167295] do_madvise.part.0+0x10c/0xeb0 [ 230.171396] ? do_syscall_64+0x67/0x80 [ 230.175157] __x64_sys_madvise+0x5a/0x70 [ 230.179082] do_syscall_64+0x58/0x80 [ 230.182661] ? do_syscall_64+0x67/0x80 [ 230.186413] entry_SYSCALL_64_after_hwframe+0x63/0xcd This is primarily a fix for KSM+userfaultfd-wp, however, the fake write fault was always questionable. As this fix is not easy to backport and it's not very critical, let's not cc stable. Link: https://lkml.kernel.org/r/20221021101141.84170-6-david@redhat.com Fixes: 529b930b87d9 ("userfaultfd: wp: hook userfault handler to write protection fault") Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/ksm.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 4efdc424a3fc..0805221e1d4c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -420,17 +420,15 @@ static inline bool ksm_test_exit(struct mm_struct *mm) } /* - * We use break_ksm to break COW on a ksm page: it's a stripped down + * We use break_ksm to break COW on a ksm page by triggering unsharing, + * such that the ksm page will get replaced by an exclusive anonymous page. * - * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) - * put_page(page); - * - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * - * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ @@ -454,7 +452,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, NULL); } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* From e07cda5f232fac4de0925d8a4c92e51e41fa2f6e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:39 +0200 Subject: [PATCH 243/313] mm/pagewalk: add walk_page_range_vma() Let's add walk_page_range_vma(), which is similar to walk_page_vma(), however, is only interested in a subset of the VMA range. To be used in KSM code to stop using follow_page() next. Link: https://lkml.kernel.org/r/20221021101141.84170-8-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 3 +++ mm/pagewalk.c | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 37dc0208862d..959f52e5867d 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -101,6 +101,9 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, pgd_t *pgd, void *private); +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 0a5d71aaf9c7..7f1c9b274906 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -517,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, return walk_pgd_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + + if (start >= end || !walk.mm) + return -EINVAL; + if (start < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + mmap_assert_locked(walk.mm); + return __walk_page_range(start, end, &walk); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { From d7c0e68dab98f0f5a2af501eaefeb90cc855fc80 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:40 +0200 Subject: [PATCH 244/313] mm/ksm: convert break_ksm() to use walk_page_range_vma() FOLL_MIGRATION exists only for the purpose of break_ksm(), and actually, there is not even the need to wait for the migration to finish, we only want to know if we're dealing with a KSM page. Using follow_page() just to identify a KSM page overcomplicates GUP code. Let's use walk_page_range_vma() instead, because we don't actually care about the page itself, we only need to know a single property -- no need to even grab a reference. So, get rid of follow_page() usage such that we can get rid of FOLL_MIGRATION now and eventually be able to get rid of follow_page() in the future. In my setup (AMD Ryzen 9 3900X), running the KSM selftest to test unmerge performance on 2 GiB (taskset 0x8 ./ksm_tests -D -s 2048), this results in a performance degradation of ~2% (old: ~5010 MiB/s, new: ~4900 MiB/s). I don't think we particularly care for now. Interestingly, the benchmark reduction is due to the single callback. Adding a second callback (e.g., pud_entry()) reduces the benchmark by another 100-200 MiB/s. Link: https://lkml.kernel.org/r/20221021101141.84170-9-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/ksm.c | 49 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 0805221e1d4c..dd02780c387f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include "internal.h" @@ -419,6 +420,39 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte; + int ret; + + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 0; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (pte_present(*pte)) { + page = vm_normal_page(walk->vma, addr, *pte); + } else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } + ret = page && PageKsm(page); + pte_unmap_unlock(pte, ptl); + return ret; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, +}; + /* * We use break_ksm to break COW on a ksm page by triggering unsharing, * such that the ksm page will get replaced by an exclusive anonymous page. @@ -434,21 +468,16 @@ static inline bool ksm_test_exit(struct mm_struct *mm) */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; vm_fault_t ret = 0; do { - bool ksm_page = false; + int ksm_page; cond_resched(); - page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) - break; - if (PageKsm(page)) - ksm_page = true; - put_page(page); - + ksm_page = walk_page_range_vma(vma, addr, addr + 1, + &break_ksm_ops, NULL); + if (WARN_ON_ONCE(ksm_page < 0)) + return ksm_page; if (!ksm_page) return 0; ret = handle_mm_fault(vma, addr, From f7355e99d9f71fcde093193fd4b569a648ba5ce3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 21 Oct 2022 12:11:41 +0200 Subject: [PATCH 245/313] mm/gup: remove FOLL_MIGRATION Fortunately, the last user (KSM) is gone, so let's just remove this rather special code from generic GUP handling -- especially because KSM never required the PMD handling as KSM only deals with individual base pages. [akpm@linux-foundation.org: fix merge snafu]Link: https://lkml.kernel.org/r/20221021101141.84170-10-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/gup.c | 55 +++++----------------------------------------- 2 files changed, 5 insertions(+), 51 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8df5cae69c80..767c8c522e70 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,7 +3057,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_ANON 0x8000 /* don't do file mappings */ diff --git a/mm/gup.c b/mm/gup.c index 2860cf4a85e1..82b275bbaad5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -537,30 +537,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); -retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto retry; - } + if (!pte_present(pte)) + goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; @@ -668,28 +651,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); -retry: - if (!pmd_present(pmdval)) { - /* - * Should never reach here, if thp migration is not supported; - * Otherwise, it must be a thp migration entry. - */ - VM_BUG_ON(!thp_migration_supported() || - !is_pmd_migration_entry(pmdval)); - - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - - pmd_migration_entry_wait(mm, pmd); - pmdval = READ_ONCE(*pmd); - /* - * MADV_DONTNEED may convert the pmd to null because - * mmap_lock is held in read mode - */ - if (pmd_none(pmdval)) - return no_page_table(vma, flags); - goto retry; - } + if (!pmd_present(pmdval)) + return no_page_table(vma, flags); if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -703,18 +666,10 @@ retry: if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); -retry_locked: ptl = pmd_lock(mm, pmd); - if (unlikely(pmd_none(*pmd))) { - spin_unlock(ptl); - return no_page_table(vma, flags); - } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - pmd_migration_entry_wait(mm, pmd); - goto retry_locked; + return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); From 4c9473e87e75a2a77ccd02e55c91ffe6a52b5df6 Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Wed, 26 Oct 2022 10:52:18 +0530 Subject: [PATCH 246/313] mm/khugepaged: add tracepoint to collapse_file() "mm_khugepaged_collapse_file" for capturing is_shmem. Currently, is_shmem is not being captured. Capturing is_shmem is useful as it can indicate if tmpfs is being used as a backing store instead of persistent storage. Add the tracepoint in collapse_file() named "mm_khugepaged_collapse_file" for capturing is_shmem. [gautammenghani201@gmail.com: swap is_shmem and addr to save space, per Steven Rostedt] Link: https://lkml.kernel.org/r/20221202201807.182829-1-gautammenghani201@gmail.com Link: https://lkml.kernel.org/r/20221026052218.148234-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Reviewed-by: Steven Rostedt (Google) [tracing] Cc: David Hildenbrand Cc: Masami Hiramatsu (Google) Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 38 ++++++++++++++++++++++++++++++ mm/khugepaged.c | 7 +++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 760455dfa860..3e6fb05852f9 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -203,5 +203,43 @@ TRACE_EVENT(mm_khugepaged_scan_file, __print_symbolic(__entry->result, SCAN_STATUS)) ); +TRACE_EVENT(mm_khugepaged_collapse_file, + TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index, + bool is_shmem, unsigned long addr, struct file *file, + int nr, int result), + TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result), + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, hpfn) + __field(pgoff_t, index) + __field(unsigned long, addr) + __field(bool, is_shmem) + __string(filename, file->f_path.dentry->d_iname) + __field(int, nr) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->hpfn = hpage ? page_to_pfn(hpage) : -1; + __entry->index = index; + __entry->addr = addr; + __entry->is_shmem = is_shmem; + __assign_str(filename, file->f_path.dentry->d_iname); + __entry->nr = nr; + __entry->result = result; + ), + + TP_printk("mm=%p, hpage_pfn=0x%lx, index=%ld, addr=%ld, is_shmem=%d, filename=%s, nr=%d, result=%s", + __entry->mm, + __entry->hpfn, + __entry->index, + __entry->addr, + __entry->is_shmem, + __get_str(filename), + __entry->nr, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 913b0f489352..78ec2771cc65 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1744,12 +1744,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; - pgoff_t index, end = start + HPAGE_PMD_NR; + pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr; + int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -2102,7 +2102,8 @@ out: mem_cgroup_uncharge(page_folio(hpage)); put_page(hpage); } - /* TODO: tracepoints */ + + trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); return result; } From fd3b1bc3c86ee11ba77421b00c70280605b521c6 Mon Sep 17 00:00:00 2001 From: Pavankumar Kondeti Date: Wed, 9 Nov 2022 10:48:36 +0530 Subject: [PATCH 247/313] mm/madvise: fix madvise_pageout for private file mappings When MADV_PAGEOUT is called on a private file mapping VMA region, we bail out early if the process is neither owner nor write capable of the file. However, this VMA may have both private/shared clean pages and private dirty pages. The opportunity of paging out the private dirty pages (Anon pages) is missed. Fix this behavior by allowing private file mappings pageout further and perform the file access check along with PageAnon() during page walk. We observe ~10% improvement in zram usage, thus leaving more available memory on a 4GB RAM system running Android. [quic_pkondeti@quicinc.com: v2] Link: https://lkml.kernel.org/r/1669962597-27724-1-git-send-email-quic_pkondeti@quicinc.com Link: https://lkml.kernel.org/r/1667971116-12900-1-git-send-email-quic_pkondeti@quicinc.com Signed-off-by: Pavankumar Kondeti Cc: Charan Teja Kalla Cc: Minchan Kim Cc: Suren Baghdasaryan Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/madvise.c | 53 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 83b0c91a126b..2573ea3ed684 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -318,6 +318,21 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static inline bool can_do_file_pageout(struct vm_area_struct *vma) +{ + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -331,10 +346,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, spinlock_t *ptl; struct page *page = NULL; LIST_HEAD(page_list); + bool pageout_anon_only_filter; if (fatal_signal_pending(current)) return -EINTR; + pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && + !can_do_file_pageout(vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; @@ -361,6 +380,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (page_mapcount(page) != 1) goto huge_unlock; + if (pageout_anon_only_filter && !PageAnon(page)) + goto huge_unlock; + if (next - addr != HPAGE_PMD_SIZE) { int err; @@ -429,6 +451,8 @@ regular_page: if (PageTransCompound(page)) { if (page_mapcount(page) != 1) break; + if (pageout_anon_only_filter && !PageAnon(page)) + break; get_page(page); if (!trylock_page(page)) { put_page(page); @@ -456,6 +480,9 @@ regular_page: if (!PageLRU(page) || page_mapcount(page) != 1) continue; + if (pageout_anon_only_filter && !PageAnon(page)) + continue; + VM_BUG_ON_PAGE(PageTransCompound(page), page); if (pte_young(ptent)) { @@ -550,23 +577,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } -static inline bool can_do_pageout(struct vm_area_struct *vma) -{ - if (vma_is_anonymous(vma)) - return true; - if (!vma->vm_file) - return false; - /* - * paging out pagecache only for non-anonymous mappings that correspond - * to the files the calling process could (if tried) open for writing; - * otherwise we'd be including shared non-exclusive mappings, which - * opens a side channel. - */ - return inode_owner_or_capable(&init_user_ns, - file_inode(vma->vm_file)) || - file_permission(vma->vm_file, MAY_WRITE) == 0; -} - static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) @@ -578,7 +588,14 @@ static long madvise_pageout(struct vm_area_struct *vma, if (!can_madv_lru_vma(vma)) return -EINVAL; - if (!can_do_pageout(vma)) + /* + * If the VMA belongs to a private file mapping, there can be private + * dirty pages which can be paged out if even this process is neither + * owner nor write capable of the file. We allow private file mappings + * further to pageout dirty anon pages. + */ + if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && + (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); From 6b3379e8dcbea09b7e27bf0eea2f53fd15a164ac Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 28 Nov 2022 11:16:10 -0800 Subject: [PATCH 248/313] zswap: fix writeback lock ordering for zsmalloc Patch series "Implement writeback for zsmalloc", v7. Unlike other zswap allocators such as zbud or z3fold, zsmalloc currently lacks the writeback mechanism. This means that when the zswap pool is full, it will simply reject further allocations, and the pages will be written directly to swap. This series of patches implements writeback for zsmalloc. When the zswap pool becomes full, zsmalloc will attempt to evict all the compressed objects in the least-recently used zspages. This patch (of 6): zswap's customary lock order is tree->lock before pool->lock, because the tree->lock protects the entries' refcount, and the free callbacks in the backends acquire their respective pool locks to dispatch the backing object. zsmalloc's map callback takes the pool lock, so zswap must not grab the tree->lock while a handle is mapped. This currently only happens during writeback, which isn't implemented for zsmalloc. In preparation for it, move the tree->lock section out of the mapped entry section Link: https://lkml.kernel.org/r/20221128191616.1261026-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20221128191616.1261026-2-nphamcs@gmail.com Signed-off-by: Johannes Weiner Signed-off-by: Nhat Pham Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 3019f0bde194..f6c89049cf70 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -968,6 +968,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) swpentry = zhdr->swpentry; /* here */ tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); + zpool_unmap_handle(pool, handle); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -975,20 +976,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); - zpool_unmap_handle(pool, handle); kfree(tmp); return 0; } spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); - src = (u8 *)zhdr + sizeof(struct zswap_header); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src = tmp; - zpool_unmap_handle(pool, handle); - } - /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ @@ -1006,6 +999,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); dlen = PAGE_SIZE; + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1015,6 +1016,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = acomp_ctx->req->dlen; mutex_unlock(acomp_ctx->mutex); + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); + else + zpool_unmap_handle(pool, handle); + BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1045,7 +1051,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - goto end; + return ret; + +fail: + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); /* * if we get here due to ZSWAP_SWAPCACHE_EXIST @@ -1054,17 +1064,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) * if we free the entry in the following put * it is also okay to return !0 */ -fail: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); -end: - if (zpool_can_sleep_mapped(pool)) - zpool_unmap_handle(pool, handle); - else - kfree(tmp); - return ret; } From 6a05aa30109d5cd4bebfb89415c58fa4599ef875 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 28 Nov 2022 11:16:11 -0800 Subject: [PATCH 249/313] zpool: clean out dead code There is a lot of provision for flexibility that isn't actually needed or used. Zswap (the only zpool user) always passes zpool_ops with an .evict method set. The backends who reclaim only do so for zswap, so they can also directly call zpool_ops without indirection or checks. Finally, there is no need to check the retries parameters and bail with -EINVAL in the reclaim function, when that's called just a few lines below with a hard-coded 8. There is no need to duplicate the evictable and sleep_mapped attrs from the driver in zpool_ops. Link: https://lkml.kernel.org/r/20221128191616.1261026-3-nphamcs@gmail.com Reviewed-by: Sergey Senozhatsky Signed-off-by: Johannes Weiner Signed-off-by: Nhat Pham Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 36 +++++------------------------------- mm/zbud.c | 32 +++++--------------------------- mm/zpool.c | 10 ++-------- 3 files changed, 12 insertions(+), 66 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index cf71da10d04e..a4de0c317ac7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -68,9 +68,6 @@ * Structures *****************/ struct z3fold_pool; -struct z3fold_ops { - int (*evict)(struct z3fold_pool *pool, unsigned long handle); -}; enum buddy { HEADLESS = 0, @@ -138,8 +135,6 @@ struct z3fold_header { * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation - * @ops: pointer to a structure of user defined operations specified at - * pool creation time. * @zpool: zpool driver * @zpool_ops: zpool operations structure with an evict callback * @compact_wq: workqueue for page layout background optimization @@ -158,7 +153,6 @@ struct z3fold_pool { struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; - const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; @@ -907,13 +901,11 @@ out_fail: * z3fold_create_pool() - create a new z3fold pool * @name: pool name * @gfp: gfp flags when allocating the z3fold pool structure - * @ops: user-defined operations for the z3fold pool * * Return: pointer to the new z3fold pool or NULL if the metadata allocation * failed. */ -static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, - const struct z3fold_ops *ops) +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp) { struct z3fold_pool *pool = NULL; int i, cpu; @@ -949,7 +941,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, if (!pool->release_wq) goto out_wq; INIT_WORK(&pool->work, free_pages_work); - pool->ops = ops; return pool; out_wq: @@ -1230,10 +1221,6 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || retries == 0) { - spin_unlock(&pool->lock); - return -EINVAL; - } for (i = 0; i < retries; i++) { if (list_empty(&pool->lru)) { spin_unlock(&pool->lock); @@ -1319,17 +1306,17 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) } /* Issue the eviction callback(s) */ if (middle_handle) { - ret = pool->ops->evict(pool, middle_handle); + ret = pool->zpool_ops->evict(pool->zpool, middle_handle); if (ret) goto next; } if (first_handle) { - ret = pool->ops->evict(pool, first_handle); + ret = pool->zpool_ops->evict(pool->zpool, first_handle); if (ret) goto next; } if (last_handle) { - ret = pool->ops->evict(pool, last_handle); + ret = pool->zpool_ops->evict(pool->zpool, last_handle); if (ret) goto next; } @@ -1593,26 +1580,13 @@ static const struct movable_operations z3fold_mops = { * zpool ****************/ -static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct z3fold_ops z3fold_zpool_ops = { - .evict = z3fold_zpool_evict -}; - static void *z3fold_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct z3fold_pool *pool; - pool = z3fold_create_pool(name, gfp, - zpool_ops ? &z3fold_zpool_ops : NULL); + pool = z3fold_create_pool(name, gfp); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zbud.c b/mm/zbud.c index 6348932430b8..3acd26193920 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -74,10 +74,6 @@ struct zbud_pool; -struct zbud_ops { - int (*evict)(struct zbud_pool *pool, unsigned long handle); -}; - /** * struct zbud_pool - stores metadata for each zbud pool * @lock: protects all pool fields and first|last_chunk fields of any @@ -90,8 +86,6 @@ struct zbud_ops { * @lru: list tracking the zbud pages in LRU order by most recently * added buddy. * @pages_nr: number of zbud pages in the pool. - * @ops: pointer to a structure of user defined operations specified at - * pool creation time. * @zpool: zpool driver * @zpool_ops: zpool operations structure with an evict callback * @@ -110,7 +104,6 @@ struct zbud_pool { }; struct list_head lru; u64 pages_nr; - const struct zbud_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; }; @@ -212,12 +205,11 @@ static int num_free_chunks(struct zbud_header *zhdr) /** * zbud_create_pool() - create a new zbud pool * @gfp: gfp flags when allocating the zbud pool structure - * @ops: user-defined operations for the zbud pool * * Return: pointer to the new zbud pool or NULL if the metadata allocation * failed. */ -static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +static struct zbud_pool *zbud_create_pool(gfp_t gfp) { struct zbud_pool *pool; int i; @@ -231,7 +223,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) INIT_LIST_HEAD(&pool->buddied); INIT_LIST_HEAD(&pool->lru); pool->pages_nr = 0; - pool->ops = ops; return pool; } @@ -419,8 +410,7 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) unsigned long first_handle = 0, last_handle = 0; spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || - retries == 0) { + if (list_empty(&pool->lru)) { spin_unlock(&pool->lock); return -EINVAL; } @@ -444,12 +434,12 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) /* Issue the eviction callback(s) */ if (first_handle) { - ret = pool->ops->evict(pool, first_handle); + ret = pool->zpool_ops->evict(pool->zpool, first_handle); if (ret) goto next; } if (last_handle) { - ret = pool->ops->evict(pool, last_handle); + ret = pool->zpool_ops->evict(pool->zpool, last_handle); if (ret) goto next; } @@ -524,25 +514,13 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool) * zpool ****************/ -static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct zbud_ops zbud_zpool_ops = { - .evict = zbud_zpool_evict -}; - static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct zbud_pool *pool; - pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + pool = zbud_create_pool(gfp); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zpool.c b/mm/zpool.c index f46c0d5e766c..571f5c5031dd 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,9 +21,6 @@ struct zpool { struct zpool_driver *driver; void *pool; - const struct zpool_ops *ops; - bool evictable; - bool can_sleep_mapped; }; static LIST_HEAD(drivers_head); @@ -177,9 +174,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); - zpool->ops = ops; - zpool->evictable = driver->shrink && ops && ops->evict; - zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -380,7 +374,7 @@ u64 zpool_get_total_size(struct zpool *zpool) */ bool zpool_evictable(struct zpool *zpool) { - return zpool->evictable; + return zpool->driver->shrink; } /** @@ -398,7 +392,7 @@ bool zpool_evictable(struct zpool *zpool) */ bool zpool_can_sleep_mapped(struct zpool *zpool) { - return zpool->can_sleep_mapped; + return zpool->driver->sleep_mapped; } MODULE_LICENSE("GPL"); From c0547d0b6a4b637db05406b90ba82e1b2e71de56 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 28 Nov 2022 11:16:12 -0800 Subject: [PATCH 250/313] zsmalloc: consolidate zs_pool's migrate_lock and size_class's locks Currently, zsmalloc has a hierarchy of locks, which includes a pool-level migrate_lock, and a lock for each size class. We have to obtain both locks in the hotpath in most cases anyway, except for zs_malloc. This exception will no longer exist when we introduce a LRU into the zs_pool for the new writeback functionality - we will need to obtain a pool-level lock to synchronize LRU handling even in zs_malloc. In preparation for zsmalloc writeback, consolidate these locks into a single pool-level lock, which drastically reduces the complexity of synchronization in zsmalloc. We have also benchmarked the lock consolidation to see the performance effect of this change on zram. First, we ran a synthetic FS workload on a server machine with 36 cores (same machine for all runs), using fs_mark -d ../zram1mnt -s 100000 -n 2500 -t 32 -k before and after for btrfs and ext4 on zram (FS usage is 80%). Here is the result (unit is file/second): With lock consolidation (btrfs): Average: 13520.2, Median: 13531.0, Stddev: 137.5961482019028 Without lock consolidation (btrfs): Average: 13487.2, Median: 13575.0, Stddev: 309.08283679298665 With lock consolidation (ext4): Average: 16824.4, Median: 16839.0, Stddev: 89.97388510006668 Without lock consolidation (ext4) Average: 16958.0, Median: 16986.0, Stddev: 194.7370021336469 As you can see, we observe a 0.3% regression for btrfs, and a 0.9% regression for ext4. This is a small, barely measurable difference in my opinion. For a more realistic scenario, we also tries building the kernel on zram. Here is the time it takes (in seconds): With lock consolidation (btrfs): real Average: 319.6, Median: 320.0, Stddev: 0.8944271909999159 user Average: 6894.2, Median: 6895.0, Stddev: 25.528415540334656 sys Average: 521.4, Median: 522.0, Stddev: 1.51657508881031 Without lock consolidation (btrfs): real Average: 319.8, Median: 320.0, Stddev: 0.8366600265340756 user Average: 6896.6, Median: 6899.0, Stddev: 16.04057355583023 sys Average: 520.6, Median: 521.0, Stddev: 1.140175425099138 With lock consolidation (ext4): real Average: 320.0, Median: 319.0, Stddev: 1.4142135623730951 user Average: 6896.8, Median: 6878.0, Stddev: 28.621670111997307 sys Average: 521.2, Median: 521.0, Stddev: 1.7888543819998317 Without lock consolidation (ext4) real Average: 319.6, Median: 319.0, Stddev: 0.8944271909999159 user Average: 6886.2, Median: 6887.0, Stddev: 16.93221781102523 sys Average: 520.4, Median: 520.0, Stddev: 1.140175425099138 The difference is entirely within the noise of a typical run on zram. This hardly justifies the complexity of maintaining both the pool lock and the class lock. In fact, for writeback, we would need to introduce yet another lock to prevent data races on the pool's LRU, further complicating the lock handling logic. IMHO, it is just better to collapse all of these into a single pool-level lock. Link: https://lkml.kernel.org/r/20221128191616.1261026-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Nitin Gupta Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 87 ++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 78feda34ad9a..5427a00a0518 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -33,8 +33,7 @@ /* * lock ordering: * page_lock - * pool->migrate_lock - * class->lock + * pool->lock * zspage->lock */ @@ -192,7 +191,6 @@ static const int fullness_threshold_frac = 4; static size_t huge_class_size; struct size_class { - spinlock_t lock; struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple @@ -247,8 +245,7 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif - /* protect page/zspage migration */ - rwlock_t migrate_lock; + spinlock_t lock; }; struct zspage { @@ -355,7 +352,7 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } -/* class->lock(which owns the handle) synchronizes races */ +/* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { *(unsigned long *)handle = obj; @@ -452,7 +449,7 @@ static __maybe_unused int is_first_page(struct page *page) return PagePrivate(page); } -/* Protected by class->lock */ +/* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; @@ -597,13 +594,13 @@ static int zs_stats_size_show(struct seq_file *s, void *v) if (class->index != i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); freeable = zs_can_compact(class); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * @@ -916,7 +913,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, get_zspage_mapping(zspage, &class_idx, &fg); - assert_spin_locked(&class->lock); + assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg != ZS_EMPTY); @@ -1268,19 +1265,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, BUG_ON(in_interrupt()); /* It guarantees it can get zspage from handle safely */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); /* - * migration cannot move any zpages in this zspage. Here, class->lock + * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ migrate_read_lock(zspage); - read_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; @@ -1433,8 +1430,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; - /* class->lock effectively protects the zpage migration */ - spin_lock(&class->lock); + /* pool->lock effectively protects the zpage migration */ + spin_lock(&pool->lock); zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); @@ -1442,12 +1439,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, OBJ_USED, 1); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } - spin_unlock(&class->lock); + spin_unlock(&pool->lock); zspage = alloc_zspage(pool, class, gfp); if (!zspage) { @@ -1455,7 +1452,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) return (unsigned long)ERR_PTR(-ENOMEM); } - spin_lock(&class->lock); + spin_lock(&pool->lock); obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); @@ -1468,7 +1465,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } @@ -1512,16 +1509,14 @@ void zs_free(struct zs_pool *pool, unsigned long handle) return; /* - * The pool->migrate_lock protects the race with zpage's migration + * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - spin_lock(&class->lock); - read_unlock(&pool->migrate_lock); obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); @@ -1531,7 +1526,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) free_zspage(pool, class, zspage); out: - spin_unlock(&class->lock); + spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1888,16 +1883,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, pool = zspage->pool; /* - * The pool migrate_lock protects the race between zpage migration + * The pool's lock protects the race between zpage migration * and zs_free. */ - write_lock(&pool->migrate_lock); + spin_lock(&pool->lock); class = zspage_class(pool, zspage); - /* - * the class lock protects zpage alloc/free in the zspage. - */ - spin_lock(&class->lock); /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); @@ -1927,10 +1918,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page, replace_sub_page(class, zspage, newpage, page); /* * Since we complete the data copy and set up new zspage structure, - * it's okay to release migration_lock. + * it's okay to release the pool's lock. */ - write_unlock(&pool->migrate_lock); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); dec_zspage_isolation(zspage); migrate_write_unlock(zspage); @@ -1985,9 +1975,9 @@ static void async_free_zspage(struct work_struct *work) if (class->index != i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } list_for_each_entry_safe(zspage, tmp, &free_pages, list) { @@ -1997,9 +1987,9 @@ static void async_free_zspage(struct work_struct *work) get_zspage_mapping(zspage, &class_idx, &fullness); VM_BUG_ON(fullness != ZS_EMPTY); class = pool->size_class[class_idx]; - spin_lock(&class->lock); + spin_lock(&pool->lock); __free_zspage(pool, class, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } }; @@ -2060,10 +2050,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; - /* protect the race between zpage migration and zs_free */ - write_lock(&pool->migrate_lock); - /* protect zpage allocation/free */ - spin_lock(&class->lock); + /* + * protect the race between zpage migration and zs_free + * as well as zpage allocation/free + */ + spin_lock(&pool->lock); while ((src_zspage = isolate_zspage(class, true))) { /* protect someone accessing the zspage(i.e., zs_map_object) */ migrate_write_lock(src_zspage); @@ -2088,7 +2079,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, putback_zspage(class, dst_zspage); migrate_write_unlock(dst_zspage); dst_zspage = NULL; - if (rwlock_is_contended(&pool->migrate_lock)) + if (spin_is_contended(&pool->lock)) break; } @@ -2105,11 +2096,9 @@ static unsigned long __zs_compact(struct zs_pool *pool, pages_freed += class->pages_per_zspage; } else migrate_write_unlock(src_zspage); - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); cond_resched(); - write_lock(&pool->migrate_lock); - spin_lock(&class->lock); + spin_lock(&pool->lock); } if (src_zspage) { @@ -2117,8 +2106,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, migrate_write_unlock(src_zspage); } - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); return pages_freed; } @@ -2221,7 +2209,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - rwlock_init(&pool->migrate_lock); + spin_lock_init(&pool->lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) @@ -2292,7 +2280,6 @@ struct zs_pool *zs_create_pool(const char *name) class->index = i; class->pages_per_zspage = pages_per_zspage; class->objs_per_zspage = objs_per_zspage; - spin_lock_init(&class->lock); pool->size_class[i] = class; for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; fullness++) From 64f768c6b32e1957e2b65b70e97cb4cb62344bc4 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 28 Nov 2022 11:16:13 -0800 Subject: [PATCH 251/313] zsmalloc: add a LRU to zs_pool to keep track of zspages in LRU order This helps determines the coldest zspages as candidates for writeback. Link: https://lkml.kernel.org/r/20221128191616.1261026-5-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5427a00a0518..b1bc231d94a3 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -239,6 +239,11 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; +#ifdef CONFIG_ZPOOL + /* List tracking the zspages in LRU order by most recently added object */ + struct list_head lru; +#endif + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -260,6 +265,12 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + +#ifdef CONFIG_ZPOOL + /* links the zspage to the lru list in the pool */ + struct list_head lru; +#endif + struct zs_pool *pool; #ifdef CONFIG_COMPACTION rwlock_t lock; @@ -953,6 +964,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, } remove_zspage(class, zspage, ZS_EMPTY); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); } @@ -998,6 +1012,10 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) off %= PAGE_SIZE; } +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&zspage->lru); +#endif + set_freeobj(zspage, 0); } @@ -1270,6 +1288,31 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); +#ifdef CONFIG_ZPOOL + /* + * Move the zspage to front of pool's LRU. + * + * Note that this is swap-specific, so by definition there are no ongoing + * accesses to the memory while the page is swapped out that would make + * it "hot". A new entry is hot, then ages to the tail until it gets either + * written back or swaps back in. + * + * Furthermore, map is also called during writeback. We must not put an + * isolated page on the LRU mid-reclaim. + * + * As a result, only update the LRU when the page is mapped for write + * when it's first instantiated. + * + * This is a deviation from the other backends, which perform this update + * in the allocation function (zbud_alloc, z3fold_alloc). + */ + if (mm == ZS_MM_WO) { + if (!list_empty(&zspage->lru)) + list_del(&zspage->lru); + list_add(&zspage->lru, &pool->lru); + } +#endif + /* * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls @@ -1988,6 +2031,9 @@ static void async_free_zspage(struct work_struct *work) VM_BUG_ON(fullness != ZS_EMPTY); class = pool->size_class[class_idx]; spin_lock(&pool->lock); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } @@ -2299,6 +2345,10 @@ struct zs_pool *zs_create_pool(const char *name) */ zs_register_shrinker(pool); +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&pool->lru); +#endif + return pool; err: From bd0fded29689a762e6a749a1258a59cc2b99a18a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 28 Nov 2022 11:16:14 -0800 Subject: [PATCH 252/313] zsmalloc: add zpool_ops field to zs_pool to store evict handlers This adds a new field to zs_pool to store evict handlers for writeback, analogous to the zbud allocator. Link: https://lkml.kernel.org/r/20221128191616.1261026-6-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Nitin Gupta Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b1bc231d94a3..d06f9150b9da 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -242,6 +242,8 @@ struct zs_pool { #ifdef CONFIG_ZPOOL /* List tracking the zspages in LRU order by most recently added object */ struct list_head lru; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; #endif #ifdef CONFIG_ZSMALLOC_STAT @@ -382,7 +384,14 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, * different contexts and its caller must provide a valid * gfp mask. */ - return zs_create_pool(name); + struct zs_pool *pool = zs_create_pool(name); + + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + + return pool; } static void zs_zpool_destroy(void *pool) From 9997bc017549acd6425e32300eff28424ffeeb6b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Mon, 28 Nov 2022 11:16:15 -0800 Subject: [PATCH 253/313] zsmalloc: implement writeback mechanism for zsmalloc This commit adds the writeback mechanism for zsmalloc, analogous to the zbud allocator. Zsmalloc will attempt to determine the coldest zspage (i.e least recently used) in the pool, and attempt to write back all the stored compressed objects via the pool's evict handler. Link: https://lkml.kernel.org/r/20221128191616.1261026-7-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Cc: Nitin Gupta Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 196 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 184 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d06f9150b9da..9445bee6b014 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -271,12 +271,13 @@ struct zspage { #ifdef CONFIG_ZPOOL /* links the zspage to the lru list in the pool */ struct list_head lru; + bool under_reclaim; + /* list of unfreed handles whose objects have been reclaimed */ + unsigned long *deferred_handles; #endif struct zs_pool *pool; -#ifdef CONFIG_COMPACTION rwlock_t lock; -#endif }; struct mapping_area { @@ -297,10 +298,11 @@ static bool ZsHugePage(struct zspage *zspage) return zspage->huge; } -#ifdef CONFIG_COMPACTION static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); @@ -308,9 +310,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static void migrate_lock_init(struct zspage *zspage) {} -static void migrate_read_lock(struct zspage *zspage) {} -static void migrate_read_unlock(struct zspage *zspage) {} static void migrate_write_lock(struct zspage *zspage) {} static void migrate_write_lock_nested(struct zspage *zspage) {} static void migrate_write_unlock(struct zspage *zspage) {} @@ -413,6 +412,27 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries); + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zs_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -451,6 +471,7 @@ static struct zpool_driver zs_zpool_driver = { .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, + .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, @@ -924,6 +945,25 @@ unlock: return 0; } +#ifdef CONFIG_ZPOOL +/* + * Free all the deferred handles whose objects are freed in zs_free. + */ +static void free_handles(struct zs_pool *pool, struct zspage *zspage) +{ + unsigned long handle = (unsigned long)zspage->deferred_handles; + + while (handle) { + unsigned long nxt_handle = handle_to_obj(handle); + + cache_free_handle(pool, handle); + handle = nxt_handle; + } +} +#else +static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +#endif + static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { @@ -938,6 +978,9 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg != ZS_EMPTY); + /* Free all deferred handles from zs_free */ + free_handles(pool, zspage); + next = page = get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1023,6 +1066,8 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) #ifdef CONFIG_ZPOOL INIT_LIST_HEAD(&zspage->lru); + zspage->under_reclaim = false; + zspage->deferred_handles = NULL; #endif set_freeobj(zspage, 0); @@ -1572,12 +1617,26 @@ void zs_free(struct zs_pool *pool, unsigned long handle) obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); - fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) - goto out; - free_zspage(pool, class, zspage); -out: +#ifdef CONFIG_ZPOOL + if (zspage->under_reclaim) { + /* + * Reclaim needs the handles during writeback. It'll free + * them along with the zspage when it's done with them. + * + * Record current deferred handle at the memory location + * whose address is given by handle. + */ + record_obj(handle, (unsigned long)zspage->deferred_handles); + zspage->deferred_handles = (unsigned long *)handle; + spin_unlock(&pool->lock); + return; + } +#endif + fullness = fix_fullness_group(class, zspage); + if (fullness == ZS_EMPTY) + free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); cache_free_handle(pool, handle); } @@ -1777,7 +1836,7 @@ static enum fullness_group putback_zspage(struct size_class *class, return fullness; } -#ifdef CONFIG_COMPACTION +#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. @@ -1819,6 +1878,24 @@ static void lock_zspage(struct zspage *zspage) } migrate_read_unlock(zspage); } +#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */ + +#ifdef CONFIG_ZPOOL +/* + * Unlocks all the pages of the zspage. + * + * pool->lock must be held before this function is called + * to prevent the underlying pages from migrating. + */ +static void unlock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); +} +#endif /* CONFIG_ZPOOL */ static void migrate_lock_init(struct zspage *zspage) { @@ -1835,6 +1912,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) read_unlock(&zspage->lock); } +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); @@ -2399,6 +2477,100 @@ void zs_destroy_pool(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_destroy_pool); +#ifdef CONFIG_ZPOOL +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) +{ + int i, obj_idx, ret = 0; + unsigned long handle; + struct zspage *zspage; + struct page *page; + enum fullness_group fullness; + + /* Lock LRU and fullness list */ + spin_lock(&pool->lock); + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } + + for (i = 0; i < retries; i++) { + struct size_class *class; + + zspage = list_last_entry(&pool->lru, struct zspage, lru); + list_del(&zspage->lru); + + /* zs_free may free objects, but not the zspage and handles */ + zspage->under_reclaim = true; + + class = zspage_class(pool, zspage); + fullness = get_fullness_group(class, zspage); + + /* Lock out object allocations and object compaction */ + remove_zspage(class, zspage, fullness); + + spin_unlock(&pool->lock); + cond_resched(); + + /* Lock backing pages into place */ + lock_zspage(zspage); + + obj_idx = 0; + page = get_first_page(zspage); + while (1) { + handle = find_alloced_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } + + /* + * This will write the object and call zs_free. + * + * zs_free will free the object, but the + * under_reclaim flag prevents it from freeing + * the zspage altogether. This is necessary so + * that we can continue working with the + * zspage potentially after the last object + * has been freed. + */ + ret = pool->zpool_ops->evict(pool->zpool, handle); + if (ret) + goto next; + + obj_idx++; + } + +next: + /* For freeing the zspage, or putting it back in the pool and LRU list. */ + spin_lock(&pool->lock); + zspage->under_reclaim = false; + + if (!get_zspage_inuse(zspage)) { + /* + * Fullness went stale as zs_free() won't touch it + * while the page is removed from the pool. Fix it + * up for the check in __free_zspage(). + */ + zspage->fullness = ZS_EMPTY; + + __free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); + return 0; + } + + putback_zspage(class, zspage); + list_add(&zspage->lru, &pool->lru); + unlock_zspage(zspage); + } + + spin_unlock(&pool->lock); + return -EAGAIN; +} +#endif /* CONFIG_ZPOOL */ + static int __init zs_init(void) { int ret; From feeb9b26952367bc1171592ee476f95aa81ee588 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 28 Nov 2022 10:56:31 -0500 Subject: [PATCH 254/313] filemap: skip write and wait if end offset precedes start Patch series "filemap: skip write and wait if end offset precedes start", v2. A fix for the odd write and wait behavior described in the patch 1 commit log. Technically patch 1 could simply remove the check rather than lift it into the callers, but this seemed a bit more user friendly to me. Patch 2 is appended after observation that fadvise() interacted poorly with the v1 patch. This is no longer a problem with v2, making patch 2 purely a cleanup. This series survived both fstests and ltp regression runs without observable problems. I had (end < start) warning checks in each relevant function, with fadvise() being the only caller that triggered them. That said, I dropped the warnings after testing because there seemed to much potential for noise from the various other callers. This patch (of 2): A call to file[map]_write_and_wait_range() with an end offset that precedes the start offset but happens to land in the same page can trigger writeback submission but fails to wait on the submitted page. Writeback submission occurs because __filemap_fdatawrite_range() passes both offsets down into write_cache_pages(), which rounds down to page indexes before it starts processing writeback. However, __filemap_fdatawait_range() immediately returns if the byte-granular end offset precedes the start offset. This behavior was observed in the form of unpredictable latency from a frequent write and wait call with incorrect parameters. The behavior gave the impression that the fdatawait path might occasionally fail to wait on writeback, but further investigation showed the latency was from write_cache_pages() waiting on writeback state to clear for a page already under writeback. Therefore, this indicated that fdatawait actually never waits on writeback in this particular situation. The byte granular check in __filemap_fdatawait_range() goes all the way back to the old wait_on_page_writeback() helper. It originally used page offsets and so would have waited in this problematic case. That changed to byte granularity file offsets in commit 94004ed726f3 ("kill wait_on_page_writeback_range"), which subtly changed this behavior. The check itself has become somewhat redundant since the error checking code that used to follow the wait loop (at the time of the aforementioned commit) has now been removed and lifted into the higher level callers. Therefore, we can restore historical fdatawait behavior by simply removing the check. Since the current fdatawait behavior has been in place for quite some time and is consistent with other interfaces that use file offsets, instead lift the check into the file[map]_write_and_wait_range() helpers to provide consistent behavior between the write and wait. Link: https://lkml.kernel.org/r/20221128155632.3950447-1-bfoster@redhat.com Link: https://lkml.kernel.org/r/20221128155632.3950447-2-bfoster@redhat.com Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/filemap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 65eee6ec1066..242cd8bd8330 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -506,9 +506,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct pagevec pvec; int nr_pages; - if (end_byte < start_byte) - return; - pagevec_init(&pvec); while (index <= end) { unsigned i; @@ -670,6 +667,9 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0, err2; + if (lend < lstart) + return 0; + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -770,6 +770,9 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; + if (lend < lstart) + return 0; + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); From 3cd629e5775397103e0428f62ce64747741dbfe5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 28 Nov 2022 10:56:32 -0500 Subject: [PATCH 255/313] mm/fadvise: use LLONG_MAX instead of -1 for eof generic_fadvise() sets endbyte = -1 to specify end of file (i.e. if length == 0 is passed from userspace). Most other callers to filemap_fdatawrite_range() use LLONG_MAX for this purpose, particularly if they also call fdatawait_range() (which requires end >= start). For example, sync_file_range(), vfs_fsync() (where the range is passed down through per-fs ->fsync() callbacks), filemap_flush(), etc. generic_fadvise() does not currently wait on writeback, but fix the call up to be consistent with other callers. Link: https://lkml.kernel.org/r/20221128155632.3950447-3-bfoster@redhat.com Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/fadvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index c76ee665355a..bf04fec87f35 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -72,7 +72,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) - endbyte = -1; + endbyte = LLONG_MAX; else endbyte--; /* inclusive */ From d3a89233583bf8edab18ac09732759c71dbe0173 Mon Sep 17 00:00:00 2001 From: zhang songyi Date: Mon, 28 Nov 2022 21:07:43 +0800 Subject: [PATCH 256/313] include/linux/pgtable.h: : remove redundant pte variable Return value from ptep_get_and_clear_full() directly instead of taking this in another redundant variable. Link: https://lkml.kernel.org/r/202211282107437343474@zte.com.cn Signed-off-by: zhang songyi Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c74cce67eec8..dfabd549d2e7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,9 +425,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { - pte_t pte; - pte = ptep_get_and_clear(mm, address, ptep); - return pte; + return ptep_get_and_clear(mm, address, ptep); } #endif From 1e8e4a7cc2fa3017b1daf02612e095d51924ce1e Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 28 Nov 2022 10:45:40 +0100 Subject: [PATCH 257/313] lockdep: allow instrumenting lockdep.c with KMSAN Lockdep and KMSAN used to play badly together, causing deadlocks when KMSAN instrumentation of lockdep.c called lockdep functions recursively. Looks like this is no more the case, and a kernel can run (yet slower) with both KMSAN and lockdep enabled. This patch should fix false positives on wq_head->lock->dep_map, which KMSAN used to consider uninitialized because of lockdep.c not being instrumented. Link: https://lore.kernel.org/lkml/Y3b9AAEKp2Vr3e6O@sol.localdomain/ Link: https://lkml.kernel.org/r/20221128094541.2645890-1-glider@google.com Signed-off-by: Alexander Potapenko Reported-by: Eric Biggers Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/locking/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index ea925731fa40..0db4093d17b8 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -7,7 +7,6 @@ obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o # Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n -KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) From 85716a80c16dd6b6d1aaed87cd4b91c9b1d9b9b2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 28 Nov 2022 10:45:41 +0100 Subject: [PATCH 258/313] kmsan: allow using __msan_instrument_asm_store() inside runtime In certain cases (e.g. when handling a softirq) __msan_instrument_asm_store(&var, sizeof(var)) may be called with from within KMSAN runtime, but later the value of @var is used with !kmsan_in_runtime(), leading to false positives. Because kmsan_internal_unpoison_memory() doesn't take locks, it should be fine to call it without kmsan_in_runtime() checks, which fixes the mentioned false positives. Link: https://lkml.kernel.org/r/20221128094541.2645890-2-glider@google.com Signed-off-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Ingo Molnar Cc: Marco Elver Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/kmsan/instrumentation.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 271f135f97a1..770fe02904f3 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -81,12 +81,16 @@ DECLARE_METADATA_PTR_GETTER(8); * Handle a memory store performed by inline assembly. KMSAN conservatively * attempts to unpoison the outputs of asm() directives to prevent false * positives caused by missed stores. + * + * __msan_instrument_asm_store() may be called for inline assembly code when + * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure + * the memory written to in these cases is also marked as initialized. */ void __msan_instrument_asm_store(void *addr, uintptr_t size) { unsigned long ua_flags; - if (!kmsan_enabled || kmsan_in_runtime()) + if (!kmsan_enabled) return; ua_flags = user_access_save(); @@ -103,10 +107,8 @@ void __msan_instrument_asm_store(void *addr, uintptr_t size) user_access_restore(ua_flags); return; } - kmsan_enter_runtime(); /* Unpoisoning the memory on best effort. */ kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); - kmsan_leave_runtime(); user_access_restore(ua_flags); } EXPORT_SYMBOL(__msan_instrument_asm_store); From 22c4e80466eb88cff283ed50a5d0b0ff1654d0c3 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:50 +0800 Subject: [PATCH 259/313] MIPS&LoongArch&NIOS2: adjust prototypes of p?d_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm/sparse-vmemmap: Generalise helpers and enable for LoongArch", v14. This series is in order to enable sparse-vmemmap for LoongArch. But LoongArch cannot use generic helpers directly because MIPS&LoongArch need to call pgd_init()/pud_init()/pmd_init() when populating page tables. So we adjust the prototypes of p?d_init() to make generic helpers can call them, then enable sparse-vmemmap with generic helpers, and to be further, generalise vmemmap_populate_hugepages() for ARM64, X86 and LoongArch. This patch (of 4): We are preparing to add sparse vmemmap support to LoongArch. MIPS and LoongArch need to call pgd_init()/pud_init()/pmd_init() when populating page tables, so adjust their prototypes to make generic helpers can call them. NIOS2 declares pmd_init() but doesn't use, just remove it to avoid build errors. Link: https://lkml.kernel.org/r/20221027125253.3458989-1-chenhuacai@loongson.cn Link: https://lkml.kernel.org/r/20221027125253.3458989-2-chenhuacai@loongson.cn Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Reviewed-by: Jiaxun Yang Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Will Deacon Cc: Xuefeng Li Cc: Xuerui Wang Cc: Min Zhou Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgalloc.h | 13 ++----------- arch/loongarch/include/asm/pgtable.h | 8 ++++---- arch/loongarch/kernel/numa.c | 4 ++-- arch/loongarch/mm/pgtable.c | 23 +++++++++++++---------- arch/mips/include/asm/pgalloc.h | 10 +++++----- arch/mips/include/asm/pgtable-64.h | 8 ++++---- arch/mips/kvm/mmu.c | 3 +-- arch/mips/mm/pgtable-32.c | 9 ++++----- arch/mips/mm/pgtable-64.c | 18 ++++++++++-------- arch/mips/mm/pgtable.c | 2 +- arch/nios2/include/asm/pgalloc.h | 5 ----- 11 files changed, 46 insertions(+), 57 deletions(-) diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index 4bfeb3c9c9ac..af1d1e4a6965 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -42,15 +42,6 @@ static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) extern void pagetable_init(void); -/* - * Initialize a new pmd table with invalid pointers. - */ -extern void pmd_init(unsigned long page, unsigned long pagetable); - -/* - * Initialize a new pgd / pmd table with invalid pointers. - */ -extern void pgd_init(unsigned long page); extern pgd_t *pgd_alloc(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, address) \ @@ -76,7 +67,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) } pmd = (pmd_t *)page_address(pg); - pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table); + pmd_init(pmd); return pmd; } @@ -92,7 +83,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) pud = (pud_t *) __get_free_page(GFP_KERNEL); if (pud) - pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table); + pud_init(pud); return pud; } diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index b8d837ee6910..9e6651846db9 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -237,11 +237,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm #define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) /* - * Initialize a new pgd / pmd table with invalid pointers. + * Initialize a new pgd / pud / pmd table with invalid pointers. */ -extern void pgd_init(unsigned long page); -extern void pud_init(unsigned long page, unsigned long pagetable); -extern void pmd_init(unsigned long page, unsigned long pagetable); +extern void pgd_init(void *addr); +extern void pud_init(void *addr); +extern void pmd_init(void *addr); /* * Non-present pages: high 40 bits are offset, next 8 bits type, diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index a13f92593cfd..eb5d3a4c8a7a 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -78,7 +78,7 @@ void __init pcpu_populate_pte(unsigned long addr) new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); pgd_populate(&init_mm, pgd, new); #ifndef __PAGETABLE_PUD_FOLDED - pud_init((unsigned long)new, (unsigned long)invalid_pmd_table); + pud_init(new); #endif } @@ -89,7 +89,7 @@ void __init pcpu_populate_pte(unsigned long addr) new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, new); #ifndef __PAGETABLE_PMD_FOLDED - pmd_init((unsigned long)new, (unsigned long)invalid_pte_table); + pmd_init(new); #endif } diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index ee179ccd3e3f..36a6dc0148ae 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -16,7 +16,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) ret = (pgd_t *) __get_free_page(GFP_KERNEL); if (ret) { init = pgd_offset(&init_mm, 0UL); - pgd_init((unsigned long)ret); + pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } @@ -25,7 +25,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(pgd_alloc); -void pgd_init(unsigned long page) +void pgd_init(void *addr) { unsigned long *p, *end; unsigned long entry; @@ -38,7 +38,7 @@ void pgd_init(unsigned long page) entry = (unsigned long)invalid_pte_table; #endif - p = (unsigned long *) page; + p = (unsigned long *)addr; end = p + PTRS_PER_PGD; do { @@ -56,11 +56,12 @@ void pgd_init(unsigned long page) EXPORT_SYMBOL_GPL(pgd_init); #ifndef __PAGETABLE_PMD_FOLDED -void pmd_init(unsigned long addr, unsigned long pagetable) +void pmd_init(void *addr) { unsigned long *p, *end; + unsigned long pagetable = (unsigned long)invalid_pte_table; - p = (unsigned long *) addr; + p = (unsigned long *)addr; end = p + PTRS_PER_PMD; do { @@ -79,9 +80,10 @@ EXPORT_SYMBOL_GPL(pmd_init); #endif #ifndef __PAGETABLE_PUD_FOLDED -void pud_init(unsigned long addr, unsigned long pagetable) +void pud_init(void *addr) { unsigned long *p, *end; + unsigned long pagetable = (unsigned long)invalid_pmd_table; p = (unsigned long *)addr; end = p + PTRS_PER_PUD; @@ -98,6 +100,7 @@ void pud_init(unsigned long addr, unsigned long pagetable) p[-1] = pagetable; } while (p != end); } +EXPORT_SYMBOL_GPL(pud_init); #endif pmd_t mk_pmd(struct page *page, pgprot_t prot) @@ -119,12 +122,12 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, void __init pagetable_init(void) { /* Initialize the entire pgd. */ - pgd_init((unsigned long)swapper_pg_dir); - pgd_init((unsigned long)invalid_pg_dir); + pgd_init(swapper_pg_dir); + pgd_init(invalid_pg_dir); #ifndef __PAGETABLE_PUD_FOLDED - pud_init((unsigned long)invalid_pud_table, (unsigned long)invalid_pmd_table); + pud_init(invalid_pud_table); #endif #ifndef __PAGETABLE_PMD_FOLDED - pmd_init((unsigned long)invalid_pmd_table, (unsigned long)invalid_pte_table); + pmd_init(invalid_pmd_table); #endif } diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 796035784c73..f72e737dda21 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -33,7 +33,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, /* * Initialize a new pmd table with invalid pointers. */ -extern void pmd_init(unsigned long page, unsigned long pagetable); +extern void pmd_init(void *addr); #ifndef __PAGETABLE_PMD_FOLDED @@ -44,9 +44,9 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) #endif /* - * Initialize a new pgd / pmd table with invalid pointers. + * Initialize a new pgd table with invalid pointers. */ -extern void pgd_init(unsigned long page); +extern void pgd_init(void *addr); extern pgd_t *pgd_alloc(struct mm_struct *mm); static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -77,7 +77,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) } pmd = (pmd_t *)page_address(pg); - pmd_init((unsigned long)pmd, (unsigned long)invalid_pte_table); + pmd_init(pmd); return pmd; } @@ -93,7 +93,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER); if (pud) - pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table); + pud_init(pud); return pud; } diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 436c29d698fa..c6310192b654 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -313,11 +313,11 @@ static inline pmd_t *pud_pgtable(pud_t pud) #endif /* - * Initialize a new pgd / pmd table with invalid pointers. + * Initialize a new pgd / pud / pmd table with invalid pointers. */ -extern void pgd_init(unsigned long page); -extern void pud_init(unsigned long page, unsigned long pagetable); -extern void pmd_init(unsigned long page, unsigned long pagetable); +extern void pgd_init(void *addr); +extern void pud_init(void *addr); +extern void pmd_init(void *addr); /* * Non-present pages: high 40 bits are offset, next 8 bits type, diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 74cd64a24d05..e8c08988ed37 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -122,8 +122,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, if (!cache) return NULL; new_pmd = kvm_mmu_memory_cache_alloc(cache); - pmd_init((unsigned long)new_pmd, - (unsigned long)invalid_pte_table); + pmd_init(new_pmd); pud_populate(NULL, pud, new_pmd); } pmd = pmd_offset(pud, addr); diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c index 61891af25019..f57fb69472f8 100644 --- a/arch/mips/mm/pgtable-32.c +++ b/arch/mips/mm/pgtable-32.c @@ -13,9 +13,9 @@ #include #include -void pgd_init(unsigned long page) +void pgd_init(void *addr) { - unsigned long *p = (unsigned long *) page; + unsigned long *p = (unsigned long *)addr; int i; for (i = 0; i < USER_PTRS_PER_PGD; i+=8) { @@ -61,9 +61,8 @@ void __init pagetable_init(void) #endif /* Initialize the entire pgd. */ - pgd_init((unsigned long)swapper_pg_dir); - pgd_init((unsigned long)swapper_pg_dir - + sizeof(pgd_t) * USER_PTRS_PER_PGD); + pgd_init(swapper_pg_dir); + pgd_init(&swapper_pg_dir[USER_PTRS_PER_PGD]); pgd_base = swapper_pg_dir; diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index 7536f7804c44..b4386a0e2ef8 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -13,7 +13,7 @@ #include #include -void pgd_init(unsigned long page) +void pgd_init(void *addr) { unsigned long *p, *end; unsigned long entry; @@ -26,7 +26,7 @@ void pgd_init(unsigned long page) entry = (unsigned long)invalid_pte_table; #endif - p = (unsigned long *) page; + p = (unsigned long *) addr; end = p + PTRS_PER_PGD; do { @@ -43,11 +43,12 @@ void pgd_init(unsigned long page) } #ifndef __PAGETABLE_PMD_FOLDED -void pmd_init(unsigned long addr, unsigned long pagetable) +void pmd_init(void *addr) { unsigned long *p, *end; + unsigned long pagetable = (unsigned long)invalid_pte_table; - p = (unsigned long *) addr; + p = (unsigned long *)addr; end = p + PTRS_PER_PMD; do { @@ -66,9 +67,10 @@ EXPORT_SYMBOL_GPL(pmd_init); #endif #ifndef __PAGETABLE_PUD_FOLDED -void pud_init(unsigned long addr, unsigned long pagetable) +void pud_init(void *addr) { unsigned long *p, *end; + unsigned long pagetable = (unsigned long)invalid_pmd_table; p = (unsigned long *)addr; end = p + PTRS_PER_PUD; @@ -108,12 +110,12 @@ void __init pagetable_init(void) pgd_t *pgd_base; /* Initialize the entire pgd. */ - pgd_init((unsigned long)swapper_pg_dir); + pgd_init(swapper_pg_dir); #ifndef __PAGETABLE_PUD_FOLDED - pud_init((unsigned long)invalid_pud_table, (unsigned long)invalid_pmd_table); + pud_init(invalid_pud_table); #endif #ifndef __PAGETABLE_PMD_FOLDED - pmd_init((unsigned long)invalid_pmd_table, (unsigned long)invalid_pte_table); + pmd_init(invalid_pmd_table); #endif pgd_base = swapper_pg_dir; /* diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c index 3b7590660a04..b13314be5d0e 100644 --- a/arch/mips/mm/pgtable.c +++ b/arch/mips/mm/pgtable.c @@ -15,7 +15,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); if (ret) { init = pgd_offset(&init_mm, 0UL); - pgd_init((unsigned long)ret); + pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); } diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 3c4ae74d5798..ecd1657bb2ce 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -26,11 +26,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -/* - * Initialize a new pmd table with invalid pointers. - */ -extern void pmd_init(unsigned long page, unsigned long pagetable); - extern pgd_t *pgd_alloc(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, addr) \ From 7b09f5af01ede480cbe7abcb281cf17550a46ff5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:51 +0800 Subject: [PATCH 260/313] LoongArch: add sparse memory vmemmap support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sparse memory vmemmap support for LoongArch. SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. Link: https://lkml.kernel.org/r/20221027125253.3458989-3-chenhuacai@loongson.cn Signed-off-by: Min Zhou Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Peter Zijlstra Cc: Philippe Mathieu-Daudé Cc: Thomas Bogendoerfer Cc: Will Deacon Cc: Xuefeng Li Cc: Xuerui Wang Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/pgtable.h | 7 ++- arch/loongarch/include/asm/sparsemem.h | 8 +++ arch/loongarch/mm/init.c | 72 ++++++++++++++++++++++++-- include/linux/mm.h | 2 + mm/sparse-vmemmap.c | 10 ++++ 6 files changed, 96 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 903096bd87f8..6f7fa0c0ca08 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -487,6 +487,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y + select SPARSEMEM_VMEMMAP_ENABLE help Say Y to support efficient handling of sparse physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 9e6651846db9..022ec6be3602 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -11,6 +11,7 @@ #include #include +#include #include #if CONFIG_PGTABLE_LEVELS == 2 @@ -59,6 +60,7 @@ #include #include #include +#include struct mm_struct; struct vm_area_struct; @@ -86,7 +88,10 @@ extern unsigned long zero_page_mask; #define VMALLOC_START MODULES_END #define VMALLOC_END \ (vm_map_base + \ - min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE) + min(PTRS_PER_PGD * PTRS_PER_PUD * PTRS_PER_PMD * PTRS_PER_PTE * PAGE_SIZE, (1UL << cpu_vabits)) - PMD_SIZE - VMEMMAP_SIZE) + +#define vmemmap ((struct page *)((VMALLOC_END + PMD_SIZE) & PMD_MASK)) +#define VMEMMAP_END ((unsigned long)vmemmap + VMEMMAP_SIZE - 1) #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %016lx.\n", __FILE__, __LINE__, pte_val(e)) diff --git a/arch/loongarch/include/asm/sparsemem.h b/arch/loongarch/include/asm/sparsemem.h index 3d18cdf1b069..8d4af6aff8a8 100644 --- a/arch/loongarch/include/asm/sparsemem.h +++ b/arch/loongarch/include/asm/sparsemem.h @@ -11,8 +11,16 @@ #define SECTION_SIZE_BITS 29 /* 2^29 = Largest Huge Page Size */ #define MAX_PHYSMEM_BITS 48 +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define VMEMMAP_SIZE (sizeof(struct page) * (1UL << (cpu_pabits + 1 - PAGE_SHIFT))) +#endif + #endif /* CONFIG_SPARSEMEM */ +#ifndef VMEMMAP_SIZE +#define VMEMMAP_SIZE 0 /* 1, For FLATMEM; 2, For SPARSEMEM without VMEMMAP. */ +#endif + #ifdef CONFIG_MEMORY_HOTPLUG int memory_add_physaddr_to_nid(u64 addr); #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 080061793c85..451d93667bcc 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -152,6 +152,72 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #endif +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = NULL; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, NULL); + if (p) { + pmd_t entry; + + entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL); + pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL; + set_pmd_at(&init_mm, addr, pmd, entry); + + continue; + } + } else if (pmd_val(*pmd) & _PAGE_HUGE) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + if (vmemmap_populate_basepages(addr, next, node, NULL)) + return -ENOMEM; + } + + return 0; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ +#if CONFIG_PGTABLE_LEVELS == 2 + return vmemmap_populate_basepages(start, end, node, NULL); +#else + return vmemmap_populate_hugepages(start, end, node, NULL); +#endif +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap) +{ +} +#endif +#endif + static pte_t *fixmap_pte(unsigned long addr) { pgd_t *pgd; @@ -168,7 +234,7 @@ static pte_t *fixmap_pte(unsigned long addr) new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); pgd_populate(&init_mm, pgd, new); #ifndef __PAGETABLE_PUD_FOLDED - pud_init((unsigned long)new, (unsigned long)invalid_pmd_table); + pud_init(new); #endif } @@ -179,7 +245,7 @@ static pte_t *fixmap_pte(unsigned long addr) new = memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, new); #ifndef __PAGETABLE_PMD_FOLDED - pmd_init((unsigned long)new, (unsigned long)invalid_pte_table); + pmd_init(new); #endif } diff --git a/include/linux/mm.h b/include/linux/mm.h index 767c8c522e70..7c31f898337c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3360,6 +3360,8 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +void pmd_init(void *addr); +void pud_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 46ae542118c0..797b30e9050c 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -196,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } +void __weak __meminit pmd_init(void *addr) +{ +} + pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); @@ -203,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pmd_init(p); pud_populate(&init_mm, pud, p); } return pud; } +void __weak __meminit pud_init(void *addr) +{ +} + p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -215,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pud_init(p); p4d_populate(&init_mm, p4d, p); } return p4d; From 2045a3b8911b6ee64dd9b522d61abc468ecdcdb5 Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:52 +0800 Subject: [PATCH 261/313] mm/sparse-vmemmap: generalise vmemmap_populate_hugepages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generalise vmemmap_populate_hugepages() so ARM64 & X86 & LoongArch can share its implementation. Link: https://lkml.kernel.org/r/20221027125253.3458989-4-chenhuacai@loongson.cn Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Acked-by: Will Deacon Acked-by: Dave Hansen Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Min Zhou Cc: Peter Zijlstra Cc: Philippe Mathieu-Daudé Cc: Thomas Bogendoerfer Cc: Xuefeng Li Cc: Xuerui Wang Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 55 +++++++---------------- arch/loongarch/mm/init.c | 55 ++++++----------------- arch/x86/mm/init_64.c | 96 ++++++++++++++-------------------------- include/linux/mm.h | 6 +++ mm/sparse-vmemmap.c | 63 ++++++++++++++++++++++++++ 5 files changed, 132 insertions(+), 143 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 556154d821bf..27217ba12e57 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1137,53 +1137,28 @@ static void free_empty_tables(unsigned long addr, unsigned long end, } #endif +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + vmemmap_verify((pte_t *)pmdp, node, addr, next); + return 1; +} + int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) { - unsigned long addr = start; - unsigned long next; - pgd_t *pgdp; - p4d_t *p4dp; - pud_t *pudp; - pmd_t *pmdp; - WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END)); if (!ARM64_KERNEL_USES_PMD_MAPS) return vmemmap_populate_basepages(start, end, node, altmap); - - do { - next = pmd_addr_end(addr, end); - - pgdp = vmemmap_pgd_populate(addr, node); - if (!pgdp) - return -ENOMEM; - - p4dp = vmemmap_p4d_populate(pgdp, addr, node); - if (!p4dp) - return -ENOMEM; - - pudp = vmemmap_pud_populate(p4dp, addr, node); - if (!pudp) - return -ENOMEM; - - pmdp = pmd_offset(pudp, addr); - if (pmd_none(READ_ONCE(*pmdp))) { - void *p = NULL; - - p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (!p) { - if (vmemmap_populate_basepages(addr, next, node, altmap)) - return -ENOMEM; - continue; - } - - pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); - } else - vmemmap_verify((pte_t *)pmdp, node, addr, next); - } while (addr = next, addr != end); - - return 0; + else + return vmemmap_populate_hugepages(start, end, node, altmap); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 451d93667bcc..e018aed34586 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -153,52 +153,25 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, - int node, struct vmem_altmap *altmap) +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) { - unsigned long addr = start; - unsigned long next; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pmd_t entry; - for (addr = start; addr < end; addr = next) { - next = pmd_addr_end(addr, end); + entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL); + pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL; + set_pmd_at(&init_mm, addr, pmd, entry); +} - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) - return -ENOMEM; - p4d = vmemmap_p4d_populate(pgd, addr, node); - if (!p4d) - return -ENOMEM; - pud = vmemmap_pud_populate(p4d, addr, node); - if (!pud) - return -ENOMEM; +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + int huge = pmd_val(*pmd) & _PAGE_HUGE; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - void *p = NULL; + if (huge) + vmemmap_verify((pte_t *)pmd, node, addr, next); - p = vmemmap_alloc_block_buf(PMD_SIZE, node, NULL); - if (p) { - pmd_t entry; - - entry = pfn_pmd(virt_to_pfn(p), PAGE_KERNEL); - pmd_val(entry) |= _PAGE_HUGE | _PAGE_HGLOBAL; - set_pmd_at(&init_mm, addr, pmd, entry); - - continue; - } - } else if (pmd_val(*pmd) & _PAGE_HUGE) { - vmemmap_verify((pte_t *)pmd, node, addr, next); - continue; - } - if (vmemmap_populate_basepages(addr, next, node, NULL)) - return -ENOMEM; - } - - return 0; + return huge; } int __meminit vmemmap_populate(unsigned long start, unsigned long end, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index e8db4edd7cc9..a190aae8ceaf 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1492,72 +1492,44 @@ static long __meminitdata addr_start, addr_end; static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node, struct vmem_altmap *altmap) +void __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) { - unsigned long addr; - unsigned long next; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pte_t entry; - for (addr = start; addr < end; addr = next) { - next = pmd_addr_end(addr, end); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); + set_pmd(pmd, __pmd(pte_val(entry))); - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) - return -ENOMEM; - - p4d = vmemmap_p4d_populate(pgd, addr, node); - if (!p4d) - return -ENOMEM; - - pud = vmemmap_pud_populate(p4d, addr, node); - if (!pud) - return -ENOMEM; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { - void *p; - - p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (p) { - pte_t entry; - - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, - PAGE_KERNEL_LARGE); - set_pmd(pmd, __pmd(pte_val(entry))); - - /* check to see if we have contiguous blocks */ - if (p_end != p || node_start != node) { - if (p_start) - pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", - addr_start, addr_end-1, p_start, p_end-1, node_start); - addr_start = addr; - node_start = node; - p_start = p; - } - - addr_end = addr + PMD_SIZE; - p_end = p + PMD_SIZE; - - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) - vmemmap_use_new_sub_pmd(addr, next); - - continue; - } else if (altmap) - return -ENOMEM; /* no fallback */ - } else if (pmd_large(*pmd)) { - vmemmap_verify((pte_t *)pmd, node, addr, next); - vmemmap_use_sub_pmd(addr, next); - continue; - } - if (vmemmap_populate_basepages(addr, next, node, NULL)) - return -ENOMEM; + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + pr_debug(" [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; } - return 0; + + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; + + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) + vmemmap_use_new_sub_pmd(addr, next); +} + +int __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmd); + + if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + vmemmap_use_sub_pmd(addr, next); + } + + return large; } int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c31f898337c..472cb60ace07 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3373,8 +3373,14 @@ struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); +void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next); +int vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +int vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 797b30e9050c..c5398a5960d0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -295,6 +295,69 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, NULL); } +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ +} + +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + return 0; +} + +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * No fallback: In any case we care about, the + * altmap should be reasonably sized and aligned + * such that vmemmap_alloc_block_buf() will always + * succeed. For consistency with the PTE case, + * return an error here as failure could indicate + * a configuration issue with the size of the altmap. + */ + return -ENOMEM; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) + continue; + if (vmemmap_populate_basepages(addr, next, node, altmap)) + return -ENOMEM; + } + return 0; +} + /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail From c5a303a51b9ca85b52250fd8d92bf4918fbbdf0d Mon Sep 17 00:00:00 2001 From: Feiyang Chen Date: Thu, 27 Oct 2022 20:52:53 +0800 Subject: [PATCH 262/313] LoongArch: enable ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The feature of minimizing overhead of struct page associated with each HugeTLB page is implemented on x86_64. However, the infrastructure of this feature is already there, so just select ARCH_WANT_HUGETLB_PAGE_ OPTIMIZE_VMEMMAP is enough to enable this feature for LoongArch. Link: https://lkml.kernel.org/r/20221027125253.3458989-5-chenhuacai@loongson.cn Signed-off-by: Feiyang Chen Signed-off-by: Huacai Chen Reviewed-by: Philippe Mathieu-Daudé Acked-by: Muchun Song Reviewed-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Jiaxun Yang Cc: Min Zhou Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Will Deacon Cc: Xuefeng Li Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 6f7fa0c0ca08..0a6ef613124c 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -52,6 +52,7 @@ config LOONGARCH select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT + select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_NO_INSTR select BUILDTIME_TABLE_SORT From 3720dd6dcac38d03424d6ba38107f39af5318bcf Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:22 -0700 Subject: [PATCH 263/313] filemap: convert replace_page_cache_page() to replace_page_cache_folio() Patch series "Removing the lru_cache_add() wrapper". This patchset replaces all calls of lru_cache_add() with the folio equivalent: folio_add_lru(). This is allows us to get rid of the wrapper The series passes xfstests and the userfaultfd selftests. This patch (of 5): Eliminates 7 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221101175326.13265-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 2 +- include/linux/pagemap.h | 2 +- mm/filemap.c | 50 ++++++++++++++++++++--------------------- 3 files changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4a6e0a1b945..26817a2db463 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -837,7 +837,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - replace_page_cache_page(oldpage, newpage); + replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); get_page(newpage); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2ec0ca1f3d38..29e1f9e76eb6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1102,7 +1102,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp); void filemap_remove_folio(struct folio *folio); void __filemap_remove_folio(struct folio *folio, void *shadow); -void replace_page_cache_page(struct page *old, struct page *new); +void replace_page_cache_folio(struct folio *old, struct folio *new); void delete_from_page_cache_batch(struct address_space *mapping, struct folio_batch *fbatch); bool filemap_release_folio(struct folio *folio, gfp_t gfp); diff --git a/mm/filemap.c b/mm/filemap.c index 242cd8bd8330..c4d4ace9cc70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -788,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) EXPORT_SYMBOL(file_write_and_wait_range); /** - * replace_page_cache_page - replace a pagecache page with a new one - * @old: page to be replaced - * @new: page to replace with + * replace_page_cache_folio - replace a pagecache folio with a new one + * @old: folio to be replaced + * @new: folio to replace with * - * This function replaces a page in the pagecache with a new one. On - * success it acquires the pagecache reference for the new page and - * drops it for the old page. Both the old and new pages must be - * locked. This function does not add the new page to the LRU, the + * This function replaces a folio in the pagecache with a new one. On + * success it acquires the pagecache reference for the new folio and + * drops it for the old folio. Both the old and new folios must be + * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ -void replace_page_cache_page(struct page *old, struct page *new) +void replace_page_cache_folio(struct folio *old, struct folio *new) { - struct folio *fold = page_folio(old); - struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - VM_BUG_ON_PAGE(!PageLocked(old), old); - VM_BUG_ON_PAGE(!PageLocked(new), new); - VM_BUG_ON_PAGE(new->mapping, new); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(new->mapping, new); - get_page(new); + folio_get(new); new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(fold, fnew); + mem_cgroup_migrate(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(old)) - __dec_lruvec_page_state(old, NR_FILE_PAGES); - if (!PageHuge(new)) - __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) - __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) - __inc_lruvec_page_state(new, NR_SHMEM); + if (!folio_test_hugetlb(old)) + __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + if (!folio_test_hugetlb(new)) + __lruvec_stat_add_folio(new, NR_FILE_PAGES); + if (folio_test_swapbacked(old)) + __lruvec_stat_sub_folio(old, NR_SHMEM); + if (folio_test_swapbacked(new)) + __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) - free_folio(fold); - folio_put(fold); + free_folio(old); + folio_put(old); } -EXPORT_SYMBOL_GPL(replace_page_cache_page); +EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) From 063aaad792eef49a11d7575dc9914b43c0fa3792 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:23 -0700 Subject: [PATCH 264/313] fuse: convert fuse_try_move_page() to use folios Converts the function to try to move folios instead of pages. Also converts fuse_check_page() to fuse_get_folio() since this is its only caller. This change removes 15 calls to compound_head(). Link: https://lkml.kernel.org/r/20221101175326.13265-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Miklos Szeredi Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- fs/fuse/dev.c | 55 ++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 26817a2db463..204c332cd343 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -764,11 +764,11 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } -static int fuse_check_page(struct page *page) +static int fuse_check_folio(struct folio *folio) { - if (page_mapcount(page) || - page->mapping != NULL || - (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + if (folio_mapped(folio) || + folio->mapping != NULL || + (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | @@ -778,7 +778,7 @@ static int fuse_check_page(struct page *page) 1 << PG_reclaim | 1 << PG_waiters | LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); + dump_page(&folio->page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -787,11 +787,11 @@ static int fuse_check_page(struct page *page) static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; - struct page *oldpage = *pagep; - struct page *newpage; + struct folio *oldfolio = page_folio(*pagep); + struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; - get_page(oldpage); + folio_get(oldfolio); err = unlock_request(cs->req); if (err) goto out_put_old; @@ -814,35 +814,36 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (!pipe_buf_try_steal(cs->pipe, buf)) goto out_fallback; - newpage = buf->page; + newfolio = page_folio(buf->page); - if (!PageUptodate(newpage)) - SetPageUptodate(newpage); + if (!folio_test_uptodate(newfolio)) + folio_mark_uptodate(newfolio); - ClearPageMappedToDisk(newpage); + folio_clear_mappedtodisk(newfolio); - if (fuse_check_page(newpage) != 0) + if (fuse_check_folio(newfolio) != 0) goto out_fallback_unlock; /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it */ - if (WARN_ON(page_mapped(oldpage))) + if (WARN_ON(folio_mapped(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(page_has_private(oldpage))) + if (WARN_ON(folio_has_private(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + if (WARN_ON(folio_test_dirty(oldfolio) || + folio_test_writeback(oldfolio))) goto out_fallback_unlock; - if (WARN_ON(PageMlocked(oldpage))) + if (WARN_ON(folio_test_mlocked(oldfolio))) goto out_fallback_unlock; - replace_page_cache_folio(page_folio(oldpage), page_folio(newpage)); + replace_page_cache_folio(oldfolio, newfolio); - get_page(newpage); + folio_get(newfolio); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(newpage); + folio_add_lru(newfolio); /* * Release while we have extra ref on stolen page. Otherwise @@ -855,28 +856,28 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = newpage; + *pagep = &newfolio->page; spin_unlock(&cs->req->waitq.lock); if (err) { - unlock_page(newpage); - put_page(newpage); + folio_unlock(newfolio); + folio_put(newfolio); goto out_put_old; } - unlock_page(oldpage); + folio_unlock(oldfolio); /* Drop ref for ap->pages[] array */ - put_page(oldpage); + folio_put(oldfolio); cs->len = 0; err = 0; out_put_old: /* Drop ref obtained in this function */ - put_page(oldpage); + folio_put(oldfolio); return err; out_fallback_unlock: - unlock_page(newpage); + folio_unlock(newfolio); out_fallback: cs->pg = buf->page; cs->offset = buf->offset; From 28965f0f8be62e1ed8296fe0240b5d5dc064b681 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:24 -0700 Subject: [PATCH 265/313] userfaultfd: replace lru_cache functions with folio_add functions Replaces lru_cache_add() and lru_cache_add_inactive_or_unevictable() with folio_add_lru() and folio_add_lru_vma(). This is in preparation for the removal of lru_cache_add(). Link: https://lkml.kernel.org/r/20221101175326.13265-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 650ab6cfd5f4..b7a9479bece2 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -66,6 +66,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, bool vm_shared = dst_vma->vm_flags & VM_SHARED; bool page_in_cache = page_mapping(page); spinlock_t *ptl; + struct folio *folio; struct inode *inode; pgoff_t offset, max_off; @@ -113,14 +114,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none_mostly(*dst_pte)) goto out_unlock; + folio = page_folio(page); if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) - lru_cache_add(page); + folio_add_lru(folio); page_add_file_rmap(page, dst_vma, false); } else { page_add_new_anon_rmap(page, dst_vma, dst_addr); - lru_cache_add_inactive_or_unevictable(page, dst_vma); + folio_add_lru_vma(folio, dst_vma); } /* From 284a344ed19dc92526024d062b30f90774fea50f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:25 -0700 Subject: [PATCH 266/313] khugepage: replace lru_cache_add() with folio_add_lru() Replaces some calls with their folio equivalents. This is in preparation for the removal of lru_cache_add(). This replaces 3 calls to compound_head() with 1. Link: https://lkml.kernel.org/r/20221101175326.13265-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 78ec2771cc65..5a7d2d5093f9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2013,6 +2013,7 @@ xa_unlocked: if (result == SCAN_SUCCEED) { struct page *page, *tmp; + struct folio *folio; /* * Replacing old pages with new one has succeeded, now we @@ -2040,11 +2041,13 @@ xa_unlocked: index++; } - SetPageUptodate(hpage); - page_ref_add(hpage, HPAGE_PMD_NR - 1); + folio = page_folio(hpage); + folio_mark_uptodate(folio); + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (is_shmem) - set_page_dirty(hpage); - lru_cache_add(hpage); + folio_mark_dirty(folio); + folio_add_lru(folio); /* * Remove pte page tables, so we can re-fault the page as huge. From 6e1ca48d0669b0f5efcbaa051b23cd8e651a1614 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Tue, 1 Nov 2022 10:53:26 -0700 Subject: [PATCH 267/313] folio-compat: remove lru_cache_add() There are no longer any callers of lru_cache_add(), so remove it. This saves 79 bytes of kernel text. Also cleanup some comments such that they reference the new folio_add_lru() instead. Link: https://lkml.kernel.org/r/20221101175326.13265-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Miklos Szeredi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/folio-compat.c | 6 ------ mm/truncate.c | 2 +- mm/workingset.c | 5 ++++- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index b61e2007d156..0ceed49516ad 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -381,7 +381,6 @@ void lru_note_cost(struct lruvec *lruvec, bool file, void lru_note_cost_refault(struct folio *); void folio_add_lru(struct folio *); void folio_add_lru_vma(struct folio *, struct vm_area_struct *); -void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 86933fa8f3e1..69ed25790c68 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -76,12 +76,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc, } EXPORT_SYMBOL(redirty_page_for_writepage); -void lru_cache_add(struct page *page) -{ - folio_add_lru(page_folio(page)); -} -EXPORT_SYMBOL(lru_cache_add); - void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { diff --git a/mm/truncate.c b/mm/truncate.c index c7bfd247a651..7b4ea4c4a46b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -565,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently - * sitting in the lru_cache_add() pagevecs. + * sitting in the folio_add_lru() pagevecs. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) diff --git a/mm/workingset.c b/mm/workingset.c index d2d02978588c..1a86645b7b3c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -492,7 +492,10 @@ void workingset_refault(struct folio *folio, void *shadow) /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); - /* XXX: Move to lru_cache_add() when it supports new vs putback */ + /* + * XXX: Move to folio_add_lru() when it supports new vs + * putback + */ lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } From 9fd330582b2fe43c49ebcd02b2480f051f85aad4 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:30 -0800 Subject: [PATCH 268/313] mm: add folio dtor and order setter functions Patch series "convert core hugetlb functions to folios", v5. ============== OVERVIEW =========================== Now that many hugetlb helper functions that deal with hugetlb specific flags[1] and hugetlb cgroups[2] are converted to folios, higher level allocation, prep, and freeing functions within hugetlb can also be converted to operate in folios. Patch 1 of this series implements the wrapper functions around setting the compound destructor and compound order for a folio. Besides the user added in patch 1, patch 2 and patch 9 also use these helper functions. Patches 2-10 convert the higher level hugetlb functions to folios. ============== TESTING =========================== LTP: Ran 10 back to back rounds of the LTP hugetlb test suite. Gigantic Huge Pages: Test allocation and freeing via hugeadm commands: hugeadm --pool-pages-min 1GB:10 hugeadm --pool-pages-min 1GB:0 Demote: Demote 1 1GB hugepages to 512 2MB hugepages echo 1 > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages echo 1 > /sys/kernel/mm/hugepages/hugepages-1048576kB/demote cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # 512 cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages # 0 [1] https://lore.kernel.org/lkml/20220922154207.1575343-1-sidhartha.kumar@oracle.com/ [2] https://lore.kernel.org/linux-mm/20221101223059.460937-1-sidhartha.kumar@oracle.com/ This patch (of 10): Add folio equivalents for set_compound_order() and set_compound_page_dtor(). Also remove extra new-lines introduced by mm/hugetlb: convert move_hugetlb_state() to folios and mm/hugetlb_cgroup: convert hugetlb_cgroup_uncharge_page() to folios. [sidhartha.kumar@oracle.com: clarify folio_set_compound_order() zero support] Link: https://lkml.kernel.org/r/20221207223731.32784-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Tarun Sahu Cc: Rasmus Villemoes Cc: Wei Chen Signed-off-by: Andrew Morton --- include/linux/mm.h | 23 +++++++++++++++++++++++ mm/hugetlb.c | 4 +--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 472cb60ace07..7dc376052d40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -997,6 +997,13 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } +static inline void folio_set_compound_dtor(struct folio *folio, + enum compound_dtor_id compound_dtor) +{ + VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); + folio->_folio_dtor = compound_dtor; +} + void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) @@ -1012,6 +1019,22 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } +/* + * folio_set_compound_order is generally passed a non-zero order to + * initialize a large folio. However, hugetlb code abuses this by + * passing in zero when 'dissolving' a large folio. + */ +static inline void folio_set_compound_order(struct folio *folio, + unsigned int order) +{ + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + + folio->_folio_order = order; +#ifdef CONFIG_64BIT + folio->_folio_nr_pages = order ? 1U << order : 0; +#endif +} + /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9d97c9a2a15d..22512f7b0237 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1780,7 +1780,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio->_folio_dtor = HUGETLB_PAGE_DTOR; + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -2938,7 +2938,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * a reservation exists for the allocation. */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); - if (!page) { spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); @@ -7343,7 +7342,6 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re int old_nid = folio_nid(old_folio); int new_nid = folio_nid(new_folio); - folio_set_hugetlb_temporary(old_folio); folio_clear_hugetlb_temporary(new_folio); From 911565b8285381e62d3bfd0cae2889a022737c37 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:31 -0800 Subject: [PATCH 269/313] mm/hugetlb: convert destroy_compound_gigantic_page() to folios Convert page operations within __destroy_compound_gigantic_page() to the corresponding folio operations. Link: https://lkml.kernel.org/r/20221129225039.82257-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 22512f7b0237..5960a05cb370 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1325,43 +1325,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) nr_nodes--) /* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_page(struct page *page, +static void __destroy_compound_gigantic_folio(struct folio *folio, unsigned int order, bool demote) { int i; int nr_pages = 1 << order; struct page *p; - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); for (i = 1; i < nr_pages; i++) { - p = nth_page(page, i); + p = folio_page(folio, i); p->mapping = NULL; clear_compound_head(p); if (!demote) set_page_refcounted(p); } - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); } -static void destroy_compound_hugetlb_page_for_demote(struct page *page, +static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, true); + __destroy_compound_gigantic_folio(folio, order, true); } #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_page(struct page *page, +static void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, false); + __destroy_compound_gigantic_folio(folio, order, false); } static void free_gigantic_page(struct page *page, unsigned int order) @@ -1430,7 +1427,7 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, return NULL; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } -static inline void destroy_compound_gigantic_page(struct page *page, +static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif @@ -1477,8 +1474,8 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, * * For gigantic pages set the destructor to the null dtor. This * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_page will turn the compound page - * into a simple group of pages. After this the destructor does not + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not * apply. * * This handles the case where more than one ref is held when and @@ -1559,6 +1556,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, static void __update_and_free_page(struct hstate *h, struct page *page) { int i; + struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1587,8 +1585,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ - if (unlikely(PageHWPoison(page))) - hugetlb_clear_page_hwpoison(page); + if (unlikely(folio_test_hwpoison(folio))) + hugetlb_clear_page_hwpoison(&folio->page); for (i = 0; i < pages_per_huge_page(h); i++) { subpage = nth_page(page, i); @@ -1604,7 +1602,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) */ if (hstate_is_gigantic(h) || hugetlb_cma_page(page, huge_page_order(h))) { - destroy_compound_gigantic_page(page, huge_page_order(h)); + destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { __free_pages(page, huge_page_order(h)); @@ -3437,6 +3435,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) { int i, nid = page_to_nid(page); struct hstate *target_hstate; + struct folio *folio = page_folio(page); struct page *subpage; int rc = 0; @@ -3455,10 +3454,10 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) } /* - * Use destroy_compound_hugetlb_page_for_demote for all huge page + * Use destroy_compound_hugetlb_folio_for_demote for all huge page * sizes as it will not ref count pages. */ - destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. From 1a7cdab59b22465b850501e3897a3f3aa01670d8 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:32 -0800 Subject: [PATCH 270/313] mm/hugetlb: convert dissolve_free_huge_page() to folios Removes compound_head() call by using a folio rather than a head page. Link: https://lkml.kernel.org/r/20221129225039.82257-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5960a05cb370..d02293fd2e64 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2128,21 +2128,21 @@ static struct page *remove_pool_huge_page(struct hstate *h, int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; + struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ - if (!PageHuge(page)) + if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); - if (!PageHuge(page)) { + if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } - if (!page_count(page)) { - struct page *head = compound_head(page); - struct hstate *h = page_hstate(head); + if (!folio_ref_count(folio)) { + struct hstate *h = folio_hstate(folio); if (!available_huge_pages(h)) goto out; @@ -2150,7 +2150,7 @@ retry: * We should make sure that the page is already on the free list * when it is dissolved. */ - if (unlikely(!HPageFreed(head))) { + if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); @@ -2165,7 +2165,7 @@ retry: goto retry; } - remove_hugetlb_page(h, head, false); + remove_hugetlb_page(h, &folio->page, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2177,12 +2177,12 @@ retry: * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_restore(h, head); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (!rc) { - update_and_free_page(h, head, false); + update_and_free_page(h, &folio->page, false); } else { spin_lock_irq(&hugetlb_lock); - add_hugetlb_page(h, head, false); + add_hugetlb_page(h, &folio->page, false); h->max_huge_pages++; spin_unlock_irq(&hugetlb_lock); } From cfd5082b514765f873504cc60a50cce30738bfd3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:33 -0800 Subject: [PATCH 271/313] mm/hugetlb: convert remove_hugetlb_page() to folios Removes page_folio() call by converting callers to directly pass a folio into __remove_hugetlb_page(). Link: https://lkml.kernel.org/r/20221129225039.82257-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d02293fd2e64..9b1c9d05ba34 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1432,19 +1432,18 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, #endif /* - * Remove hugetlb page from lists, and update dtor so that page appears + * Remove hugetlb folio from lists, and update dtor so that the folio appears * as just a compound page. * - * A reference is held on the page, except in the case of demote. + * A reference is held on the folio, except in the case of demote. * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_page(struct hstate *h, struct page *page, +static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus, bool demote) { - int nid = page_to_nid(page); - struct folio *folio = page_folio(page); + int nid = folio_nid(folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); @@ -1453,9 +1452,9 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - list_del(&page->lru); + list_del(&folio->lru); - if (HPageFreed(page)) { + if (folio_test_hugetlb_freed(folio)) { h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1485,26 +1484,26 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, * be turned into a page of smaller size. */ if (!demote) - set_page_refcounted(page); + folio_ref_unfreeze(folio, 1); if (hstate_is_gigantic(h)) - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); else - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_page(struct hstate *h, struct page *page, +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, false); + __remove_hugetlb_folio(h, folio, adjust_surplus, false); } -static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, +static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, true); + __remove_hugetlb_folio(h, folio, adjust_surplus, true); } static void add_hugetlb_page(struct hstate *h, struct page *page, @@ -1639,8 +1638,9 @@ static void free_hpage_workfn(struct work_struct *work) /* * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() * is going to trigger because a previous call to - * remove_hugetlb_page() will set_compound_page_dtor(page, - * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + * remove_hugetlb_folio() will call folio_set_compound_dtor + * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate() + * directly. */ h = size_to_hstate(page_size(page)); @@ -1749,12 +1749,12 @@ void free_huge_page(struct page *page) h->resv_huge_pages++; if (folio_test_hugetlb_temporary(folio)) { - remove_hugetlb_page(h, page, false); + remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - remove_hugetlb_page(h, page, true); + remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page, true); } else { @@ -2092,6 +2092,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, { int nr_nodes, node; struct page *page = NULL; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2103,7 +2104,8 @@ static struct page *remove_pool_huge_page(struct hstate *h, !list_empty(&h->hugepage_freelists[node])) { page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - remove_hugetlb_page(h, page, acct_surplus); + folio = page_folio(page); + remove_hugetlb_folio(h, folio, acct_surplus); break; } } @@ -2165,7 +2167,7 @@ retry: goto retry; } - remove_hugetlb_page(h, &folio->page, false); + remove_hugetlb_folio(h, folio, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2803,7 +2805,7 @@ retry: * and enqueue_huge_page() for new_page. The counters will remain * stable since this happens under the lock. */ - remove_hugetlb_page(h, old_page, false); + remove_hugetlb_folio(h, old_folio, false); /* * Ref count on new page is already zero as it was dropped @@ -3230,7 +3232,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, goto out; if (PageHighMem(page)) continue; - remove_hugetlb_page(h, page, false); + remove_hugetlb_folio(h, page_folio(page), false); list_add(&page->lru, &page_list); } } @@ -3441,7 +3443,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - remove_hugetlb_page_for_demote(h, page, false); + remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); rc = hugetlb_vmemmap_restore(h, page); From d6ef19e25df2aa50f932a78c368d7bb710eaaa1b Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:34 -0800 Subject: [PATCH 272/313] mm/hugetlb: convert update_and_free_page() to folios Make more progress on converting the free_huge_page() destructor to operate on folios by converting update_and_free_page() to folios. Link: https://lkml.kernel.org/r/20221129225039.82257-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar \ Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9b1c9d05ba34..5f3622ce791f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1478,7 +1478,7 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, * apply. * * This handles the case where more than one ref is held when and - * after update_and_free_page is called. + * after update_and_free_hugetlb_folio is called. * * In the case of demote we do not ref count the page as it will soon * be turned into a page of smaller size. @@ -1609,7 +1609,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) } /* - * As update_and_free_page() can be called under any context, so we cannot + * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. @@ -1657,11 +1657,11 @@ static inline void flush_free_hpage_work(struct hstate *h) flush_work(&free_hpage_work); } -static void update_and_free_page(struct hstate *h, struct page *page, +static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { - if (!HPageVmemmapOptimized(page) || !atomic) { - __update_and_free_page(h, page); + if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { + __update_and_free_page(h, &folio->page); return; } @@ -1672,16 +1672,18 @@ static void update_and_free_page(struct hstate *h, struct page *page, * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct page *page, *t_page; + struct folio *folio; list_for_each_entry_safe(page, t_page, list, lru) { - update_and_free_page(h, page, false); + folio = page_folio(page); + update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } @@ -1751,12 +1753,12 @@ void free_huge_page(struct page *page) if (folio_test_hugetlb_temporary(folio)) { remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); @@ -2172,8 +2174,8 @@ retry: spin_unlock_irq(&hugetlb_lock); /* - * Normally update_and_free_page will allocate required vmemmmap - * before freeing the page. update_and_free_page will fail to + * Normally update_and_free_hugtlb_folio will allocate required vmemmmap + * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take @@ -2181,7 +2183,7 @@ retry: */ rc = hugetlb_vmemmap_restore(h, &folio->page); if (!rc) { - update_and_free_page(h, &folio->page, false); + update_and_free_hugetlb_folio(h, folio, false); } else { spin_lock_irq(&hugetlb_lock); add_hugetlb_page(h, &folio->page, false); @@ -2818,7 +2820,7 @@ retry: * Pages have been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, old_page, false); + update_and_free_hugetlb_folio(h, old_folio, false); } return ret; @@ -2827,7 +2829,7 @@ free_new: spin_unlock_irq(&hugetlb_lock); /* Page has a zero ref count, but needs a ref to be freed */ folio_ref_unfreeze(new_folio, 1); - update_and_free_page(h, new_page, false); + update_and_free_hugetlb_folio(h, new_folio, false); return ret; } From 2f6c57d696abcd2d27d07b8506d5e6bcc060e77a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:35 -0800 Subject: [PATCH 273/313] mm/hugetlb: convert add_hugetlb_page() to folios and add hugetlb_cma_folio() Convert add_hugetlb_page() to take in a folio, also convert hugetlb_cma_page() to take in a folio. Link: https://lkml.kernel.org/r/20221129225039.82257-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f3622ce791f..d80a83490b9a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -54,13 +54,13 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { - return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 1 << order); } #else -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { return false; } @@ -1506,17 +1506,17 @@ static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *foli __remove_hugetlb_folio(h, folio, adjust_surplus, true); } -static void add_hugetlb_page(struct hstate *h, struct page *page, +static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int zeroed; - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; @@ -1525,21 +1525,21 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]++; } - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_page_private(page, 0); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_change_private(folio, NULL); /* - * We have to set HPageVmemmapOptimized again as above - * set_page_private(page, 0) cleared it. + * We have to set hugetlb_vmemmap_optimized again as above + * folio_change_private(folio, NULL) cleared it. */ - SetHPageVmemmapOptimized(page); + folio_set_hugetlb_vmemmap_optimized(folio); /* - * This page is about to be managed by the hugetlb allocator and + * This folio is about to be managed by the hugetlb allocator and * should have no users. Drop our reference, and check for others * just in case. */ - zeroed = put_page_testzero(page); - if (!zeroed) + zeroed = folio_put_testzero(folio); + if (unlikely(!zeroed)) /* * It is VERY unlikely soneone else has taken a ref on * the page. In this case, we simply return as the @@ -1548,8 +1548,8 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, */ return; - arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + arch_clear_hugepage_flags(&folio->page); + enqueue_huge_page(h, &folio->page); } static void __update_and_free_page(struct hstate *h, struct page *page) @@ -1575,7 +1575,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * page and put the page back on the hugetlb free list and treat * as a surplus page. */ - add_hugetlb_page(h, page, true); + add_hugetlb_folio(h, page_folio(page), true); spin_unlock_irq(&hugetlb_lock); return; } @@ -1600,7 +1600,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * need to be given back to CMA in free_gigantic_page. */ if (hstate_is_gigantic(h) || - hugetlb_cma_page(page, huge_page_order(h))) { + hugetlb_cma_folio(folio, huge_page_order(h))) { destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { @@ -2186,7 +2186,7 @@ retry: update_and_free_hugetlb_folio(h, folio, false); } else { spin_lock_irq(&hugetlb_lock); - add_hugetlb_page(h, &folio->page, false); + add_hugetlb_folio(h, folio, false); h->max_huge_pages++; spin_unlock_irq(&hugetlb_lock); } @@ -3453,7 +3453,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); set_page_refcounted(page); - add_hugetlb_page(h, page, false); + add_hugetlb_folio(h, page_folio(page), false); return rc; } From 240d67a86ecb0fa18863821a0cb55783ad50ef30 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:36 -0800 Subject: [PATCH 274/313] mm/hugetlb: convert enqueue_huge_page() to folios Convert callers of enqueue_huge_page() to pass in a folio, function is renamed to enqueue_hugetlb_folio(). Link: https://lkml.kernel.org/r/20221129225039.82257-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d80a83490b9a..6da673f1d830 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1127,17 +1127,17 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) return false; } -static void enqueue_huge_page(struct hstate *h, struct page *page) +static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - list_move(&page->lru, &h->hugepage_freelists[nid]); + list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; - SetHPageFreed(page); + folio_set_hugetlb_freed(folio); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1549,7 +1549,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, return; arch_clear_hugepage_flags(&folio->page); - enqueue_huge_page(h, &folio->page); + enqueue_hugetlb_folio(h, folio); } static void __update_and_free_page(struct hstate *h, struct page *page) @@ -1761,7 +1761,7 @@ void free_huge_page(struct page *page) update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } @@ -2438,7 +2438,7 @@ retry: if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, page_folio(page)); } free: spin_unlock_irq(&hugetlb_lock); @@ -2804,8 +2804,8 @@ retry: * Ok, old_page is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() - * and enqueue_huge_page() for new_page. The counters will remain - * stable since this happens under the lock. + * and enqueue_hugetlb_folio() for new_folio. The counters will + * remain stable since this happens under the lock. */ remove_hugetlb_folio(h, old_folio, false); @@ -2814,7 +2814,7 @@ retry: * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - enqueue_huge_page(h, new_page); + enqueue_hugetlb_folio(h, new_folio); /* * Pages have been replaced, we can safely free the old one. From 7f325a8d25631e68cd75afaeaf330187e45e0eb5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:37 -0800 Subject: [PATCH 275/313] mm/hugetlb: convert free_gigantic_page() to folios Convert callers of free_gigantic_page() to use folios, function is then renamed to free_gigantic_folio(). Link: https://lkml.kernel.org/r/20221129225039.82257-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6da673f1d830..eb58b0f38222 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1361,18 +1361,20 @@ static void destroy_compound_gigantic_folio(struct folio *folio, __destroy_compound_gigantic_folio(folio, order, false); } -static void free_gigantic_page(struct page *page, unsigned int order) +static void free_gigantic_folio(struct folio *folio, unsigned int order) { /* * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ #ifdef CONFIG_CMA - if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) + int nid = folio_nid(folio); + + if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) return; #endif - free_contig_range(page_to_pfn(page), 1 << order); + free_contig_range(folio_pfn(folio), 1 << order); } #ifdef CONFIG_CONTIG_ALLOC @@ -1426,7 +1428,8 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, { return NULL; } -static inline void free_gigantic_page(struct page *page, unsigned int order) { } +static inline void free_gigantic_folio(struct folio *folio, + unsigned int order) { } static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif @@ -1565,7 +1568,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ - if (HPageRawHwpUnreliable(page)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; if (hugetlb_vmemmap_restore(h, page)) { @@ -1575,7 +1578,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * page and put the page back on the hugetlb free list and treat * as a surplus page. */ - add_hugetlb_folio(h, page_folio(page), true); + add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } @@ -1588,7 +1591,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) hugetlb_clear_page_hwpoison(&folio->page); for (i = 0; i < pages_per_huge_page(h); i++) { - subpage = nth_page(page, i); + subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1597,12 +1600,12 @@ static void __update_and_free_page(struct hstate *h, struct page *page) /* * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_page. + * need to be given back to CMA in free_gigantic_folio. */ if (hstate_is_gigantic(h) || hugetlb_cma_folio(folio, huge_page_order(h))) { destroy_compound_gigantic_folio(folio, huge_page_order(h)); - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } else { __free_pages(page, huge_page_order(h)); } @@ -2025,6 +2028,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, nodemask_t *node_alloc_noretry) { struct page *page; + struct folio *folio; bool retry = false; retry: @@ -2035,14 +2039,14 @@ retry: nid, nmask, node_alloc_noretry); if (!page) return NULL; - + folio = page_folio(page); if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_page(page, huge_page_order(h))) { /* * Rare failure to convert pages to compound page. * Free pages and try again - ONCE! */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); if (!retry) { retry = true; goto retry; @@ -3050,6 +3054,7 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); + struct folio *folio = page_folio(page); struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); @@ -3060,7 +3065,7 @@ static void __init gather_bootmem_prealloc(void) free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } /* From d1c6095572d0cf00c0cd30378639ff9387b34edd Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:38 -0800 Subject: [PATCH 276/313] mm/hugetlb: convert hugetlb prep functions to folios Convert prep_new_huge_page() and __prep_compound_gigantic_page() to folios. Link: https://lkml.kernel.org/r/20221129225039.82257-10-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Rasmus Villemoes Cc: Tarun Sahu Cc: Wei Chen Signed-off-by: Andrew Morton --- mm/hugetlb.c | 69 +++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eb58b0f38222..903ee75cccd6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1789,29 +1789,27 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) set_hugetlb_cgroup_rsvd(folio, NULL); } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { - struct folio *folio = page_folio(page); - __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, - bool demote) +static bool __prep_compound_gigantic_folio(struct folio *folio, + unsigned int order, bool demote) { int i, j; int nr_pages = 1 << order; struct page *p; - /* we rely on prep_new_huge_page to set the destructor */ - set_compound_order(page, order); - __ClearPageReserved(page); - __SetPageHead(page); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_compound_order(folio, order); + __folio_clear_reserved(folio); + __folio_set_head(folio); for (i = 0; i < nr_pages; i++) { - p = nth_page(page, i); + p = folio_page(folio, i); /* * For gigantic hugepages allocated through bootmem at @@ -1853,43 +1851,41 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, VM_BUG_ON_PAGE(page_count(p), p); } if (i != 0) - set_compound_head(p, page); + set_compound_head(p, &folio->page); } - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(subpages_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); return true; out_error: /* undo page modifications made above */ for (j = 0; j < i; j++) { - p = nth_page(page, j); + p = folio_page(folio, j); if (j != 0) clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ for (; j < nr_pages; j++) { - p = nth_page(page, j); + p = folio_page(folio, j); __ClearPageReserved(p); } - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); return false; } -static bool prep_compound_gigantic_page(struct page *page, unsigned int order) -{ - return __prep_compound_gigantic_page(page, order, false); -} - -static bool prep_compound_gigantic_page_for_demote(struct page *page, +static bool prep_compound_gigantic_folio(struct folio *folio, unsigned int order) { - return __prep_compound_gigantic_page(page, order, true); + return __prep_compound_gigantic_folio(folio, order, false); +} + +static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, + unsigned int order) +{ + return __prep_compound_gigantic_folio(folio, order, true); } /* @@ -2041,7 +2037,7 @@ retry: return NULL; folio = page_folio(page); if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* * Rare failure to convert pages to compound page. * Free pages and try again - ONCE! @@ -2054,7 +2050,7 @@ retry: return NULL; } } - prep_new_huge_page(h, page, page_to_nid(page)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return page; } @@ -3058,10 +3054,10 @@ static void __init gather_bootmem_prealloc(void) struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); - WARN_ON(page_count(page) != 1); - if (prep_compound_gigantic_page(page, huge_page_order(h))) { - WARN_ON(PageReserved(page)); - prep_new_huge_page(h, page, page_to_nid(page)); + WARN_ON(folio_ref_count(folio) != 1); + if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { + WARN_ON(folio_test_reserved(folio)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ @@ -3480,13 +3476,14 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { subpage = nth_page(page, i); + folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_page_for_demote(subpage, + prep_compound_gigantic_folio_for_demote(folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); set_page_private(subpage, 0); - prep_new_huge_page(target_hstate, subpage, nid); + prep_new_hugetlb_folio(target_hstate, folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); From 19fc1a7e8b2b3b0e18fbea84ee26517e1b0f1a6e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 29 Nov 2022 14:50:39 -0800 Subject: [PATCH 277/313] mm/hugetlb: change hugetlb allocation functions to return a folio Many hugetlb allocation helper functions have now been converting to folios, update their higher level callers to be compatible with folios. alloc_pool_huge_page is reorganized to avoid a smatch warning reporting the folio variable is uninitialized. [sidhartha.kumar@oracle.com: update alloc_and_dissolve_hugetlb_folio comments] Link: https://lkml.kernel.org/r/20221206233512.146535-1-sidhartha.kumar@oracle.com Link: https://lkml.kernel.org/r/20221129225039.82257-11-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reported-by: Wei Chen Suggested-by: John Hubbard Suggested-by: Rasmus Villemoes Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mina Almasry Cc: Muchun Song Cc: Tarun Sahu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 134 ++++++++++++++++++++++++--------------------------- 1 file changed, 64 insertions(+), 70 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 903ee75cccd6..8c6fe2286814 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1378,23 +1378,23 @@ static void free_gigantic_folio(struct folio *folio, unsigned int order) } #ifdef CONFIG_CONTIG_ALLOC -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { + struct page *page; unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); #ifdef CONFIG_CMA { - struct page *page; int node; if (hugetlb_cma[nid]) { page = cma_alloc(hugetlb_cma[nid], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } if (!(gfp_mask & __GFP_THISNODE)) { @@ -1405,17 +1405,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, page = cma_alloc(hugetlb_cma[node], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } } } #endif - return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + return page ? page_folio(page) : NULL; } #else /* !CONFIG_CONTIG_ALLOC */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1423,7 +1424,7 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1950,7 +1951,7 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_buddy_huge_page(struct hstate *h, +static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { @@ -1988,11 +1989,6 @@ retry: page = NULL; } - if (page) - __count_vm_event(HTLB_BUDDY_PGALLOC); - else - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this * indicates an overall state change. Clear bit so that we resume @@ -2009,7 +2005,13 @@ retry: if (node_alloc_noretry && !page && alloc_try_hard) node_set(nid, *node_alloc_noretry); - return page; + if (!page) { + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + return NULL; + } + + __count_vm_event(HTLB_BUDDY_PGALLOC); + return page_folio(page); } /* @@ -2019,23 +2021,21 @@ retry: * Note that returned page is 'frozen': ref count of head page and all tail * pages is zero. */ -static struct page *alloc_fresh_huge_page(struct hstate *h, +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - struct page *page; struct folio *folio; bool retry = false; retry: if (hstate_is_gigantic(h)) - page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else - page = alloc_buddy_huge_page(h, gfp_mask, + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); - if (!page) + if (!folio) return NULL; - folio = page_folio(page); if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* @@ -2052,7 +2052,7 @@ retry: } prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - return page; + return folio; } /* @@ -2062,23 +2062,20 @@ retry: static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, - node_alloc_noretry); - if (page) - break; + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + nodes_allowed, node_alloc_noretry); + if (folio) { + free_huge_page(&folio->page); /* free it into the hugepage allocator */ + return 1; + } } - if (!page) - return 0; - - free_huge_page(page); /* free it into the hugepage allocator */ - - return 1; + return 0; } /* @@ -2237,7 +2234,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page = NULL; + struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; @@ -2247,8 +2244,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); @@ -2260,43 +2257,42 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); - free_huge_page(page); + free_huge_page(&folio->page); return NULL; } h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; + h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; + struct folio *folio; if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; /* fresh huge pages are frozen */ - set_page_refcounted(page); - + folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); - return page; + return &folio->page; } /* @@ -2745,54 +2741,52 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, } /* - * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve + * the old one * @h: struct hstate old page belongs to - * @old_page: Old page to dissolve + * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, - struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, + struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - struct folio *old_folio = page_folio(old_page); int nid = folio_nid(old_folio); - struct page *new_page; struct folio *new_folio; int ret = 0; /* - * Before dissolving the page, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the page and 'prep' it + * Before dissolving the folio, we need to allocate a new one for the + * pool to remain stable. Here, we allocate the folio and 'prep' it * by doing everything but actually updating counters and adding to * the pool. This simplifies and let us do most of the processing * under the lock. */ - new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); - if (!new_page) + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); + if (!new_folio) return -ENOMEM; - new_folio = page_folio(new_page); __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* - * Freed from under us. Drop new_page too. + * Freed from under us. Drop new_folio too. */ goto free_new; } else if (folio_ref_count(old_folio)) { /* - * Someone has grabbed the page, try to isolate it here. + * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_page, list); + ret = isolate_hugetlb(&old_folio->page, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!folio_test_hugetlb_freed(old_folio)) { /* - * Page's refcount is 0 but it has not been enqueued in the + * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ @@ -2801,7 +2795,7 @@ retry: goto retry; } else { /* - * Ok, old_page is still a genuine free hugepage. Remove it from + * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() * and enqueue_hugetlb_folio() for new_folio. The counters will @@ -2810,14 +2804,14 @@ retry: remove_hugetlb_folio(h, old_folio, false); /* - * Ref count on new page is already zero as it was dropped + * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); enqueue_hugetlb_folio(h, new_folio); /* - * Pages have been replaced, we can safely free the old one. + * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, old_folio, false); @@ -2827,7 +2821,7 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); - /* Page has a zero ref count, but needs a ref to be freed */ + /* Folio has a zero ref count, but needs a ref to be freed */ folio_ref_unfreeze(new_folio, 1); update_and_free_hugetlb_folio(h, new_folio, false); @@ -2865,7 +2859,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_huge_page(h, &folio->page, list); + ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } @@ -3083,14 +3077,14 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) if (!alloc_bootmem_huge_page(h, nid)) break; } else { - struct page *page; + struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - page = alloc_fresh_huge_page(h, gfp_mask, nid, + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); - if (!page) + if (!folio) break; - free_huge_page(page); /* free it into the hugepage allocator */ + free_huge_page(&folio->page); /* free it into the hugepage allocator */ } cond_resched(); } From c8c7016f50c85688d71feea2dba1bd955d5f5358 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 30 Nov 2022 16:02:03 +0100 Subject: [PATCH 278/313] kasan: fail non-kasan KUnit tests on KASAN reports After the recent changes done to KUnit-enabled KASAN tests, non-KASAN KUnit tests stopped being failed when KASAN report is detected. Recover that property by failing the currently running non-KASAN KUnit test when KASAN detects and prints a report for a bad memory access. Note that if the bad accesses happened in a kernel thread that doesn't have a reference to the currently running KUnit-test available via current->kunit_test, the test won't be failed. This is a limitation of KUnit, which doesn't yet provide a thread-agnostic way to find the reference to the currenly running test. Link: https://lkml.kernel.org/r/7be29a8ea967cee6b7e48d3d5a242d1d0bd96851.1669820505.git.andreyknvl@google.com Fixes: 49d9977ac909 ("kasan: check CONFIG_KASAN_KUNIT_TEST instead of CONFIG_KUNIT") Fixes: 7ce0ea19d50e ("kasan: switch kunit tests to console tracepoints") Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: David Gow Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 12 ++++++++++ mm/kasan/kasan_test.c | 4 ++++ mm/kasan/report.c | 53 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index a84491bc4867..ea8cf1310b1e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -541,6 +541,18 @@ static inline bool kasan_arch_is_ready(void) { return true; } #error kasan_arch_is_ready only works in KASAN generic outline mode! #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_kunit_test_suite_start(void); +void kasan_kunit_test_suite_end(void); + +#else /* CONFIG_KASAN_KUNIT_TEST */ + +static inline void kasan_kunit_test_suite_start(void) { } +static inline void kasan_kunit_test_suite_end(void) { } + +#endif /* CONFIG_KASAN_KUNIT_TEST */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) bool kasan_save_enable_multi_shot(void); diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index e27591ef2777..9aa892e7b76c 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -76,6 +76,9 @@ static int kasan_suite_init(struct kunit_suite *suite) return -1; } + /* Stop failing KUnit tests on KASAN reports. */ + kasan_kunit_test_suite_start(); + /* * Temporarily enable multi-shot mode. Otherwise, KASAN would only * report the first detected bug and panic the kernel if panic_on_warn @@ -94,6 +97,7 @@ static int kasan_suite_init(struct kunit_suite *suite) static void kasan_suite_exit(struct kunit_suite *suite) { + kasan_kunit_test_suite_end(); kasan_restore_multi_shot(multishot); for_each_kernel_tracepoint(unregister_tracepoints, NULL); tracepoint_synchronize_unregister(); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 31355851a5ec..f2db8605ee0f 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -9,6 +9,7 @@ * Andrey Konovalov */ +#include #include #include #include @@ -112,10 +113,62 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +/* + * Whether the KASAN KUnit test suite is currently being executed. + * Updated in kasan_test.c. + */ +bool kasan_kunit_executing; + +void kasan_kunit_test_suite_start(void) +{ + WRITE_ONCE(kasan_kunit_executing, true); +} +EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start); + +void kasan_kunit_test_suite_end(void) +{ + WRITE_ONCE(kasan_kunit_executing, false); +} +EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end); + +static bool kasan_kunit_test_suite_executing(void) +{ + return READ_ONCE(kasan_kunit_executing); +} + +#else /* CONFIG_KASAN_KUNIT_TEST */ + +static inline bool kasan_kunit_test_suite_executing(void) { return false; } + +#endif /* CONFIG_KASAN_KUNIT_TEST */ + +#if IS_ENABLED(CONFIG_KUNIT) + +static void fail_non_kasan_kunit_test(void) +{ + struct kunit *test; + + if (kasan_kunit_test_suite_executing()) + return; + + test = current->kunit_test; + if (test) + kunit_set_failure(test); +} + +#else /* CONFIG_KUNIT */ + +static inline void fail_non_kasan_kunit_test(void) { } + +#endif /* CONFIG_KUNIT */ + static DEFINE_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { + fail_non_kasan_kunit_test(); /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ disable_trace_on_warning(); /* Do not allow LOCKDEP mangling KASAN reports. */ From 0b7623bdf89b9f6d320784e929acb78291aaf5f6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 1 Dec 2022 17:08:34 +0000 Subject: [PATCH 279/313] selftests/damon: test removed scheme sysfs dir access bug A DAMON sysfs user could start DAMON with a scheme, remove the sysfs directory for the scheme, and then ask stats or schemes tried regions update. The related logic were not aware of the already removed directory situation, so it was able to results in invalid memory accesses. The fix has made with commit 8468b486612c ("mm/damon/sysfs-schemes: skip stats update if the scheme directory is removed"), though. Add a selftest to prevent such kinds of bugs from being introduced again. Link: https://lkml.kernel.org/r/20221201170834.62823-1-sj@kernel.org Signed-off-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- .../damon/sysfs_update_removed_scheme_dir.sh | 58 +++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 838a8e49f77b..b71247ba7196 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -8,7 +8,7 @@ TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += debugfs_rm_non_contexts.sh -TEST_PROGS += sysfs.sh +TEST_PROGS += sysfs.sh sysfs_update_removed_scheme_dir.sh TEST_PROGS += reclaim.sh lru_sort.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh new file mode 100644 index 000000000000..ade35576e748 --- /dev/null +++ b/tools/testing/selftests/damon/sysfs_update_removed_scheme_dir.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +if [ $EUID -ne 0 ] +then + echo "Run as root" + exit $ksft_skip +fi + +damon_sysfs="/sys/kernel/mm/damon/admin" +if [ ! -d "$damon_sysfs" ] +then + echo "damon sysfs not found" + exit $ksft_skip +fi + +# clear log +dmesg -C + +# start DAMON with a scheme +echo 1 > "$damon_sysfs/kdamonds/nr_kdamonds" +echo 1 > "$damon_sysfs/kdamonds/0/contexts/nr_contexts" +echo "vaddr" > "$damon_sysfs/kdamonds/0/contexts/0/operations" +echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/targets/nr_targets" +echo $$ > "$damon_sysfs/kdamonds/0/contexts/0/targets/0/pid_target" +echo 1 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes" +scheme_dir="$damon_sysfs/kdamonds/0/contexts/0/schemes/0" +echo 4096000 > "$scheme_dir/access_pattern/sz/max" +echo 20 > "$scheme_dir/access_pattern/nr_accesses/max" +echo 1024 > "$scheme_dir/access_pattern/age/max" +echo "on" > "$damon_sysfs/kdamonds/0/state" +sleep 0.3 + +# remove scheme sysfs dir +echo 0 > "$damon_sysfs/kdamonds/0/contexts/0/schemes/nr_schemes" + +# try to update stat of already removed scheme sysfs dir +echo "update_schemes_stats" > "$damon_sysfs/kdamonds/0/state" +if dmesg | grep -q BUG +then + echo "update_schemes_stats triggers a kernel bug" + dmesg + exit 1 +fi + +# try to update tried regions of already removed scheme sysfs dir +echo "update_schemes_tried_regions" > "$damon_sysfs/kdamonds/0/state" +if dmesg | grep -q BUG +then + echo "update_schemes_tried_regions triggers a kernel bug" + dmesg + exit 1 +fi + +echo "off" > "$damon_sysfs/kdamonds/0/state" From 169004265860327182ecf92297b25b6271e81e96 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:51 +0000 Subject: [PATCH 280/313] fsdax: introduce page->share for fsdax in reflink mode Patch series "fsdax,xfs: fix warning messages", v2. Many testcases failed in dax+reflink mode with warning message in dmesg. Such as generic/051,075,127. The warning message is like this: [ 775.509337] ------------[ cut here ]------------ [ 775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 dax_insert_entry.cold+0x2e/0x69 [ 775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel configfs xfs libcrc32c fuse [ 775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: G W 6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d [ 775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch Linux 1.16.0-3-3 04/01/2014 [ 775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69 [ 775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48 [ 775.526708] RSP: 0000:ffffc90001d57b30 EFLAGS: 00010082 [ 775.527042] RAX: 000000000000002a RBX: 0000000000000000 RCX: 0000000000000042 [ 775.527396] RDX: ffffea000a0f6c80 RSI: ffffffff81dfab1b RDI: 00000000ffffffff [ 775.527819] RBP: ffffea000a0f6c40 R08: 0000000000000000 R09: ffffffff820625e0 [ 775.528241] R10: ffffc90001d579d8 R11: ffffffff820d2628 R12: ffff88815fc98320 [ 775.528598] R13: ffffc90001d57c18 R14: 0000000000000000 R15: 0000000000000001 [ 775.528997] FS: 00007f39fc75d740(0000) GS:ffff88817bc80000(0000) knlGS:0000000000000000 [ 775.529474] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 775.529800] CR2: 00007f39fc772040 CR3: 0000000107eb6001 CR4: 00000000003706e0 [ 775.530214] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 775.530592] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 775.531002] Call Trace: [ 775.531230] [ 775.531444] dax_fault_iter+0x267/0x6c0 [ 775.531719] dax_iomap_pte_fault+0x198/0x3d0 [ 775.532002] __xfs_filemap_fault+0x24a/0x2d0 [xfs aa8d25411432b306d9554da38096f4ebb86bdfe7] [ 775.532603] __do_fault+0x30/0x1e0 [ 775.532903] do_fault+0x314/0x6c0 [ 775.533166] __handle_mm_fault+0x646/0x1250 [ 775.533480] handle_mm_fault+0xc1/0x230 [ 775.533810] do_user_addr_fault+0x1ac/0x610 [ 775.534110] exc_page_fault+0x63/0x140 [ 775.534389] asm_exc_page_fault+0x22/0x30 [ 775.534678] RIP: 0033:0x7f39fc55820a [ 775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9 a4 c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00 [ 775.535839] RSP: 002b:00007ffc66a08118 EFLAGS: 00010202 [ 775.536157] RAX: 00007f39fc772001 RBX: 0000000000042001 RCX: 00000000000063c1 [ 775.536537] RDX: 0000000000006400 RSI: 00007f39fac42050 RDI: 00007f39fc772040 [ 775.536919] RBP: 0000000000006400 R08: 00007f39fc772001 R09: 0000000000042000 [ 775.537304] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000001 [ 775.537694] R13: 00007f39fc772000 R14: 0000000000006401 R15: 0000000000000003 [ 775.538086] [ 775.538333] ---[ end trace 0000000000000000 ]--- This also affects dax+noreflink mode if we run the test after a dax+reflink test. So, the most urgent thing is solving the warning messages. With these fixes, most warning messages in dax_associate_entry() are gone. But honestly, generic/388 will randomly failed with the warning. The case shutdown the xfs when fsstress is running, and do it for many times. I think the reason is that dax pages in use are not able to be invalidated in time when fs is shutdown. The next time dax page to be associated, it still remains the mapping value set last time. I'll keep on solving it. The warning message in dax_writeback_one() can also be fixed because of the dax unshare. This patch (of 8): fsdax page is used not only when CoW, but also mapread. To make the it easily understood, use 'share' to indicate that the dax page is shared by more than one extent. And add helper functions to use it. Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED. [ruansy.fnst@fujitsu.com: rename several functions] Link: https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.fnst@fujitsu.com [ruansy.fnst@fujitsu.com: v2.2] Link: https://lkml.kernel.org/r/1670381359-53-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-2-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Allison Henderson Reviewed-by: Darrick J. Wong Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Alistair Popple Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 38 ++++++++++++++++++++++---------------- include/linux/mm_types.h | 5 ++++- include/linux/page-flags.h | 2 +- 3 files changed, 27 insertions(+), 18 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 1c6867810cbd..84fadea08705 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_mapping_is_cow(struct address_space *mapping) +static inline bool dax_page_is_shared(struct page *page) { - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; + return page->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * refcount. */ -static inline void dax_mapping_set_cow(struct page *page) +static inline void dax_page_share_get(struct page *page) { - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + if (page->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ if (page->mapping) - page->index = 1; - page->mapping = (void *)PAGE_MAPPING_DAX_COW; + page->share = 1; + page->mapping = PAGE_MAPPING_DAX_SHARED; } - page->index++; + page->share++; +} + +static inline unsigned long dax_page_share_put(struct page *page) +{ + return --page->share; } /* - * When it is called in dax_insert_entry(), the cow flag will indicate that + * When it is called in dax_insert_entry(), the shared flag will indicate that * whether this entry is shared by multiple files. If so, set the page->mapping - * FS_DAX_MAPPING_COW, and use page->index as refcount. + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool cow) + struct vm_area_struct *vma, unsigned long address, bool shared) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - if (cow) { - dax_mapping_set_cow(page); + if (shared) { + dax_page_share_get(page); } else { WARN_ON_ONCE(page->mapping); page->mapping = mapping; @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_mapping_is_cow(page->mapping)) { - /* keep the CoW flag if this page is still shared */ - if (page->index-- > 0) + if (dax_page_is_shared(page)) { + /* keep the shared flag if this page is still shared */ + if (dax_page_share_put(page) > 0) continue; } else WARN_ON_ONCE(page->mapping && page->mapping != mapping); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 199f98be6f9c..3b8475007734 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -104,7 +104,10 @@ struct page { }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; - pgoff_t index; /* Our offset within mapping. */ + union { + pgoff_t index; /* Our offset within mapping. */ + unsigned long share; /* share count for fsdax */ + }; /** * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e42c55a7e012..9aec9fd8c50b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -638,7 +638,7 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) * Different with flags above, this flag is used only for fsdax mode. It * indicates that this page->mapping is now under reflink case. */ -#define PAGE_MAPPING_DAX_COW 0x1 +#define PAGE_MAPPING_DAX_SHARED ((void *)0x1) static __always_inline bool folio_mapping_flags(struct folio *folio) { From f80e1668888f34c0764822e74953c997daf2ccdb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:52 +0000 Subject: [PATCH 281/313] fsdax: invalidate pages when CoW CoW changes the share state of a dax page, but the share count of the page isn't updated. The next time access this page, it should have been a newly accessed, but old association exists. So, we need to clear the share state when CoW happens, in both dax_iomap_rw() and dax_zero_iter(). Link: https://lkml.kernel.org/r/1669908538-55-3-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 84fadea08705..c975d075e77b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1264,6 +1264,15 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) return length; + /* + * invalidate the pages whose sharing state is to be changed + * because of CoW. + */ + if (iomap->flags & IOMAP_F_SHARED) + invalidate_inode_pages2_range(iter->inode->i_mapping, + pos >> PAGE_SHIFT, + (pos + length - 1) >> PAGE_SHIFT); + do { unsigned offset = offset_in_page(pos); unsigned size = min_t(u64, PAGE_SIZE - offset, length); @@ -1324,12 +1333,13 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; - const struct iomap *srcmap = &iomi->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iomi); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; bool write = iov_iter_rw(iter) == WRITE; + bool cow = write && iomap->flags & IOMAP_F_SHARED; ssize_t ret = 0; size_t xfer; int id; @@ -1356,7 +1366,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if (iomap->flags & IOMAP_F_NEW) { + if (iomap->flags & IOMAP_F_NEW || cow) { invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); @@ -1390,8 +1400,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + if (cow) { ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, kaddr); if (ret) From 708dfad2eb4169324189782edd6d3763237e0489 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:53 +0000 Subject: [PATCH 282/313] fsdax: zero the edges if source is HOLE or UNWRITTEN If srcmap contains invalid data, such as HOLE and UNWRITTEN, the dest page should be zeroed. Otherwise, since it's a pmem, old data may remains on the dest page, the result of CoW will be incorrect. The function name is also not easy to understand, rename it to "dax_iomap_copy_around()", which means it copies data around the range. [akpm@linux-foundation.org: update dax_iomap_copy_around() kerneldoc, per Darrick] Link: https://lkml.kernel.org/r/1669973145-318-1-git-send-email-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/1669908538-55-4-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Allison Henderson Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 79 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 30 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index c975d075e77b..359b958eb835 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1092,7 +1092,8 @@ out: } /** - * dax_iomap_cow_copy - Copy the data from source to destination before write + * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page + * by copying the data before and after the range to be written. * @pos: address to do copy from. * @length: size of copy operation. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) @@ -1101,35 +1102,50 @@ out: * * This can be called from two places. Either during DAX write fault (page * aligned), to copy the length size data to daddr. Or, while doing normal DAX - * write operation, dax_iomap_actor() might call this to do the copy of either + * write operation, dax_iomap_iter() might call this to do the copy of either * start or end unaligned address. In the latter case the rest of the copy of - * aligned ranges is taken care by dax_iomap_actor() itself. + * aligned ranges is taken care by dax_iomap_iter() itself. + * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the + * area to make sure no old data remains. */ -static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, +static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, const struct iomap *srcmap, void *daddr) { loff_t head_off = pos & (align_size - 1); size_t size = ALIGN(head_off + length, align_size); loff_t end = pos + length; loff_t pg_end = round_up(end, align_size); + /* copy_all is usually in page fault case */ bool copy_all = head_off == 0 && end == pg_end; + /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */ + bool zero_edge = srcmap->flags & IOMAP_F_SHARED || + srcmap->type == IOMAP_UNWRITTEN; void *saddr = 0; int ret = 0; - ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); - if (ret) - return ret; + if (!zero_edge) { + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + } if (copy_all) { - ret = copy_mc_to_kernel(daddr, saddr, length); - return ret ? -EIO : 0; + if (zero_edge) + memset(daddr, 0, size); + else + ret = copy_mc_to_kernel(daddr, saddr, length); + goto out; } /* Copy the head part of the range */ if (head_off) { - ret = copy_mc_to_kernel(daddr, saddr, head_off); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr, 0, head_off); + else { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } } /* Copy the tail part of the range */ @@ -1137,12 +1153,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, loff_t tail_off = head_off + length; loff_t tail_len = pg_end - end; - ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, - tail_len); - if (ret) - return -EIO; + if (zero_edge) + memset(daddr + tail_off, 0, tail_len); + else { + ret = copy_mc_to_kernel(daddr + tail_off, + saddr + tail_off, tail_len); + if (ret) + return -EIO; + } } - return 0; +out: + if (zero_edge) + dax_flush(srcmap->dax_dev, daddr, size); + return ret ? -EIO : 0; } /* @@ -1241,13 +1264,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) if (ret < 0) return ret; memset(kaddr + offset, 0, size); - if (srcmap->addr != iomap->addr) { - ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, - kaddr); - if (ret < 0) - return ret; - dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); - } else + if (iomap->flags & IOMAP_F_SHARED) + ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, + kaddr); + else dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1401,8 +1421,8 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } if (cow) { - ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, - kaddr); + ret = dax_iomap_copy_around(pos, length, PAGE_SIZE, + srcmap, kaddr); if (ret) break; } @@ -1547,7 +1567,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, struct xa_state *xas, void **entry, bool pmd) { const struct iomap *iomap = &iter->iomap; - const struct iomap *srcmap = &iter->srcmap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; @@ -1578,9 +1598,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); - if (write && - srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { - err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (write && iomap->flags & IOMAP_F_SHARED) { + err = dax_iomap_copy_around(pos, size, size, srcmap, kaddr); if (err) return dax_fault_return(err); } From c6f0b395b2110aa26a134a9a395875b1ec0a5aae Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:28:54 +0000 Subject: [PATCH 283/313] fsdax,xfs: set the shared flag when file extent is shared If a dax page is shared, mapread at different offsets can also trigger page fault on same dax page. So, change the flag from "cow" to "shared". And get the shared flag from filesystem when read. Link: https://lkml.kernel.org/r/1669908538-55-5-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 19 +++++++------------ fs/xfs/xfs_iomap.c | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 359b958eb835..fa547ce41add 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -846,12 +846,6 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter, (iter->iomap.flags & IOMAP_F_DIRTY); } -static bool dax_fault_is_cow(const struct iomap_iter *iter) -{ - return (iter->flags & IOMAP_WRITE) && - (iter->iomap.flags & IOMAP_F_SHARED); -} - /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to @@ -865,13 +859,14 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); - bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); - bool cow = dax_fault_is_cow(iter); + bool write = iter->flags & IOMAP_WRITE; + bool dirty = write && !dax_fault_is_synchronous(iter, vmf->vma); + bool shared = iter->iomap.flags & IOMAP_F_SHARED; if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { + if (shared || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -883,12 +878,12 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, xas_reset(xas); xas_lock_irq(xas); - if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (shared || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - cow); + shared); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -908,7 +903,7 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); - if (cow) + if (write && shared) xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); xas_unlock_irq(xas); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 07da03976ec1..881de99766ca 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1215,7 +1215,7 @@ xfs_read_iomap_begin( return error; error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); - if (!error && (flags & IOMAP_REPORT)) + if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); xfs_iunlock(ip, lockmode); From 0e79e3736d54bb8efbc9fb29cc3b54a132783565 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:31:41 +0000 Subject: [PATCH 284/313] fsdax: dedupe: iter two files at the same time The iomap_iter() on a range of one file may loop more than once. In this case, the inner dst_iter can update its iomap but the outer src_iter can't. This may cause the wrong remapping in filesystem. Let them called at the same time. Link: https://lkml.kernel.org/r/1669908701-93-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index fa547ce41add..8fb928cd9dce 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1965,15 +1965,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret; + int ret, compared = 0; - while ((ret = iomap_iter(&src_iter, ops)) > 0) { - while ((ret = iomap_iter(&dst_iter, ops)) > 0) { - dst_iter.processed = dax_range_compare_iter(&src_iter, - &dst_iter, len, same); - } - if (ret <= 0) - src_iter.processed = ret; + while ((ret = iomap_iter(&src_iter, ops)) > 0 && + (ret = iomap_iter(&dst_iter, ops)) > 0) { + compared = dax_range_compare_iter(&src_iter, &dst_iter, len, + same); + if (compared < 0) + return ret; + src_iter.processed = dst_iter.processed = compared; } return ret; } From 64e6edc185da7e101e867c4732c097fedb1da08e Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:10 +0000 Subject: [PATCH 285/313] xfs: use dax ops for zero and truncate in fsdax mode Zero and truncate on a dax file may execute CoW. So use dax ops which contains end work for CoW. Link: https://lkml.kernel.org/r/1669908730-131-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/xfs/xfs_iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 881de99766ca..d9401d0300ad 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -1370,7 +1370,7 @@ xfs_zero_range( if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, &xfs_buffered_write_iomap_ops); } @@ -1385,7 +1385,7 @@ xfs_truncate_page( if (IS_DAX(inode)) return dax_truncate_page(inode, pos, did_zero, - &xfs_direct_write_iomap_ops); + &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, &xfs_buffered_write_iomap_ops); } From d984648e428bf88cbd94ebe346c73632cb92fffb Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:33 +0000 Subject: [PATCH 286/313] fsdax,xfs: port unshare to fsdax Implement unshare in fsdax mode: copy data from srcmap to iomap. Link: https://lkml.kernel.org/r/1669908753-169-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/dax.c | 52 ++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_reflink.c | 8 +++++-- include/linux/dax.h | 2 ++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 8fb928cd9dce..c48a3a93ab29 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1245,6 +1245,58 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ +static s64 dax_unshare_iter(struct iomap_iter *iter) +{ + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); + int id = 0; + s64 ret = 0; + void *daddr = NULL, *saddr = NULL; + + /* don't bother with blocks that are not shared to start with */ + if (!(iomap->flags & IOMAP_F_SHARED)) + return length; + /* don't bother with holes or unwritten extents */ + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) + return length; + + id = dax_read_lock(); + ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = copy_mc_to_kernel(daddr, saddr, length); + if (ret) + ret = -EIO; + +out_unlock: + dax_read_unlock(id); + return ret; +} + +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops) +{ + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = dax_unshare_iter(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(dax_file_unshare); + static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { const struct iomap *iomap = &iter->iomap; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 93bdd25680bc..fe46bce8cae6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1693,8 +1693,12 @@ xfs_reflink_unshare( inode_dio_wait(inode); - error = iomap_file_unshare(inode, offset, len, - &xfs_buffered_write_iomap_ops); + if (IS_DAX(inode)) + error = dax_file_unshare(inode, offset, len, + &xfs_dax_write_iomap_ops); + else + error = iomap_file_unshare(inode, offset, len, + &xfs_buffered_write_iomap_ops); if (error) goto out; diff --git a/include/linux/dax.h b/include/linux/dax.h index ba985333e26b..2b5ecb591059 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -205,6 +205,8 @@ static inline void dax_unlock_mapping_entry(struct address_space *mapping, } #endif +int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, + const struct iomap_ops *ops); int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops); int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, From 480017957d6380d3336a8e80ad90f70415bb86f7 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 1 Dec 2022 15:32:53 +0000 Subject: [PATCH 287/313] xfs: remove restrictions for fsdax and reflink Since the basic function for fsdax and reflink has been implemented, remove the restrictions of them for widly test. Link: https://lkml.kernel.org/r/1669908773-207-1-git-send-email-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Alistair Popple Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: John Hubbard Signed-off-by: Andrew Morton --- fs/xfs/xfs_ioctl.c | 4 ---- fs/xfs/xfs_iops.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 1f783e979629..13f1b2add390 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1138,10 +1138,6 @@ xfs_ioctl_setattr_xflags( if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; - /* Don't allow us to set DAX mode for a reflinked file for now. */ - if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) - return -EINVAL; - /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); if (i_flags2 && !xfs_has_v3inodes(mp)) diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2e10e1c66ad6..bf0495f7a5e1 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1185,10 +1185,6 @@ xfs_inode_supports_dax( if (!S_ISREG(VFS_I(ip)->i_mode)) return false; - /* Only supported on non-reflinked files. */ - if (xfs_is_reflink_inode(ip)) - return false; - /* Block size must match page size */ if (mp->m_sb.sb_blocksize != PAGE_SIZE) return false; From ac4b2901a112e4dcee1455c96d89ef83cc7aa545 Mon Sep 17 00:00:00 2001 From: Deyan Wang Date: Thu, 1 Dec 2022 21:50:45 +0800 Subject: [PATCH 288/313] mm/page_alloc: update comments in __free_pages_ok() Add a comment to explain why we call get_pfnblock_migratetype() twice in __free_pages_ok(). Link: https://lkml.kernel.org/r/20221201135045.31663-1-wonder_rock@126.com Signed-off-by: Deyan Wang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ab9dd29ef7e..0745aedebb37 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1702,6 +1702,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, if (!free_pages_prepare(page, order, true, fpi_flags)) return; + /* + * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here + * is used to avoid calling get_pfnblock_migratetype() under the lock. + * This will reduce the lock holding time. + */ migratetype = get_pfnblock_migratetype(page, pfn); spin_lock_irqsave(&zone->lock, flags); From a11774122180a782b327b0a9a5091d99c91a4db7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:38 +0100 Subject: [PATCH 289/313] extfat: remove ->writepage Patch series "start removing writepage instances v2". The VM doesn't need or want ->writepage for writeback and is fine with just having ->writepages as long as ->migrate_folio is implemented. This series removes all ->writepage instances that use block_write_full_page directly and also have a plain mpage_writepages based ->writepages. This patch (of 7): ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-1-hch@lst.de Link: https://lkml.kernel.org/r/20221202102644.770505-2-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Namjae Jeon Acked-by: Johannes Weiner Cc: Bob Copeland Cc: Dave Kleikamp Cc: Jan Kara Cc: Mikulas Patocka Cc: OGAWA Hirofumi Cc: Sungjong Seo Signed-off-by: Andrew Morton --- fs/exfat/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 5590a1e83126..eac95bcd9a8a 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -345,11 +345,6 @@ static void exfat_readahead(struct readahead_control *rac) mpage_readahead(rac, exfat_get_block); } -static int exfat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, exfat_get_block, wbc); -} - static int exfat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -473,12 +468,12 @@ static const struct address_space_operations exfat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = exfat_read_folio, .readahead = exfat_readahead, - .writepage = exfat_writepage, .writepages = exfat_writepages, .write_begin = exfat_write_begin, .write_end = exfat_write_end, .direct_IO = exfat_direct_IO, - .bmap = exfat_aop_bmap + .bmap = exfat_aop_bmap, + .migrate_folio = buffer_migrate_folio, }; static inline unsigned long exfat_hash(loff_t i_pos) From ee649af0d9a60ea61a5dad99ef5d6b4aa346f0a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:39 +0100 Subject: [PATCH 290/313] fat: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-3-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/fat/inode.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 1cbcc4608dc7..d99b8549ec8f 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -194,11 +194,6 @@ static int fat_get_block(struct inode *inode, sector_t iblock, return 0; } -static int fat_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, fat_get_block, wbc); -} - static int fat_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -346,12 +341,12 @@ static const struct address_space_operations fat_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = fat_read_folio, .readahead = fat_readahead, - .writepage = fat_writepage, .writepages = fat_writepages, .write_begin = fat_write_begin, .write_end = fat_write_end, .direct_IO = fat_direct_IO, - .bmap = _fat_bmap + .bmap = _fat_bmap, + .migrate_folio = buffer_migrate_folio, }; /* From ba195d9f14829690b8e4f67549960d83169a314e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:40 +0100 Subject: [PATCH 291/313] hfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and stop wiring up ->writepage for hfs_aops. Link: https://lkml.kernel.org/r/20221202102644.770505-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index c4526f16355d..16466a5e88b4 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -173,12 +173,12 @@ const struct address_space_operations hfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfs_read_folio, - .writepage = hfs_writepage, .write_begin = hfs_write_begin, .write_end = generic_write_end, .bmap = hfs_bmap, .direct_IO = hfs_direct_IO, .writepages = hfs_writepages, + .migrate_folio = buffer_migrate_folio, }; /* From 12f9b9a73dc603e658bf24eed2777cecdaf4103e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:41 +0100 Subject: [PATCH 292/313] hfsplus: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and stop wiring up ->writepage for hfsplus_aops. Link: https://lkml.kernel.org/r/20221202102644.770505-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hfsplus/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index aeab83ed1c9c..d6572ad2407a 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -170,12 +170,12 @@ const struct address_space_operations hfsplus_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hfsplus_read_folio, - .writepage = hfsplus_writepage, .write_begin = hfsplus_write_begin, .write_end = generic_write_end, .bmap = hfsplus_bmap, .direct_IO = hfsplus_direct_IO, .writepages = hfsplus_writepages, + .migrate_folio = buffer_migrate_folio, }; const struct dentry_operations hfsplus_dentry_operations = { From cd2e6024260de27a523e0af6ee47a20a6b8b8aa8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:42 +0100 Subject: [PATCH 293/313] hpfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-6-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/hpfs/file.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index f7547a62c81f..88952d4a631e 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -163,11 +163,6 @@ static int hpfs_read_folio(struct file *file, struct folio *folio) return mpage_read_folio(folio, hpfs_get_block); } -static int hpfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, hpfs_get_block, wbc); -} - static void hpfs_readahead(struct readahead_control *rac) { mpage_readahead(rac, hpfs_get_block); @@ -248,12 +243,12 @@ const struct address_space_operations hpfs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = hpfs_read_folio, - .writepage = hpfs_writepage, .readahead = hpfs_readahead, .writepages = hpfs_writepages, .write_begin = hpfs_write_begin, .write_end = hpfs_write_end, - .bmap = _hpfs_bmap + .bmap = _hpfs_bmap, + .migrate_folio = buffer_migrate_folio, }; const struct file_operations hpfs_file_ops = From 2274c3b281bb47e6980ae42fb8dc93b7a38192d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:43 +0100 Subject: [PATCH 294/313] jfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Dave Kleikamp Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/jfs/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index d1ec920aa030..8ac10e396050 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -264,11 +264,6 @@ int jfs_get_block(struct inode *ip, sector_t lblock, return rc; } -static int jfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, jfs_get_block, wbc); -} - static int jfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -355,12 +350,12 @@ const struct address_space_operations jfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = jfs_read_folio, .readahead = jfs_readahead, - .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, + .migrate_folio = buffer_migrate_folio, }; /* From 1bda9dad5aa0199c8592bac32b91afbf8ea236ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Dec 2022 11:26:44 +0100 Subject: [PATCH 295/313] omfs: remove ->writepage ->writepage is a very inefficient method to write back data, and only used through write_cache_pages or a a fallback when no ->migrate_folio method is present. Set ->migrate_folio to the generic buffer_head based helper, and remove the ->writepage implementation. Link: https://lkml.kernel.org/r/20221202102644.770505-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Bob Copeland Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- fs/omfs/file.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/omfs/file.c b/fs/omfs/file.c index fa7fe2393ff6..3a5b4b88a583 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -294,11 +294,6 @@ static void omfs_readahead(struct readahead_control *rac) mpage_readahead(rac, omfs_get_block); } -static int omfs_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, omfs_get_block, wbc); -} - static int omfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -375,10 +370,10 @@ const struct address_space_operations omfs_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = omfs_read_folio, .readahead = omfs_readahead, - .writepage = omfs_writepage, .writepages = omfs_writepages, .write_begin = omfs_write_begin, .write_end = generic_write_end, .bmap = omfs_bmap, + .migrate_folio = buffer_migrate_folio, }; From 675eaca1f441acd4f0d403d71036b100cd49036a Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 2 Dec 2022 15:53:39 +1100 Subject: [PATCH 296/313] mm/mmap: properly unaccount memory on mas_preallocate() failure security_vm_enough_memory_mm() accounts memory via a call to vm_acct_memory(). Therefore any subsequent failures should unaccount for this memory prior to returning the error. Link: https://lkml.kernel.org/r/20221202045339.2999017-1-apopple@nvidia.com Fixes: 28c5609fb236 ("mm/mmap: preallocate maple nodes for brk vma expansion") Signed-off-by: Alistair Popple Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1741273ac34c..7d24fc478ffa 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2953,7 +2953,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); if (mas_preallocate(mas, vma, GFP_KERNEL)) - return -ENOMEM; + goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); if (vma->anon_vma) { @@ -2975,7 +2975,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) - goto vma_alloc_fail; + goto unacct_fail; vma_set_anonymous(vma); vma->vm_start = addr; @@ -3000,7 +3000,7 @@ out: mas_store_fail: vm_area_free(vma); -vma_alloc_fail: +unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } From adb8213014b25c7f1d75d5b219becaadcd695efb Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 2 Dec 2022 03:15:10 +0000 Subject: [PATCH 297/313] mm: memcg: fix stale protection of reclaim target memcg Patch series "mm: memcg: fix protection of reclaim target memcg", v3. This series fixes a bug in calculating the protection of the reclaim target memcg where we end up using stale effective protection values from the last reclaim operation, instead of completely ignoring the protection of the reclaim target as intended. More detailed explanation and examples in patch 1, which includes the fix. Patches 2 & 3 introduce a selftest case that catches the bug. This patch (of 3): When we are doing memcg reclaim, the intended behavior is that we ignore any protection (memory.min, memory.low) of the target memcg (but not its children). Ever since the patch pointed to by the "Fixes" tag, we actually read a stale value for the target memcg protection when deciding whether to skip the memcg or not because it is protected. If the stale value happens to be high enough, we don't reclaim from the target memcg. Essentially, in some cases we may falsely skip reclaiming from the target memcg of reclaim because we read a stale protection value from last time we reclaimed from it. During reclaim, mem_cgroup_calculate_protection() is used to determine the effective protection (emin and elow) values of a memcg. The protection of the reclaim target is ignored, but we cannot set their effective protection to 0 due to a limitation of the current implementation (see comment in mem_cgroup_protection()). Instead, we leave their effective protection values unchaged, and later ignore it in mem_cgroup_protection(). However, mem_cgroup_protection() is called later in shrink_lruvec()->get_scan_count(), which is after the mem_cgroup_below_{min/low}() checks in shrink_node_memcgs(). As a result, the stale effective protection values of the target memcg may lead us to skip reclaiming from the target memcg entirely, before calling shrink_lruvec(). This can be even worse with recursive protection, where the stale target memcg protection can be higher than its standalone protection. See two examples below (a similar version of example (a) is added to test_memcontrol in a later patch). (a) A simple example with proactive reclaim is as follows. Consider the following hierarchy: ROOT | A | B (memory.min = 10M) Consider the following scenario: - B has memory.current = 10M. - The system undergoes global reclaim (or memcg reclaim in A). - In shrink_node_memcgs(): - mem_cgroup_calculate_protection() calculates the effective min (emin) of B as 10M. - mem_cgroup_below_min() returns true for B, we do not reclaim from B. - Now if we want to reclaim 5M from B using proactive reclaim (memory.reclaim), we should be able to, as the protection of the target memcg should be ignored. - In shrink_node_memcgs(): - mem_cgroup_calculate_protection() immediately returns for B without doing anything, as B is the target memcg, relying on mem_cgroup_protection() to ignore B's stale effective min (still 10M). - mem_cgroup_below_min() reads the stale effective min for B and we skip it instead of ignoring its protection as intended, as we never reach mem_cgroup_protection(). (b) An more complex example with recursive protection is as follows. Consider the following hierarchy with memory_recursiveprot: ROOT | A (memory.min = 50M) | B (memory.min = 10M, memory.high = 40M) Consider the following scenario: - B has memory.current = 35M. - The system undergoes global reclaim (target memcg is NULL). - B will have an effective min of 50M (all of A's unclaimed protection). - B will not be reclaimed from. - Now allocate 10M more memory in B, pushing it above it's high limit. - The system undergoes memcg reclaim from B (target memcg is B). - Like example (a), we do nothing in mem_cgroup_calculate_protection(), then call mem_cgroup_below_min(), which will read the stale effective min for B (50M) and skip it. In this case, it's even worse because we are not just considering B's standalone protection (10M), but we are reading a much higher stale protection (50M) which will cause us to not reclaim from B at all. This is an artifact of commit 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") which made mem_cgroup_calculate_protection() only change the state without returning any value. Before that commit, we used to return MEMCG_PROT_NONE for the target memcg, which would cause us to skip the mem_cgroup_below_{min/low}() checks. After that commit we do not return anything and we end up checking the min & low effective protections for the target memcg, which are stale. Update mem_cgroup_supports_protection() to also check if we are reclaiming from the target, and rename it to mem_cgroup_unprotected() (now returns true if we should not protect the memcg, much simpler logic). Link: https://lkml.kernel.org/r/20221202031512.1365483-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20221202031512.1365483-2-yosryahmed@google.com Fixes: 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") Signed-off-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Chris Down Cc: David Rientjes Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 31 +++++++++++++++++++++---------- mm/vmscan.c | 11 ++++++----- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e1644a24009c..d3c8203cab6c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -615,28 +615,32 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); -static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support - * protection. + * protection. The target memcg's protection is ignored, see + * mem_cgroup_calculate_protection() and mem_cgroup_protection() */ - return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg); - + return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) || + memcg == target; } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { - if (!mem_cgroup_supports_protection(memcg)) + if (mem_cgroup_unprotected(target, memcg)) return false; return READ_ONCE(memcg->memory.emin) >= @@ -1209,12 +1213,19 @@ static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, { } -static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) +static inline bool mem_cgroup_unprotected(struct mem_cgroup *target, + struct mem_cgroup *memcg) +{ + return true; +} +static inline bool mem_cgroup_below_low(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } -static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) +static inline bool mem_cgroup_below_min(struct mem_cgroup *target, + struct mem_cgroup *memcg) { return false; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9356a3ee639c..dcd476a66a59 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4513,7 +4513,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned mem_cgroup_calculate_protection(NULL, memcg); - if (mem_cgroup_below_min(memcg)) + if (mem_cgroup_below_min(NULL, memcg)) return false; need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); @@ -5100,8 +5100,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - if (mem_cgroup_below_min(memcg) || - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || + (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && + !sc->memcg_low_reclaim)) return 0; *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); @@ -6096,13 +6097,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) mem_cgroup_calculate_protection(target_memcg, memcg); - if (mem_cgroup_below_min(memcg)) { + if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; - } else if (mem_cgroup_below_low(memcg)) { + } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as From e5d64edac64531375716fabe35c9e0a502ca2894 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 2 Dec 2022 03:15:11 +0000 Subject: [PATCH 298/313] selftests: cgroup: refactor proactive reclaim code to reclaim_until() Refactor the code that drives writing to memory.reclaim (retrying, error handling, etc) from test_memcg_reclaim() to a helper called reclaim_until(), which proactively reclaims from a memcg until its usage reaches a certain value. While we are at it, refactor and simplify the reclaim loop. This will be used in a following patch in another test. Link: https://lkml.kernel.org/r/20221202031512.1365483-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Roman Gushchin Reviewed-by: Roman Gushchin Cc: Chris Down Cc: David Rientjes Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- .../selftests/cgroup/test_memcontrol.c | 80 ++++++++++--------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 8833359556f3..a8f4700353a4 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -645,6 +645,48 @@ cleanup: return ret; } +/* + * Reclaim from @memcg until usage reaches @goal by writing to + * memory.reclaim. + * + * This function will return false if the usage is already below the + * goal. + * + * This function assumes that writing to memory.reclaim is the only + * source of change in memory.current (no concurrent allocations or + * reclaim). + * + * This function makes sure memory.reclaim is sane. It will return + * false if memory.reclaim's error codes do not make sense, even if + * the usage goal was satisfied. + */ +static bool reclaim_until(const char *memcg, long goal) +{ + char buf[64]; + int retries, err; + long current, to_reclaim; + bool reclaimed = false; + + for (retries = 5; retries > 0; retries--) { + current = cg_read_long(memcg, "memory.current"); + + if (current < goal || values_close(current, goal, 3)) + break; + /* Did memory.reclaim return 0 incorrectly? */ + else if (reclaimed) + return false; + + to_reclaim = current - goal; + snprintf(buf, sizeof(buf), "%ld", to_reclaim); + err = cg_write(memcg, "memory.reclaim", buf); + if (!err) + reclaimed = true; + else if (err != -EAGAIN) + return false; + } + return reclaimed; +} + /* * This test checks that memory.reclaim reclaims the given * amount of memory (from both anon and file, if possible). @@ -653,8 +695,7 @@ static int test_memcg_reclaim(const char *root) { int ret = KSFT_FAIL, fd, retries; char *memcg; - long current, expected_usage, to_reclaim; - char buf[64]; + long current, expected_usage; memcg = cg_name(root, "memcg_test"); if (!memcg) @@ -705,41 +746,8 @@ static int test_memcg_reclaim(const char *root) * Reclaim until current reaches 30M, this makes sure we hit both anon * and file if swap is enabled. */ - retries = 5; - while (true) { - int err; - - current = cg_read_long(memcg, "memory.current"); - to_reclaim = current - MB(30); - - /* - * We only keep looping if we get EAGAIN, which means we could - * not reclaim the full amount. - */ - if (to_reclaim <= 0) - goto cleanup; - - - snprintf(buf, sizeof(buf), "%ld", to_reclaim); - err = cg_write(memcg, "memory.reclaim", buf); - if (!err) { - /* - * If writing succeeds, then the written amount should have been - * fully reclaimed (and maybe more). - */ - current = cg_read_long(memcg, "memory.current"); - if (!values_close(current, MB(30), 3) && current > MB(30)) - goto cleanup; - break; - } - - /* The kernel could not reclaim the full amount, try again. */ - if (err == -EAGAIN && retries--) - continue; - - /* We got an unexpected error or ran out of retries. */ + if (!reclaim_until(memcg, MB(30))) goto cleanup; - } ret = KSFT_PASS; cleanup: From 1c74697776e17619e485a40cf8cfdb4bf18fd18e Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 2 Dec 2022 03:15:12 +0000 Subject: [PATCH 299/313] selftests: cgroup: make sure reclaim target memcg is unprotected Make sure that we ignore protection of a memcg that is the target of memcg reclaim. Link: https://lkml.kernel.org/r/20221202031512.1365483-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Chris Down Cc: David Rientjes Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Tejun Heo Cc: Vasily Averin Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_memcontrol.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index a8f4700353a4..1e616a8c6a9c 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -238,6 +238,8 @@ static int cg_test_proc_killed(const char *cgroup) return -1; } +static bool reclaim_until(const char *memcg, long goal); + /* * First, this test creates the following hierarchy: * A memory.min = 0, memory.max = 200M @@ -266,6 +268,12 @@ static int cg_test_proc_killed(const char *cgroup) * unprotected memory in A available, and checks that: * a) memory.min protects pagecache even in this case, * b) memory.low allows reclaiming page cache with low events. + * + * Then we try to reclaim from A/B/C using memory.reclaim until its + * usage reaches 10M. + * This makes sure that: + * (a) We ignore the protection of the reclaim target memcg. + * (b) The previously calculated emin value (~29M) should be dismissed. */ static int test_memcg_protection(const char *root, bool min) { @@ -385,6 +393,9 @@ static int test_memcg_protection(const char *root, bool min) if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3)) goto cleanup; + if (!reclaim_until(children[0], MB(10))) + goto cleanup; + if (min) { ret = KSFT_PASS; goto cleanup; From 6b426d071419a40f61fe41fe1bd9e1b4fa5aeb37 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 1 Dec 2022 15:33:17 -0800 Subject: [PATCH 300/313] mm: disable top-tier fallback to reclaim on proactive reclaim Reclaiming directly from top tier nodes breaks the aging pipeline of memory tiers. If we have a RAM -> CXL -> storage hierarchy, we should demote from RAM to CXL and from CXL to storage. If we reclaim a page from RAM, it means we 'demote' it directly from RAM to storage, bypassing potentially a huge amount of pages colder than it in CXL. However disabling reclaim from top tier nodes entirely would cause ooms in edge scenarios where lower tier memory is unreclaimable for whatever reason, e.g. memory being mlocked() or too hot to reclaim. In these cases we would rather the job run with a performance regression rather than it oom altogether. However, we can disable reclaim from top tier nodes for proactive reclaim. That reclaim is not real memory pressure, and we don't have any cause to be breaking the aging pipeline. [akpm@linux-foundation.org: restore comment layout, per Ying Huang] Link: https://lkml.kernel.org/r/20221201233317.1394958-1-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: "Huang, Ying" Reviewed-by: Yang Shi Cc: Greg Thelen Cc: Shakeel Butt Cc: Tim Chen Cc: Wei Xu Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/vmscan.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index dcd476a66a59..1a59171c6695 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2088,10 +2088,29 @@ keep: nr_reclaimed += demote_folio_list(&demote_folios, pgdat); /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { - /* Folios which weren't demoted go back on @folio_list for retry: */ + /* Folios which weren't demoted go back on @folio_list */ list_splice_init(&demote_folios, folio_list); - do_demote_pass = false; - goto retry; + + /* + * goto retry to reclaim the undemoted folios in folio_list if + * desired. + * + * Reclaiming directly from top tier nodes is not often desired + * due to it breaking the LRU ordering: in general memory + * should be reclaimed from lower tier nodes and demoted from + * top tier nodes. + * + * However, disabling reclaim from top tier nodes entirely + * would cause ooms in edge scenarios where lower tier memory + * is unreclaimable for whatever reason, eg memory being + * mlocked or too hot to reclaim. We can disable reclaim + * from top tier nodes in proactive reclaim though as that is + * not real memory pressure. + */ + if (!sc->proactive) { + do_demote_pass = false; + goto retry; + } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; From 12a5d3955227b0d7e04fb793ccceeb2a1dd275c5 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 2 Dec 2022 14:35:31 -0800 Subject: [PATCH 301/313] mm: add nodes= arg to memory.reclaim The nodes= arg instructs the kernel to only scan the given nodes for proactive reclaim. For example use cases, consider a 2 tier memory system: nodes 0,1 -> top tier nodes 2,3 -> second tier $ echo "1m nodes=0" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory from node 0. Since node 0 is a top tier node, demotion will be attempted first. This is useful to direct proactive reclaim to specific nodes that are under pressure. $ echo "1m nodes=2,3" > memory.reclaim This instructs the kernel to attempt to reclaim 1m memory in the second tier, since this tier of memory has no demotion targets the memory will be reclaimed. $ echo "1m nodes=0,1" > memory.reclaim Instructs the kernel to reclaim memory from the top tier nodes, which can be desirable according to the userspace policy if there is pressure on the top tiers. Since these nodes have demotion targets, the kernel will attempt demotion first. Since commit 3f1509c57b1b ("Revert "mm/vmscan: never demote for memcg reclaim""), the proactive reclaim interface memory.reclaim does both reclaim and demotion. Reclaim and demotion incur different latency costs to the jobs in the cgroup. Demoted memory would still be addressable by the userspace at a higher latency, but reclaimed memory would need to incur a pagefault. The 'nodes' arg is useful to allow the userspace to control demotion and reclaim independently according to its policy: if the memory.reclaim is called on a node with demotion targets, it will attempt demotion first; if it is called on a node without demotion targets, it will only attempt reclaim. Link: https://lkml.kernel.org/r/20221202223533.1785418-1-almasrymina@google.com Signed-off-by: Mina Almasry Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Muchun Song Cc: Bagas Sanjaya Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Roman Gushchin Cc: Tejun Heo Cc: Wei Xu Cc: Yang Shi Cc: Yosry Ahmed Cc: zefan li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 15 +++--- include/linux/swap.h | 3 +- mm/memcontrol.c | 67 ++++++++++++++++++++----- mm/vmscan.c | 4 +- 4 files changed, 68 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 74cec76be9f2..c8ae7c897f14 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,17 +1245,13 @@ PAGE_SIZE multiple when read back. This is a simple interface to trigger memory reclaim in the target cgroup. - This file accepts a single key, the number of bytes to reclaim. - No nested keys are currently supported. + This file accepts a string which contains the number of bytes to + reclaim. Example:: echo "1G" > memory.reclaim - The interface can be later extended with nested keys to - configure the reclaim behavior. For example, specify the - type of memory to reclaim from (anon, file, ..). - Please note that the kernel can over or under reclaim from the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. @@ -1267,6 +1263,13 @@ PAGE_SIZE multiple when read back. This means that the networking layer will not adapt based on reclaim induced by memory.reclaim. + This file also allows the user to specify the nodes to reclaim from, + via the 'nodes=' key, for example:: + + echo "1G nodes=0,1" > memory.reclaim + + The above instructs the kernel to reclaim memory from nodes 0,1. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index 0ceed49516ad..2787b84eaf12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options); + unsigned int reclaim_options, + nodemask_t *nodemask); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2c7a91689fef..ff65bc23be13 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "internal.h" #include #include @@ -2392,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP); + MEMCG_RECLAIM_MAY_SWAP, + NULL); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2683,7 +2685,8 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options); + gfp_mask, reclaim_options, + NULL); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3503,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, + NULL)) { ret = -EBUSY; break; } @@ -3614,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP)) + MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_retries--; } @@ -6418,7 +6423,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL); if (!reclaimed && !nr_retries--) break; @@ -6467,7 +6473,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_reclaims--; continue; } @@ -6590,21 +6597,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_NODES = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t if_tokens = { + { MEMORY_RECLAIM_NODES, "nodes=%s" }, + { MEMORY_RECLAIM_NULL, NULL }, +}; + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options; - int err; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + int token; + char value[256]; + nodemask_t nodemask = NODE_MASK_ALL; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + token = match_token(start, if_tokens, args); + match_strlcpy(value, args, sizeof(value)); + switch (token) { + case MEMORY_RECLAIM_NODES: + if (nodelist_parse(value, nodemask) < 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6621,7 +6661,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options); + GFP_KERNEL, reclaim_options, + &nodemask); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a59171c6695..2b42ac9ad755 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6758,7 +6758,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + nodemask_t *nodemask) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6773,6 +6774,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put From c449deb2b99ff2458214ed4a3526277bc9e40757 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 4 Dec 2022 17:01:03 -0800 Subject: [PATCH 302/313] mm: memcg: fix swapcached stat accounting I'd been worried by high "swapcached" counts in memcg OOM reports, thought we had a problem freeing swapcache, but it was just the accounting that was wrong. Two issues: 1. When __remove_mapping() removes swapcache, __delete_from_swap_cache() relies on memcg_data for the right counts to be updated; but that had already been reset by mem_cgroup_swapout(). Swap those calls around - mem_cgroup_swapout() does not require the swapcached flag to be set. 6.1 commit ac35a4902374 ("mm: multi-gen LRU: minimal implementation") already made a similar swap for workingset_eviction(), but not for this. 2. memcg's "swapcached" count was added for memcg v2 stats, but displayed on OOM even for memcg v1: so mem_cgroup_move_account() ought to move it. Link: https://lkml.kernel.org/r/b8b96ee0-1e1e-85f8-df97-c82a11d7cd14@google.com Fixes: b6038942480e ("mm: memcg: add swapcache stat for memcg v2") Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Roman Gushchin Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++++++ mm/vmscan.c | 3 +-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff65bc23be13..ab457f0394ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5767,6 +5767,12 @@ static int mem_cgroup_move_account(struct page *page, } } +#ifdef CONFIG_SWAP + if (folio_test_swapcache(folio)) { + __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); + __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); + } +#endif if (folio_test_writeback(folio)) { __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 2b42ac9ad755..aba991c505f1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1368,11 +1368,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (folio_test_swapcache(folio)) { swp_entry_t swap = folio_swap_entry(folio); - /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); + mem_cgroup_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); } else { From 6287b7dae80944bfa37784a8f9d6861a4facaa6e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 4 Dec 2022 17:57:07 -0800 Subject: [PATCH 303/313] mm,thp,rmap: fix races between updates of subpages_mapcount Commit 4b51634cd16a, introducing the COMPOUND_MAPPED bit, paid attention to the impossibility of subpages_mapcount ever appearing negative; but did not attend to those races in which it can momentarily appear larger than thought possible. These arise from how page_remove_rmap() first decrements page->_mapcount or compound_mapcount, then, if that transition goes negative (logical 0), decrements subpages_mapcount. The initial decrement lets a racing page_add_*_rmap() reincrement _mapcount or compound_mapcount immediately, and then in rare cases its corresponding increment of subpages_mapcount may be completed before page_remove_rmap()'s decrement. There could even (with increasing unlikelihood) be a series of increments intermixed with the decrements. In practice, checking subpages_mapcount with a temporary WARN on range, has caught values of 0x1000000 (2*COMPOUND_MAPPED, when move_pages() was using remove_migration_pmd()) and 0x800201 (do_huge_pmd_wp_page() using __split_huge_pmd()): page_add_anon_rmap() racing page_remove_rmap(), as predicted. I certainly found it harder to reason about than when bit_spin_locked, but the easy case gives a clue to how to handle the harder case. The easy case being the three !(nr & COMPOUND_MAPPED) checks, which should obviously be replaced by (nr < COMPOUND_MAPPED) checks - to count a page as compound mapped, even while the bit in that position is 0. The harder case is when trying to decide how many subpages are newly covered or uncovered, when compound map is first added or last removed: not knowing all that racily happened between first and second atomic ops. But the easy way to handle that, is again to count the page as compound mapped all the while that its subpages_mapcount indicates so - ignoring the _mapcount or compound_mapcount transition while it is on the way to being reversed. Link: https://lkml.kernel.org/r/4388158-3092-a960-ff2d-55f2b0fe4ef8@google.com Fixes: 4b51634cd16a ("mm,thp,rmap: subpages_mapcount COMPOUND_MAPPED if PMD-mapped") Signed-off-by: Hugh Dickins Cc: David Hildenbrand Cc: James Houghton Cc: Johannes Weiner Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Peter Xu Cc: Sidhartha Kumar Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/rmap.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 459dc1c44d8a..b616870a09be 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1232,7 +1232,7 @@ void page_add_anon_rmap(struct page *page, if (first && PageCompound(page)) { mapped = subpages_mapcount_ptr(compound_head(page)); nr = atomic_inc_return_relaxed(mapped); - nr = !(nr & COMPOUND_MAPPED); + nr = (nr < COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ @@ -1241,8 +1241,16 @@ void page_add_anon_rmap(struct page *page, if (first) { mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); - nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of a remove and another add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* Raced ahead of a remove of COMPOUND_MAPPED */ + nr = 0; + } } } @@ -1330,7 +1338,7 @@ void page_add_file_rmap(struct page *page, if (first && PageCompound(page)) { mapped = subpages_mapcount_ptr(compound_head(page)); nr = atomic_inc_return_relaxed(mapped); - nr = !(nr & COMPOUND_MAPPED); + nr = (nr < COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ @@ -1339,8 +1347,16 @@ void page_add_file_rmap(struct page *page, if (first) { mapped = subpages_mapcount_ptr(page); nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); - nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of a remove and another add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* Raced ahead of a remove of COMPOUND_MAPPED */ + nr = 0; + } } } @@ -1387,7 +1403,7 @@ void page_remove_rmap(struct page *page, if (last && PageCompound(page)) { mapped = subpages_mapcount_ptr(compound_head(page)); nr = atomic_dec_return_relaxed(mapped); - nr = !(nr & COMPOUND_MAPPED); + nr = (nr < COMPOUND_MAPPED); } } else if (PageTransHuge(page)) { /* That test is redundant: it's for safety or to optimize out */ @@ -1396,8 +1412,16 @@ void page_remove_rmap(struct page *page, if (last) { mapped = subpages_mapcount_ptr(page); nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); - nr_pmdmapped = thp_nr_pages(page); - nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + if (likely(nr < COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of another remove and an add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* An add of COMPOUND_MAPPED raced ahead */ + nr = 0; + } } } From a0ac9b3598fac36907bd1baec28c99656d2ed0b6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 5 Dec 2022 20:37:13 +0100 Subject: [PATCH 304/313] mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem Patch series "selftests/vm: fix some tests on 32bit". I finally had the time to run some of the selftests written by me (especially "cow") on x86 PAE. I found some unexpected "surprises" :) With these changes, and with [1] on top of mm-unstable, the "cow" tests and the "ksm_functional_tests" compile and pass as expected (expected failures with hugetlb in the "cow" tests). "madv_populate" has one expected test failure -- x86 does not support softdirty tracking. #1-#3 fix commits with stable commit ids. #4 fixes a test that is not in mm-stable yet. A note that there are many other compile errors/warnings when compiling on 32bit and with older Linux headers ... something for another day. [1] https://lkml.kernel.org/r/20221205150857.167583-1-david@redhat.com This patch (of 4): ... we have to kmap()/kunmap(), otherwise this won't work as expected with highmem. Link: https://lkml.kernel.org/r/20221205193716.276024-1-david@redhat.com Link: https://lkml.kernel.org/r/20221205193716.276024-2-david@redhat.com Fixes: c77369b437f9 ("mm/gup_test: start/stop/read functionality for PIN LONGTERM test") Signed-off-by: David Hildenbrand Cc: Shuah Khan , Cc: Yang Li Signed-off-by: Andrew Morton --- mm/gup_test.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/gup_test.c b/mm/gup_test.c index 0d76d9b4bb5a..33f431e0da60 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "gup_test.h" static void put_back_pages(unsigned int cmd, struct page **pages, @@ -297,10 +298,13 @@ static inline int pin_longterm_test_read(unsigned long arg) return -EFAULT; for (i = 0; i < pin_longterm_test_nr_pages; i++) { - void *addr = page_to_virt(pin_longterm_test_pages[i]); + void *addr = kmap_local_page(pin_longterm_test_pages[i]); + unsigned long ret; - if (copy_to_user((void __user *)(unsigned long)user_addr, addr, - PAGE_SIZE)) + ret = copy_to_user((void __user *)(unsigned long)user_addr, addr, + PAGE_SIZE); + kunmap_local(addr); + if (ret) return -EFAULT; user_addr += PAGE_SIZE; } From d88825f22b8f1cad8acb429b6bf0a54c85d7b93f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 5 Dec 2022 20:37:14 +0100 Subject: [PATCH 305/313] selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions The tests fail to compile in some environments (e.g., Debian 11.5 on x86). Let's simply conditionally define MADV_POPULATE_(READ|WRITE) if not already defined, similar to how the khugepaged.c test handles it. Link: https://lkml.kernel.org/r/20221205193716.276024-3-david@redhat.com Fixes: 39b2e5cae43d ("selftests/vm: make MADV_POPULATE_(READ|WRITE) use in-tree headers") Signed-off-by: David Hildenbrand Cc: Shuah Khan Cc: Yang Li Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/madv_populate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/testing/selftests/vm/madv_populate.c b/tools/testing/selftests/vm/madv_populate.c index 60547245e479..262eae6b58f2 100644 --- a/tools/testing/selftests/vm/madv_populate.c +++ b/tools/testing/selftests/vm/madv_populate.c @@ -20,6 +20,13 @@ #include "../kselftest.h" #include "vm_util.h" +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif /* MADV_POPULATE_READ */ +#ifndef MADV_POPULATE_WRITE +#define MADV_POPULATE_WRITE 23 +#endif /* MADV_POPULATE_WRITE */ + /* * For now, we're using 2 MiB of private anonymous memory for all tests. */ From 380969fe5aacdea661d3f9ab32e84327e8624ae2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 5 Dec 2022 20:37:15 +0100 Subject: [PATCH 306/313] selftests/vm: cow: fix compile warning on 32bit The compiler complains about the conversion of a pointer to an int of different width. Link: https://lkml.kernel.org/r/20221205193716.276024-4-david@redhat.com Fixes: 6f1405efc61b ("selftests/vm: anon_cow: add R/O longterm tests via gup_test") Signed-off-by: David Hildenbrand Cc: Shuah Khan Cc: Yang Li Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/cow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/cow.c b/tools/testing/selftests/vm/cow.c index 73e05b52c49e..26f6ea3079e2 100644 --- a/tools/testing/selftests/vm/cow.c +++ b/tools/testing/selftests/vm/cow.c @@ -650,7 +650,7 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, } /* Take a R/O pin. This should trigger unsharing. */ - args.addr = (__u64)mem; + args.addr = (__u64)(uintptr_t)mem; args.size = size; args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); @@ -669,7 +669,7 @@ static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, * Read back the content via the pin to the temporary buffer and * test if we observed the modification. */ - tmp_val = (__u64)tmp; + tmp_val = (__u64)(uintptr_t)tmp; ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); if (ret) ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); From 9d789c3b4170574baa4242dd8cae5988cf97d48d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 5 Dec 2022 20:37:16 +0100 Subject: [PATCH 307/313] selftests/vm: ksm_functional_tests: fixes for 32bit The test currently fails on 32bit. Fixing the "-1ull" vs. "-1ul" seems to make the test pass and the compiler happy. Note: This test is not in mm-stable yet. This fix should be squashed into "selftests/vm: add KSM unmerge tests". Link: https://lkml.kernel.org/r/20221205193716.276024-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Shuah Khan Cc: Yang Li Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/ksm_functional_tests.c | 4 ++-- tools/testing/selftests/vm/vm_util.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/ksm_functional_tests.c b/tools/testing/selftests/vm/ksm_functional_tests.c index 96644be68962..b11b7e5115dc 100644 --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ b/tools/testing/selftests/vm/ksm_functional_tests.c @@ -42,13 +42,13 @@ static bool range_maps_duplicates(char *addr, unsigned long size) for (offs_a = 0; offs_a < size; offs_a += pagesize) { pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); /* Page not present or PFN not exposed by the kernel. */ - if (pfn_a == -1ull || !pfn_a) + if (pfn_a == -1ul || !pfn_a) continue; for (offs_b = offs_a + pagesize; offs_b < size; offs_b += pagesize) { pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); - if (pfn_b == -1ull || !pfn_b) + if (pfn_b == -1ul || !pfn_b) continue; if (pfn_a == pfn_b) return true; diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 710571902743..40e795624ff3 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -50,7 +50,7 @@ unsigned long pagemap_get_pfn(int fd, char *start) /* If present (63th bit), PFN is at bit 0 -- 54. */ if (entry & 0x8000000000000000ull) return entry & 0x007fffffffffffffull; - return -1ull; + return -1ul; } void clear_softdirty(void) From 8614d6c5eda005ad72b37afeaae2879d7c101b18 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 5 Dec 2022 18:30:07 +0100 Subject: [PATCH 308/313] mm: do not show fs mm pc for VM_LOCKONFAULT pages When VM_LOCKONFAULT was added, /proc/PID/smaps wasn't hooked up to it, so looking at /proc/PID/smaps, it shows '??' instead of something intelligable. This can be reached by userspace by simply calling `mlock2(..., MLOCK_ONFAULT);`. Fix this by adding "lf" to denote VM_LOCKONFAULT. Link: https://lkml.kernel.org/r/20221205173007.580210-1-Jason@zx2c4.com Fixes: de60f5f10c58 ("mm: introduce VM_LOCKONFAULT") Signed-off-by: Jason A. Donenfeld Acked-by: Vlastimil Babka Cc: Eric B Munson Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 89338950afd3..e35a0398db63 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -674,6 +674,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", From de2e5171433126d340573cb7d0d4fcac084ab2a0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 5 Dec 2022 22:03:27 +0800 Subject: [PATCH 309/313] mm: add cond_resched() in swapin_walk_pmd_entry() When handling MADV_WILLNEED in madvise(), a soflockup may occurr in swapin_walk_pmd_entry() if swapping in lots of memory on a slow device. Add a cond_resched() to avoid the possible softlockup. Link: https://lkml.kernel.org/r/20221205140327.72304-1-wangkefeng.wang@huawei.com Fixes: 1998cc048901 ("mm: make madvise(MADV_WILLNEED) support swap file prefetch") Signed-off-by: Kefeng Wang Cc: Shaohua Li Cc: Hugh Dickins Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/madvise.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/madvise.c b/mm/madvise.c index 2573ea3ed684..b7d9b1a1c135 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -223,6 +223,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, put_page(page); } swap_read_unplug(splug); + cond_resched(); return 0; } From 5478afc55a2104caaef5b78c7c1f9acb9ec1f92a Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 5 Dec 2022 15:57:40 +0100 Subject: [PATCH 310/313] kmsan: fix memcpy tests Recent Clang changes may cause it to delete calls of memcpy(), if the source is an uninitialized volatile local. This happens because passing a pointer to a volatile local into memcpy() discards the volatile qualifier, giving the compiler a free hand to optimize the memcpy() call away. Use OPTIMIZER_HIDE_VAR() to hide the uninitialized var from the too-smart compiler. Link: https://lkml.kernel.org/r/20221205145740.694038-1-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Reviewed-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 9a29ea2dbfb9..eb44ef3c5f29 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -419,6 +419,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)&dst, sizeof(dst)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); @@ -441,6 +442,7 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)dst, 4); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); @@ -464,6 +466,7 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); From c7cdf94e9cd7a03549e61b0f85949959191b8a10 Mon Sep 17 00:00:00 2001 From: Wang Yong Date: Wed, 7 Dec 2022 07:40:11 +0000 Subject: [PATCH 311/313] mm: fix typo in struct pglist_data code comment change "stat" to "start". Link: https://lkml.kernel.org/r/20221207074011.GA151242@cloud Fixes: c959924b0dc5 ("memory tiering: adjust hot threshold automatically") Signed-off-by: Wang Yong Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..128f3cde800c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,7 +1200,7 @@ typedef struct pglist_data { /* start time in ms of current promote threshold adjustment period */ unsigned int nbp_th_start; /* - * number of promote candidate pages at stat time of current promote + * number of promote candidate pages at start time of current promote * threshold adjustment period */ unsigned long nbp_th_nr_cand; From c47454823bd4e3ab34ed3f795afd4479ab938a3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 6 Dec 2022 11:15:09 -0800 Subject: [PATCH 312/313] mm: mmu_gather: allow more than one batch of delayed rmaps Commit 5df397dec7c4 ("mm: delay page_remove_rmap() until after the TLB has been flushed") limited the page batching for the mmu gather operation when a dirty shared page needed to delay rmap removal until after the TLB had been flushed. It did so because it needs to walk that array of pages while still holding the page table lock, and our mmu_gather infrastructure allows for batching quite a lot of pages. We may have thousands on pages queued up for freeing, and we wanted to walk only the last batch if we then added a dirty page to the queue. However, when I limited it to one batch, I didn't think of the degenerate case of the special first batch that is embedded on-stack in the mmu_gather structure (called "local") and that only has eight entries. So with the right pattern, that "limit delayed rmap to just one batch" will trigger over and over in that first small batch, and we'll waste a lot of time flushing TLB's every eight pages. And those right patterns are trivially triggered by just having a shared mappings with lots of adjacent dirty pages. Like the 'page_fault3' subtest of the 'will-it-scale' benchmark, that just maps a shared area, dirties all pages, and unmaps it. Rinse and repeat. We still want to limit the batching, but to fix this (easily triggered) degenerate case, just expand the "only one batch" logic to instead be "only one batch that isn't the special first on-stack ('local') batch". That way, when we need to flush the delayed rmaps, we can still limit our walk to just the last batch - and that first small one. Link: https://lkml.kernel.org/r/CAHk-=whkL5aM1fR7kYUmhHQHBcMUc-bDoFP7EwYjTxy64DGtvw@mail.gmail.com Fixes: 5df397dec7c4 ("mm: delay page_remove_rmap() until after the TLB has been flushed") Signed-off-by: Linus Torvalds Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202212051534.852804af-yujie.liu@intel.com Tested-by: Huang, Ying Tested-by: Hugh Dickins Cc: Feng Tang Cc: Johannes Weiner Cc: Nadav Amit Cc: Xing Zhengjun Cc: "Yin, Fengwei" Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 8247553a69c2..2b93cf6ac9ae 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -19,8 +19,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; - /* No more batching if we have delayed rmaps pending */ - if (tlb->delayed_rmap) + /* Limit batching if we have delayed rmaps pending */ + if (tlb->delayed_rmap && tlb->active != &tlb->local) return false; batch = tlb->active; @@ -48,22 +48,8 @@ static bool tlb_next_batch(struct mmu_gather *tlb) } #ifdef CONFIG_SMP -/** - * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB - * @tlb: the current mmu_gather - * - * Note that because of how tlb_next_batch() above works, we will - * never start new batches with pending delayed rmaps, so we only - * need to walk through the current active batch. - */ -void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) +static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) { - struct mmu_gather_batch *batch; - - if (!tlb->delayed_rmap) - return; - - batch = tlb->active; for (int i = 0; i < batch->nr; i++) { struct encoded_page *enc = batch->encoded_pages[i]; @@ -72,7 +58,25 @@ void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) page_remove_rmap(page, vma, false); } } +} +/** + * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB + * @tlb: the current mmu_gather + * + * Note that because of how tlb_next_batch() above works, we will + * never start multiple new batches with pending delayed rmaps, so + * we only need to walk through the current active batch and the + * original local one. + */ +void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (!tlb->delayed_rmap) + return; + + tlb_flush_rmap_batch(&tlb->local, vma); + if (tlb->active != &tlb->local) + tlb_flush_rmap_batch(tlb->active, vma); tlb->delayed_rmap = 0; } #endif From c45bc55a99957b20e4e0333bcd42e12d1833a7f5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 12 Dec 2022 14:55:29 -0800 Subject: [PATCH 313/313] mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio folio_set_compound_order() checks if the passed in folio is a large folio. A large folio is indicated by the PG_head flag. Call __folio_set_head() before setting the order. Link: https://lkml.kernel.org/r/20221212225529.22493-1-sidhartha.kumar@oracle.com Fixes: d1c6095572d0 ("mm/hugetlb: convert hugetlb prep functions to folios") Signed-off-by: Sidhartha Kumar Reported-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8c6fe2286814..7cdbcc22587b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1805,10 +1805,10 @@ static bool __prep_compound_gigantic_folio(struct folio *folio, int nr_pages = 1 << order; struct page *p; - /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_compound_order(folio, order); __folio_clear_reserved(folio); __folio_set_head(folio); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_compound_order(folio, order); for (i = 0; i < nr_pages; i++) { p = folio_page(folio, i);