From cd16dd03737c30608b78d235b17b3ab935ed18db Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 25 May 2022 15:52:20 -0400 Subject: [PATCH 001/282] mm/x86: remove dead code for hugetlbpage.c It seems to exist since the old times and never used once. Remove them. Link: https://lkml.kernel.org/r/20220525195220.10241-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Muchun Song Signed-off-by: Andrew Morton --- arch/x86/mm/hugetlbpage.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index a0d023cb4292..509408da0da1 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,44 +19,6 @@ #include #include -#if 0 /* This is just for testing */ -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - unsigned long start = address; - int length = 1; - int nr; - struct page *page; - struct vm_area_struct *vma; - - vma = find_vma(mm, addr); - if (!vma || !is_vm_hugetlb_page(vma)) - return ERR_PTR(-EINVAL); - - pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); - - /* hugetlb should be locked, and hence, prefaulted */ - WARN_ON(!pte || pte_none(*pte)); - - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - - WARN_ON(!PageHead(page)); - - return page; -} - -int pmd_huge(pmd_t pmd) -{ - return 0; -} - -int pud_huge(pud_t pud) -{ - return 0; -} - -#else - /* * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. @@ -72,7 +34,6 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } -#endif #ifdef CONFIG_HUGETLB_PAGE static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, From 0b82ade6c042907e4f24bbe826958a896d24700d Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Thu, 26 May 2022 22:02:57 +0800 Subject: [PATCH 002/282] mm: use PAGE_ALIGNED instead of IS_ALIGNED already provides the PAGE_ALIGNED macro. Let's use this macro instead of IS_ALIGNED and passing PAGE_SIZE directly. Link: https://lkml.kernel.org/r/20220526140257.1568744-1-bh1scw@gmail.com Signed-off-by: Fanjun Kong Acked-by: Muchun Song Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index f4fa61dbbee3..49cb15cbe590 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -200,8 +200,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, unsigned long next; pgd_t *pgd; - VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE)); - VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE)); + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); pgd = pgd_offset_k(addr); do { From 4f5ceb8851f0081af54313abbf56de1615911faf Mon Sep 17 00:00:00 2001 From: Yuanzheng Song Date: Sat, 28 May 2022 06:31:17 +0000 Subject: [PATCH 003/282] tools/vm/slabinfo: use alphabetic order when two values are equal When the number of partial slabs in each cache is the same (e.g., the value are 0), the results of the `slabinfo -X -N5` and `slabinfo -P -N5` are different. / # slabinfo -X -N5 ... Slabs sorted by number of partial slabs --------------------------------------- Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg inode_cache 15180 392 6217728 758/0/1 20 1 0 95 a kernfs_node_cache 22494 88 2002944 488/0/1 46 0 0 98 shmem_inode_cache 663 464 319488 38/0/1 17 1 0 96 biovec-max 50 3072 163840 4/0/1 10 3 0 93 A dentry 19050 136 2600960 633/0/2 30 0 0 99 a / # slabinfo -P -N5 Name Objects Objsize Space Slabs/Part/Cpu O/S O %Fr %Ef Flg bdev_cache 32 984 32.7K 1/0/1 16 2 0 96 Aa ext4_inode_cache 42 752 32.7K 1/0/1 21 2 0 96 a dentry 19050 136 2.6M 633/0/2 30 0 0 99 a TCPv6 17 1840 32.7K 0/0/1 17 3 0 95 A RAWv6 18 856 16.3K 0/0/1 18 2 0 94 A This problem is caused by the sort_slabs(). So let's use alphabetic order when two values are equal in the sort_slabs(). By the way, the content of the `slabinfo -h` is not aligned because the `-P|--partial Sort by number of partial slabs` uses tabs instead of spaces. So let's use spaces instead of tabs to fix it. Link: https://lkml.kernel.org/r/20220528063117.935158-1-songyuanzheng@huawei.com Fixes: 1106b205a3fe ("tools/vm/slabinfo: add partial slab listing to -X") Signed-off-by: Yuanzheng Song Cc: "Tobin C. Harding" Signed-off-by: Andrew Morton --- tools/vm/slabinfo.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 9b68658b6bb8..3ae985dc24b6 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -125,7 +125,7 @@ static void usage(void) "-n|--numa Show NUMA information\n" "-N|--lines=K Show the first K slabs\n" "-o|--ops Show kmem_cache_ops\n" - "-P|--partial Sort by number of partial slabs\n" + "-P|--partial Sort by number of partial slabs\n" "-r|--report Detailed report on single slabs\n" "-s|--shrink Shrink slabs\n" "-S|--Size Sort by size\n" @@ -1045,15 +1045,27 @@ static void sort_slabs(void) for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { int result; - if (sort_size) - result = slab_size(s1) < slab_size(s2); - else if (sort_active) - result = slab_activity(s1) < slab_activity(s2); - else if (sort_loss) - result = slab_waste(s1) < slab_waste(s2); - else if (sort_partial) - result = s1->partial < s2->partial; - else + if (sort_size) { + if (slab_size(s1) == slab_size(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_size(s1) < slab_size(s2); + } else if (sort_active) { + if (slab_activity(s1) == slab_activity(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_activity(s1) < slab_activity(s2); + } else if (sort_loss) { + if (slab_waste(s1) == slab_waste(s2)) + result = strcasecmp(s1->name, s2->name); + else + result = slab_waste(s1) < slab_waste(s2); + } else if (sort_partial) { + if (s1->partial == s2->partial) + result = strcasecmp(s1->name, s2->name); + else + result = s1->partial < s2->partial; + } else result = strcasecmp(s1->name, s2->name); if (show_inverted) From d92725256b4f22d084b813b37ddc394da79aacab Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 30 May 2022 14:34:50 -0400 Subject: [PATCH 004/282] mm: avoid unnecessary page fault retires on shared memory types I observed that for each of the shared file-backed page faults, we're very likely to retry one more time for the 1st write fault upon no page. It's because we'll need to release the mmap lock for dirty rate limit purpose with balance_dirty_pages_ratelimited() (in fault_dirty_shared_page()). Then after that throttling we return VM_FAULT_RETRY. We did that probably because VM_FAULT_RETRY is the only way we can return to the fault handler at that time telling it we've released the mmap lock. However that's not ideal because it's very likely the fault does not need to be retried at all since the pgtable was well installed before the throttling, so the next continuous fault (including taking mmap read lock, walk the pgtable, etc.) could be in most cases unnecessary. It's not only slowing down page faults for shared file-backed, but also add more mmap lock contention which is in most cases not needed at all. To observe this, one could try to write to some shmem page and look at "pgfault" value in /proc/vmstat, then we should expect 2 counts for each shmem write simply because we retried, and vm event "pgfault" will capture that. To make it more efficient, add a new VM_FAULT_COMPLETED return code just to show that we've completed the whole fault and released the lock. It's also a hint that we should very possibly not need another fault immediately on this page because we've just completed it. This patch provides a ~12% perf boost on my aarch64 test VM with a simple program sequentially dirtying 400MB shmem file being mmap()ed and these are the time it needs: Before: 650.980 ms (+-1.94%) After: 569.396 ms (+-1.38%) I believe it could help more than that. We need some special care on GUP and the s390 pgfault handler (for gmap code before returning from pgfault), the rest changes in the page fault handlers should be relatively straightforward. Another thing to mention is that mm_account_fault() does take this new fault as a generic fault to be accounted, unlike VM_FAULT_RETRY. I explicitly didn't touch hmm_vma_fault() and break_ksm() because they do not handle VM_FAULT_RETRY even with existing code, so I'm literally keeping them as-is. Link: https://lkml.kernel.org/r/20220530183450.42886-1-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: Geert Uytterhoeven Acked-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Acked-by: Vineet Gupta Acked-by: Guo Ren Acked-by: Max Filippov Acked-by: Christian Borntraeger Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Reviewed-by: Alistair Popple Reviewed-by: Ingo Molnar Acked-by: Russell King (Oracle) [arm part] Acked-by: Heiko Carstens Cc: Vasily Gorbik Cc: Stafford Horne Cc: David S. Miller Cc: Johannes Berg Cc: Brian Cain Cc: Richard Henderson Cc: Richard Weinberger Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Janosch Frank Cc: Albert Ou Cc: Anton Ivanov Cc: Dave Hansen Cc: Borislav Petkov Cc: Sven Schnelle Cc: Andrea Arcangeli Cc: James Bottomley Cc: Al Viro Cc: Alexander Gordeev Cc: Jonas Bonn Cc: Will Deacon Cc: Vlastimil Babka Cc: Michal Simek Cc: Matt Turner Cc: Paul Mackerras Cc: David Hildenbrand Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Stefan Kristiansson Cc: Paul Walmsley Cc: Ivan Kokshaysky Cc: Chris Zankel Cc: Hugh Dickins Cc: Dinh Nguyen Cc: Rich Felker Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Thomas Bogendoerfer Cc: Helge Deller Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/mm/fault.c | 4 ++++ arch/arc/mm/fault.c | 4 ++++ arch/arm/mm/fault.c | 4 ++++ arch/arm64/mm/fault.c | 4 ++++ arch/csky/mm/fault.c | 4 ++++ arch/hexagon/mm/vm_fault.c | 4 ++++ arch/ia64/mm/fault.c | 4 ++++ arch/m68k/mm/fault.c | 4 ++++ arch/microblaze/mm/fault.c | 4 ++++ arch/mips/mm/fault.c | 4 ++++ arch/nios2/mm/fault.c | 4 ++++ arch/openrisc/mm/fault.c | 4 ++++ arch/parisc/mm/fault.c | 4 ++++ arch/powerpc/mm/copro_fault.c | 5 +++++ arch/powerpc/mm/fault.c | 5 +++++ arch/riscv/mm/fault.c | 4 ++++ arch/s390/mm/fault.c | 12 ++++++++++++ arch/sh/mm/fault.c | 4 ++++ arch/sparc/mm/fault_32.c | 4 ++++ arch/sparc/mm/fault_64.c | 5 +++++ arch/um/kernel/trap.c | 4 ++++ arch/x86/mm/fault.c | 4 ++++ arch/xtensa/mm/fault.c | 4 ++++ include/linux/mm_types.h | 2 ++ mm/gup.c | 34 +++++++++++++++++++++++++++++++++- mm/memory.c | 2 +- 26 files changed, 139 insertions(+), 2 deletions(-) diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index ec20c1004abf..ef427a6bdd1a 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -155,6 +155,10 @@ retry: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index dad27e4d69ff..5ca59a482632 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -146,6 +146,10 @@ retry: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + /* * Fault retry nuances, mmap_lock already relinquished by core mm */ diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index a062e07516dd..46cccd6bf705 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -322,6 +322,10 @@ retry: return 0; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (!(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index c5e11768e5c1..de166cdeb89a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -608,6 +608,10 @@ retry: return 0; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (fault & VM_FAULT_RETRY) { mm_flags |= FAULT_FLAG_TRIED; goto retry; diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index 7215a46b6b8e..e15f736cca4b 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -285,6 +285,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely((fault & VM_FAULT_RETRY) && (flags & FAULT_FLAG_ALLOW_RETRY))) { flags |= FAULT_FLAG_TRIED; diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 4fac4b9eb316..f73c7cbfe326 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c @@ -96,6 +96,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + /* The most common case -- we are done. */ if (likely(!(fault & VM_FAULT_ERROR))) { if (fault & VM_FAULT_RETRY) { diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 07379d1a227f..ef78c2d66cdd 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -139,6 +139,10 @@ retry: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { /* * We ran out of memory, or some other thing happened diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 71aa9f6315dc..4d2837eb3e2a 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -141,6 +141,10 @@ good_area: if (fault_signal_pending(fault, regs)) return 0; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index a9626e6a68af..5c40c3ebe52f 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c @@ -222,6 +222,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index b08bc556d30d..a27045f5a556 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c @@ -162,6 +162,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index a32f14cd72f2..edaca0a6c1c1 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c @@ -139,6 +139,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 53b760af3bb7..b4762d66e9ef 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c @@ -165,6 +165,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 84bc437be5cd..9ad80d4d3389 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -311,6 +311,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { /* * We hit a shared mapping outside of the file, or some diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index c1cb21a00884..7c507fb48182 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -65,6 +65,11 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, ret = 0; *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0, NULL); + + /* The fault is fully completed (including releasing mmap lock) */ + if (*flt & VM_FAULT_COMPLETED) + return 0; + if (unlikely(*flt & VM_FAULT_ERROR)) { if (*flt & VM_FAULT_OOM) { ret = -ENOMEM; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index d53fed4eccbd..014005428687 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -511,6 +511,10 @@ retry: if (fault_signal_pending(fault, regs)) return user_mode(regs) ? 0 : SIGBUS; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + goto out; + /* * Handle the retry right now, the mmap_lock has been released in that * case. @@ -525,6 +529,7 @@ retry: if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); +out: /* * Major/minor page fault accounting. */ diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 40694f0cab9e..f2fbd1400b7c 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -326,6 +326,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_RETRY)) { flags |= FAULT_FLAG_TRIED; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index e173b6187ad5..973dcd05c293 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -433,6 +433,17 @@ retry: goto out_up; goto out; } + + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) { + if (gmap) { + mmap_read_lock(mm); + goto out_gmap; + } + fault = 0; + goto out; + } + if (unlikely(fault & VM_FAULT_ERROR)) goto out_up; @@ -452,6 +463,7 @@ retry: mmap_read_lock(mm); goto retry; } +out_gmap: if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index e175667b1363..acd2f5e50bfc 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -485,6 +485,10 @@ good_area: if (mm_fault_error(regs, error_code, address, fault)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (fault & VM_FAULT_RETRY) { flags |= FAULT_FLAG_TRIED; diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index ad569d9bd124..91259f291c54 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -190,6 +190,10 @@ good_area: if (fault_signal_pending(fault, regs)) return; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index 253e07043298..4acc12eafbf5 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c @@ -427,6 +427,10 @@ good_area: if (fault_signal_pending(fault, regs)) goto exit_exception; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + goto lock_released; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; @@ -449,6 +453,7 @@ good_area: } mmap_read_unlock(mm); +lock_released: mm_rss = get_mm_rss(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) mm_rss -= (mm->context.thp_pte_count * (HPAGE_SIZE / PAGE_SIZE)); diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index d1d5d0be0308..d3ce21c4ca32 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -76,6 +76,10 @@ good_area: if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) goto out_nosemaphore; + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return 0; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) { goto out_of_memory; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fad8faa29d04..fe10c6d76bac 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1408,6 +1408,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + /* * If we need to retry the mmap_lock has already been released, * and if there is a fatal signal pending there is no guarantee diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 16f0a5ff5799..8c781b05c0bd 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -172,6 +172,10 @@ good_area: return; } + /* The fault is fully completed (including releasing mmap lock) */ + if (fault & VM_FAULT_COMPLETED) + return; + if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) goto out_of_memory; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c29ab4c0cd5c..6b961a29bf26 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,6 +729,7 @@ typedef __bitwise unsigned int vm_fault_t; * @VM_FAULT_NEEDDSYNC: ->fault did not modify page tables and needs * fsync() to complete (for synchronous page faults * in DAX) + * @VM_FAULT_COMPLETED: ->fault completed, meanwhile mmap lock released * @VM_FAULT_HINDEX_MASK: mask HINDEX value * */ @@ -746,6 +747,7 @@ enum vm_fault_reason { VM_FAULT_FALLBACK = (__force vm_fault_t)0x000800, VM_FAULT_DONE_COW = (__force vm_fault_t)0x001000, VM_FAULT_NEEDDSYNC = (__force vm_fault_t)0x002000, + VM_FAULT_COMPLETED = (__force vm_fault_t)0x004000, VM_FAULT_HINDEX_MASK = (__force vm_fault_t)0x0f0000, }; diff --git a/mm/gup.c b/mm/gup.c index 551264407624..407a81d5ca03 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -951,6 +951,25 @@ static int faultin_page(struct vm_area_struct *vma, } ret = handle_mm_fault(vma, address, fault_flags, NULL); + + if (ret & VM_FAULT_COMPLETED) { + /* + * With FAULT_FLAG_RETRY_NOWAIT we'll never release the + * mmap lock in the page fault handler. Sanity check this. + */ + WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT); + if (locked) + *locked = 0; + /* + * We should do the same as VM_FAULT_RETRY, but let's not + * return -EBUSY since that's not reflecting the reality of + * what has happened - we've just fully completed a page + * fault, with the mmap lock released. Use -EAGAIN to show + * that we want to take the mmap lock _again_. + */ + return -EAGAIN; + } + if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, *flags); @@ -1177,6 +1196,7 @@ retry: case 0: goto retry; case -EBUSY: + case -EAGAIN: ret = 0; fallthrough; case -EFAULT: @@ -1303,6 +1323,18 @@ retry: return -EINTR; ret = handle_mm_fault(vma, address, fault_flags, NULL); + + if (ret & VM_FAULT_COMPLETED) { + /* + * NOTE: it's a pity that we need to retake the lock here + * to pair with the unlock() in the callers. Ideally we + * could tell the callers so they do not need to unlock. + */ + mmap_read_lock(mm); + *unlocked = true; + return 0; + } + if (ret & VM_FAULT_ERROR) { int err = vm_fault_to_errno(ret, 0); @@ -1368,7 +1400,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, /* VM_FAULT_RETRY couldn't trigger, bypass */ return ret; - /* VM_FAULT_RETRY cannot return errors */ + /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */ if (!*locked) { BUG_ON(ret < 0); BUG_ON(ret >= nr_pages); diff --git a/mm/memory.c b/mm/memory.c index 7a089145cad4..580c62febe42 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3020,7 +3020,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) balance_dirty_pages_ratelimited(mapping); if (fpin) { fput(fpin); - return VM_FAULT_RETRY; + return VM_FAULT_COMPLETED; } } From 833de10ff58e230fba523ad618e17c93d33b6fa3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:58:41 +0800 Subject: [PATCH 005/282] mm/shmem.c: clean up comment of shmem_swapin_folio shmem_swapin_folio has changed to use folio but comment still mentions page. Update the relevant comment accordingly as suggested by Naoya. Link: https://lkml.kernel.org/r/20220530115841.4348-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Naoya Horiguchi Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index a6f565308133..12d45a03f7fc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1706,10 +1706,10 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, } /* - * Swap in the page pointed to by *pagep. - * Caller has to make sure that *pagep contains a valid swapped page. - * Returns 0 and the page in pagep if success. On failure, returns the - * error code and NULL in *pagep. + * Swap in the folio pointed to by *foliop. + * Caller has to make sure that *foliop contains a valid swapped folio. + * Returns 0 and the folio in foliop if success. On failure, returns the + * error code and NULL in *foliop. */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, @@ -1749,7 +1749,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } folio = page_folio(page); - /* We have to do this with page locked to prevent races */ + /* We have to do this with folio locked to prevent races */ folio_lock(folio); if (!folio_test_swapcache(folio) || folio_swap_entry(folio).val != swap.val || From 943189db4f3ed1445dd630dc0b96e115357c4330 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 31 May 2022 14:34:41 +0530 Subject: [PATCH 006/282] mm/memory_hotplug: drop 'reason' argument from check_pfn_span() In check_pfn_span(), a 'reason' string is being used to recreate the caller function name, while printing the warning message. It is really unnecessary as the warning message could just be printed inside the caller depending on the return code. Currently there are just two callers for check_pfn_span() i.e __add_pages() and __remove_pages(). Let's clean this up. Link: https://lkml.kernel.org/r/20220531090441.170650-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Oscar Salvador Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1213d0c67a53..1f1a730c4499 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -237,8 +237,7 @@ static void release_memory_resource(struct resource *res) kfree(res); } -static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, - const char *reason) +static int check_pfn_span(unsigned long pfn, unsigned long nr_pages) { /* * Disallow all operations smaller than a sub-section and only @@ -255,12 +254,8 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, min_align = PAGES_PER_SUBSECTION; else min_align = PAGES_PER_SECTION; - if (!IS_ALIGNED(pfn, min_align) - || !IS_ALIGNED(nr_pages, min_align)) { - WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n", - reason, pfn, pfn + nr_pages - 1); + if (!IS_ALIGNED(pfn | nr_pages, min_align)) return -EINVAL; - } return 0; } @@ -337,9 +332,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, altmap->alloc = 0; } - err = check_pfn_span(pfn, nr_pages, "add"); - if (err) - return err; + if (check_pfn_span(pfn, nr_pages)) { + WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); + return -EINVAL; + } for (; pfn < end_pfn; pfn += cur_nr_pages) { /* Select all remaining pages up to the next section boundary */ @@ -536,8 +532,10 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, map_offset = vmem_altmap_offset(altmap); - if (check_pfn_span(pfn, nr_pages, "remove")) + if (check_pfn_span(pfn, nr_pages)) { + WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1); return; + } for (; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); From bcc728eb4f446073e0160671d7d0059a4e9aa300 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Tue, 31 May 2022 10:04:21 +0800 Subject: [PATCH 007/282] mm/damon: remove obsolete comments of kdamond_stop Since commit 0f91d13366a4 ("mm/damon: simplify stop mechanism") delete kdamond_stop and change to use kthread stop mechanism, these obsolete comments should be removed accordingly. Link: https://lkml.kernel.org/r/20220531020421.46849-1-zhouchengming@bytedance.com Signed-off-by: Chengming Zhou Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7c62da31ce4b..2765c7d99beb 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -397,7 +397,6 @@ struct damon_callback { * detail. * * @kdamond: Kernel thread who does the monitoring. - * @kdamond_stop: Notifies whether kdamond should stop. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * * For each monitoring context, one kernel thread for the monitoring is @@ -406,14 +405,14 @@ struct damon_callback { * Once started, the monitoring thread runs until explicitly required to be * terminated or every monitoring target is invalid. The validity of the * targets is checked via the &damon_operations.target_valid of @ops. The - * termination can also be explicitly requested by writing non-zero to - * @kdamond_stop. The thread sets @kdamond to NULL when it terminates. - * Therefore, users can know whether the monitoring is ongoing or terminated by - * reading @kdamond. Reads and writes to @kdamond and @kdamond_stop from - * outside of the monitoring thread must be protected by @kdamond_lock. + * termination can also be explicitly requested by calling damon_stop(). + * The thread sets @kdamond to NULL when it terminates. Therefore, users can + * know whether the monitoring is ongoing or terminated by reading @kdamond. + * Reads and writes to @kdamond from outside of the monitoring thread must + * be protected by @kdamond_lock. * - * Note that the monitoring thread protects only @kdamond and @kdamond_stop via - * @kdamond_lock. Accesses to other fields must be protected by themselves. + * Note that the monitoring thread protects only @kdamond via @kdamond_lock. + * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. From 1b23ff80b399ae4561bbfd45f7c9c98f62797304 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 7 Jun 2022 18:59:55 +0800 Subject: [PATCH 008/282] mm/vmalloc: invoke classify_va_fit_type() in adjust_va_to_fit_type() Patch series "Cleanup patches of vmalloc", v2. Some cleanup patches found when reading vmalloc code. This patch (of 4): adjust_va_to_fit_type() checks all values of passed in fit type, including NOTHING_FIT in the else branch. However, the check of NOTHING_FIT has been done inside adjust_va_to_fit_type() and before it's called in all call sites. In fact, both of these functions are coupled tightly, since classify_va_fit_type() is doing the preparation work for adjust_va_to_fit_type(). So putting invocation of classify_va_fit_type() inside adjust_va_to_fit_type() can simplify code logic and the redundant check of NOTHING_FIT issue will go away. Link: https://lkml.kernel.org/r/20220607105958.382076-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20220607105958.382076-2-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Uladzislau Rezki (Sony) Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 07db42455dd4..f9d45aa90b7c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1335,10 +1335,10 @@ classify_va_fit_type(struct vmap_area *va, static __always_inline int adjust_va_to_fit_type(struct vmap_area *va, - unsigned long nva_start_addr, unsigned long size, - enum fit_type type) + unsigned long nva_start_addr, unsigned long size) { struct vmap_area *lva = NULL; + enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); if (type == FL_FIT_TYPE) { /* @@ -1444,7 +1444,6 @@ __alloc_vmap_area(unsigned long size, unsigned long align, bool adjust_search_size = true; unsigned long nva_start_addr; struct vmap_area *va; - enum fit_type type; int ret; /* @@ -1472,14 +1471,9 @@ __alloc_vmap_area(unsigned long size, unsigned long align, if (nva_start_addr + size > vend) return vend; - /* Classify what we have found. */ - type = classify_va_fit_type(va, nva_start_addr, size); - if (WARN_ON_ONCE(type == NOTHING_FIT)) - return vend; - /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); - if (ret) + ret = adjust_va_to_fit_type(va, nva_start_addr, size); + if (WARN_ON_ONCE(ret)) return vend; #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK @@ -3735,7 +3729,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, int area, area2, last_area, term_area; unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; - enum fit_type type; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); @@ -3846,15 +3839,11 @@ retry: /* It is a BUG(), but trigger recovery instead. */ goto recovery; - type = classify_va_fit_type(va, start, size); - if (WARN_ON_ONCE(type == NOTHING_FIT)) + ret = adjust_va_to_fit_type(va, start, size); + if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(va, start, size, type); - if (unlikely(ret)) - goto recovery; - /* Allocated area. */ va = vas[area]; va->va_start = start; From 753df96be5d3a21cd70d8ab4f7464a868e1d2cb4 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 7 Jun 2022 18:59:56 +0800 Subject: [PATCH 009/282] mm/vmalloc: remove the redundant boundary check In find_va_links(), when traversing the vmap_area tree, the comparing to check if the passed in 'va' is above or below 'tmp_va' is redundant, assuming both 'va' and 'tmp_va' has ->va_start <= ->va_end. Here, to simplify the checking as code change. Link: https://lkml.kernel.org/r/20220607105958.382076-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f9d45aa90b7c..b711bf82fd5d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -874,11 +874,9 @@ find_va_links(struct vmap_area *va, * Trigger the BUG() if there are sides(left/right) * or full overlaps. */ - if (va->va_start < tmp_va->va_end && - va->va_end <= tmp_va->va_start) + if (va->va_end <= tmp_va->va_start) link = &(*link)->rb_left; - else if (va->va_end > tmp_va->va_start && - va->va_start >= tmp_va->va_end) + else if (va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; else { WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", From baa468a648b489e35475c8de9dd1d77f0a687b4d Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 7 Jun 2022 18:59:57 +0800 Subject: [PATCH 010/282] mm/vmalloc: fix typo in local variable name In __purge_vmap_area_lazy(), rename local_pure_list to local_purge_list. Link: https://lkml.kernel.org/r/20220607105958.382076-4-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b711bf82fd5d..b9bf7dfe71ec 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1669,32 +1669,32 @@ static void purge_fragmented_blocks_allcpus(void); static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; - struct list_head local_pure_list; + struct list_head local_purge_list; struct vmap_area *va, *n_va; lockdep_assert_held(&vmap_purge_lock); spin_lock(&purge_vmap_area_lock); purge_vmap_area_root = RB_ROOT; - list_replace_init(&purge_vmap_area_list, &local_pure_list); + list_replace_init(&purge_vmap_area_list, &local_purge_list); spin_unlock(&purge_vmap_area_lock); - if (unlikely(list_empty(&local_pure_list))) + if (unlikely(list_empty(&local_purge_list))) return false; start = min(start, - list_first_entry(&local_pure_list, + list_first_entry(&local_purge_list, struct vmap_area, list)->va_start); end = max(end, - list_last_entry(&local_pure_list, + list_last_entry(&local_purge_list, struct vmap_area, list)->va_end); flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock); - list_for_each_entry_safe(va, n_va, &local_pure_list, list) { + list_for_each_entry_safe(va, n_va, &local_purge_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; From 153090f2c6d595c9636c582ed4b6c4dac1739a41 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 7 Jun 2022 18:59:58 +0800 Subject: [PATCH 011/282] mm/vmalloc: add code comment for find_vmap_area_exceed_addr() Its behaviour is like find_vma() which finds an area above the specified address, add comment to make it easier to understand. And also fix two places of grammer mistake/typo. Link: https://lkml.kernel.org/r/20220607105958.382076-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmalloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b9bf7dfe71ec..fff3925c6f8f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -790,6 +790,7 @@ unsigned long vmalloc_nr_pages(void) return atomic_long_read(&nr_vmalloc_pages); } +/* Look up the first VA which satisfies addr < va_end, NULL if none. */ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) { struct vmap_area *va = NULL; @@ -929,7 +930,7 @@ link_va(struct vmap_area *va, struct rb_root *root, * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to * its current size before calling rb_insert_augmented(). - * It is because of we populate the tree from the bottom + * It is because we populate the tree from the bottom * to parent levels when the node _is_ in the tree. * * Therefore we set subtree_max_size to zero after insertion, @@ -1655,7 +1656,7 @@ static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual critical section protected - * by this look, but we want to avoid concurrent calls for performance + * by this lock, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ static DEFINE_MUTEX(vmap_purge_lock); From 673520f8da64f16077c1ecb190cbb38aa939fb41 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sat, 4 Jun 2022 16:22:09 +0800 Subject: [PATCH 012/282] mm: memcontrol: add {pgscan,pgsteal}_{kswapd,direct} items in memory.stat of cgroup v2 There are already statistics of {pgscan,pgsteal}_kswapd and {pgscan,pgsteal}_direct of memcg event here, but now only the sum of the two is displayed in memory.stat of cgroup v2. In order to obtain more accurate information during monitoring and debugging, and to align with the display in /proc/vmstat, it better to display {pgscan,pgsteal}_kswapd and {pgscan,pgsteal}_direct separately. Also, for forward compatibility, we still display pgscan and pgsteal items so that it won't break existing applications. [zhengqi.arch@bytedance.com: add comment for memcg_vm_event_stat (suggested by Michal)] Link: https://lkml.kernel.org/r/20220606154028.55030-1-zhengqi.arch@bytedance.com [zhengqi.arch@bytedance.com: fix the doc, thanks to Johannes] Link: https://lkml.kernel.org/r/20220607064803.79363-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220604082209.55174-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Muchun Song Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Muchun Song Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 24 ++++++++--- mm/memcontrol.c | 55 ++++++++++++------------- 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 176298f2f4de..ad9ba3ec90a5 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1433,6 +1433,24 @@ PAGE_SIZE multiple when read back. workingset_nodereclaim Number of times a shadow node has been reclaimed + pgscan (npn) + Amount of scanned pages (in an inactive LRU list) + + pgsteal (npn) + Amount of reclaimed pages + + pgscan_kswapd (npn) + Amount of scanned pages by kswapd (in an inactive LRU list) + + pgscan_direct (npn) + Amount of scanned pages directly (in an inactive LRU list) + + pgsteal_kswapd (npn) + Amount of reclaimed pages by kswapd + + pgsteal_direct (npn) + Amount of reclaimed pages directly + pgfault (npn) Total number of page faults incurred @@ -1442,12 +1460,6 @@ PAGE_SIZE multiple when read back. pgrefill (npn) Amount of scanned pages (in an active LRU list) - pgscan (npn) - Amount of scanned pages (in an inactive LRU list) - - pgsteal (npn) - Amount of reclaimed pages - pgactivate (npn) Amount of pages moved to the active LRU list diff --git a/mm/memcontrol.c b/mm/memcontrol.c index abec50f31fe6..28c1532cc91f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1460,6 +1460,29 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + static char *memory_stat_format(struct mem_cgroup *memcg) { struct seq_buf s; @@ -1495,41 +1518,17 @@ static char *memory_stat_format(struct mem_cgroup *memcg) } /* Accumulated memory events */ - - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT), - memcg_events(memcg, PGFAULT)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT), - memcg_events(memcg, PGMAJFAULT)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL), - memcg_events(memcg, PGREFILL)); seq_buf_printf(&s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + memcg_events(memcg, PGSCAN_DIRECT)); seq_buf_printf(&s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE), - memcg_events(memcg, PGACTIVATE)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE), - memcg_events(memcg, PGDEACTIVATE)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE), - memcg_events(memcg, PGLAZYFREE)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED), - memcg_events(memcg, PGLAZYFREED)); -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN), - memcg_events(memcg, ZSWPIN)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT), - memcg_events(memcg, ZSWPOUT)); -#endif - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC), - memcg_events(memcg, THP_FAULT_ALLOC)); - seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC), - memcg_events(memcg, THP_COLLAPSE_ALLOC)); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) + seq_buf_printf(&s, "%s %lu\n", + vm_event_name(memcg_vm_event_stat[i]), + memcg_events(memcg, memcg_vm_event_stat[i])); /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); From 9384d79249d04b03572abb7e551a35d99c9268c0 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 6 Jun 2022 16:15:33 +0200 Subject: [PATCH 013/282] mm/highmem: delete memmove_page() Matthew Wilcox reported that, while he was looking at memmove_page(), he realized that it can't actually work. The reasons are hidden in its implementation, which makes use of memmove() on logical addresses provided by kmap_local_page(). memmove() does the wrong thing when it tests "if (dest <= src)". Therefore, delete memmove_page(). No need to change any other code because we have no call sites of memmove_page() across the whole kernel. Link: https://lkml.kernel.org/r/20220606141533.555-1-fmdefrancesco@gmail.com Signed-off-by: Fabio M. De Francesco Reported-by: Matthew Wilcox Reviewed-by: Baoquan He Reviewed-by: Ira Weiny Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/highmem.h | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3af34de54330..fee9835e3793 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -336,19 +336,6 @@ static inline void memcpy_page(struct page *dst_page, size_t dst_off, kunmap_local(dst); } -static inline void memmove_page(struct page *dst_page, size_t dst_off, - struct page *src_page, size_t src_off, - size_t len) -{ - char *dst = kmap_local_page(dst_page); - char *src = kmap_local_page(src_page); - - VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); - memmove(dst + dst_off, src + src_off, len); - kunmap_local(src); - kunmap_local(dst); -} - static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { From 446ec83805ddaab5b8734d30ba4ae8c56739a9b4 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 5 Jun 2022 17:25:37 +0200 Subject: [PATCH 014/282] mm/page_alloc: use might_alloc() ... instead of open coding it. Completely equivalent code, just a notch more meaningful when reading. Link: https://lkml.kernel.org/r/20220605152539.3196045-1-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e008a3df0485..81fadb266973 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5197,10 +5197,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, *alloc_flags |= ALLOC_CPUSET; } - fs_reclaim_acquire(gfp_mask); - fs_reclaim_release(gfp_mask); - - might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); + might_alloc(gfp_mask); if (should_fail_alloc_page(gfp_mask, order)) return false; From a3967244430eb91698ac8dca7db8bd0871251305 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 5 Jun 2022 17:25:38 +0200 Subject: [PATCH 015/282] mm/slab: delete cache_alloc_debugcheck_before() It only does a might_sleep_if(GFP_RECLAIM) check, which is already covered by the might_alloc() in slab_pre_alloc_hook(). And all callers of cache_alloc_debugcheck_before() call that beforehand already. Link: https://lkml.kernel.org/r/20220605152539.3196045-2-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/slab.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index f8cd00f4ba13..47151fb2b2d2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2958,12 +2958,6 @@ direct_grow: return ac->entry[--ac->avail]; } -static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, - gfp_t flags) -{ - might_sleep_if(gfpflags_allow_blocking(flags)); -} - #if DEBUG static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) @@ -3205,7 +3199,6 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_ if (unlikely(ptr)) goto out_hooks; - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); if (nodeid == NUMA_NO_NODE) @@ -3290,7 +3283,6 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, if (unlikely(objp)) goto out; - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); @@ -3527,8 +3519,6 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (!s) return 0; - cache_alloc_debugcheck_before(s, flags); - local_irq_disable(); for (i = 0; i < size; i++) { void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags); From 21bfe8db0a4223c16d4f863ef4250dce5ffd08bb Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Sun, 5 Jun 2022 17:25:39 +0200 Subject: [PATCH 016/282] mm/mempool: use might_alloc() mempool are generally used for GFP_NOIO, so this wont benefit all that much because might_alloc currently only checks GFP_NOFS. But it does validate against mmu notifier pte zapping, some might catch some drivers doing really silly things, plus it's a bit more meaningful in what we're checking for here. Link: https://lkml.kernel.org/r/20220605152539.3196045-3-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Reviewed-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index b933d0fc21b8..96488b13a1ef 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -379,7 +379,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); + might_alloc(gfp_mask); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ From 23689037e0986724e3bdc41bb2ee6fa1b497b8f9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 7 Jun 2022 22:36:21 +0800 Subject: [PATCH 017/282] mm/memremap: fix wrong function name above memremap_pages() Fix the wrong function name dev_memremap_pages above memremap_pages() to avoid confusion. Minor readability improvement. Link: https://lkml.kernel.org/r/20220607143621.58989-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memremap.c b/mm/memremap.c index b870a659eee6..377ccbc7e356 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -279,8 +279,8 @@ err_pfn_remap: /* - * Not device managed version of dev_memremap_pages, undone by - * memunmap_pages(). Please use dev_memremap_pages if you have a struct + * Not device managed version of devm_memremap_pages, undone by + * memunmap_pages(). Please use devm_memremap_pages if you have a struct * device available. */ void *memremap_pages(struct dev_pagemap *pgmap, int nid) From ed913b055a74b723976f8e885a3395162a0371e6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 9 Jun 2022 21:08:35 +0800 Subject: [PATCH 018/282] lib/test_hmm: avoid accessing uninitialized pages If make_device_exclusive_range() fails or returns pages marked for exclusive access less than required, remaining fields of pages will left uninitialized. So dmirror_atomic_map() will access those yet uninitialized fields of pages. To fix it, do dmirror_atomic_map() iff all pages are marked for exclusive access (we will break if mapped is less than required anyway) so we won't access those uninitialized fields of pages. Link: https://lkml.kernel.org/r/20220609130835.35110-1-linmiaohe@huawei.com Fixes: b659baea7546 ("mm: selftests for exclusive device memory") Signed-off-by: Miaohe Lin Cc: Jerome Glisse Cc: Alistair Popple Cc: Jason Gunthorpe Cc: Ralph Campbell Signed-off-by: Andrew Morton --- lib/test_hmm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index cfe632047839..f2c3015c5c82 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -732,7 +732,7 @@ static int dmirror_exclusive(struct dmirror *dmirror, mmap_read_lock(mm); for (addr = start; addr < end; addr = next) { - unsigned long mapped; + unsigned long mapped = 0; int i; if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) @@ -741,7 +741,13 @@ static int dmirror_exclusive(struct dmirror *dmirror, next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); ret = make_device_exclusive_range(mm, addr, next, pages, NULL); - mapped = dmirror_atomic_map(addr, next, pages, dmirror); + /* + * Do dmirror_atomic_map() iff all pages are marked for + * exclusive access to avoid accessing uninitialized + * fields of pages. + */ + if (ret == (next - addr) >> PAGE_SHIFT) + mapped = dmirror_atomic_map(addr, next, pages, dmirror); for (i = 0; i < ret; i++) { if (pages[i]) { unlock_page(pages[i]); From c200d90049dbe08fa8b016f74b713fddefca0479 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:48 +0800 Subject: [PATCH 019/282] mm: kmemleak: remove kmemleak_not_leak_phys() and the min_count argument to kmemleak_alloc_phys() Patch series "mm: kmemleak: store objects allocated with physical address separately and check when scan", v4. The kmemleak_*_phys() interface uses "min_low_pfn" and "max_low_pfn" to check address. But on some architectures, kmemleak_*_phys() is called before those two variables initialized. The following steps will be taken: 1) Add OBJECT_PHYS flag and rbtree for the objects allocated with physical address 2) Store physical address in objects if allocated with OBJECT_PHYS 3) Check the boundary when scan instead of in kmemleak_*_phys() This patch set will solve: https://lore.kernel.org/r/20220527032504.30341-1-yee.lee@mediatek.com https://lore.kernel.org/r/9dd08bb5-f39e-53d8-f88d-bec598a08c93@gmail.com v3: https://lore.kernel.org/r/20220609124950.1694394-1-patrick.wang.shcn@gmail.com v2: https://lore.kernel.org/r/20220603035415.1243913-1-patrick.wang.shcn@gmail.com v1: https://lore.kernel.org/r/20220531150823.1004101-1-patrick.wang.shcn@gmail.com This patch (of 4): Remove the unused kmemleak_not_leak_phys() function. And remove the min_count argument to kmemleak_alloc_phys() function, assume it's 0. Link: https://lkml.kernel.org/r/20220611035551.1823303-1-patrick.wang.shcn@gmail.com Link: https://lkml.kernel.org/r/20220611035551.1823303-2-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- Documentation/dev-tools/kmemleak.rst | 1 - drivers/of/fdt.c | 2 +- include/linux/kmemleak.h | 8 ++------ mm/kmemleak.c | 20 +++----------------- mm/memblock.c | 14 +++++++------- tools/testing/memblock/linux/kmemleak.h | 2 +- 6 files changed, 14 insertions(+), 33 deletions(-) diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index 1c935f41cd3a..5483fd39ef29 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -174,7 +174,6 @@ mapping: - ``kmemleak_alloc_phys`` - ``kmemleak_free_part_phys`` -- ``kmemleak_not_leak_phys`` - ``kmemleak_ignore_phys`` Dealing with false positives/negatives diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index a8f5b6532165..2c677e84c3f5 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -529,7 +529,7 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", uname, &base, (unsigned long)(size / SZ_1M)); if (!nomap) - kmemleak_alloc_phys(base, size, 0, 0); + kmemleak_alloc_phys(base, size, 0); } else pr_info("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 34684b2026ab..6a3cd1bf4680 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -29,10 +29,9 @@ extern void kmemleak_not_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; -extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, +extern void kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) __ref; extern void kmemleak_free_part_phys(phys_addr_t phys, size_t size) __ref; -extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, @@ -107,15 +106,12 @@ static inline void kmemleak_no_scan(const void *ptr) { } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) { } -static inline void kmemleak_not_leak_phys(phys_addr_t phys) -{ -} static inline void kmemleak_ignore_phys(phys_addr_t phys) { } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a182f5ddaf68..156eafafa182 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1125,15 +1125,13 @@ EXPORT_SYMBOL(kmemleak_no_scan); * address argument * @phys: physical address of the object * @size: size of the object - * @min_count: minimum number of references to this object. - * See kmemleak_alloc() * @gfp: kmalloc() flags used for kmemleak internal memory allocations */ -void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, - gfp_t gfp) +void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_alloc(__va(phys), size, min_count, gfp); + /* assume min_count 0 */ + kmemleak_alloc(__va(phys), size, 0, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1151,18 +1149,6 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) } EXPORT_SYMBOL(kmemleak_free_part_phys); -/** - * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical - * address argument - * @phys: physical address of the object - */ -void __ref kmemleak_not_leak_phys(phys_addr_t phys) -{ - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_not_leak(__va(phys)); -} -EXPORT_SYMBOL(kmemleak_not_leak_phys); - /** * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical * address argument diff --git a/mm/memblock.c b/mm/memblock.c index e4f03a6e8e56..749abd2685c4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1345,8 +1345,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * from the regions with mirroring enabled and then retried from any * memory region. * - * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for - * allocated boot memory block, so that it is never reported as leaks. + * In addition, function using kmemleak_alloc_phys for allocated boot + * memory block, it is never reported as leaks. * * Return: * Physical address of allocated memory block on success, %0 on failure. @@ -1398,12 +1398,12 @@ done: */ if (end != MEMBLOCK_ALLOC_NOLEAKTRACE) /* - * The min_count is set to 0 so that memblock allocated - * blocks are never reported as leaks. This is because many - * of these blocks are only referred via the physical - * address which is not looked up by kmemleak. + * Memblock allocated blocks are never reported as + * leaks. This is because many of these blocks are + * only referred via the physical address which is + * not looked up by kmemleak. */ - kmemleak_alloc_phys(found, size, 0, 0); + kmemleak_alloc_phys(found, size, 0); return found; } diff --git a/tools/testing/memblock/linux/kmemleak.h b/tools/testing/memblock/linux/kmemleak.h index 462f8c5e8aa0..5fed13bb9ec4 100644 --- a/tools/testing/memblock/linux/kmemleak.h +++ b/tools/testing/memblock/linux/kmemleak.h @@ -7,7 +7,7 @@ static inline void kmemleak_free_part_phys(phys_addr_t phys, size_t size) } static inline void kmemleak_alloc_phys(phys_addr_t phys, size_t size, - int min_count, gfp_t gfp) + gfp_t gfp) { } From 8e0c4ab36c61c514a9c1caaf5707d1f55ab5f6df Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:49 +0800 Subject: [PATCH 020/282] mm: kmemleak: add OBJECT_PHYS flag for objects allocated with physical address Add OBJECT_PHYS flag for object. This flag is used to identify the objects allocated with physical address. The create_object_phys() function is added as well to set that flag and is used by kmemleak_alloc_phys(). Link: https://lkml.kernel.org/r/20220611035551.1823303-3-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- mm/kmemleak.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 156eafafa182..d82d8db0e8df 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -172,6 +172,8 @@ struct kmemleak_object { #define OBJECT_NO_SCAN (1 << 2) /* flag set to fully scan the object when scan_area allocation failed */ #define OBJECT_FULL_SCAN (1 << 3) +/* flag set for object allocated with physical address */ +#define OBJECT_PHYS (1 << 4) #define HEX_PREFIX " " /* number of bytes to print per line; must be 16 or 32 */ @@ -574,8 +576,9 @@ static int __save_stack_trace(unsigned long *trace) * Create the metadata (struct kmemleak_object) corresponding to an allocated * memory block and add it to the object_list and object_tree_root. */ -static struct kmemleak_object *create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp) +static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp, + bool is_phys) { unsigned long flags; struct kmemleak_object *object, *parent; @@ -595,7 +598,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, INIT_HLIST_HEAD(&object->area_list); raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED; + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); object->pointer = ptr; object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; @@ -662,6 +665,20 @@ out: return object; } +/* Create kmemleak object which allocated with virtual address. */ +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + return __create_object(ptr, size, min_count, gfp, false); +} + +/* Create kmemleak object which allocated with physical address. */ +static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) +{ + return __create_object(ptr, size, min_count, gfp, true); +} + /* * Mark the object as not allocated and schedule RCU freeing via put_object(). */ @@ -728,11 +745,11 @@ static void delete_object_part(unsigned long ptr, size_t size) start = object->pointer; end = object->pointer + object->size; if (ptr > start) - create_object(start, ptr - start, object->min_count, - GFP_KERNEL); + __create_object(start, ptr - start, object->min_count, + GFP_KERNEL, object->flags & OBJECT_PHYS); if (ptr + size < end) - create_object(ptr + size, end - ptr - size, object->min_count, - GFP_KERNEL); + __create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL, object->flags & OBJECT_PHYS); __delete_object(object); } @@ -1129,9 +1146,14 @@ EXPORT_SYMBOL(kmemleak_no_scan); */ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { + pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size); + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - /* assume min_count 0 */ - kmemleak_alloc(__va(phys), size, 0, gfp); + /* + * Create object with OBJECT_PHYS flag and + * assume min_count 0. + */ + create_object_phys((unsigned long)__va(phys), size, 0, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); From 0c24e061196c21d53328d60f4ad0e5a2b3183343 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:50 +0800 Subject: [PATCH 021/282] mm: kmemleak: add rbtree and store physical address for objects allocated with PA Add object_phys_tree_root to store the objects allocated with physical address. Distinguish it from object_tree_root by OBJECT_PHYS flag or function argument. The physical address is stored directly in those objects. Link: https://lkml.kernel.org/r/20220611035551.1823303-4-patrick.wang.shcn@gmail.com Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- mm/kmemleak.c | 133 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 91 insertions(+), 42 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d82d8db0e8df..ee94b028cb8a 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -14,14 +14,16 @@ * The following locks and mutexes are used by kmemleak: * * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and - * accesses to the object_tree_root. The object_list is the main list - * holding the metadata (struct kmemleak_object) for the allocated memory - * blocks. The object_tree_root is a red black tree used to look-up - * metadata based on a pointer to the corresponding memory block. The - * kmemleak_object structures are added to the object_list and - * object_tree_root in the create_object() function called from the - * kmemleak_alloc() callback and removed in delete_object() called from the - * kmemleak_free() callback + * accesses to the object_tree_root (or object_phys_tree_root). The + * object_list is the main list holding the metadata (struct kmemleak_object) + * for the allocated memory blocks. The object_tree_root and object_phys_tree_root + * are red black trees used to look-up metadata based on a pointer to the + * corresponding memory block. The object_phys_tree_root is for objects + * allocated with physical address. The kmemleak_object structures are + * added to the object_list and object_tree_root (or object_phys_tree_root) + * in the create_object() function called from the kmemleak_alloc() (or + * kmemleak_alloc_phys()) callback and removed in delete_object() called from + * the kmemleak_free() callback * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object. * Accesses to the metadata (e.g. count) are protected by this lock. Note * that some members of this structure may be protected by other means @@ -195,7 +197,9 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool); static LIST_HEAD(mem_pool_free_list); /* search tree for object boundaries */ static struct rb_root object_tree_root = RB_ROOT; -/* protecting the access to object_list and object_tree_root */ +/* search tree for object (with OBJECT_PHYS flag) boundaries */ +static struct rb_root object_phys_tree_root = RB_ROOT; +/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */ static DEFINE_RAW_SPINLOCK(kmemleak_lock); /* allocation caches for kmemleak internal data */ @@ -287,6 +291,9 @@ static void hex_dump_object(struct seq_file *seq, const u8 *ptr = (const u8 *)object->pointer; size_t len; + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) + return; + /* limit the number of lines to HEX_MAX_LINES */ len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); @@ -380,9 +387,11 @@ static void dump_object_info(struct kmemleak_object *object) * beginning of the memory block are allowed. The kmemleak_lock must be held * when calling this function. */ -static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) +static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias, + bool is_phys) { - struct rb_node *rb = object_tree_root.rb_node; + struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node : + object_tree_root.rb_node; unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { @@ -408,6 +417,12 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) return NULL; } +/* Look-up a kmemleak object which allocated with virtual address. */ +static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) +{ + return __lookup_object(ptr, alias, false); +} + /* * Increment the object use_count. Return 1 if successful or 0 otherwise. Note * that once an object's use_count reached 0, the RCU freeing was already @@ -517,14 +532,15 @@ static void put_object(struct kmemleak_object *object) /* * Look up an object in the object search tree and increase its use_count. */ -static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) +static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias, + bool is_phys) { unsigned long flags; struct kmemleak_object *object; rcu_read_lock(); raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, alias); + object = __lookup_object(ptr, alias, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); /* check whether the object is still available */ @@ -535,28 +551,39 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) return object; } +/* Look up and get an object which allocated with virtual address. */ +static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) +{ + return __find_and_get_object(ptr, alias, false); +} + /* - * Remove an object from the object_tree_root and object_list. Must be called - * with the kmemleak_lock held _if_ kmemleak is still enabled. + * Remove an object from the object_tree_root (or object_phys_tree_root) + * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak + * is still enabled. */ static void __remove_object(struct kmemleak_object *object) { - rb_erase(&object->rb_node, &object_tree_root); + rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ? + &object_phys_tree_root : + &object_tree_root); list_del_rcu(&object->object_list); } /* * Look up an object in the object search tree and remove it from both - * object_tree_root and object_list. The returned object's use_count should be - * at least 1, as initially set by create_object(). + * object_tree_root (or object_phys_tree_root) and object_list. The + * returned object's use_count should be at least 1, as initially set + * by create_object(). */ -static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias) +static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias, + bool is_phys) { unsigned long flags; struct kmemleak_object *object; raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, alias); + object = __lookup_object(ptr, alias, is_phys); if (object) __remove_object(object); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); @@ -574,7 +601,8 @@ static int __save_stack_trace(unsigned long *trace) /* * Create the metadata (struct kmemleak_object) corresponding to an allocated - * memory block and add it to the object_list and object_tree_root. + * memory block and add it to the object_list and object_tree_root (or + * object_phys_tree_root). */ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, int min_count, gfp_t gfp, @@ -631,9 +659,16 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, raw_spin_lock_irqsave(&kmemleak_lock, flags); untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); - min_addr = min(min_addr, untagged_ptr); - max_addr = max(max_addr, untagged_ptr + size); - link = &object_tree_root.rb_node; + /* + * Only update min_addr and max_addr with object + * storing virtual address. + */ + if (!is_phys) { + min_addr = min(min_addr, untagged_ptr); + max_addr = max(max_addr, untagged_ptr + size); + } + link = is_phys ? &object_phys_tree_root.rb_node : + &object_tree_root.rb_node; rb_parent = NULL; while (*link) { rb_parent = *link; @@ -657,7 +692,8 @@ static struct kmemleak_object *__create_object(unsigned long ptr, size_t size, } } rb_link_node(&object->rb_node, rb_parent, link); - rb_insert_color(&object->rb_node, &object_tree_root); + rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : + &object_tree_root); list_add_tail_rcu(&object->object_list, &object_list); out: @@ -707,7 +743,7 @@ static void delete_object_full(unsigned long ptr) { struct kmemleak_object *object; - object = find_and_remove_object(ptr, 0); + object = find_and_remove_object(ptr, 0, false); if (!object) { #ifdef DEBUG kmemleak_warn("Freeing unknown object at 0x%08lx\n", @@ -723,12 +759,12 @@ static void delete_object_full(unsigned long ptr) * delete it. If the memory block is partially freed, the function may create * additional metadata for the remaining parts of the block. */ -static void delete_object_part(unsigned long ptr, size_t size) +static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) { struct kmemleak_object *object; unsigned long start, end; - object = find_and_remove_object(ptr, 1); + object = find_and_remove_object(ptr, 1, is_phys); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", @@ -746,10 +782,10 @@ static void delete_object_part(unsigned long ptr, size_t size) end = object->pointer + object->size; if (ptr > start) __create_object(start, ptr - start, object->min_count, - GFP_KERNEL, object->flags & OBJECT_PHYS); + GFP_KERNEL, is_phys); if (ptr + size < end) __create_object(ptr + size, end - ptr - size, object->min_count, - GFP_KERNEL, object->flags & OBJECT_PHYS); + GFP_KERNEL, is_phys); __delete_object(object); } @@ -770,11 +806,11 @@ static void paint_it(struct kmemleak_object *object, int color) raw_spin_unlock_irqrestore(&object->lock, flags); } -static void paint_ptr(unsigned long ptr, int color) +static void paint_ptr(unsigned long ptr, int color, bool is_phys) { struct kmemleak_object *object; - object = find_and_get_object(ptr, 0); + object = __find_and_get_object(ptr, 0, is_phys); if (!object) { kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", ptr, @@ -792,16 +828,16 @@ static void paint_ptr(unsigned long ptr, int color) */ static void make_gray_object(unsigned long ptr) { - paint_ptr(ptr, KMEMLEAK_GREY); + paint_ptr(ptr, KMEMLEAK_GREY, false); } /* * Mark the object as black-colored so that it is ignored from scans and * reporting. */ -static void make_black_object(unsigned long ptr) +static void make_black_object(unsigned long ptr, bool is_phys) { - paint_ptr(ptr, KMEMLEAK_BLACK); + paint_ptr(ptr, KMEMLEAK_BLACK, is_phys); } /* @@ -1007,7 +1043,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) pr_debug("%s(0x%p)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - delete_object_part((unsigned long)ptr, size); + delete_object_part((unsigned long)ptr, size, false); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -1095,7 +1131,7 @@ void __ref kmemleak_ignore(const void *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) - make_black_object((unsigned long)ptr); + make_black_object((unsigned long)ptr, false); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1153,7 +1189,7 @@ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) * Create object with OBJECT_PHYS flag and * assume min_count 0. */ - create_object_phys((unsigned long)__va(phys), size, 0, gfp); + create_object_phys((unsigned long)phys, size, 0, gfp); } EXPORT_SYMBOL(kmemleak_alloc_phys); @@ -1166,8 +1202,10 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { + pr_debug("%s(0x%pa)\n", __func__, &phys); + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_free_part(__va(phys), size); + delete_object_part((unsigned long)phys, size, true); } EXPORT_SYMBOL(kmemleak_free_part_phys); @@ -1178,8 +1216,10 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { + pr_debug("%s(0x%pa)\n", __func__, &phys); + if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) - kmemleak_ignore(__va(phys)); + make_black_object((unsigned long)phys, true); } EXPORT_SYMBOL(kmemleak_ignore_phys); @@ -1190,6 +1230,9 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; + if (WARN_ON_ONCE(object->flags & OBJECT_PHYS)) + return false; + kasan_disable_current(); kcsan_disable_current(); object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size); @@ -1343,6 +1386,7 @@ static void scan_object(struct kmemleak_object *object) { struct kmemleak_scan_area *area; unsigned long flags; + void *obj_ptr; /* * Once the object->lock is acquired, the corresponding memory block @@ -1354,10 +1398,15 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; + + obj_ptr = object->flags & OBJECT_PHYS ? + __va((phys_addr_t)object->pointer) : + (void *)object->pointer; + if (hlist_empty(&object->area_list) || object->flags & OBJECT_FULL_SCAN) { - void *start = (void *)object->pointer; - void *end = (void *)(object->pointer + object->size); + void *start = obj_ptr; + void *end = obj_ptr + object->size; void *next; do { From 84c326299191042a719655d3327538fc52aa8473 Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Sat, 11 Jun 2022 11:55:51 +0800 Subject: [PATCH 022/282] mm: kmemleak: check physical address when scan Check the physical address of objects for its boundary when scan instead of in kmemleak_*_phys(). Link: https://lkml.kernel.org/r/20220611035551.1823303-5-patrick.wang.shcn@gmail.com Fixes: 23c2d497de21 ("mm: kmemleak: take a full lowmem check in kmemleak_*_phys()") Signed-off-by: Patrick Wang Suggested-by: Catalin Marinas Reviewed-by: Catalin Marinas Cc: Yee Lee Signed-off-by: Andrew Morton --- mm/kmemleak.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index ee94b028cb8a..466126625d76 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1184,7 +1184,7 @@ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size); - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (kmemleak_enabled) /* * Create object with OBJECT_PHYS flag and * assume min_count 0. @@ -1204,7 +1204,7 @@ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { pr_debug("%s(0x%pa)\n", __func__, &phys); - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (kmemleak_enabled) delete_object_part((unsigned long)phys, size, true); } EXPORT_SYMBOL(kmemleak_free_part_phys); @@ -1218,7 +1218,7 @@ void __ref kmemleak_ignore_phys(phys_addr_t phys) { pr_debug("%s(0x%pa)\n", __func__, &phys); - if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn) + if (kmemleak_enabled) make_black_object((unsigned long)phys, true); } EXPORT_SYMBOL(kmemleak_ignore_phys); @@ -1493,6 +1493,17 @@ static void kmemleak_scan(void) dump_object_info(object); } #endif + + /* ignore objects outside lowmem (paint them black) */ + if ((object->flags & OBJECT_PHYS) && + !(object->flags & OBJECT_NO_SCAN)) { + unsigned long phys = object->pointer; + + if (PHYS_PFN(phys) < min_low_pfn || + PHYS_PFN(phys + object->size) >= max_low_pfn) + __paint_it(object, KMEMLEAK_BLACK); + } + /* reset the reference count (whiten the object) */ object->count = 0; if (color_gray(object) && get_object(object)) From 1e57ffb6e3fd9583268c6462c4e3853575b21701 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 9 Jun 2022 20:13:05 +0800 Subject: [PATCH 023/282] mm/memremap: fix memunmap_pages() race with get_dev_pagemap() Think about the below scene: CPU1 CPU2 memunmap_pages percpu_ref_exit __percpu_ref_exit free_percpu(percpu_count); /* percpu_count is freed here! */ get_dev_pagemap xa_load(&pgmap_array, PHYS_PFN(phys)) /* pgmap still in the pgmap_array */ percpu_ref_tryget_live(&pgmap->ref) if __ref_is_percpu /* __PERCPU_REF_ATOMIC_DEAD not set yet */ this_cpu_inc(*percpu_count) /* access freed percpu_count here! */ ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; /* too late... */ pageunmap_range To fix the issue, do percpu_ref_exit() after pgmap_array is emptied. So we won't do percpu_ref_tryget_live() against a being freed percpu_ref. Link: https://lkml.kernel.org/r/20220609121305.2508-1-linmiaohe@huawei.com Fixes: b7b3c01b1915 ("mm/memremap_pages: support multiple ranges per invocation") Signed-off-by: Miaohe Lin Cc: Dan Williams Signed-off-by: Andrew Morton --- mm/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memremap.c b/mm/memremap.c index 377ccbc7e356..8b5c8fd4ea8e 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -141,10 +141,10 @@ void memunmap_pages(struct dev_pagemap *pgmap) for (i = 0; i < pgmap->nr_range; i++) percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); wait_for_completion(&pgmap->done); - percpu_ref_exit(&pgmap->ref); for (i = 0; i < pgmap->nr_range; i++) pageunmap_range(pgmap, i); + percpu_ref_exit(&pgmap->ref); WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); devmap_managed_enable_put(pgmap); From fc4db90fe71e640e3fe88df346f7cf653b75315d Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 10 Jun 2022 11:03:10 -0700 Subject: [PATCH 024/282] mm: kmem: make mem_cgroup_from_obj() vmalloc()-safe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently mem_cgroup_from_obj() is not working properly with objects allocated using vmalloc(). It creates problems in some cases, when it's called for static objects belonging to modules or generally allocated using vmalloc(). This patch makes mem_cgroup_from_obj() safe to be called on objects allocated using vmalloc(). It also introduces mem_cgroup_from_slab_obj(), which is a faster version to use in places when we know the object is either a slab object or a generic slab page (e.g. when adding an object to a lru list). Link: https://lkml.kernel.org/r/20220610180310.1725111-1-roman.gushchin@linux.dev Suggested-by: Kefeng Wang Signed-off-by: Roman Gushchin Tested-by: Linux Kernel Functional Testing Acked-by: Shakeel Butt Tested-by: Vasily Averin Acked-by: Michal Hocko Acked-by: Muchun Song Cc: Johannes Weiner Cc: Naresh Kamboju Cc: Qian Cai Cc: Kefeng Wang Cc: David S. Miller Cc: Eric Dumazet Cc: Florian Westphal Cc: Jakub Kicinski Cc: Michal Koutný Cc: Paolo Abeni Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 6 ++++ mm/list_lru.c | 2 +- mm/memcontrol.c | 71 +++++++++++++++++++++++++++----------- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ecead1042b9..3ce96ce5fe3e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1740,6 +1740,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_obj(void *p); +struct mem_cgroup *mem_cgroup_from_slab_obj(void *p); static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) @@ -1801,6 +1802,11 @@ static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } +static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +{ + return NULL; +} + static inline void count_objcg_event(struct obj_cgroup *objcg, enum vm_event_item idx) { diff --git a/mm/list_lru.c b/mm/list_lru.c index ba76428ceece..a05e5bef3b40 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -71,7 +71,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr, if (!list_lru_memcg_aware(lru)) goto out; - memcg = mem_cgroup_from_obj(ptr); + memcg = mem_cgroup_from_slab_obj(ptr); if (!memcg) goto out; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28c1532cc91f..c1ae9b3f8d35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -783,7 +783,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) struct lruvec *lruvec; rcu_read_lock(); - memcg = mem_cgroup_from_obj(p); + memcg = mem_cgroup_from_slab_obj(p); /* * Untracked pages have no memcg, no lruvec. Update only the @@ -2841,27 +2841,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, return 0; } -/* - * Returns a pointer to the memory cgroup to which the kernel object is charged. - * - * A passed kernel object can be a slab object or a generic kernel page, so - * different mechanisms for getting the memory cgroup pointer should be used. - * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller - * can not know for sure how the kernel object is implemented. - * mem_cgroup_from_obj() can be safely used in such cases. - * - * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), - * cgroup_mutex, etc. - */ -struct mem_cgroup *mem_cgroup_from_obj(void *p) +static __always_inline +struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) { - struct folio *folio; - - if (mem_cgroup_disabled()) - return NULL; - - folio = virt_to_folio(p); - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in @@ -2894,6 +2876,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return page_memcg_check(folio_page(folio, 0)); } +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * + * A passed kernel object can be a slab object, vmalloc object or a generic + * kernel page, so different mechanisms for getting the memory cgroup pointer + * should be used. + * + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_obj(void *p) +{ + struct folio *folio; + + if (mem_cgroup_disabled()) + return NULL; + + if (unlikely(is_vmalloc_addr(p))) + folio = page_folio(vmalloc_to_page(p)); + else + folio = virt_to_folio(p); + + return mem_cgroup_from_obj_folio(folio, p); +} + +/* + * Returns a pointer to the memory cgroup to which the kernel object is charged. + * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects, + * allocated using vmalloc(). + * + * A passed kernel object must be a slab object or a generic kernel page. + * + * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), + * cgroup_mutex, etc. + */ +struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) +{ + if (mem_cgroup_disabled()) + return NULL; + + return mem_cgroup_from_obj_folio(virt_to_folio(p), p); +} + static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { struct obj_cgroup *objcg = NULL; From 1d0403d20f6c281cb3d14c5f1db5317caeec48e9 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 3 Jun 2022 07:19:43 +0300 Subject: [PATCH 025/282] net: set proper memcg for net_init hooks allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __register_pernet_operations() executes init hook of registered pernet_operation structure in all existing net namespaces. Typically, these hooks are called by a process associated with the specified net namespace, and all __GFP_ACCOUNT marked allocation are accounted for corresponding container/memcg. However __register_pernet_operations() calls the hooks in the same context, and as a result all marked allocations are accounted to one memcg for all processed net namespaces. This patch adjusts active memcg for each net namespace and helps to account memory allocated inside ops_init() into the proper memcg. Link: https://lkml.kernel.org/r/f9394752-e272-9bf9-645f-a18c56d1c4ec@openvz.org Signed-off-by: Vasily Averin Acked-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Michal Koutný Cc: Vlastimil Babka Cc: Michal Hocko Cc: Florian Westphal Cc: David S. Miller Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Eric Dumazet Cc: Johannes Weiner Cc: Kefeng Wang Cc: Linux Kernel Functional Testing Cc: Muchun Song Cc: Naresh Kamboju Cc: Qian Cai Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 47 +++++++++++++++++++++++++++++++++++++- net/core/net_namespace.c | 7 ++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3ce96ce5fe3e..04f2f33607e9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1756,6 +1756,42 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, rcu_read_unlock(); } +/** + * get_mem_cgroup_from_obj - get a memcg associated with passed kernel object. + * @p: pointer to object from which memcg should be extracted. It can be NULL. + * + * Retrieves the memory group into which the memory of the pointed kernel + * object is accounted. If memcg is found, its reference is taken. + * If a passed kernel object is uncharged, or if proper memcg cannot be found, + * as well as if mem_cgroup is disabled, NULL is returned. + * + * Return: valid memcg pointer with taken reference or NULL. + */ +static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); + do { + memcg = mem_cgroup_from_obj(p); + } while (memcg && !css_tryget(&memcg->css)); + rcu_read_unlock(); + return memcg; +} + +/** + * mem_cgroup_or_root - always returns a pointer to a valid memory cgroup. + * @memcg: pointer to a valid memory cgroup or NULL. + * + * If passed argument is not NULL, returns it without any additional checks + * and changes. Otherwise, root_mem_cgroup is returned. + * + * NOTE: root_mem_cgroup can be NULL during early boot. + */ +static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) +{ + return memcg ? memcg : root_mem_cgroup; +} #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1799,7 +1835,7 @@ static inline int memcg_kmem_id(struct mem_cgroup *memcg) static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) { - return NULL; + return NULL; } static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p) @@ -1812,6 +1848,15 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { } +static inline struct mem_cgroup *get_mem_cgroup_from_obj(void *p) +{ + return NULL; +} + +static inline struct mem_cgroup *mem_cgroup_or_root(struct mem_cgroup *memcg) +{ + return NULL; +} #endif /* CONFIG_MEMCG_KMEM */ #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0ec2f5906a27..6b9f19122ec1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1143,7 +1144,13 @@ static int __register_pernet_operations(struct list_head *list, * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { + struct mem_cgroup *old, *memcg; + + memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net)); + old = set_active_memcg(memcg); error = ops_init(ops, net); + set_active_memcg(old); + mem_cgroup_put(memcg); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); From c5de43634c572b0cec0b32eecf24a17c649711c1 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Fri, 10 Jun 2022 15:12:44 +0800 Subject: [PATCH 026/282] userfaultfd/selftests: fix typo in comment Delete the redundant word 'in'. Link: https://lkml.kernel.org/r/20220610071244.59679-1-wangxiang@cdjrlc.com Signed-off-by: Xiang wangx Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 0bdfc1955229..4bc24581760d 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -860,7 +860,7 @@ static int stress(struct uffd_stats *uffd_stats) /* * Be strict and immediately zap area_src, the whole area has * been transferred already by the background treads. The - * area_src could then be faulted in in a racy way by still + * area_src could then be faulted in a racy way by still * running uffdio_threads reading zeropages after we zapped * area_src (but they're guaranteed to get -EEXIST from * UFFDIO_COPY without writing zero pages into area_dst From b623d434f00868c3ec76ec6e6bbd85e9e6c06457 Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Thu, 9 Jun 2022 16:32:17 -0400 Subject: [PATCH 027/282] selftests: make use of GUP_TEST_FILE macro Commit 17de1e559cf1 ("selftests: clarify common error when running gup_test") had most of its hunks dropped due to a conflict with another patch accepted into Linux around the same time that implemented the same behavior as a subset of other changes. However, the remaining hunk defines the GUP_TEST_FILE macro without making use of it. This patch makes use of the macro in the two relevant places. Furthermore, the above mentioned commit's log message erroneously describes the changes that were dropped from the patch. This patch corrects the record. Link: https://lkml.kernel.org/r/20220609203217.3206247-1-jsavitz@redhat.com Fixes: 17de1e559cf1 ("selftests: clarify common error when running gup_test") Signed-off-by: Joel Savitz Reviewed-by: Shuah Khan Acked-by: Nico Pache Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/gup_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index 6bb36ca71cb5..a309876d832f 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -209,7 +209,7 @@ int main(int argc, char **argv) if (write) gup.gup_flags |= FOLL_WRITE; - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + gup_fd = open(GUP_TEST_FILE, O_RDWR); if (gup_fd == -1) { switch (errno) { case EACCES: @@ -224,7 +224,7 @@ int main(int argc, char **argv) printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); break; default: - perror("failed to open /sys/kernel/debug/gup_test"); + perror("failed to open " GUP_TEST_FILE); break; } exit(KSFT_SKIP); From 55896f935a60b919ce699d11754061f6df936a7d Mon Sep 17 00:00:00 2001 From: Gautam Menghani Date: Sun, 12 Jun 2022 11:23:20 -0700 Subject: [PATCH 028/282] mm/sparse-vmemmap.c: remove unwanted initialization in vmemmap_populate_compound_pages() Remove unnecessary initialization for the variable 'next'. This fixes the clang scan warning: Value stored to 'next' during its initialization is never read [deadcode.DeadStores] Link: https://lkml.kernel.org/r/20220612182320.160651-1-gautammenghani201@gmail.com Signed-off-by: Gautam Menghani Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Reviewed-by: Joao Martins Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 49cb15cbe590..652f11a05749 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -737,7 +737,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page)); for (addr = start; addr < end; addr += size) { - unsigned long next = addr, last = addr + size; + unsigned long next, last = addr + size; /* Populate the head page vmemmap page */ pte = vmemmap_populate_address(addr, node, NULL, NULL); From 00c155066eca8bcfc9b255db017119b84eea6909 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 14 Jun 2022 18:03:57 -0400 Subject: [PATCH 029/282] mm/kmemleak: use _irq lock/unlock variants in kmemleak_scan/_clear() Patch series "mm/kmemleak: Avoid soft lockup in kmemleak_scan()", v2. There are 3 RCU-based object iteration loops in kmemleak_scan(). Because of the need to take RCU read lock, we can't insert cond_resched() into the loop like other parts of the function. As there can be millions of objects to be scanned, it takes a while to iterate all of them. The kmemleak functionality is usually enabled in a debug kernel which is much slower than a non-debug kernel. With sufficient number of kmemleak objects, the time to iterate them all may exceed 22s causing soft lockup. watchdog: BUG: soft lockup - CPU#3 stuck for 22s! [kmemleak:625] This patch series make changes to the 3 object iteration loops in kmemleak_scan() to prevent them from causing soft lockup. This patch (of 3): kmemleak_scan() is called only from the kmemleak scan thread or from write to the kmemleak debugfs file. Both are in task context and so we can directly use the simpler _irq() lock/unlock calls instead of the more complex _irqsave/_irqrestore variants. Similarly, kmemleak_clear() is called only from write to the kmemleak debugfs file. The same change can be applied. Link: https://lkml.kernel.org/r/20220614220359.59282-1-longman@redhat.com Link: https://lkml.kernel.org/r/20220614220359.59282-2-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Muchun Song Reviewed-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 466126625d76..dc1aa9a7125f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1470,7 +1470,6 @@ static void scan_gray_list(void) */ static void kmemleak_scan(void) { - unsigned long flags; struct kmemleak_object *object; struct zone *zone; int __maybe_unused i; @@ -1481,7 +1480,7 @@ static void kmemleak_scan(void) /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - raw_spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* * With a few exceptions there should be a maximum of @@ -1509,7 +1508,7 @@ static void kmemleak_scan(void) if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); - raw_spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irq(&object->lock); } rcu_read_unlock(); @@ -1577,14 +1576,14 @@ static void kmemleak_scan(void) */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - raw_spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irq(&object->lock); if (color_white(object) && (object->flags & OBJECT_ALLOCATED) && update_checksum(object) && get_object(object)) { /* color it gray temporarily */ object->count = object->min_count; list_add_tail(&object->gray_list, &gray_list); } - raw_spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irq(&object->lock); } rcu_read_unlock(); @@ -1604,7 +1603,7 @@ static void kmemleak_scan(void) */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - raw_spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irq(&object->lock); if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; @@ -1614,7 +1613,7 @@ static void kmemleak_scan(void) new_leaks++; } - raw_spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irq(&object->lock); } rcu_read_unlock(); @@ -1816,15 +1815,14 @@ static int dump_str_object_info(const char *str) static void kmemleak_clear(void) { struct kmemleak_object *object; - unsigned long flags; rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { - raw_spin_lock_irqsave(&object->lock, flags); + raw_spin_lock_irq(&object->lock); if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) __paint_it(object, KMEMLEAK_GREY); - raw_spin_unlock_irqrestore(&object->lock, flags); + raw_spin_unlock_irq(&object->lock); } rcu_read_unlock(); From 64977918c2381aaadd544535708294213cc964f6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 14 Jun 2022 18:03:58 -0400 Subject: [PATCH 030/282] mm/kmemleak: skip unlikely objects in kmemleak_scan() without taking lock There are 3 RCU-based object iteration loops in kmemleak_scan(). Because of the need to take RCU read lock, we can't insert cond_resched() into the loop like other parts of the function. As there can be millions of objects to be scanned, it takes a while to iterate all of them. The kmemleak functionality is usually enabled in a debug kernel which is much slower than a non-debug kernel. With sufficient number of kmemleak objects, the time to iterate them all may exceed 22s causing soft lockup. watchdog: BUG: soft lockup - CPU#3 stuck for 22s! [kmemleak:625] In this particular bug report, the soft lockup happen in the 2nd iteration loop. In the 2nd and 3rd loops, most of the objects are checked and then skipped under the object lock. Only a selected fews are modified. Those objects certainly need lock protection. However, the lock/unlock operation is slow especially with interrupt disabling and enabling included. We can actually do some basic check like color_white() without taking the lock and skip the object accordingly. Of course, this kind of check is racy and may miss objects that are being modified concurrently. The cost of missed objects, however, is just that they will be discovered in the next scan instead. The advantage of doing so is that iteration can be done much faster especially with LOCKDEP enabled in a debug kernel. With a debug kernel running on a 2-socket 96-thread x86-64 system (HZ=1000), the 2nd and 3rd iteration loops speedup with this patch on the first kmemleak_scan() call after bootup is shown in the table below. Before patch After patch Loop # # of objects Elapsed time # of objects Elapsed time ------ ------------ ------------ ------------ ------------ 2 2,599,850 2.392s 2,596,364 0.266s 3 2,600,176 2.171s 2,597,061 0.260s This patch reduces loop iteration times by about 88%. This will greatly reduce the chance of a soft lockup happening in the 2nd or 3rd iteration loops. Even though the first loop runs a little bit faster, it can still be problematic if many kmemleak objects are there. As the object count has to be modified in every object, we cannot avoid taking the object lock. So other way to prevent soft lockup will be needed. Link: https://lkml.kernel.org/r/20220614220359.59282-3-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Catalin Marinas Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/kmemleak.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index dc1aa9a7125f..30d9bd41e5a1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1576,6 +1576,13 @@ static void kmemleak_scan(void) */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * This is racy but we can save the overhead of lock/unlock + * calls. The missed objects, if any, should be caught in + * the next scan. + */ + if (!color_white(object)) + continue; raw_spin_lock_irq(&object->lock); if (color_white(object) && (object->flags & OBJECT_ALLOCATED) && update_checksum(object) && get_object(object)) { @@ -1603,6 +1610,13 @@ static void kmemleak_scan(void) */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * This is racy but we can save the overhead of lock/unlock + * calls. The missed objects, if any, should be caught in + * the next scan. + */ + if (!color_white(object)) + continue; raw_spin_lock_irq(&object->lock); if (unreferenced_object(object) && !(object->flags & OBJECT_REPORTED)) { From 6edda04ccc7cfb281d139e352dbd5dd933bd2751 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 14 Jun 2022 18:03:59 -0400 Subject: [PATCH 031/282] mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan() The first RCU-based object iteration loop has to modify the object count. So we cannot skip taking the object lock. One way to avoid soft lockup is to insert occasional cond_resched() call into the loop. This cannot be done while holding the RCU read lock which is to protect objects from being freed. However, taking a reference to the object will prevent it from being freed. We can then do a cond_resched() call after every 64k objects safely. Link: https://lkml.kernel.org/r/20220614220359.59282-4-longman@redhat.com Signed-off-by: Waiman Long Reviewed-by: Catalin Marinas Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/kmemleak.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 30d9bd41e5a1..1eddc0132f7f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1474,12 +1474,16 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; + int loop1_cnt = 0; jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { + bool obj_pinned = false; + + loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1505,10 +1509,32 @@ static void kmemleak_scan(void) /* reset the reference count (whiten the object) */ object->count = 0; - if (color_gray(object) && get_object(object)) + if (color_gray(object) && get_object(object)) { list_add_tail(&object->gray_list, &gray_list); + obj_pinned = true; + } raw_spin_unlock_irq(&object->lock); + + /* + * Do a cond_resched() to avoid soft lockup every 64k objects. + * Make sure a reference has been taken so that the object + * won't go away without RCU read lock. + */ + if (!(loop1_cnt & 0xffff)) { + if (!obj_pinned && !get_object(object)) { + /* Try the next object instead */ + loop1_cnt--; + continue; + } + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + + if (!obj_pinned) + put_object(object); + } } rcu_read_unlock(); From ee65728e103bb7dd99d8604bf6c7aa89c7d7e446 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 27 Jun 2022 09:00:26 +0300 Subject: [PATCH 032/282] docs: rename Documentation/vm to Documentation/mm so it will be consistent with code mm directory and with Documentation/admin-guide/mm and won't be confused with virtual machines. Signed-off-by: Mike Rapoport Suggested-by: Matthew Wilcox Tested-by: Ira Weiny Acked-by: Jonathan Corbet Acked-by: Wu XiangCheng --- Documentation/ABI/testing/sysfs-kernel-mm-ksm | 2 +- Documentation/ABI/testing/sysfs-kernel-slab | 4 ++-- Documentation/admin-guide/kernel-parameters.txt | 10 +++++----- Documentation/admin-guide/mm/concepts.rst | 2 +- Documentation/admin-guide/mm/damon/index.rst | 2 +- Documentation/admin-guide/mm/damon/reclaim.rst | 2 +- Documentation/admin-guide/mm/damon/usage.rst | 8 ++++---- Documentation/admin-guide/sysctl/vm.rst | 2 +- Documentation/core-api/index.rst | 2 +- Documentation/filesystems/proc.rst | 2 +- Documentation/index.rst | 2 +- Documentation/{vm => mm}/active_mm.rst | 0 Documentation/{vm => mm}/arch_pgtable_helpers.rst | 0 Documentation/{vm => mm}/balance.rst | 0 Documentation/{vm => mm}/bootmem.rst | 0 Documentation/{vm => mm}/damon/api.rst | 0 Documentation/{vm => mm}/damon/design.rst | 0 Documentation/{vm => mm}/damon/faq.rst | 0 Documentation/{vm => mm}/damon/index.rst | 0 Documentation/{vm => mm}/free_page_reporting.rst | 0 Documentation/{vm => mm}/frontswap.rst | 0 Documentation/{vm => mm}/highmem.rst | 0 Documentation/{vm => mm}/hmm.rst | 0 Documentation/{vm => mm}/hugetlbfs_reserv.rst | 0 Documentation/{vm => mm}/hwpoison.rst | 0 Documentation/{vm => mm}/index.rst | 0 Documentation/{vm => mm}/ksm.rst | 0 Documentation/{vm => mm}/memory-model.rst | 2 +- Documentation/{vm => mm}/mmu_notifier.rst | 0 Documentation/{vm => mm}/numa.rst | 0 Documentation/{vm => mm}/oom.rst | 0 Documentation/{vm => mm}/overcommit-accounting.rst | 0 Documentation/{vm => mm}/page_allocation.rst | 0 Documentation/{vm => mm}/page_cache.rst | 0 Documentation/{vm => mm}/page_frags.rst | 0 Documentation/{vm => mm}/page_migration.rst | 0 Documentation/{vm => mm}/page_owner.rst | 0 Documentation/{vm => mm}/page_reclaim.rst | 0 Documentation/{vm => mm}/page_table_check.rst | 0 Documentation/{vm => mm}/page_tables.rst | 0 Documentation/{vm => mm}/physical_memory.rst | 0 Documentation/{vm => mm}/process_addrs.rst | 0 Documentation/{vm => mm}/remap_file_pages.rst | 0 Documentation/{vm => mm}/shmfs.rst | 0 Documentation/{vm => mm}/slab.rst | 0 Documentation/{vm => mm}/slub.rst | 0 Documentation/{vm => mm}/split_page_table_lock.rst | 0 Documentation/{vm => mm}/swap.rst | 0 Documentation/{vm => mm}/transhuge.rst | 0 Documentation/{vm => mm}/unevictable-lru.rst | 0 Documentation/{vm => mm}/vmalloc.rst | 0 Documentation/{vm => mm}/vmalloced-kernel-stacks.rst | 0 Documentation/{vm => mm}/vmemmap_dedup.rst | 0 Documentation/{vm => mm}/z3fold.rst | 0 Documentation/{vm => mm}/zsmalloc.rst | 0 .../zh_CN/admin-guide/mm/damon/index.rst | 2 +- .../zh_CN/admin-guide/mm/damon/reclaim.rst | 2 +- .../zh_CN/admin-guide/mm/damon/usage.rst | 8 ++++---- Documentation/translations/zh_CN/core-api/index.rst | 2 +- Documentation/translations/zh_CN/index.rst | 2 +- .../translations/zh_CN/{vm => mm}/active_mm.rst | 2 +- .../translations/zh_CN/{vm => mm}/balance.rst | 2 +- .../translations/zh_CN/{vm => mm}/damon/api.rst | 2 +- .../translations/zh_CN/{vm => mm}/damon/design.rst | 2 +- .../translations/zh_CN/{vm => mm}/damon/faq.rst | 2 +- .../translations/zh_CN/{vm => mm}/damon/index.rst | 5 ++--- .../zh_CN/{vm => mm}/free_page_reporting.rst | 2 +- .../translations/zh_CN/{vm => mm}/frontswap.rst | 2 +- .../translations/zh_CN/{vm => mm}/highmem.rst | 2 +- Documentation/translations/zh_CN/{vm => mm}/hmm.rst | 2 +- .../zh_CN/{vm => mm}/hugetlbfs_reserv.rst | 2 +- .../translations/zh_CN/{vm => mm}/hwpoison.rst | 2 +- .../translations/zh_CN/{vm => mm}/index.rst | 2 +- Documentation/translations/zh_CN/{vm => mm}/ksm.rst | 2 +- .../translations/zh_CN/{vm => mm}/memory-model.rst | 4 ++-- .../translations/zh_CN/{vm => mm}/mmu_notifier.rst | 2 +- Documentation/translations/zh_CN/{vm => mm}/numa.rst | 2 +- .../zh_CN/{vm => mm}/overcommit-accounting.rst | 2 +- .../translations/zh_CN/{vm => mm}/page_frags.rst | 2 +- .../translations/zh_CN/{vm => mm}/page_owner.rst | 2 +- .../zh_CN/{vm => mm}/page_table_check.rst | 2 +- .../zh_CN/{vm => mm}/remap_file_pages.rst | 2 +- .../zh_CN/{vm => mm}/split_page_table_lock.rst | 2 +- .../translations/zh_CN/{vm => mm}/z3fold.rst | 2 +- .../translations/zh_CN/{vm => mm}/zsmalloc.rst | 2 +- Documentation/translations/zh_TW/index.rst | 2 +- Documentation/vm/.gitignore | 3 --- MAINTAINERS | 12 ++++++------ arch/loongarch/Kconfig | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- include/linux/hmm.h | 4 ++-- include/linux/memremap.h | 2 +- include/linux/mmu_notifier.h | 2 +- include/linux/sched/mm.h | 4 ++-- include/linux/swap.h | 2 +- mm/Kconfig | 2 +- mm/debug_vm_pgtable.c | 2 +- mm/frontswap.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 6 +++--- mm/hugetlb_vmemmap.c | 2 +- mm/ksm.c | 4 ++-- mm/mmap.c | 2 +- mm/rmap.c | 8 ++++---- mm/sparse-vmemmap.c | 2 +- mm/util.c | 2 +- tools/vm/page_owner_sort.c | 2 +- 107 files changed, 89 insertions(+), 93 deletions(-) rename Documentation/{vm => mm}/active_mm.rst (100%) rename Documentation/{vm => mm}/arch_pgtable_helpers.rst (100%) rename Documentation/{vm => mm}/balance.rst (100%) rename Documentation/{vm => mm}/bootmem.rst (100%) rename Documentation/{vm => mm}/damon/api.rst (100%) rename Documentation/{vm => mm}/damon/design.rst (100%) rename Documentation/{vm => mm}/damon/faq.rst (100%) rename Documentation/{vm => mm}/damon/index.rst (100%) rename Documentation/{vm => mm}/free_page_reporting.rst (100%) rename Documentation/{vm => mm}/frontswap.rst (100%) rename Documentation/{vm => mm}/highmem.rst (100%) rename Documentation/{vm => mm}/hmm.rst (100%) rename Documentation/{vm => mm}/hugetlbfs_reserv.rst (100%) rename Documentation/{vm => mm}/hwpoison.rst (100%) rename Documentation/{vm => mm}/index.rst (100%) rename Documentation/{vm => mm}/ksm.rst (100%) rename Documentation/{vm => mm}/memory-model.rst (99%) rename Documentation/{vm => mm}/mmu_notifier.rst (100%) rename Documentation/{vm => mm}/numa.rst (100%) rename Documentation/{vm => mm}/oom.rst (100%) rename Documentation/{vm => mm}/overcommit-accounting.rst (100%) rename Documentation/{vm => mm}/page_allocation.rst (100%) rename Documentation/{vm => mm}/page_cache.rst (100%) rename Documentation/{vm => mm}/page_frags.rst (100%) rename Documentation/{vm => mm}/page_migration.rst (100%) rename Documentation/{vm => mm}/page_owner.rst (100%) rename Documentation/{vm => mm}/page_reclaim.rst (100%) rename Documentation/{vm => mm}/page_table_check.rst (100%) rename Documentation/{vm => mm}/page_tables.rst (100%) rename Documentation/{vm => mm}/physical_memory.rst (100%) rename Documentation/{vm => mm}/process_addrs.rst (100%) rename Documentation/{vm => mm}/remap_file_pages.rst (100%) rename Documentation/{vm => mm}/shmfs.rst (100%) rename Documentation/{vm => mm}/slab.rst (100%) rename Documentation/{vm => mm}/slub.rst (100%) rename Documentation/{vm => mm}/split_page_table_lock.rst (100%) rename Documentation/{vm => mm}/swap.rst (100%) rename Documentation/{vm => mm}/transhuge.rst (100%) rename Documentation/{vm => mm}/unevictable-lru.rst (100%) rename Documentation/{vm => mm}/vmalloc.rst (100%) rename Documentation/{vm => mm}/vmalloced-kernel-stacks.rst (100%) rename Documentation/{vm => mm}/vmemmap_dedup.rst (100%) rename Documentation/{vm => mm}/z3fold.rst (100%) rename Documentation/{vm => mm}/zsmalloc.rst (100%) rename Documentation/translations/zh_CN/{vm => mm}/active_mm.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/balance.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/damon/api.rst (91%) rename Documentation/translations/zh_CN/{vm => mm}/damon/design.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/damon/faq.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/damon/index.rst (90%) rename Documentation/translations/zh_CN/{vm => mm}/free_page_reporting.rst (97%) rename Documentation/translations/zh_CN/{vm => mm}/frontswap.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/highmem.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/hmm.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/hugetlbfs_reserv.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/hwpoison.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/index.rst (96%) rename Documentation/translations/zh_CN/{vm => mm}/ksm.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/memory-model.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/mmu_notifier.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/numa.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/overcommit-accounting.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/page_frags.rst (97%) rename Documentation/translations/zh_CN/{vm => mm}/page_owner.rst (99%) rename Documentation/translations/zh_CN/{vm => mm}/page_table_check.rst (97%) rename Documentation/translations/zh_CN/{vm => mm}/remap_file_pages.rst (97%) rename Documentation/translations/zh_CN/{vm => mm}/split_page_table_lock.rst (98%) rename Documentation/translations/zh_CN/{vm => mm}/z3fold.rst (96%) rename Documentation/translations/zh_CN/{vm => mm}/zsmalloc.rst (98%) delete mode 100644 Documentation/vm/.gitignore diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-ksm b/Documentation/ABI/testing/sysfs-kernel-mm-ksm index 1c9bed5595f5..d244674a9480 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm +++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm @@ -41,7 +41,7 @@ Description: Kernel Samepage Merging daemon sysfs interface sleep_millisecs: how many milliseconds ksm should sleep between scans. - See Documentation/vm/ksm.rst for more information. + See Documentation/mm/ksm.rst for more information. What: /sys/kernel/mm/ksm/merge_across_nodes Date: January 2013 diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index c440f4946e12..cd5fb8fa3ddf 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab @@ -37,7 +37,7 @@ Description: The alloc_calls file is read-only and lists the kernel code locations from which allocations for this cache were performed. The alloc_calls file only contains information if debugging is - enabled for that cache (see Documentation/vm/slub.rst). + enabled for that cache (see Documentation/mm/slub.rst). What: /sys/kernel/slab//alloc_fastpath Date: February 2008 @@ -219,7 +219,7 @@ Contact: Pekka Enberg , Description: The free_calls file is read-only and lists the locations of object frees if slab debugging is enabled (see - Documentation/vm/slub.rst). + Documentation/mm/slub.rst). What: /sys/kernel/slab//free_fastpath Date: February 2008 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2522b11e593f..8c0ea6b6c6a9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5442,7 +5442,7 @@ cache (risks via metadata attacks are mostly unchanged). Debug options disable merging on their own. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slab_max_order= [MM, SLAB] Determines the maximum allowed order for slabs. @@ -5456,13 +5456,13 @@ slub_debug can create guard zones around objects and may poison objects when not in use. Also tracks the last alloc / free. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory fragmentation. For more information see - Documentation/vm/slub.rst. + Documentation/mm/slub.rst. slub_min_objects= [MM, SLUB] The minimum number of objects per slab. SLUB will @@ -5471,12 +5471,12 @@ the number of objects indicated. The higher the number of objects the smaller the overhead of tracking slabs and the less frequently locks need to be acquired. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_min_order= [MM, SLUB] Determines the minimum page order for slabs. Must be lower than slub_max_order. - For more information see Documentation/vm/slub.rst. + For more information see Documentation/mm/slub.rst. slub_merge [MM, SLUB] Same with slab_merge. diff --git a/Documentation/admin-guide/mm/concepts.rst b/Documentation/admin-guide/mm/concepts.rst index b966fcff993b..c79f1e336222 100644 --- a/Documentation/admin-guide/mm/concepts.rst +++ b/Documentation/admin-guide/mm/concepts.rst @@ -125,7 +125,7 @@ processor. Each bank is referred to as a `node` and for each node Linux constructs an independent memory management subsystem. A node has its own set of zones, lists of free and used pages and various statistics counters. You can find more details about NUMA in -:ref:`Documentation/vm/numa.rst ` and in +:ref:`Documentation/mm/numa.rst ` and in :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst `. Page cache diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 61aff88347f3..c4681fa69b9c 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -4,7 +4,7 @@ Monitoring Data Accesses ======================== -:doc:`DAMON ` allows light-weight data access monitoring. +:doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and optimize those. diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index 46306f1f34b1..a8bd3bd29959 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -268,4 +268,4 @@ granularity reclamation. :: .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1bb7b72414b2..5540a3a40fc9 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -30,11 +30,11 @@ DAMON provides below interfaces for different users. `. This will be removed after next LTS kernel is released, so users should move to the :ref:`sysfs interface `. - *Kernel Space Programming Interface.* - :doc:`This ` is for kernel space programmers. Using this, + :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface - :doc:`document `. + :doc:`document `. .. _sysfs_interface: @@ -185,7 +185,7 @@ controls the monitoring overhead, exist. You can set and get the values by writing to and rading from the files. For more details about the intervals and monitoring regions range, please refer -to the Design document (:doc:`/vm/damon/design`). +to the Design document (:doc:`/mm/damon/design`). contexts//targets/ --------------------- @@ -402,7 +402,7 @@ Attributes Users can get and set the ``sampling interval``, ``aggregation interval``, ``update interval``, and min/max number of monitoring target regions by reading from and writing to the ``attrs`` file. To know about the monitoring -attributes in detail, please refer to the :doc:`/vm/damon/design`. For +attributes in detail, please refer to the :doc:`/mm/damon/design`. For example, below commands set those values to 5 ms, 100 ms, 1,000 ms, 10 and 1000, and then check it again:: diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5c9aa171a0d3..4a440a7cfeb0 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -760,7 +760,7 @@ and don't use much of it. The default value is 0. -See Documentation/vm/overcommit-accounting.rst and +See Documentation/mm/overcommit-accounting.rst and mm/util.c::__vm_enough_memory() for more information. diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index dedd4d853329..5b1188494bcd 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -87,7 +87,7 @@ Memory management ================= How to allocate and use memory in the kernel. Note that there is a lot -more memory-management documentation in Documentation/vm/index.rst. +more memory-management documentation in Documentation/mm/index.rst. .. toctree:: :maxdepth: 1 diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 1bc91fb8c321..8543a59f288f 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1109,7 +1109,7 @@ CommitLimit yield a CommitLimit of 7.3G. For more details, see the memory overcommit documentation - in vm/overcommit-accounting. + in mm/overcommit-accounting. Committed_AS The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which diff --git a/Documentation/index.rst b/Documentation/index.rst index 67036a05b771..4737c18c97ff 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -128,7 +128,7 @@ needed). sound/index crypto/index filesystems/index - vm/index + mm/index bpf/index usb/index PCI/index diff --git a/Documentation/vm/active_mm.rst b/Documentation/mm/active_mm.rst similarity index 100% rename from Documentation/vm/active_mm.rst rename to Documentation/mm/active_mm.rst diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/mm/arch_pgtable_helpers.rst similarity index 100% rename from Documentation/vm/arch_pgtable_helpers.rst rename to Documentation/mm/arch_pgtable_helpers.rst diff --git a/Documentation/vm/balance.rst b/Documentation/mm/balance.rst similarity index 100% rename from Documentation/vm/balance.rst rename to Documentation/mm/balance.rst diff --git a/Documentation/vm/bootmem.rst b/Documentation/mm/bootmem.rst similarity index 100% rename from Documentation/vm/bootmem.rst rename to Documentation/mm/bootmem.rst diff --git a/Documentation/vm/damon/api.rst b/Documentation/mm/damon/api.rst similarity index 100% rename from Documentation/vm/damon/api.rst rename to Documentation/mm/damon/api.rst diff --git a/Documentation/vm/damon/design.rst b/Documentation/mm/damon/design.rst similarity index 100% rename from Documentation/vm/damon/design.rst rename to Documentation/mm/damon/design.rst diff --git a/Documentation/vm/damon/faq.rst b/Documentation/mm/damon/faq.rst similarity index 100% rename from Documentation/vm/damon/faq.rst rename to Documentation/mm/damon/faq.rst diff --git a/Documentation/vm/damon/index.rst b/Documentation/mm/damon/index.rst similarity index 100% rename from Documentation/vm/damon/index.rst rename to Documentation/mm/damon/index.rst diff --git a/Documentation/vm/free_page_reporting.rst b/Documentation/mm/free_page_reporting.rst similarity index 100% rename from Documentation/vm/free_page_reporting.rst rename to Documentation/mm/free_page_reporting.rst diff --git a/Documentation/vm/frontswap.rst b/Documentation/mm/frontswap.rst similarity index 100% rename from Documentation/vm/frontswap.rst rename to Documentation/mm/frontswap.rst diff --git a/Documentation/vm/highmem.rst b/Documentation/mm/highmem.rst similarity index 100% rename from Documentation/vm/highmem.rst rename to Documentation/mm/highmem.rst diff --git a/Documentation/vm/hmm.rst b/Documentation/mm/hmm.rst similarity index 100% rename from Documentation/vm/hmm.rst rename to Documentation/mm/hmm.rst diff --git a/Documentation/vm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst similarity index 100% rename from Documentation/vm/hugetlbfs_reserv.rst rename to Documentation/mm/hugetlbfs_reserv.rst diff --git a/Documentation/vm/hwpoison.rst b/Documentation/mm/hwpoison.rst similarity index 100% rename from Documentation/vm/hwpoison.rst rename to Documentation/mm/hwpoison.rst diff --git a/Documentation/vm/index.rst b/Documentation/mm/index.rst similarity index 100% rename from Documentation/vm/index.rst rename to Documentation/mm/index.rst diff --git a/Documentation/vm/ksm.rst b/Documentation/mm/ksm.rst similarity index 100% rename from Documentation/vm/ksm.rst rename to Documentation/mm/ksm.rst diff --git a/Documentation/vm/memory-model.rst b/Documentation/mm/memory-model.rst similarity index 99% rename from Documentation/vm/memory-model.rst rename to Documentation/mm/memory-model.rst index 30e8fbed6914..3779e562dc76 100644 --- a/Documentation/vm/memory-model.rst +++ b/Documentation/mm/memory-model.rst @@ -170,7 +170,7 @@ The users of `ZONE_DEVICE` are: * hmm: Extend `ZONE_DEVICE` with `->page_fault()` and `->page_free()` event callbacks to allow a device-driver to coordinate memory management events related to device-memory, typically GPU memory. See - Documentation/vm/hmm.rst. + Documentation/mm/hmm.rst. * p2pdma: Create `struct page` objects to allow peer devices in a PCI/-E topology to coordinate direct-DMA operations between themselves, diff --git a/Documentation/vm/mmu_notifier.rst b/Documentation/mm/mmu_notifier.rst similarity index 100% rename from Documentation/vm/mmu_notifier.rst rename to Documentation/mm/mmu_notifier.rst diff --git a/Documentation/vm/numa.rst b/Documentation/mm/numa.rst similarity index 100% rename from Documentation/vm/numa.rst rename to Documentation/mm/numa.rst diff --git a/Documentation/vm/oom.rst b/Documentation/mm/oom.rst similarity index 100% rename from Documentation/vm/oom.rst rename to Documentation/mm/oom.rst diff --git a/Documentation/vm/overcommit-accounting.rst b/Documentation/mm/overcommit-accounting.rst similarity index 100% rename from Documentation/vm/overcommit-accounting.rst rename to Documentation/mm/overcommit-accounting.rst diff --git a/Documentation/vm/page_allocation.rst b/Documentation/mm/page_allocation.rst similarity index 100% rename from Documentation/vm/page_allocation.rst rename to Documentation/mm/page_allocation.rst diff --git a/Documentation/vm/page_cache.rst b/Documentation/mm/page_cache.rst similarity index 100% rename from Documentation/vm/page_cache.rst rename to Documentation/mm/page_cache.rst diff --git a/Documentation/vm/page_frags.rst b/Documentation/mm/page_frags.rst similarity index 100% rename from Documentation/vm/page_frags.rst rename to Documentation/mm/page_frags.rst diff --git a/Documentation/vm/page_migration.rst b/Documentation/mm/page_migration.rst similarity index 100% rename from Documentation/vm/page_migration.rst rename to Documentation/mm/page_migration.rst diff --git a/Documentation/vm/page_owner.rst b/Documentation/mm/page_owner.rst similarity index 100% rename from Documentation/vm/page_owner.rst rename to Documentation/mm/page_owner.rst diff --git a/Documentation/vm/page_reclaim.rst b/Documentation/mm/page_reclaim.rst similarity index 100% rename from Documentation/vm/page_reclaim.rst rename to Documentation/mm/page_reclaim.rst diff --git a/Documentation/vm/page_table_check.rst b/Documentation/mm/page_table_check.rst similarity index 100% rename from Documentation/vm/page_table_check.rst rename to Documentation/mm/page_table_check.rst diff --git a/Documentation/vm/page_tables.rst b/Documentation/mm/page_tables.rst similarity index 100% rename from Documentation/vm/page_tables.rst rename to Documentation/mm/page_tables.rst diff --git a/Documentation/vm/physical_memory.rst b/Documentation/mm/physical_memory.rst similarity index 100% rename from Documentation/vm/physical_memory.rst rename to Documentation/mm/physical_memory.rst diff --git a/Documentation/vm/process_addrs.rst b/Documentation/mm/process_addrs.rst similarity index 100% rename from Documentation/vm/process_addrs.rst rename to Documentation/mm/process_addrs.rst diff --git a/Documentation/vm/remap_file_pages.rst b/Documentation/mm/remap_file_pages.rst similarity index 100% rename from Documentation/vm/remap_file_pages.rst rename to Documentation/mm/remap_file_pages.rst diff --git a/Documentation/vm/shmfs.rst b/Documentation/mm/shmfs.rst similarity index 100% rename from Documentation/vm/shmfs.rst rename to Documentation/mm/shmfs.rst diff --git a/Documentation/vm/slab.rst b/Documentation/mm/slab.rst similarity index 100% rename from Documentation/vm/slab.rst rename to Documentation/mm/slab.rst diff --git a/Documentation/vm/slub.rst b/Documentation/mm/slub.rst similarity index 100% rename from Documentation/vm/slub.rst rename to Documentation/mm/slub.rst diff --git a/Documentation/vm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst similarity index 100% rename from Documentation/vm/split_page_table_lock.rst rename to Documentation/mm/split_page_table_lock.rst diff --git a/Documentation/vm/swap.rst b/Documentation/mm/swap.rst similarity index 100% rename from Documentation/vm/swap.rst rename to Documentation/mm/swap.rst diff --git a/Documentation/vm/transhuge.rst b/Documentation/mm/transhuge.rst similarity index 100% rename from Documentation/vm/transhuge.rst rename to Documentation/mm/transhuge.rst diff --git a/Documentation/vm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst similarity index 100% rename from Documentation/vm/unevictable-lru.rst rename to Documentation/mm/unevictable-lru.rst diff --git a/Documentation/vm/vmalloc.rst b/Documentation/mm/vmalloc.rst similarity index 100% rename from Documentation/vm/vmalloc.rst rename to Documentation/mm/vmalloc.rst diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/mm/vmalloced-kernel-stacks.rst similarity index 100% rename from Documentation/vm/vmalloced-kernel-stacks.rst rename to Documentation/mm/vmalloced-kernel-stacks.rst diff --git a/Documentation/vm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst similarity index 100% rename from Documentation/vm/vmemmap_dedup.rst rename to Documentation/mm/vmemmap_dedup.rst diff --git a/Documentation/vm/z3fold.rst b/Documentation/mm/z3fold.rst similarity index 100% rename from Documentation/vm/z3fold.rst rename to Documentation/mm/z3fold.rst diff --git a/Documentation/vm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst similarity index 100% rename from Documentation/vm/zsmalloc.rst rename to Documentation/mm/zsmalloc.rst diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst index 0c8276109fc0..30c69e1f44fe 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/index.rst @@ -13,7 +13,7 @@ 监测数据访问 ============ -:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, +:doc:`DAMON ` 允许轻量级的数据访问监测。使用DAMON, 用户可以分析他们系统的内存访问模式,并优化它们。 .. toctree:: diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst index 1500bdbf338a..c976f3e33ffd 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/reclaim.rst @@ -229,4 +229,4 @@ DAMON_RECLAIM再次什么都不做,这样我们就可以退回到基于LRU列 .. [1] https://research.google/pubs/pub48551/ .. [2] https://lwn.net/Articles/787611/ -.. [3] https://www.kernel.org/doc/html/latest/vm/free_page_reporting.html +.. [3] https://www.kernel.org/doc/html/latest/mm/free_page_reporting.html diff --git a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst index eee0e8c5c368..cd41ada4fdad 100644 --- a/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst +++ b/Documentation/translations/zh_CN/admin-guide/mm/damon/usage.rst @@ -33,9 +33,9 @@ DAMON 为不同的用户提供了下面这些接口。 口相同。这将在下一个LTS内核发布后被移除,所以用户应该转移到 :ref:`sysfs interface `。 - *内核空间编程接口。* - :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 + :doc:`这 ` 这是为内核空间程序员准备的。使用它,用户可以通过为你编写内 核空间的DAMON应用程序,最灵活有效地利用DAMON的每一个功能。你甚至可以为各种地址空间扩展DAMON。 - 详细情况请参考接口 :doc:`文件 `。 + 详细情况请参考接口 :doc:`文件 `。 sysfs接口 ========= @@ -148,7 +148,7 @@ contexts//monitoring_attrs/ 在 ``nr_regions`` 目录下,有两个文件分别用于DAMON监测区域的下限和上限(``min`` 和 ``max`` ), 这两个文件控制着监测的开销。你可以通过向这些文件的写入和读出来设置和获取这些值。 -关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/vm/damon/design`)。 +关于间隔和监测区域范围的更多细节,请参考设计文件 (:doc:`/mm/damon/design`)。 contexts//targets/ --------------------- @@ -318,7 +318,7 @@ DAMON导出了八个文件, ``attrs``, ``target_ids``, ``init_regions``, ---- 用户可以通过读取和写入 ``attrs`` 文件获得和设置 ``采样间隔`` 、 ``聚集间隔`` 、 ``更新间隔`` -以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/vm/damon/design` 。例如, +以及监测目标区域的最小/最大数量。要详细了解监测属性,请参考 `:doc:/mm/damon/design` 。例如, 下面的命令将这些值设置为5ms、100ms、1000ms、10和1000,然后再次检查:: # cd /damon diff --git a/Documentation/translations/zh_CN/core-api/index.rst b/Documentation/translations/zh_CN/core-api/index.rst index 26d9913fc8b6..b03020c8b2ab 100644 --- a/Documentation/translations/zh_CN/core-api/index.rst +++ b/Documentation/translations/zh_CN/core-api/index.rst @@ -101,7 +101,7 @@ Todolist: ======== 如何在内核中分配和使用内存。请注意,在 -:doc:`/vm/index` 中有更多的内存管理文档。 +:doc:`/mm/index` 中有更多的内存管理文档。 .. toctree:: :maxdepth: 1 diff --git a/Documentation/translations/zh_CN/index.rst b/Documentation/translations/zh_CN/index.rst index ad7bb8c17562..bf85baca8b3e 100644 --- a/Documentation/translations/zh_CN/index.rst +++ b/Documentation/translations/zh_CN/index.rst @@ -118,7 +118,7 @@ TODOList: sound/index filesystems/index scheduler/index - vm/index + mm/index peci/index TODOList: diff --git a/Documentation/translations/zh_CN/vm/active_mm.rst b/Documentation/translations/zh_CN/mm/active_mm.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/active_mm.rst rename to Documentation/translations/zh_CN/mm/active_mm.rst index 366609ea4f37..c2816f523bd7 100644 --- a/Documentation/translations/zh_CN/vm/active_mm.rst +++ b/Documentation/translations/zh_CN/mm/active_mm.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/active_mm.rst +:Original: Documentation/mm/active_mm.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/balance.rst b/Documentation/translations/zh_CN/mm/balance.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/balance.rst rename to Documentation/translations/zh_CN/mm/balance.rst index e98a47ef24a8..6fd79209c307 100644 --- a/Documentation/translations/zh_CN/vm/balance.rst +++ b/Documentation/translations/zh_CN/mm/balance.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/balance.rst +:Original: Documentation/mm/balance.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/damon/api.rst b/Documentation/translations/zh_CN/mm/damon/api.rst similarity index 91% rename from Documentation/translations/zh_CN/vm/damon/api.rst rename to Documentation/translations/zh_CN/mm/damon/api.rst index 21143eea4ebe..5593a83c86bc 100644 --- a/Documentation/translations/zh_CN/vm/damon/api.rst +++ b/Documentation/translations/zh_CN/mm/damon/api.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -:Original: Documentation/vm/damon/api.rst +:Original: Documentation/mm/damon/api.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/damon/design.rst b/Documentation/translations/zh_CN/mm/damon/design.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/damon/design.rst rename to Documentation/translations/zh_CN/mm/damon/design.rst index 46128b77c2b3..16e3db34a7dd 100644 --- a/Documentation/translations/zh_CN/vm/damon/design.rst +++ b/Documentation/translations/zh_CN/mm/damon/design.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -:Original: Documentation/vm/damon/design.rst +:Original: Documentation/mm/damon/design.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/damon/faq.rst b/Documentation/translations/zh_CN/mm/damon/faq.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/damon/faq.rst rename to Documentation/translations/zh_CN/mm/damon/faq.rst index 07b4ac19407d..de4be417494a 100644 --- a/Documentation/translations/zh_CN/vm/damon/faq.rst +++ b/Documentation/translations/zh_CN/mm/damon/faq.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -:Original: Documentation/vm/damon/faq.rst +:Original: Documentation/mm/damon/faq.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/damon/index.rst b/Documentation/translations/zh_CN/mm/damon/index.rst similarity index 90% rename from Documentation/translations/zh_CN/vm/damon/index.rst rename to Documentation/translations/zh_CN/mm/damon/index.rst index 84d36d90c9b0..b03bf307204f 100644 --- a/Documentation/translations/zh_CN/vm/damon/index.rst +++ b/Documentation/translations/zh_CN/mm/damon/index.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -:Original: Documentation/vm/damon/index.rst +:Original: Documentation/mm/damon/index.rst :翻译: @@ -14,7 +14,7 @@ DAMON:数据访问监视器 ========================== DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心机制使其成为 -(该核心机制详见(Documentation/translations/zh_CN/vm/damon/design.rst)) +(该核心机制详见(Documentation/translations/zh_CN/mm/damon/design.rst)) - *准确度* (监测输出对DRAM级别的内存管理足够有用;但可能不适合CPU Cache级别), - *轻量级* (监控开销低到可以在线应用),以及 @@ -30,4 +30,3 @@ DAMON是Linux内核的一个数据访问监控框架子系统。DAMON的核心 faq design api - diff --git a/Documentation/translations/zh_CN/vm/free_page_reporting.rst b/Documentation/translations/zh_CN/mm/free_page_reporting.rst similarity index 97% rename from Documentation/translations/zh_CN/vm/free_page_reporting.rst rename to Documentation/translations/zh_CN/mm/free_page_reporting.rst index 31d6c34b956b..83b14cce9adf 100644 --- a/Documentation/translations/zh_CN/vm/free_page_reporting.rst +++ b/Documentation/translations/zh_CN/mm/free_page_reporting.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/_free_page_reporting.rst +:Original: Documentation/mm/_free_page_reporting.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/frontswap.rst rename to Documentation/translations/zh_CN/mm/frontswap.rst index 3eb07870e2ef..5c18ea2be04f 100644 --- a/Documentation/translations/zh_CN/vm/frontswap.rst +++ b/Documentation/translations/zh_CN/mm/frontswap.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/_free_page_reporting.rst +:Original: Documentation/mm/_free_page_reporting.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/highmem.rst b/Documentation/translations/zh_CN/mm/highmem.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/highmem.rst rename to Documentation/translations/zh_CN/mm/highmem.rst index 018838e58c3e..81202c65e000 100644 --- a/Documentation/translations/zh_CN/vm/highmem.rst +++ b/Documentation/translations/zh_CN/mm/highmem.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/highmem.rst +:Original: Documentation/mm/highmem.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/hmm.rst b/Documentation/translations/zh_CN/mm/hmm.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/hmm.rst rename to Documentation/translations/zh_CN/mm/hmm.rst index 2379df95aa58..5024a8a15516 100644 --- a/Documentation/translations/zh_CN/vm/hmm.rst +++ b/Documentation/translations/zh_CN/mm/hmm.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/hmm.rst +:Original: Documentation/mm/hmm.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst rename to Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index c6d471ce2131..752e5696cd47 100644 --- a/Documentation/translations/zh_CN/vm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/hugetlbfs_reserv.rst +:Original: Documentation/mm/hugetlbfs_reserv.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/hwpoison.rst b/Documentation/translations/zh_CN/mm/hwpoison.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/hwpoison.rst rename to Documentation/translations/zh_CN/mm/hwpoison.rst index c6e1e7bdb05b..310862edc937 100644 --- a/Documentation/translations/zh_CN/vm/hwpoison.rst +++ b/Documentation/translations/zh_CN/mm/hwpoison.rst @@ -1,5 +1,5 @@ -:Original: Documentation/vm/hwpoison.rst +:Original: Documentation/mm/hwpoison.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/index.rst b/Documentation/translations/zh_CN/mm/index.rst similarity index 96% rename from Documentation/translations/zh_CN/vm/index.rst rename to Documentation/translations/zh_CN/mm/index.rst index a1c6d529b6ff..4c8c6b7b72a3 100644 --- a/Documentation/translations/zh_CN/vm/index.rst +++ b/Documentation/translations/zh_CN/mm/index.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/index.rst +:Original: Documentation/mm/index.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/ksm.rst b/Documentation/translations/zh_CN/mm/ksm.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/ksm.rst rename to Documentation/translations/zh_CN/mm/ksm.rst index 83b0c73984da..d1f82e857ad7 100644 --- a/Documentation/translations/zh_CN/vm/ksm.rst +++ b/Documentation/translations/zh_CN/mm/ksm.rst @@ -1,6 +1,6 @@ .. include:: ../disclaimer-zh_CN.rst -:Original: Documentation/vm/ksm.rst +:Original: Documentation/mm/ksm.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/memory-model.rst b/Documentation/translations/zh_CN/mm/memory-model.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/memory-model.rst rename to Documentation/translations/zh_CN/mm/memory-model.rst index 013e30c88d72..77ec149a970c 100644 --- a/Documentation/translations/zh_CN/vm/memory-model.rst +++ b/Documentation/translations/zh_CN/mm/memory-model.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -:Original: Documentation/vm/memory-model.rst +:Original: Documentation/mm/memory-model.rst :翻译: @@ -129,7 +129,7 @@ ZONE_DEVICE * pmem: 通过DAX映射将平台持久性内存作为直接I/O目标使用。 * hmm: 用 `->page_fault()` 和 `->page_free()` 事件回调扩展 `ZONE_DEVICE` , - 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见/vm/hmm.rst。 + 以允许设备驱动程序协调与设备内存相关的内存管理事件,通常是GPU内存。参见Documentation/mm/hmm.rst。 * p2pdma: 创建 `struct page` 对象,允许PCI/E拓扑结构中的peer设备协调它们之间的 直接DMA操作,即绕过主机内存。 diff --git a/Documentation/translations/zh_CN/vm/mmu_notifier.rst b/Documentation/translations/zh_CN/mm/mmu_notifier.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/mmu_notifier.rst rename to Documentation/translations/zh_CN/mm/mmu_notifier.rst index b29a37b33628..ce3664d1a410 100644 --- a/Documentation/translations/zh_CN/vm/mmu_notifier.rst +++ b/Documentation/translations/zh_CN/mm/mmu_notifier.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/mmu_notifier.rst +:Original: Documentation/mm/mmu_notifier.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/numa.rst b/Documentation/translations/zh_CN/mm/numa.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/numa.rst rename to Documentation/translations/zh_CN/mm/numa.rst index 6af412b924ad..b15cfeeb6dfb 100644 --- a/Documentation/translations/zh_CN/vm/numa.rst +++ b/Documentation/translations/zh_CN/mm/numa.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/numa.rst +:Original: Documentation/mm/numa.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/overcommit-accounting.rst rename to Documentation/translations/zh_CN/mm/overcommit-accounting.rst index 8765cb118f24..d8452d8b7fbb 100644 --- a/Documentation/translations/zh_CN/vm/overcommit-accounting.rst +++ b/Documentation/translations/zh_CN/mm/overcommit-accounting.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/overcommit-accounting.rst +:Original: Documentation/mm/overcommit-accounting.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst similarity index 97% rename from Documentation/translations/zh_CN/vm/page_frags.rst rename to Documentation/translations/zh_CN/mm/page_frags.rst index ad27fed33634..320952ca93af 100644 --- a/Documentation/translations/zh_CN/vm/page_frags.rst +++ b/Documentation/translations/zh_CN/mm/page_frags.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/page_frag.rst +:Original: Documentation/mm/page_frag.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/page_owner.rst b/Documentation/translations/zh_CN/mm/page_owner.rst similarity index 99% rename from Documentation/translations/zh_CN/vm/page_owner.rst rename to Documentation/translations/zh_CN/mm/page_owner.rst index 9e951fabba9d..03d9e613094a 100644 --- a/Documentation/translations/zh_CN/vm/page_owner.rst +++ b/Documentation/translations/zh_CN/mm/page_owner.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/page_owner.rst +:Original: Documentation/mm/page_owner.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/page_table_check.rst b/Documentation/translations/zh_CN/mm/page_table_check.rst similarity index 97% rename from Documentation/translations/zh_CN/vm/page_table_check.rst rename to Documentation/translations/zh_CN/mm/page_table_check.rst index a29fc1b360e6..e8077310a76c 100644 --- a/Documentation/translations/zh_CN/vm/page_table_check.rst +++ b/Documentation/translations/zh_CN/mm/page_table_check.rst @@ -1,6 +1,6 @@ .. SPDX-License-Identifier: GPL-2.0 -:Original: Documentation/vm/page_table_check.rst +:Original: Documentation/mm/page_table_check.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/remap_file_pages.rst b/Documentation/translations/zh_CN/mm/remap_file_pages.rst similarity index 97% rename from Documentation/translations/zh_CN/vm/remap_file_pages.rst rename to Documentation/translations/zh_CN/mm/remap_file_pages.rst index af6b7e28af23..31e0c54dc36f 100644 --- a/Documentation/translations/zh_CN/vm/remap_file_pages.rst +++ b/Documentation/translations/zh_CN/mm/remap_file_pages.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/remap_file_pages.rst +:Original: Documentation/mm/remap_file_pages.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/split_page_table_lock.rst rename to Documentation/translations/zh_CN/mm/split_page_table_lock.rst index 50694d97c426..4fb7aa666037 100644 --- a/Documentation/translations/zh_CN/vm/split_page_table_lock.rst +++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/split_page_table_lock.rst +:Original: Documentation/mm/split_page_table_lock.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/z3fold.rst b/Documentation/translations/zh_CN/mm/z3fold.rst similarity index 96% rename from Documentation/translations/zh_CN/vm/z3fold.rst rename to Documentation/translations/zh_CN/mm/z3fold.rst index 57204aa08caa..9569a6d88270 100644 --- a/Documentation/translations/zh_CN/vm/z3fold.rst +++ b/Documentation/translations/zh_CN/mm/z3fold.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/z3fold.rst +:Original: Documentation/mm/z3fold.rst :翻译: diff --git a/Documentation/translations/zh_CN/vm/zsmalloc.rst b/Documentation/translations/zh_CN/mm/zsmalloc.rst similarity index 98% rename from Documentation/translations/zh_CN/vm/zsmalloc.rst rename to Documentation/translations/zh_CN/mm/zsmalloc.rst index 29e9c70a8eb6..b5596ea08ae4 100644 --- a/Documentation/translations/zh_CN/vm/zsmalloc.rst +++ b/Documentation/translations/zh_CN/mm/zsmalloc.rst @@ -1,4 +1,4 @@ -:Original: Documentation/vm/zs_malloc.rst +:Original: Documentation/mm/zs_malloc.rst :翻译: diff --git a/Documentation/translations/zh_TW/index.rst b/Documentation/translations/zh_TW/index.rst index e1ce9d8c06f8..e97d7d578751 100644 --- a/Documentation/translations/zh_TW/index.rst +++ b/Documentation/translations/zh_TW/index.rst @@ -128,7 +128,7 @@ TODOList: * security/index * sound/index * crypto/index -* vm/index +* mm/index * bpf/index * usb/index * PCI/index diff --git a/Documentation/vm/.gitignore b/Documentation/vm/.gitignore deleted file mode 100644 index bc74f5643008..000000000000 --- a/Documentation/vm/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -page-types -slabinfo diff --git a/MAINTAINERS b/MAINTAINERS index fe5daf141501..55fb1daa9057 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5526,7 +5526,7 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-damon F: Documentation/admin-guide/mm/damon/ -F: Documentation/vm/damon/ +F: Documentation/mm/damon/ F: include/linux/damon.h F: include/trace/events/damon.h F: mm/damon/ @@ -9037,7 +9037,7 @@ HMM - Heterogeneous Memory Management M: Jérôme Glisse L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/hmm.rst +F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* @@ -9135,8 +9135,8 @@ L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages F: Documentation/admin-guide/mm/hugetlbpage.rst -F: Documentation/vm/hugetlbfs_reserv.rst -F: Documentation/vm/vmemmap_dedup.rst +F: Documentation/mm/hugetlbfs_reserv.rst +F: Documentation/mm/vmemmap_dedup.rst F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: mm/hugetlb.c @@ -15072,7 +15072,7 @@ M: Pasha Tatashin M: Andrew Morton L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/page_table_check.rst +F: Documentation/mm/page_table_check.rst F: include/linux/page_table_check.h F: mm/page_table_check.c @@ -22158,7 +22158,7 @@ M: Nitin Gupta R: Sergey Senozhatsky L: linux-mm@kvack.org S: Maintained -F: Documentation/vm/zsmalloc.rst +F: Documentation/mm/zsmalloc.rst F: include/linux/zsmalloc.h F: mm/zsmalloc.c diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 1920d52653b4..db2838cf8c02 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -410,7 +410,7 @@ config ARCH_SPARSEMEM_ENABLE Say Y to support efficient handling of sparse physical memory, for architectures which are either NUMA (Non-Uniform Memory Access) or have huge holes in the physical address space for other reasons. - See for more. + See for more. config ARCH_ENABLE_THP_MIGRATION def_bool y diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index cb9d5fd39d7f..392ff48f77df 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1273,7 +1273,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, * should return true. * We should not call this on a hugetlb entry. We should check for HugeTLB * entry using vma->vm_flags - * The page table walk rule is explained in Documentation/vm/transhuge.rst + * The page table walk rule is explained in Documentation/mm/transhuge.rst */ static inline int pmd_trans_huge(pmd_t pmd) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index d5a6f101f843..126a36571667 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -4,7 +4,7 @@ * * Authors: Jérôme Glisse * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. + * See Documentation/mm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -100,7 +100,7 @@ struct hmm_range { }; /* - * Please see Documentation/vm/hmm.rst for how to use the range API. + * Please see Documentation/mm/hmm.rst for how to use the range API. */ int hmm_range_fault(struct hmm_range *range); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 8af304f6b504..9f5ee49482de 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -39,7 +39,7 @@ struct vmem_altmap { * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in - * include/linux/hmm.h and Documentation/vm/hmm.rst. + * include/linux/hmm.h and Documentation/mm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 45fc2c81e370..d6c06e140277 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -198,7 +198,7 @@ struct mmu_notifier_ops { * invalidate_range_start()/end() notifiers, as * invalidate_range() already catches the points in time when an * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/vm/mmu_notifier.rst + * discussion on this see Documentation/mm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8cd975a8bfeb..2a243616f222 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -29,7 +29,7 @@ extern struct mm_struct *mm_alloc(void); * * Use mmdrop() to release the reference acquired by mmgrab(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) @@ -92,7 +92,7 @@ static inline void mmdrop_sched(struct mm_struct *mm) * * Use mmput() to release the reference acquired by mmget(). * - * See also for an in-depth explanation + * See also for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0c0fed1b348f..95a5b7aa1ae9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -74,7 +74,7 @@ static inline int current_is_kswapd(void) /* * Unaddressable device memory support. See include/linux/hmm.h and - * Documentation/vm/hmm.rst. Short description is we need struct pages for + * Documentation/mm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * diff --git a/mm/Kconfig b/mm/Kconfig index 169e64192e48..c1fa4993a56f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -663,7 +663,7 @@ config KSM the many instances by a single page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.rst for more information: KSM is inactive + See Documentation/mm/ksm.rst for more information: KSM is inactive until a program has madvised that an area is MADV_MERGEABLE, and root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1ab091f49fc0..dc7df1254f0a 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -35,7 +35,7 @@ #include /* - * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics + * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. */ diff --git a/mm/frontswap.c b/mm/frontswap.c index 6f69b044a8cc..1a97610308cb 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -4,7 +4,7 @@ * * This code provides the generic "frontend" layer to call a matching * "backend" driver implementation of frontswap. See - * Documentation/vm/frontswap.rst for more information. + * Documentation/mm/frontswap.rst for more information. * * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. * Author: Dan Magenheimer diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 834f288b3769..f9b90a8d7dfa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1937,7 +1937,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, * replacing a zero pmd write protected page with a zero pte write * protected page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ pmdp_huge_clear_flush(vma, haddr, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a57e1be41401..b36a4ef87a2e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4875,7 +4875,7 @@ again: * table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_pte_wrprotect(entry); @@ -6403,7 +6403,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * No need to call mmu_notifier_invalidate_range() we are downgrading * page table protection not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(&range); @@ -7102,7 +7102,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) i_mmap_unlock_write(vma->vm_file->f_mapping); /* * No need to call mmu_notifier_invalidate_range(), see - * Documentation/vm/mmu_notifier.rst. + * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1089ea8a9c98..ba29c15c53d6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -6,7 +6,7 @@ * * Author: Muchun Song * - * See Documentation/vm/vmemmap_dedup.rst + * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt diff --git a/mm/ksm.c b/mm/ksm.c index 54f78c9eecae..8d2dc501c92c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are downgrading page table to read * only not changing it to point to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* @@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * No need to notify as we are replacing a read only page with another * read only page with the same content. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); diff --git a/mm/mmap.c b/mm/mmap.c index 61e6135c54ef..c14d7286a379 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2944,7 +2944,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n", + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n", current->comm, current->pid); if (prot) diff --git a/mm/rmap.c b/mm/rmap.c index 5bcb334cd6f2..65e0a767b837 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) * downgrading page table protection not changing it to point * to a new page. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ if (ret) cleaned++; @@ -1765,7 +1765,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * to point at a new folio while a device is * still using this folio. * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(&folio->page)); } @@ -1775,7 +1775,7 @@ discard: * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) @@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * done above for all cases requiring it to happen under page * table lock before mmu_notifier_invalidate_range_end() * - * See Documentation/vm/mmu_notifier.rst + * See Documentation/mm/mmu_notifier.rst */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 652f11a05749..3ff88a2eefb8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -752,7 +752,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, /* * Reuse the previous page for the rest of tail pages - * See layout diagram in Documentation/vm/vmemmap_dedup.rst + * See layout diagram in Documentation/mm/vmemmap_dedup.rst */ next += PAGE_SIZE; rc = vmemmap_populate_range(next, last, node, NULL, diff --git a/mm/util.c b/mm/util.c index 0837570c9225..5df8f2db7ca9 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * succeed and -ENOMEM implies there is not. * * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst + * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst * * Strict overcommit modes added 2002 Feb 26 by Alan Cox. * Additional code 2002 Jul 20 by Robert Love. diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index c149427eb1c9..74c3dcecf64d 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -8,7 +8,7 @@ * Or sort by total memory: * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * - * See Documentation/vm/page_owner.rst + * See Documentation/mm/page_owner.rst */ #include From 507db7927cd181d409dd495c8384b8e14c21c600 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sun, 3 Jul 2022 18:08:36 -0700 Subject: [PATCH 033/282] mm: rmap: use the correct parameter name for DEFINE_PAGE_VMA_WALK The parameter used by DEFINE_PAGE_VMA_WALK is _page not page, fix the parameter name. It didn't cause any build error, it is probably because the only caller is write_protect_page() from ksm.c, which pass in page. Link: https://lkml.kernel.org/r/20220512174551.81279-1-shy828301@gmail.com Fixes: 2aff7a4755be ("mm: Convert page_vma_mapped_walk to work on PFNs") Signed-off-by: Yang Shi Reviewed-by: Muchun Song Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/rmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 9ec23138e410..bf80adca980b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -325,8 +325,8 @@ struct page_vma_mapped_walk { #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(page), \ - .pgoff = page_to_pgoff(page), \ + .nr_pages = compound_nr(_page), \ + .pgoff = page_to_pgoff(_page), \ .vma = _vma, \ .address = _address, \ .flags = _flags, \ From c453d8c7d1384d7e1d7f26d3ec0d527092edf801 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 13 May 2022 12:17:05 -0700 Subject: [PATCH 034/282] mm/page_vma_mapped.c: check possible huge PMD map with transhuge_vma_suitable() IIUC page_vma_mapped_walk() checks if the vma is possibly huge PMD mapped with transparent_hugepage_active() and "pvmw->nr_pages >= HPAGE_PMD_NR". Actually pvmw->nr_pages is returned by compound_nr() or folio_nr_pages(), so the page should be THP as long as "pvmw->nr_pages >= HPAGE_PMD_NR". And it is guaranteed THP is allocated for valid VMA in the first place. But it may be not PMD mapped if the VMA is file VMA and it is not properly aligned. The transhuge_vma_suitable() is used to do such check, so replace transparent_hugepage_active() to it, which is too heavy and overkilling. Link: https://lkml.kernel.org/r/20220513191705.457775-1-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++++++-- mm/page_vma_mapped.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index de29821231c9..648cb3ce7099 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,8 +117,10 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { + unsigned long haddr; + /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, @@ -126,6 +128,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return false; } + haddr = addr & HPAGE_PMD_MASK; + if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; @@ -342,7 +346,7 @@ static inline bool transparent_hugepage_active(struct vm_area_struct *vma) } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) + unsigned long addr) { return false; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index c10f839fc410..e971a467fcdf 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -243,7 +243,7 @@ restart: * cleared *pmd but not decremented compound_mapcount(). */ if ((pvmw->flags & PVMW_SYNC) && - transparent_hugepage_active(vma) && + transhuge_vma_suitable(vma, pvmw->address) && (pvmw->nr_pages >= HPAGE_PMD_NR)) { spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); From 160088b3b6d7946e456caa379dcdfc8702c66274 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:14 +0800 Subject: [PATCH 035/282] mm/migration: remove unneeded lock page and PageMovable check When non-lru movable page was freed from under us, __ClearPageMovable must have been done. So we can remove unneeded lock page and PageMovable check here. Also free_pages_prepare() will clear PG_isolated for us, so we can further remove ClearPageIsolated as suggested by David. Link: https://lkml.kernel.org/r/20220530113016.16663-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Christoph Hellwig Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Lameter Cc: David Howells Cc: Huang Ying Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/migrate.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 6c1ea61f39d8..c83b3ae2e285 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1090,15 +1090,10 @@ static int unmap_and_move(new_page_t get_new_page, return -ENOSYS; if (page_count(page) == 1) { - /* page was freed from under us. So we are done. */ + /* Page was freed from under us. So we are done. */ ClearPageActive(page); ClearPageUnevictable(page); - if (unlikely(__PageMovable(page))) { - lock_page(page); - if (!PageMovable(page)) - ClearPageIsolated(page); - unlock_page(page); - } + /* free_pages_prepare() will clear PG_isolated. */ goto out; } From 7ce82f4c3f3ead13a9d9498768e3b1a79975c4d8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:15 +0800 Subject: [PATCH 036/282] mm/migration: return errno when isolate_huge_page failed We might fail to isolate huge page due to e.g. the page is under migration which cleared HPageMigratable. We should return errno in this case rather than always return 1 which could confuse the user, i.e. the caller might think all of the memory is migrated while the hugetlb page is left behind. We make the prototype of isolate_huge_page consistent with isolate_lru_page as suggested by Huang Ying and rename isolate_huge_page to isolate_hugetlb as suggested by Muchun to improve the readability. Link: https://lkml.kernel.org/r/20220530113016.16663-4-linmiaohe@huawei.com Fixes: e8db67eb0ded ("mm: migrate: move_pages() supports thp migration") Signed-off-by: Miaohe Lin Suggested-by: Huang Ying Reported-by: kernel test robot (build error) Cc: Alistair Popple Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Howells Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +++--- mm/gup.c | 2 +- mm/hugetlb.c | 11 +++++------ mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 7 ++++--- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e4cff27d1198..756b66ff025e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -170,7 +170,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_huge_page(struct page *page, struct list_head *list); +int isolate_hugetlb(struct page *page, struct list_head *list); int get_hwpoison_huge_page(struct page *page, bool *hugetlb); int get_huge_page_for_hwpoison(unsigned long pfn, int flags); void putback_active_hugepage(struct page *page); @@ -376,9 +376,9 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_huge_page(struct page *page, struct list_head *list) +static inline int isolate_hugetlb(struct page *page, struct list_head *list) { - return false; + return -EBUSY; } static inline int get_hwpoison_huge_page(struct page *page, bool *hugetlb) diff --git a/mm/gup.c b/mm/gup.c index 407a81d5ca03..3129b754ade3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1930,7 +1930,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, * Try to move out any movable page before pinning the range. */ if (folio_test_hugetlb(folio)) { - if (!isolate_huge_page(&folio->page, + if (isolate_hugetlb(&folio->page, &movable_page_list)) isolation_error_count++; continue; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b36a4ef87a2e..dd9a46ccb79c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2766,8 +2766,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - if (!isolate_huge_page(old_page, list)) - ret = -EBUSY; + ret = isolate_hugetlb(old_page, list); spin_lock_irq(&hugetlb_lock); goto free_new; } else if (!HPageFreed(old_page)) { @@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && isolate_huge_page(head, list)) + if (page_count(head) && !isolate_hugetlb(head, list)) ret = 0; else if (!page_count(head)) ret = alloc_and_dissolve_huge_page(h, head, list); @@ -6960,15 +6959,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -bool isolate_huge_page(struct page *page, struct list_head *list) +int isolate_hugetlb(struct page *page, struct list_head *list) { - bool ret = true; + int ret = 0; spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { - ret = false; + ret = -EBUSY; goto unlock; } ClearHPageMigratable(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index da39ec8afca8..845369f839e1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2178,7 +2178,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) bool lru = PageLRU(page); if (PageHuge(page)) { - isolated = isolate_huge_page(page, pagelist); + isolated = !isolate_hugetlb(page, pagelist); } else { if (lru) isolated = !isolate_lru_page(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f1a730c4499..84990a14d51a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1641,7 +1641,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { pfn = page_to_pfn(head) + compound_nr(head) - 1; - isolate_huge_page(head, &source); + isolate_hugetlb(head, &source); continue; } else if (PageTransHuge(page)) pfn = page_to_pfn(head) + thp_nr_pages(page) - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d39b01fd52fe..9689919a2829 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -602,7 +602,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { - if (!isolate_huge_page(page, qp->pagelist) && + if (isolate_hugetlb(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages diff --git a/mm/migrate.c b/mm/migrate.c index c83b3ae2e285..1d036dec1328 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -133,7 +133,7 @@ static void putback_movable_page(struct page *page) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_huge_page(). + * and isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -1628,8 +1628,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (PageHuge(page)) { if (PageHead(page)) { - isolate_huge_page(page, pagelist); - err = 1; + err = isolate_hugetlb(page, pagelist); + if (!err) + err = 1; } } else { struct page *head; From ad1ac596e8a8c4b06715dfbd89853eb73c9886b2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 30 May 2022 19:30:16 +0800 Subject: [PATCH 037/282] mm/migration: fix potential pte_unmap on an not mapped pte __migration_entry_wait and migration_entry_wait_on_locked assume pte is always mapped from caller. But this is not the case when it's called from migration_entry_wait_huge and follow_huge_pmd. Add a hugetlbfs variant that calls hugetlb_migration_entry_wait(ptep == NULL) to fix this issue. Link: https://lkml.kernel.org/r/20220530113016.16663-5-linmiaohe@huawei.com Fixes: 30dad30922cc ("mm: migration: add migrate_entry_wait_huge()") Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Cc: Alistair Popple Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Howells Cc: Huang Ying Cc: kernel test robot Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swapops.h | 12 ++++++++---- mm/hugetlb.c | 4 ++-- mm/migrate.c | 23 +++++++++++++++++++---- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f24775b41880..bb7afd03a324 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -244,8 +244,10 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte); +#ifdef CONFIG_HUGETLB_PAGE +extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); +#endif #else static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -271,8 +273,10 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } -static inline void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) { } +#ifdef CONFIG_HUGETLB_PAGE +static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } +#endif static inline int is_writable_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd9a46ccb79c..ed202d29ca46 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5702,7 +5702,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, mm, ptep); + migration_entry_wait_huge(vma, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -6927,7 +6927,7 @@ retry: } else { if (is_hugetlb_entry_migration(pte)) { spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pmd, ptl); + __migration_entry_wait_huge((pte_t *)pmd, ptl); goto retry; } /* diff --git a/mm/migrate.c b/mm/migrate.c index 1d036dec1328..7934eebf1689 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -315,13 +315,28 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) +#ifdef CONFIG_HUGETLB_PAGE +void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); - __migration_entry_wait(mm, pte, ptl); + pte_t pte; + + spin_lock(ptl); + pte = huge_ptep_get(ptep); + + if (unlikely(!is_hugetlb_entry_migration(pte))) + spin_unlock(ptl); + else + migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl); } +void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) +{ + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte); + + __migration_entry_wait_huge(pte, ptl); +} +#endif + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) { From 205498012513f9a1209d9335bf3766080c587a33 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:05 +0000 Subject: [PATCH 038/282] Docs/admin-guide/damon/reclaim: remove a paragraph that been obsolete due to online tuning support Patch series "mm/damon: trivial cleanups". This patchset contains trivial cleansups for DAMON code. This patch (of 6): Commit 81a84182c343 ("Docs/admin-guide/mm/damon/reclaim: document 'commit_inputs' parameter") has documented the 'commit_inputs' parameter which allows online parameter update, but it didn't remove a paragraph saying the online parameter update is impossible. This commit removes the obsolete paragraph. Link: https://lkml.kernel.org/r/20220606182310.48781-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220606182310.48781-2-sj@kernel.org Fixes: 81a84182c343 ("Docs/admin-guide/mm/damon/reclaim: document 'commit_inputs' parameter") Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/reclaim.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index a8bd3bd29959..4f1479a11e63 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -48,12 +48,6 @@ DAMON_RECLAIM utilizes module parameters. That is, you can put ``damon_reclaim.=`` on the kernel boot command line or write proper values to ``/sys/modules/damon_reclaim/parameters/`` files. -Note that the parameter values except ``enabled`` are applied only when -DAMON_RECLAIM starts. Therefore, if you want to apply new parameter values in -runtime and DAMON_RECLAIM is already enabled, you should disable and re-enable -it via ``enabled`` parameter file. Writing of the new values to proper -parameter values should be done before the re-enablement. - Below are the description of each parameter. enabled From c9e124e0382d83d458db204f929002ea98daa6a8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:06 +0000 Subject: [PATCH 039/282] mm/damon/{dbgfs,sysfs}: move target_has_pid() from dbgfs to damon.h The function for knowing if given monitoring context's targets will have pid or not is defined and used in dbgfs only. However, the logic is also needed for sysfs. This commit moves the code to damon.h and makes both dbgfs and sysfs to use it. Link: https://lkml.kernel.org/r/20220606182310.48781-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/dbgfs.c | 15 +++++---------- mm/damon/sysfs.c | 8 +++----- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2765c7d99beb..b9aae19fab3e 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -525,6 +525,12 @@ bool damon_is_registered_ops(enum damon_ops_id id); int damon_register_ops(struct damon_operations *ops); int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id); +static inline bool damon_target_has_pid(const struct damon_ctx *ctx) +{ + return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR; +} + + int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index a0dab8b5e45f..5ae810927309 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -275,11 +275,6 @@ out: return ret; } -static inline bool target_has_pid(const struct damon_ctx *ctx) -{ - return ctx->ops.id == DAMON_OPS_VADDR; -} - static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) { struct damon_target *t; @@ -288,7 +283,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len) int rc; damon_for_each_target(t, ctx) { - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) /* Show pid numbers to debugfs users */ id = pid_vnr(t->pid); else @@ -415,7 +410,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) put_pid(t->pid); damon_destroy_target(t); } @@ -425,11 +420,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets, if (!t) { damon_for_each_target_safe(t, next, ctx) damon_destroy_target(t); - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) dbgfs_put_pids(pids, nr_targets); return -ENOMEM; } - if (target_has_pid(ctx)) + if (damon_target_has_pid(ctx)) t->pid = pids[i]; damon_add_target(ctx, t); } @@ -722,7 +717,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (!target_has_pid(ctx)) + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 09f9e8ca3d1f..8810e6abdb06 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2136,8 +2136,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) struct damon_target *t, *next; damon_for_each_target_safe(t, next, ctx) { - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) + if (damon_target_has_pid(ctx)) put_pid(t->pid); damon_destroy_target(t); } @@ -2181,8 +2180,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, if (!t) return -ENOMEM; - if (ctx->ops.id == DAMON_OPS_VADDR || - ctx->ops.id == DAMON_OPS_FVADDR) { + if (damon_target_has_pid(ctx)) { t->pid = find_get_pid(sys_target->pid); if (!t->pid) goto destroy_targets_out; @@ -2210,7 +2208,7 @@ static struct damon_target *damon_sysfs_existing_target( struct pid *pid; struct damon_target *t; - if (ctx->ops.id == DAMON_OPS_PADDR) { + if (!damon_target_has_pid(ctx)) { /* Up to only one target for paddr could exist */ damon_for_each_target(t, ctx) return t; From f25ab3bdfb6f8548a721d4592b2c56cb2ac90ce6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:07 +0000 Subject: [PATCH 040/282] mm/damon/reclaim: deduplicate 'commit_inputs' handling DAMON_RECLAIM's handling of 'commit_inputs' parameter is duplicated in 'after_aggregation()' and 'after_wmarks_check()' callbacks. This commit deduplicates the code for better maintenance. Link: https://lkml.kernel.org/r/20220606182310.48781-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 4b07c29effe9..c2ed962db23f 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -403,10 +403,21 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); MODULE_PARM_DESC(enabled, "Enable or disable DAMON_RECLAIM (default: disabled)"); +static int damon_reclaim_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_reclaim_apply_parameters(); + commit_inputs = false; + return err; +} + static int damon_reclaim_after_aggregation(struct damon_ctx *c) { struct damos *s; - int err = 0; /* update the stats parameter */ damon_for_each_scheme(s, c) { @@ -417,22 +428,12 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) nr_quota_exceeds = s->stat.qt_exceeds; } - if (commit_inputs) { - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - } - return err; + return damon_reclaim_handle_commit_inputs(); } static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) { - int err = 0; - - if (commit_inputs) { - err = damon_reclaim_apply_parameters(); - commit_inputs = false; - } - return err; + return damon_reclaim_handle_commit_inputs(); } static int __init damon_reclaim_init(void) From a79b68ee3e21f3e5e39b6bd07759c1b4d5267bc4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:08 +0000 Subject: [PATCH 041/282] mm/damon/sysfs: deduplicate inputs applying DAMON sysfs interface's DAMON context building and its online parameter update have duplicated code. This commit removes the duplicate. Link: https://lkml.kernel.org/r/20220606182310.48781-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 65 ++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 8810e6abdb06..c35809c6087c 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2357,25 +2357,10 @@ static inline bool damon_sysfs_kdamond_running( damon_sysfs_ctx_running(kdamond->damon_ctx); } -/* - * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. - * @kdamond: The kobject wrapper for the associated kdamond. - * - * If the sysfs input is wrong, the kdamond will be terminated. - */ -static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, + struct damon_sysfs_context *sys_ctx) { - struct damon_ctx *ctx = kdamond->damon_ctx; - struct damon_sysfs_context *sys_ctx; - int err = 0; - - if (!damon_sysfs_kdamond_running(kdamond)) - return -EINVAL; - /* TODO: Support multiple contexts per kdamond */ - if (kdamond->contexts->nr != 1) - return -EINVAL; - - sys_ctx = kdamond->contexts->contexts_arr[0]; + int err; err = damon_select_ops(ctx, sys_ctx->ops_id); if (err) @@ -2386,10 +2371,25 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) err = damon_sysfs_set_targets(ctx, sys_ctx->targets); if (err) return err; - err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes); - if (err) - return err; - return err; + return damon_sysfs_set_schemes(ctx, sys_ctx->schemes); +} + +/* + * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. + * @kdamond: The kobject wrapper for the associated kdamond. + * + * If the sysfs input is wrong, the kdamond will be terminated. + */ +static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +{ + if (!damon_sysfs_kdamond_running(kdamond)) + return -EINVAL; + /* TODO: Support multiple contexts per kdamond */ + if (kdamond->contexts->nr != 1) + return -EINVAL; + + return damon_sysfs_apply_inputs(kdamond->damon_ctx, + kdamond->contexts->contexts_arr[0]); } /* @@ -2436,27 +2436,16 @@ static struct damon_ctx *damon_sysfs_build_ctx( if (!ctx) return ERR_PTR(-ENOMEM); - err = damon_select_ops(ctx, sys_ctx->ops_id); - if (err) - goto out; - err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); - if (err) - goto out; - err = damon_sysfs_set_targets(ctx, sys_ctx->targets); - if (err) - goto out; - err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes); - if (err) - goto out; + err = damon_sysfs_apply_inputs(ctx, sys_ctx); + if (err) { + damon_destroy_ctx(ctx); + return ERR_PTR(err); + } ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; - -out: - damon_destroy_ctx(ctx); - return ERR_PTR(err); } static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) From f943e7e3a4c6202eeb8b3c4bf2889778450eaf7e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:09 +0000 Subject: [PATCH 042/282] mm/damon/reclaim: make 'enabled' checking timer simpler DAMON_RECLAIM's 'enabled' parameter store callback ('enabled_store()') schedules the parameter check timer ('damon_reclaim_timer') if the parameter is set as 'Y'. Then, the timer schedules itself to check if user has set the parameter as 'N'. It's unnecessarily complex. This commit makes it simpler by making the parameter store callback to schedule the timer regardless of the parameter value and disabling the timer's self scheduling. Link: https://lkml.kernel.org/r/20220606182310.48781-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index c2ed962db23f..38da28803d75 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -353,7 +353,6 @@ static int damon_reclaim_turn(bool on) return 0; } -#define ENABLE_CHECK_INTERVAL_MS 1000 static struct delayed_work damon_reclaim_timer; static void damon_reclaim_timer_fn(struct work_struct *work) { @@ -367,10 +366,6 @@ static void damon_reclaim_timer_fn(struct work_struct *work) else enabled = last_enabled; } - - if (enabled) - schedule_delayed_work(&damon_reclaim_timer, - msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS)); } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); @@ -388,9 +383,7 @@ static int enabled_store(const char *val, if (!damon_reclaim_initialized) return rc; - if (enabled) - schedule_delayed_work(&damon_reclaim_timer, 0); - + schedule_delayed_work(&damon_reclaim_timer, 0); return 0; } From d79905c77f96988284e3b286e4b50ae8762eebca Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 6 Jun 2022 18:23:10 +0000 Subject: [PATCH 043/282] mm/damon/reclaim: add 'damon_reclaim_' prefix to 'enabled_store()' This commit adds 'damon_reclaim_' prefix to 'enabled_store()', so that we can distinguish it easily from the stack trace using 'faddr2line.sh' like tools. Link: https://lkml.kernel.org/r/20220606182310.48781-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 38da28803d75..e69b807fefe4 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -371,7 +371,7 @@ static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); static bool damon_reclaim_initialized; -static int enabled_store(const char *val, +static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { int rc = param_set_bool(val, kp); @@ -388,7 +388,7 @@ static int enabled_store(const char *val, } static const struct kernel_param_ops enabled_param_ops = { - .set = enabled_store, + .set = damon_reclaim_enabled_store, .get = param_get_bool, }; From aeaec8e27eddc147b96fe32df2671980ce7ca87c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 9 Jun 2022 20:18:45 +0200 Subject: [PATCH 044/282] mm: rename kernel_init_free_pages to kernel_init_pages Rename kernel_init_free_pages() to kernel_init_pages(). This function is not only used for free pages but also for pages that were just allocated. Link: https://lkml.kernel.org/r/1ecaffc0a9c1404d4d7cf52efe0b2dc8a0c681d8.1654798516.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Muchun Song Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81fadb266973..9234863f2488 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1296,7 +1296,7 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) PageSkipKASanPoison(page); } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_pages(struct page *page, int numpages) { int i; @@ -1396,7 +1396,7 @@ static __always_inline bool free_pages_prepare(struct page *page, init = false; } if (init) - kernel_init_free_pages(page, 1 << order); + kernel_init_pages(page, 1 << order); /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -2441,7 +2441,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } /* If memory is still not initialized, do it now. */ if (init) - kernel_init_free_pages(page, 1 << order); + kernel_init_pages(page, 1 << order); /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) SetPageSkipKASanPoison(page); From d9da8f6cf55eeca642c021912af1890002464c64 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 9 Jun 2022 20:18:46 +0200 Subject: [PATCH 045/282] mm: introduce clear_highpage_kasan_tagged Add a clear_highpage_kasan_tagged() helper that does clear_highpage() on a page potentially tagged by KASAN. This helper is used by the following patch. Link: https://lkml.kernel.org/r/4471979b46b2c487787ddcd08b9dc5fedd1b6ffd.1654798516.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- include/linux/highmem.h | 10 ++++++++++ mm/page_alloc.c | 8 ++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fee9835e3793..22379a63e293 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -243,6 +243,16 @@ static inline void clear_highpage(struct page *page) kunmap_local(kaddr); } +static inline void clear_highpage_kasan_tagged(struct page *page) +{ + u8 tag; + + tag = page_kasan_tag(page); + page_kasan_tag_reset(page); + clear_highpage(page); + page_kasan_tag_set(page, tag); +} + #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9234863f2488..248469134962 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1302,12 +1302,8 @@ static void kernel_init_pages(struct page *page, int numpages) /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) { - u8 tag = page_kasan_tag(page + i); - page_kasan_tag_reset(page + i); - clear_highpage(page + i); - page_kasan_tag_set(page + i, tag); - } + for (i = 0; i < numpages; i++) + clear_highpage_kasan_tagged(page + i); kasan_enable_current(); } From 6c2f761dad7851d8088b91063ccaea3c970efe78 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 9 Jun 2022 20:18:47 +0200 Subject: [PATCH 046/282] kasan: fix zeroing vmalloc memory with HW_TAGS HW_TAGS KASAN skips zeroing page_alloc allocations backing vmalloc mappings via __GFP_SKIP_ZERO. Instead, these pages are zeroed via kasan_unpoison_vmalloc() by passing the KASAN_VMALLOC_INIT flag. The problem is that __kasan_unpoison_vmalloc() does not zero pages when either kasan_vmalloc_enabled() or is_vmalloc_or_module_addr() fail. Thus: 1. Change __vmalloc_node_range() to only set KASAN_VMALLOC_INIT when __GFP_SKIP_ZERO is set. 2. Change __kasan_unpoison_vmalloc() to always zero pages when the KASAN_VMALLOC_INIT flag is set. 3. Add WARN_ON() asserts to check that KASAN_VMALLOC_INIT cannot be set in other early return paths of __kasan_unpoison_vmalloc(). Also clean up the comment in __kasan_unpoison_vmalloc. Link: https://lkml.kernel.org/r/4bc503537efdc539ffc3f461c1b70162eea31cf6.1654798516.git.andreyknvl@google.com Fixes: 23689e91fb22 ("kasan, vmalloc: add vmalloc tagging for HW_TAGS") Signed-off-by: Andrey Konovalov Cc: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton --- mm/kasan/hw_tags.c | 32 +++++++++++++++++++++++--------- mm/vmalloc.c | 10 +++++----- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9e1b6544bfa8..9ad8eff71b28 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -257,27 +257,37 @@ static void unpoison_vmalloc_pages(const void *addr, u8 tag) } } +static void init_vmalloc_pages(const void *start, unsigned long size) +{ + const void *addr; + + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + struct page *page = virt_to_page(addr); + + clear_highpage_kasan_tagged(page); + } +} + void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, kasan_vmalloc_flags_t flags) { u8 tag; unsigned long redzone_start, redzone_size; - if (!kasan_vmalloc_enabled()) - return (void *)start; - - if (!is_vmalloc_or_module_addr(start)) + if (!kasan_vmalloc_enabled() || !is_vmalloc_or_module_addr(start)) { + if (flags & KASAN_VMALLOC_INIT) + init_vmalloc_pages(start, size); return (void *)start; + } /* - * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC - * mappings as: + * Don't tag non-VM_ALLOC mappings, as: * * 1. Unlike the software KASAN modes, hardware tag-based KASAN only * supports tagging physical memory. Therefore, it can only tag a * single mapping of normal physical pages. * 2. Hardware tag-based KASAN can only tag memory mapped with special - * mapping protection bits, see arch_vmalloc_pgprot_modify(). + * mapping protection bits, see arch_vmap_pgprot_tagged(). * As non-VM_ALLOC mappings can be mapped outside of vmalloc code, * providing these bits would require tracking all non-VM_ALLOC * mappers. @@ -289,15 +299,19 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * * For non-VM_ALLOC allocations, page_alloc memory is tagged as usual. */ - if (!(flags & KASAN_VMALLOC_VM_ALLOC)) + if (!(flags & KASAN_VMALLOC_VM_ALLOC)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); return (void *)start; + } /* * Don't tag executable memory. * The kernel doesn't tolerate having the PC register tagged. */ - if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) + if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) { + WARN_ON(flags & KASAN_VMALLOC_INIT); return (void *)start; + } tag = kasan_random_tag(); start = set_tag(start, tag); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5977b178694d..37939f96d2f4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3161,15 +3161,15 @@ again: /* * Mark the pages as accessible, now that they are mapped. - * The init condition should match the one in post_alloc_hook() - * (except for the should_skip_init() check) to make sure that memory - * is initialized under the same conditions regardless of the enabled - * KASAN mode. + * The condition for setting KASAN_VMALLOC_INIT should complement the + * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check + * to make sure that memory is initialized under the same conditions. * Tag-based KASAN modes only assign tags to normal non-executable * allocations, see __kasan_unpoison_vmalloc(). */ kasan_flags |= KASAN_VMALLOC_VM_ALLOC; - if (!want_init_on_free() && want_init_on_alloc(gfp_mask)) + if (!want_init_on_free() && want_init_on_alloc(gfp_mask) && + (gfp_mask & __GFP_SKIP_ZERO)) kasan_flags |= KASAN_VMALLOC_INIT; /* KASAN_VMALLOC_PROT_NORMAL already set if required. */ area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags); From 8edaec0756005a3f286c9272e909dff07d12cf75 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 27 May 2022 10:01:35 +0800 Subject: [PATCH 047/282] mm/hugetlb: remove unnecessary huge_ptep_set_access_flags() in hugetlb_mcopy_atomic_pte() There is no need to update the hugetlb access flags after just setting the hugetlb page table entry by set_huge_pte_at(), since the page table entry value has no changes. Thus remove the unnecessary huge_ptep_set_access_flags() in hugetlb_mcopy_atomic_pte(). Link: https://lkml.kernel.org/r/f3e28b897b53a69967a8b98a6fdcda3be80c9229.1653616175.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed202d29ca46..70d2763f92ea 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6039,8 +6039,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, - dst_vma->vm_flags & VM_WRITE); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ From 000eca5d044d1ee23b4ca311793cf3fc528da6c6 Mon Sep 17 00:00:00 2001 From: Tianyu Li Date: Wed, 1 Jun 2022 17:32:11 +0800 Subject: [PATCH 048/282] mm/mempolicy: fix get_nodes out of bound access When user specified more nodes than supported, get_nodes will access nmask array out of bounds. Link: https://lkml.kernel.org/r/20220601093211.2970565-1-tianyu.li@arm.com Fixes: e130242dc351 ("mm: simplify compat numa syscalls") Signed-off-by: Tianyu Li Cc: Arnd Bergmann Cc: Mark Rutland Signed-off-by: Andrew Morton --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9689919a2829..f4cd963550c1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1388,7 +1388,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG); unsigned long t; - if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits)) + if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits)) return -EFAULT; if (maxnode - bits >= MAX_NUMNODES) { From c15187a4a2d660bf490f7873afd0de5288f65c8f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:22 -0700 Subject: [PATCH 049/282] mm: memcontrol: introduce mem_cgroup_ino() and mem_cgroup_get_from_ino() Patch series "mm: introduce shrinker debugfs interface", v5. The only existing debugging mechanism is a couple of tracepoints in do_shrink_slab(): mm_shrink_slab_start and mm_shrink_slab_end. They aren't covering everything though: shrinkers which report 0 objects will never show up, there is no support for memcg-aware shrinkers. Shrinkers are identified by their scan function, which is not always enough (e.g. hard to guess which super block's shrinker it is having only "super_cache_scan"). To provide a better visibility and debug options for memory shrinkers this patchset introduces a /sys/kernel/debug/shrinker interface, to some extent similar to /sys/kernel/slab. For each shrinker registered in the system a directory is created. As now, the directory will contain only a "scan" file, which allows to get the number of managed objects for each memory cgroup (for memcg-aware shrinkers) and each numa node (for numa-aware shrinkers on a numa machine). Other interfaces might be added in the future. To make debugging more pleasant, the patchset also names all shrinkers, so that debugfs entries can have meaningful names. This patch (of 5): Shrinker debugfs requires a way to represent memory cgroups without using full paths, both for displaying information and getting input from a user. Cgroup inode number is a perfect way, already used by bpf. This commit adds a couple of helper functions which will be used to handle memcg-aware shrinkers. Link: https://lkml.kernel.org/r/20220601032227.4076670-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20220601032227.4076670-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Cc: Dave Chinner Cc: Kent Overstreet Cc: Hillf Danton Cc: Christophe JAILLET Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 21 +++++++++++++++++++++ mm/memcontrol.c | 23 +++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 04f2f33607e9..4d31ce55b1c0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -837,6 +837,15 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return memcg ? cgroup_ino(memcg->css.cgroup) : 0; +} + +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino); +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); @@ -1343,6 +1352,18 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +#ifdef CONFIG_SHRINKER_DEBUG +static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg) +{ + return 0; +} + +static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + return NULL; +} +#endif + static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 655c09393ad5..1497affe08c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5088,6 +5088,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return idr_find(&mem_cgroup_idr, id); } +#ifdef CONFIG_SHRINKER_DEBUG +struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + + cgrp = cgroup_get_from_id(ino); + if (!cgrp) + return ERR_PTR(-ENOENT); + + css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys); + if (css) + memcg = container_of(css, struct mem_cgroup, css); + else + memcg = ERR_PTR(-ENOENT); + + cgroup_put(cgrp); + + return memcg; +} +#endif + static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; From 5035ebc644aec92d55d1bbfe042f35341e4bffb5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:23 -0700 Subject: [PATCH 050/282] mm: shrinkers: introduce debugfs interface for memory shrinkers This commit introduces the /sys/kernel/debug/shrinker debugfs interface which provides an ability to observe the state of individual kernel memory shrinkers. Because the feature adds some memory overhead (which shouldn't be large unless there is a huge amount of registered shrinkers), it's guarded by a config option (enabled by default). This commit introduces the "count" interface for each shrinker registered in the system. The output is in the following format: ... ... ... To reduce the size of output on machines with many thousands cgroups, if the total number of objects on all nodes is 0, the line is omitted. If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed as cgroup inode id. If the shrinker is not numa-aware, 0's are printed for all nodes except the first one. This commit gives debugfs entries simple numeric names, which are not very convenient. The following commit in the series will provide shrinkers with more meaningful names. [akpm@linux-foundation.org: remove WARN_ON_ONCE(), per Roman] Reported-by: syzbot+300d27c79fe6d4cbcc39@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/20220601032227.4076670-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: Kent Overstreet Acked-by: Muchun Song Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 19 ++++- lib/Kconfig.debug | 9 +++ mm/Makefile | 1 + mm/shrinker_debug.c | 168 +++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 6 +- 5 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 mm/shrinker_debug.c diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 76fbf92b04d9..2ced8149c513 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -72,6 +72,10 @@ struct shrinker { #ifdef CONFIG_MEMCG /* ID in shrinker_idr */ int id; +#endif +#ifdef CONFIG_SHRINKER_DEBUG + int debugfs_id; + struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; @@ -94,4 +98,17 @@ extern int register_shrinker(struct shrinker *shrinker); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); -#endif + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern void shrinker_debugfs_remove(struct shrinker *shrinker); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline void shrinker_debugfs_remove(struct shrinker *shrinker) +{ +} +#endif /* CONFIG_SHRINKER_DEBUG */ +#endif /* _LINUX_SHRINKER_H */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2e24db4bff19..0b483a8da409 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -699,6 +699,15 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT help Debug objects boot parameter default value +config SHRINKER_DEBUG + default y + bool "Enable shrinker debugging support" + depends on DEBUG_FS + help + Say Y to enable the shrinker debugfs interface which provides + visibility into the kernel memory shrinkers subsystem. + Disable it to avoid an extra memory footprint. + config HAVE_DEBUG_KMEMLEAK bool diff --git a/mm/Makefile b/mm/Makefile index 6f9ffa968a1a..9a564f836403 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o +obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c new file mode 100644 index 000000000000..1a70556bd46c --- /dev/null +++ b/mm/shrinker_debug.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +/* defined in vmscan.c */ +extern struct rw_semaphore shrinker_rwsem; +extern struct list_head shrinker_list; + +static DEFINE_IDA(shrinker_debugfs_ida); +static struct dentry *shrinker_debugfs_root; + +static unsigned long shrinker_count_objects(struct shrinker *shrinker, + struct mem_cgroup *memcg, + unsigned long *count_per_node) +{ + unsigned long nr, total = 0; + int nid; + + for_each_node(nid) { + if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) { + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + .nid = nid, + .memcg = memcg, + }; + + nr = shrinker->count_objects(shrinker, &sc); + if (nr == SHRINK_EMPTY) + nr = 0; + } else { + nr = 0; + } + + count_per_node[nid] = nr; + total += nr; + } + + return total; +} + +static int shrinker_debugfs_count_show(struct seq_file *m, void *v) +{ + struct shrinker *shrinker = m->private; + unsigned long *count_per_node; + struct mem_cgroup *memcg; + unsigned long total; + bool memcg_aware; + int ret, nid; + + count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); + if (!count_per_node) + return -ENOMEM; + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); + + memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + if (memcg && !mem_cgroup_online(memcg)) + continue; + + total = shrinker_count_objects(shrinker, + memcg_aware ? memcg : NULL, + count_per_node); + if (total) { + seq_printf(m, "%lu", mem_cgroup_ino(memcg)); + for_each_node(nid) + seq_printf(m, " %lu", count_per_node[nid]); + seq_putc(m, '\n'); + } + + if (!memcg_aware) { + mem_cgroup_iter_break(NULL, memcg); + break; + } + + if (signal_pending(current)) { + mem_cgroup_iter_break(NULL, memcg); + ret = -EINTR; + break; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + + rcu_read_unlock(); + up_read(&shrinker_rwsem); + + kfree(count_per_node); + return ret; +} +DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); + +int shrinker_debugfs_add(struct shrinker *shrinker) +{ + struct dentry *entry; + char buf[16]; + int id; + + lockdep_assert_held(&shrinker_rwsem); + + /* debugfs isn't initialized yet, add debugfs entries later. */ + if (!shrinker_debugfs_root) + return 0; + + id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL); + if (id < 0) + return id; + shrinker->debugfs_id = id; + + snprintf(buf, sizeof(buf), "%d", id); + + /* create debugfs entry */ + entry = debugfs_create_dir(buf, shrinker_debugfs_root); + if (IS_ERR(entry)) { + ida_free(&shrinker_debugfs_ida, id); + return PTR_ERR(entry); + } + shrinker->debugfs_entry = entry; + + debugfs_create_file("count", 0220, entry, shrinker, + &shrinker_debugfs_count_fops); + return 0; +} + +void shrinker_debugfs_remove(struct shrinker *shrinker) +{ + lockdep_assert_held(&shrinker_rwsem); + + if (!shrinker->debugfs_entry) + return; + + debugfs_remove_recursive(shrinker->debugfs_entry); + ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); +} + +static int __init shrinker_debugfs_init(void) +{ + struct shrinker *shrinker; + struct dentry *dentry; + int ret = 0; + + dentry = debugfs_create_dir("shrinker", NULL); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + shrinker_debugfs_root = dentry; + + /* Create debugfs entries for shrinkers registered at boot */ + down_write(&shrinker_rwsem); + list_for_each_entry(shrinker, &shrinker_list, list) + if (!shrinker->debugfs_entry) { + ret = shrinker_debugfs_add(shrinker); + if (ret) + break; + } + up_write(&shrinker_rwsem); + + return ret; +} +late_initcall(shrinker_debugfs_init); diff --git a/mm/vmscan.c b/mm/vmscan.c index f7d9a683e3a7..35dedff79eb4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -190,8 +190,8 @@ static void set_task_reclaim_state(struct task_struct *task, task->reclaim_state = rs; } -static LIST_HEAD(shrinker_list); -static DECLARE_RWSEM(shrinker_rwsem); +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -650,6 +650,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); } @@ -677,6 +678,7 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); + shrinker_debugfs_remove(shrinker); up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); From e33c267ab70de4249d22d7eab1cc7d68a889bac2 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:24 -0700 Subject: [PATCH 051/282] mm: shrinkers: provide shrinkers with names Currently shrinkers are anonymous objects. For debugging purposes they can be identified by count/scan function names, but it's not always useful: e.g. for superblock's shrinkers it's nice to have at least an idea of to which superblock the shrinker belongs. This commit adds names to shrinkers. register_shrinker() and prealloc_shrinker() functions are extended to take a format and arguments to master a name. In some cases it's not possible to determine a good name at the time when a shrinker is allocated. For such cases shrinker_debugfs_rename() is provided. The expected format is: -[:]- For some shrinkers an instance can be encoded as (MAJOR:MINOR) pair. After this change the shrinker debugfs directory looks like: $ cd /sys/kernel/debug/shrinker/ $ ls dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42 mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43 mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44 rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49 sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13 sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36 sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19 sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10 sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9 sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37 sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38 sb-dax-11 sb-proc-45 sb-tmpfs-35 sb-debugfs-7 sb-proc-46 sb-tmpfs-40 [roman.gushchin@linux.dev: fix build warnings] Link: https://lkml.kernel.org/r/Yr+ZTnLb9lJk6fJO@castle Reported-by: kernel test robot Link: https://lkml.kernel.org/r/20220601032227.4076670-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Cc: Muchun Song Signed-off-by: Andrew Morton --- arch/x86/kvm/mmu/mmu.c | 2 +- drivers/android/binder_alloc.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_shrinker.c | 3 +- drivers/gpu/drm/msm/msm_gem_shrinker.c | 2 +- .../gpu/drm/panfrost/panfrost_gem_shrinker.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 2 +- drivers/md/bcache/btree.c | 2 +- drivers/md/dm-bufio.c | 3 +- drivers/md/dm-zoned-metadata.c | 4 +- drivers/md/raid5.c | 2 +- drivers/misc/vmw_balloon.c | 2 +- drivers/virtio/virtio_balloon.c | 2 +- drivers/xen/xenbus/xenbus_probe_backend.c | 2 +- fs/btrfs/super.c | 2 + fs/erofs/utils.c | 2 +- fs/ext4/extents_status.c | 3 +- fs/f2fs/super.c | 2 +- fs/gfs2/glock.c | 2 +- fs/gfs2/main.c | 2 +- fs/jbd2/journal.c | 3 +- fs/mbcache.c | 2 +- fs/nfs/nfs42xattr.c | 7 ++- fs/nfs/super.c | 2 +- fs/nfsd/filecache.c | 2 +- fs/nfsd/nfscache.c | 3 +- fs/quota/dquot.c | 2 +- fs/super.c | 6 +- fs/ubifs/super.c | 2 +- fs/xfs/xfs_buf.c | 3 +- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_qm.c | 3 +- include/linux/shrinker.h | 14 ++++- kernel/rcu/tree.c | 2 +- mm/huge_memory.c | 4 +- mm/shrinker_debug.c | 47 ++++++++++++++- mm/vmscan.c | 58 ++++++++++++++++++- mm/workingset.c | 2 +- mm/zsmalloc.c | 3 +- net/sunrpc/auth.c | 2 +- 39 files changed, 167 insertions(+), 45 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 17252f39bd7c..797d3286ecc1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6317,7 +6317,7 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker); + ret = register_shrinker(&mmu_shrinker, "x86-mmu"); if (ret) goto out; diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 5649a0371a1f..51b502217d00 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1084,7 +1084,7 @@ int binder_alloc_shrinker_init(void) int ret = list_lru_init(&binder_alloc_lru); if (ret == 0) { - ret = register_shrinker(&binder_shrinker); + ret = register_shrinker(&binder_shrinker, "android-binder"); if (ret) list_lru_destroy(&binder_alloc_lru); } diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index 6a6ff98a8746..e43577e03067 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -426,7 +426,8 @@ void i915_gem_driver_register__shrinker(struct drm_i915_private *i915) i915->mm.shrinker.count_objects = i915_gem_shrinker_count; i915->mm.shrinker.seeks = DEFAULT_SEEKS; i915->mm.shrinker.batch = 4096; - drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker)); + drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker, + "drm-i915_gem")); i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom; drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier)); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 086dacf2f26a..26e84d2ea6ae 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -221,7 +221,7 @@ void msm_gem_shrinker_init(struct drm_device *dev) priv->shrinker.count_objects = msm_gem_shrinker_count; priv->shrinker.scan_objects = msm_gem_shrinker_scan; priv->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&priv->shrinker)); + WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem")); priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap; WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier)); diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c index 77e7cb6d1ae3..bf0170782f25 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c @@ -103,7 +103,7 @@ void panfrost_gem_shrinker_init(struct drm_device *dev) pfdev->shrinker.count_objects = panfrost_gem_shrinker_count; pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan; pfdev->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&pfdev->shrinker)); + WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost")); } /** diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 1bba0a0ed3f9..21b61631f73a 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -722,7 +722,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) mm_shrinker.count_objects = ttm_pool_shrinker_count; mm_shrinker.scan_objects = ttm_pool_shrinker_scan; mm_shrinker.seeks = 1; - return register_shrinker(&mm_shrinker); + return register_shrinker(&mm_shrinker, "drm-ttm_pool"); } /** diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e136d6edc1ed..147c493a989a 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -812,7 +812,7 @@ int bch_btree_cache_alloc(struct cache_set *c) c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; - if (register_shrinker(&c->shrink)) + if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) pr_warn("bcache: %s: could not register shrinker\n", __func__); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 5ffa1dcf84cf..3ff571b20f14 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1806,7 +1806,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker); + r = register_shrinker(&c->shrinker, "md-%s:(%u:%u)", slab_name, + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); if (r) goto bad; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index d1ea66114d14..46648f6100fb 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -2944,7 +2944,9 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker); + ret = register_shrinker(&zmd->mblk_shrinker, "md-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); if (ret) { dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); goto err; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d09256d7f81..780ae66840b7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7414,7 +7414,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker); + ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); if (ret) { pr_warn("md/raid:%s: couldn't register shrinker.\n", mdname(mddev)); diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 086ce77d9074..c2d2fa114e65 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1587,7 +1587,7 @@ static int vmballoon_register_shrinker(struct vmballoon *b) b->shrinker.count_objects = vmballoon_shrinker_count; b->shrinker.seeks = DEFAULT_SEEKS; - r = register_shrinker(&b->shrinker); + r = register_shrinker(&b->shrinker, "vmw-balloon"); if (r == 0) b->shrinker_registered = true; diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index b9737da6c4dd..cba57b1f382f 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -875,7 +875,7 @@ static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) vb->shrinker.count_objects = virtio_balloon_shrinker_count; vb->shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&vb->shrinker); + return register_shrinker(&vb->shrinker, "virtio-balloon"); } static int virtballoon_probe(struct virtio_device *vdev) diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 5abded97e1a7..9c09f89d8278 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -305,7 +305,7 @@ static int __init xenbus_probe_backend_init(void) register_xenstore_notifier(&xenstore_notifier); - if (register_shrinker(&backend_memory_shrinker)) + if (register_shrinker(&backend_memory_shrinker, "xen-backend")) pr_warn("shrinker registration failed\n"); return 0; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6627dd7875ee..eee3e96d877f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1815,6 +1815,8 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + s->s_id); btrfs_sb(s)->bdev_holder = fs_type; if (!strstr(crc32c_impl(), "generic")) set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags); diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index ec9a1d780dc1..46627cb69abe 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -282,7 +282,7 @@ static struct shrinker erofs_shrinker_info = { int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info); + return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); } void erofs_exit_shrinker(void) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9a3a8996aacf..23167efda95e 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1654,7 +1654,8 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", + sbi->s_sb->s_id); if (err) goto err4; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 37221e94e5ef..bce02306f7a0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4579,7 +4579,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info); + err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c992d53013d3..dca842379cab 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2533,7 +2533,7 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker); + ret = register_shrinker(&glock_shrinker, "gfs2-glock"); if (ret) { destroy_workqueue(gfs2_delete_workqueue); destroy_workqueue(glock_workqueue); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 244187e3e70f..b66a3e1ec152 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -148,7 +148,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker); + error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); if (error) goto fail_shrinker; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0cbeeaec2d1..45e4655c8033 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1418,7 +1418,8 @@ static journal_t *journal_init_common(struct block_device *bdev, if (percpu_counter_init(&journal->j_checkpoint_jh_count, 0, GFP_KERNEL)) goto err_cleanup; - if (register_shrinker(&journal->j_shrinker)) { + if (register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev))) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); goto err_cleanup; } diff --git a/fs/mbcache.c b/fs/mbcache.c index 97c54d3a2227..0b833da0a9a5 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -367,7 +367,7 @@ struct mb_cache *mb_cache_create(int bucket_bits) cache->c_shrink.count_objects = mb_cache_count; cache->c_shrink.scan_objects = mb_cache_scan; cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink)) { + if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { kfree(cache->c_hash); kfree(cache); goto err_out; diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index e7b34f7e0614..a9bf09fdf2c3 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -1017,15 +1017,16 @@ int __init nfs4_xattr_cache_init(void) if (ret) goto out2; - ret = register_shrinker(&nfs4_xattr_cache_shrinker); + ret = register_shrinker(&nfs4_xattr_cache_shrinker, "nfs-xattr_cache"); if (ret) goto out1; - ret = register_shrinker(&nfs4_xattr_entry_shrinker); + ret = register_shrinker(&nfs4_xattr_entry_shrinker, "nfs-xattr_entry"); if (ret) goto out; - ret = register_shrinker(&nfs4_xattr_large_entry_shrinker); + ret = register_shrinker(&nfs4_xattr_large_entry_shrinker, + "nfs-xattr_large_entry"); if (!ret) return 0; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 6ab5eeb000dc..82944e14fcea 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -149,7 +149,7 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker); + ret = register_shrinker(&acl_shrinker, "nfs-acl"); if (ret < 0) goto error_3; #ifdef CONFIG_NFS_V4_2 diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb2d590c036..a605c0e39b09 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -670,7 +670,7 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker); + ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); if (ret) { pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); goto out_lru; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 7da88bdc0d6c..9b31e1103e7b 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -176,7 +176,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker); + status = register_shrinker(&nn->nfsd_reply_cache_shrinker, + "nfsd-reply:%s", nn->nfsd_name); if (status) goto out_stats_destroy; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 09d1307959d0..e0b659900e70 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2995,7 +2995,7 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker)) + if (register_shrinker(&dqcache_shrinker, "dquota-cache")) panic("Cannot register dquot shrinker"); return 0; diff --git a/fs/super.c b/fs/super.c index 60f57c7bc0a6..4fca6657f442 100644 --- a/fs/super.c +++ b/fs/super.c @@ -265,7 +265,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink)) + if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) goto fail; if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; @@ -1288,6 +1288,8 @@ int get_tree_bdev(struct fs_context *fc, } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", + fc->fs_type->name, s->s_id); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, fc); if (error) { @@ -1363,6 +1365,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", + fs_type->name, s->s_id); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0978d01b0ea4..d0c9a09988bc 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2430,7 +2430,7 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info); + err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); if (err) goto out_slab; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bf4e60871068..4aa9c9cf5b6e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1986,7 +1986,8 @@ xfs_alloc_buftarg( btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; btp->bt_shrinker.seeks = DEFAULT_SEEKS; btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker)) + if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", + mp->m_super->s_id)) goto error_pcpu; return btp; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 5269354b1b69..a1941c8b8630 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2201,5 +2201,5 @@ xfs_inodegc_register_shrinker( shrink->flags = SHRINKER_NONSLAB; shrink->batch = XFS_INODEGC_SHRINKER_BATCH; - return register_shrinker(shrink); + return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index abf08bbf34a9..c31d57453ceb 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -677,7 +677,8 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.seeks = DEFAULT_SEEKS; qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - error = register_shrinker(&qinf->qi_shrinker); + error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", + mp->m_super->s_id); if (error) goto out_free_inos; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 2ced8149c513..08e6054e061f 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -75,6 +75,7 @@ struct shrinker { #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; + const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ @@ -92,9 +93,11 @@ struct shrinker { */ #define SHRINKER_NONSLAB (1 << 3) -extern int prealloc_shrinker(struct shrinker *shrinker); +extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, + const char *fmt, ...); extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int register_shrinker(struct shrinker *shrinker); +extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, + const char *fmt, ...); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); @@ -102,6 +105,8 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); extern void shrinker_debugfs_remove(struct shrinker *shrinker); +extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, + const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { @@ -110,5 +115,10 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) static inline void shrinker_debugfs_remove(struct shrinker *shrinker) { } +static inline __printf(2, 3) +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + return 0; +} #endif /* CONFIG_SHRINKER_DEBUG */ #endif /* _LINUX_SHRINKER_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c25ba442044a..4b3bf6ebb1eb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4884,7 +4884,7 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker)) + if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) pr_err("Failed to register kfree_rcu() shrinker!\n"); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9b90a8d7dfa..60d742c33de3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -423,10 +423,10 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker); + err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); if (err) goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker); + err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); if (err) goto err_split_shrinker; diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 1a70556bd46c..781ecbd3d608 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -102,7 +102,7 @@ DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); int shrinker_debugfs_add(struct shrinker *shrinker) { struct dentry *entry; - char buf[16]; + char buf[128]; int id; lockdep_assert_held(&shrinker_rwsem); @@ -116,7 +116,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) return id; shrinker->debugfs_id = id; - snprintf(buf, sizeof(buf), "%d", id); + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id); /* create debugfs entry */ entry = debugfs_create_dir(buf, shrinker_debugfs_root); @@ -131,10 +131,53 @@ int shrinker_debugfs_add(struct shrinker *shrinker) return 0; } +int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) +{ + struct dentry *entry; + char buf[128]; + const char *new, *old; + va_list ap; + int ret = 0; + + va_start(ap, fmt); + new = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + + if (!new) + return -ENOMEM; + + down_write(&shrinker_rwsem); + + old = shrinker->name; + shrinker->name = new; + + if (shrinker->debugfs_entry) { + snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, + shrinker->debugfs_id); + + entry = debugfs_rename(shrinker_debugfs_root, + shrinker->debugfs_entry, + shrinker_debugfs_root, buf); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + else + shrinker->debugfs_entry = entry; + } + + up_write(&shrinker_rwsem); + + kfree_const(old); + + return ret; +} +EXPORT_SYMBOL(shrinker_debugfs_rename); + void shrinker_debugfs_remove(struct shrinker *shrinker) { lockdep_assert_held(&shrinker_rwsem); + kfree_const(shrinker->name); + if (!shrinker->debugfs_entry) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index 35dedff79eb4..97ac6c6c026d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -608,7 +608,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, /* * Add a shrinker callback to be called from the vm. */ -int prealloc_shrinker(struct shrinker *shrinker) +static int __prealloc_shrinker(struct shrinker *shrinker) { unsigned int size; int err; @@ -632,8 +632,36 @@ int prealloc_shrinker(struct shrinker *shrinker) return 0; } +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) + kfree_const(shrinker->name); + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + void free_prealloced_shrinker(struct shrinker *shrinker) { +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); +#endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); @@ -654,15 +682,39 @@ void register_shrinker_prepared(struct shrinker *shrinker) up_write(&shrinker_rwsem); } -int register_shrinker(struct shrinker *shrinker) +static int __register_shrinker(struct shrinker *shrinker) { - int err = prealloc_shrinker(shrinker); + int err = __prealloc_shrinker(shrinker); if (err) return err; register_shrinker_prepared(shrinker); return 0; } + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) + kfree_const(shrinker->name); + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif EXPORT_SYMBOL(register_shrinker); /* diff --git a/mm/workingset.c b/mm/workingset.c index 592569a8974c..a5e84862fc86 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -625,7 +625,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker); + ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); if (ret) goto err; ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d5fc04385b8..f24b71568e83 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -2217,7 +2217,8 @@ static int zs_register_shrinker(struct zs_pool *pool) pool->shrinker.batch = 0; pool->shrinker.seeks = DEFAULT_SEEKS; - return register_shrinker(&pool->shrinker); + return register_shrinker(&pool->shrinker, "mm-zspool:%s", + pool->name); } /** diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 682fcd24bf43..04e7b55fe0d9 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -874,7 +874,7 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = register_shrinker(&rpc_cred_shrinker); + err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred"); if (err < 0) goto out2; return 0; From 7507f0991d59169b75a09aa21ffa8ffeda58116f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:25 -0700 Subject: [PATCH 052/282] mm: docs: document shrinker debugfs Add a document describing the shrinker debugfs interface. Link: https://lkml.kernel.org/r/20220601032227.4076670-5-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: Muchun Song Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/index.rst | 1 + .../admin-guide/mm/shrinker_debugfs.rst | 104 ++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 Documentation/admin-guide/mm/shrinker_debugfs.rst diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst index c21b5823f126..1bd11118dfb1 100644 --- a/Documentation/admin-guide/mm/index.rst +++ b/Documentation/admin-guide/mm/index.rst @@ -36,6 +36,7 @@ the Linux memory management. numa_memory_policy numaperf pagemap + shrinker_debugfs soft-dirty swap_numa transhuge diff --git a/Documentation/admin-guide/mm/shrinker_debugfs.rst b/Documentation/admin-guide/mm/shrinker_debugfs.rst new file mode 100644 index 000000000000..1e0e5bdb8179 --- /dev/null +++ b/Documentation/admin-guide/mm/shrinker_debugfs.rst @@ -0,0 +1,104 @@ +.. _shrinker_debugfs: + +========================== +Shrinker Debugfs Interface +========================== + +Shrinker debugfs interface provides a visibility into the kernel memory +shrinkers subsystem and allows to get information about individual shrinkers. + +For each shrinker registered in the system a directory in **/shrinker/** +is created. The directory's name is composed from the shrinker's name and an +unique id: e.g. *kfree_rcu-0* or *sb-xfs:vda1-36*. + +Each shrinker directory contains the **count** file, which allows to trigger +the *count_objects()* callback for each memcg and numa node (if applicable). + +Usage: +------ + +1. *List registered shrinkers* + + :: + + $ cd /sys/kernel/debug/shrinker/ + $ ls + dquota-cache-16 sb-devpts-28 sb-proc-47 sb-tmpfs-42 + mm-shadow-18 sb-devtmpfs-5 sb-proc-48 sb-tmpfs-43 + mm-zspool:zram0-34 sb-hugetlbfs-17 sb-pstore-31 sb-tmpfs-44 + rcu-kfree-0 sb-hugetlbfs-33 sb-rootfs-2 sb-tmpfs-49 + sb-aio-20 sb-iomem-12 sb-securityfs-6 sb-tracefs-13 + sb-anon_inodefs-15 sb-mqueue-21 sb-selinuxfs-22 sb-xfs:vda1-36 + sb-bdev-3 sb-nsfs-4 sb-sockfs-8 sb-zsmalloc-19 + sb-bpf-32 sb-pipefs-14 sb-sysfs-26 thp-deferred_split-10 + sb-btrfs:vda2-24 sb-proc-25 sb-tmpfs-1 thp-zero-9 + sb-cgroup2-30 sb-proc-39 sb-tmpfs-27 xfs-buf:vda1-37 + sb-configfs-23 sb-proc-41 sb-tmpfs-29 xfs-inodegc:vda1-38 + sb-dax-11 sb-proc-45 sb-tmpfs-35 + sb-debugfs-7 sb-proc-46 sb-tmpfs-40 + +2. *Get information about a specific shrinker* + + :: + + $ cd sb-btrfs\:vda2-24/ + $ ls + count + +3. *Count objects* + + Each line in the output has the following format:: + + ... + ... + ... + + If there are no objects on all numa nodes, a line is omitted. If there + are no objects at all, the output might be empty. + + If the shrinker is not memcg-aware or CONFIG_MEMCG is off, 0 is printed + as cgroup inode id. If the shrinker is not numa-aware, 0's are printed + for all nodes except the first one. + :: + + $ cat count + 1 224 2 + 21 98 0 + 55 818 10 + 2367 2 0 + 2401 30 0 + 225 13 0 + 599 35 0 + 939 124 0 + 1041 3 0 + 1075 1 0 + 1109 1 0 + 1279 60 0 + 1313 7 0 + 1347 39 0 + 1381 3 0 + 1449 14 0 + 1483 63 0 + 1517 53 0 + 1551 6 0 + 1585 1 0 + 1619 6 0 + 1653 40 0 + 1687 11 0 + 1721 8 0 + 1755 4 0 + 1789 52 0 + 1823 888 0 + 1857 1 0 + 1925 2 0 + 1959 32 0 + 2027 22 0 + 2061 9 0 + 2469 799 0 + 2537 861 0 + 2639 1 0 + 2707 70 0 + 2775 4 0 + 2877 84 0 + 293 1 0 + 735 8 0 From d261ea23533b5da113b0779f32d39c3803dddb02 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:26 -0700 Subject: [PATCH 053/282] tools: add memcg_shrinker.py Add a simple tool which prints a sorted list of shrinker lists in the following format: (number of objects, shrinker name, cgroup). Example: $ ./memcg_shrinker.py -n 10 2090 sb-sysfs-26 /sys/fs/cgroup/system.slice 1809 sb-sysfs-26 /sys/fs/cgroup/system.slice/systemd-udevd.service 1044 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice/system-dbus\x2d:1.3\... 861 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice/system-dbus\x2d:1.3\... 804 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice 643 sb-btrfs:vda2-24 /sys/fs/cgroup/system.slice/firewalld.service 616 sb-cgroup2-30 /sys/fs/cgroup/init.scope 275 sb-sysfs-26 / 238 sb-proc-25 /sys/fs/cgroup/system.slice/systemd-journald.service 225 sb-proc-25 /sys/fs/cgroup/system.slice/abrtd.service Link: https://lkml.kernel.org/r/20220601032227.4076670-6-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Cc: Muchun Song Signed-off-by: Andrew Morton --- tools/cgroup/memcg_shrinker.py | 71 ++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 tools/cgroup/memcg_shrinker.py diff --git a/tools/cgroup/memcg_shrinker.py b/tools/cgroup/memcg_shrinker.py new file mode 100644 index 000000000000..706ab27666a4 --- /dev/null +++ b/tools/cgroup/memcg_shrinker.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# +# Copyright (C) 2022 Roman Gushchin +# Copyright (C) 2022 Meta + +import os +import argparse +import sys + + +def scan_cgroups(cgroup_root): + cgroups = {} + + for root, subdirs, _ in os.walk(cgroup_root): + for cgroup in subdirs: + path = os.path.join(root, cgroup) + ino = os.stat(path).st_ino + cgroups[ino] = path + + # (memcg ino, path) + return cgroups + + +def scan_shrinkers(shrinker_debugfs): + shrinkers = [] + + for root, subdirs, _ in os.walk(shrinker_debugfs): + for shrinker in subdirs: + count_path = os.path.join(root, shrinker, "count") + with open(count_path) as f: + for line in f.readlines(): + items = line.split(' ') + ino = int(items[0]) + # (count, shrinker, memcg ino) + shrinkers.append((int(items[1]), shrinker, ino)) + return shrinkers + + +def main(): + parser = argparse.ArgumentParser(description='Display biggest shrinkers') + parser.add_argument('-n', '--lines', type=int, help='Number of lines to print') + + args = parser.parse_args() + + cgroups = scan_cgroups("/sys/fs/cgroup/") + shrinkers = scan_shrinkers("/sys/kernel/debug/shrinker/") + shrinkers = sorted(shrinkers, reverse = True, key = lambda x: x[0]) + + n = 0 + for s in shrinkers: + count, name, ino = (s[0], s[1], s[2]) + if count == 0: + break + + if ino == 0 or ino == 1: + cg = "/" + else: + try: + cg = cgroups[ino] + except KeyError: + cg = "unknown (%d)" % ino + + print("%-8s %-20s %s" % (count, name, cg)) + + n += 1 + if args.lines and n >= args.lines: + break + + +if __name__ == '__main__': + main() From bbf535fd6f06b94b9d07ed6f09397a936d4a58d8 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 31 May 2022 20:22:27 -0700 Subject: [PATCH 054/282] mm: shrinkers: add scan interface for shrinker debugfs Add a scan interface which allows to trigger scanning of a particular shrinker and specify memcg and numa node. It's useful for testing, debugging and profiling of a specific scan_objects() callback. Unlike alternatives (creating a real memory pressure and dropping caches via /proc/sys/vm/drop_caches) this interface allows to interact with only one shrinker at once. Also, if a shrinker is misreporting the number of objects (as some do), it doesn't affect scanning. [roman.gushchin@linux.dev: improve typing, fix arg count checking] Link: https://lkml.kernel.org/r/YpgKttTowT22mKPQ@carbon [akpm@linux-foundation.org: fix arg count checking] Link: https://lkml.kernel.org/r/20220601032227.4076670-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Acked-by: Muchun Song Cc: Christophe JAILLET Cc: Dave Chinner Cc: Hillf Danton Cc: Kent Overstreet Signed-off-by: Andrew Morton --- .../admin-guide/mm/shrinker_debugfs.rst | 39 +++++++++- mm/shrinker_debug.c | 74 +++++++++++++++++++ 2 files changed, 109 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/shrinker_debugfs.rst b/Documentation/admin-guide/mm/shrinker_debugfs.rst index 1e0e5bdb8179..3887f0b294fe 100644 --- a/Documentation/admin-guide/mm/shrinker_debugfs.rst +++ b/Documentation/admin-guide/mm/shrinker_debugfs.rst @@ -5,14 +5,16 @@ Shrinker Debugfs Interface ========================== Shrinker debugfs interface provides a visibility into the kernel memory -shrinkers subsystem and allows to get information about individual shrinkers. +shrinkers subsystem and allows to get information about individual shrinkers +and interact with them. For each shrinker registered in the system a directory in **/shrinker/** is created. The directory's name is composed from the shrinker's name and an unique id: e.g. *kfree_rcu-0* or *sb-xfs:vda1-36*. -Each shrinker directory contains the **count** file, which allows to trigger -the *count_objects()* callback for each memcg and numa node (if applicable). +Each shrinker directory contains **count** and **scan** files, which allow to +trigger *count_objects()* and *scan_objects()* callbacks for each memcg and +numa node (if applicable). Usage: ------ @@ -43,7 +45,7 @@ Usage: $ cd sb-btrfs\:vda2-24/ $ ls - count + count scan 3. *Count objects* @@ -102,3 +104,32 @@ Usage: 2877 84 0 293 1 0 735 8 0 + +4. *Scan objects* + + The expected input format:: + + + + For a non-memcg-aware shrinker or on a system with no memory + cgrups **0** should be passed as cgroup id. + :: + + $ cd /sys/kernel/debug/shrinker/ + $ cd sb-btrfs\:vda2-24/ + + $ cat count | head -n 5 + 1 212 0 + 21 97 0 + 55 802 5 + 2367 2 0 + 225 13 0 + + $ echo "55 0 200" > scan + + $ cat count | head -n 5 + 1 212 0 + 21 96 0 + 55 752 5 + 2367 2 0 + 225 13 0 diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 781ecbd3d608..e5b40c43221d 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -99,6 +99,78 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count); +static int shrinker_debugfs_scan_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t shrinker_debugfs_scan_write(struct file *file, + const char __user *buf, + size_t size, loff_t *pos) +{ + struct shrinker *shrinker = file->private_data; + unsigned long nr_to_scan = 0, ino, read_len; + struct shrink_control sc = { + .gfp_mask = GFP_KERNEL, + }; + struct mem_cgroup *memcg = NULL; + int nid; + char kbuf[72]; + ssize_t ret; + + read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); + if (copy_from_user(kbuf, buf, read_len)) + return -EFAULT; + kbuf[read_len] = '\0'; + + if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3) + return -EINVAL; + + if (nid < 0 || nid >= nr_node_ids) + return -EINVAL; + + if (nr_to_scan == 0) + return size; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + memcg = mem_cgroup_get_from_ino(ino); + if (!memcg || IS_ERR(memcg)) + return -ENOENT; + + if (!mem_cgroup_online(memcg)) { + mem_cgroup_put(memcg); + return -ENOENT; + } + } else if (ino != 0) { + return -EINVAL; + } + + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } + + sc.nid = nid; + sc.memcg = memcg; + sc.nr_to_scan = nr_to_scan; + sc.nr_scanned = nr_to_scan; + + shrinker->scan_objects(shrinker, &sc); + + up_read(&shrinker_rwsem); + mem_cgroup_put(memcg); + + return size; +} + +static const struct file_operations shrinker_debugfs_scan_fops = { + .owner = THIS_MODULE, + .open = shrinker_debugfs_scan_open, + .write = shrinker_debugfs_scan_write, +}; + int shrinker_debugfs_add(struct shrinker *shrinker) { struct dentry *entry; @@ -128,6 +200,8 @@ int shrinker_debugfs_add(struct shrinker *shrinker) debugfs_create_file("count", 0220, entry, shrinker, &shrinker_debugfs_count_fops); + debugfs_create_file("scan", 0440, entry, shrinker, + &shrinker_debugfs_scan_fops); return 0; } From 8eb510db2125ab471967819d1f8749162588bba9 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Jun 2022 11:34:45 +0200 Subject: [PATCH 055/282] mm/vmalloc: make link_va()/unlink_va() common to different rb_root Patch series "Reduce a vmalloc internal lock contention preparation work". This small serias is preparation work to implement per-cpu vmalloc allocation in order to reduce a high internal lock contention. This series does not introduce any functional changes, it is only about preparation. This patch (of 5): Currently link_va() and unlik_va(), in order to figure out a tree type, compares a passed root value with a global free_vmap_area_root variable to distinguish the augmented rb-tree from a regular one. It is hard coded since such functions can manipulate only with specific "free_vmap_area_root" tree that represents a global free vmap space. Make it common by introducing "_augment" versions of both internal functions, so it is possible to deal with different trees. There is no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20220607093449.3100-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20220607093449.3100-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 60 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 12 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 37939f96d2f4..2504c83814fe 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -910,8 +910,9 @@ get_va_next_sibling(struct rb_node *parent, struct rb_node **link) } static __always_inline void -link_va(struct vmap_area *va, struct rb_root *root, - struct rb_node *parent, struct rb_node **link, struct list_head *head) +__link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head, bool augment) { /* * VA is still not in the list, but we can @@ -925,7 +926,7 @@ link_va(struct vmap_area *va, struct rb_root *root, /* Insert to the rb-tree */ rb_link_node(&va->rb_node, parent, link); - if (root == &free_vmap_area_root) { + if (augment) { /* * Some explanation here. Just perform simple insertion * to the tree. We do not set va->subtree_max_size to @@ -949,12 +950,28 @@ link_va(struct vmap_area *va, struct rb_root *root, } static __always_inline void -unlink_va(struct vmap_area *va, struct rb_root *root) +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head) +{ + __link_va(va, root, parent, link, head, false); +} + +static __always_inline void +link_va_augment(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, + struct list_head *head) +{ + __link_va(va, root, parent, link, head, true); +} + +static __always_inline void +__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) { if (WARN_ON(RB_EMPTY_NODE(&va->rb_node))) return; - if (root == &free_vmap_area_root) + if (augment) rb_erase_augmented(&va->rb_node, root, &free_vmap_area_rb_augment_cb); else @@ -964,6 +981,18 @@ unlink_va(struct vmap_area *va, struct rb_root *root) RB_CLEAR_NODE(&va->rb_node); } +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + __unlink_va(va, root, false); +} + +static __always_inline void +unlink_va_augment(struct vmap_area *va, struct rb_root *root) +{ + __unlink_va(va, root, true); +} + #if DEBUG_AUGMENT_PROPAGATE_CHECK /* * Gets called when remove the node and rotate. @@ -1059,7 +1088,7 @@ insert_vmap_area_augment(struct vmap_area *va, link = find_va_links(va, root, NULL, &parent); if (link) { - link_va(va, root, parent, link, head); + link_va_augment(va, root, parent, link, head); augment_tree_propagate_from(va); } } @@ -1076,8 +1105,8 @@ insert_vmap_area_augment(struct vmap_area *va, * ongoing. */ static __always_inline struct vmap_area * -merge_or_add_vmap_area(struct vmap_area *va, - struct rb_root *root, struct list_head *head) +__merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head, bool augment) { struct vmap_area *sibling; struct list_head *next; @@ -1139,7 +1168,7 @@ merge_or_add_vmap_area(struct vmap_area *va, * "normalized" because of rotation operations. */ if (merged) - unlink_va(va, root); + __unlink_va(va, root, augment); sibling->va_end = va->va_end; @@ -1154,16 +1183,23 @@ merge_or_add_vmap_area(struct vmap_area *va, insert: if (!merged) - link_va(va, root, parent, link, head); + __link_va(va, root, parent, link, head, augment); return va; } +static __always_inline struct vmap_area * +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + return __merge_or_add_vmap_area(va, root, head, false); +} + static __always_inline struct vmap_area * merge_or_add_vmap_area_augment(struct vmap_area *va, struct rb_root *root, struct list_head *head) { - va = merge_or_add_vmap_area(va, root, head); + va = __merge_or_add_vmap_area(va, root, head, true); if (va) augment_tree_propagate_from(va); @@ -1347,7 +1383,7 @@ adjust_va_to_fit_type(struct vmap_area *va, * V NVA V * |---------------| */ - unlink_va(va, &free_vmap_area_root); + unlink_va_augment(va, &free_vmap_area_root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* From f9863be49312aa1f566dca12603e33487965e6a4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Jun 2022 11:34:46 +0200 Subject: [PATCH 056/282] mm/vmalloc: extend __alloc_vmap_area() with extra arguments It implies that __alloc_vmap_area() allocates only from the global vmap space, therefore a list-head and rb-tree, which represent a free vmap space, are not passed as parameters to this function and are accessed directly from this function. Extend the __alloc_vmap_area() and other dependent functions to have a possibility to allocate from different trees making an interface common and not specific. There is no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20220607093449.3100-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2504c83814fe..5dce7593c075 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1233,15 +1233,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size, * overhead. */ static __always_inline struct vmap_area * -find_vmap_lowest_match(unsigned long size, unsigned long align, - unsigned long vstart, bool adjust_search_size) +find_vmap_lowest_match(struct rb_root *root, unsigned long size, + unsigned long align, unsigned long vstart, bool adjust_search_size) { struct vmap_area *va; struct rb_node *node; unsigned long length; /* Start from the root. */ - node = free_vmap_area_root.rb_node; + node = root->rb_node; /* Adjust the search size for alignment overhead. */ length = adjust_search_size ? size + align - 1 : size; @@ -1369,8 +1369,9 @@ classify_va_fit_type(struct vmap_area *va, } static __always_inline int -adjust_va_to_fit_type(struct vmap_area *va, - unsigned long nva_start_addr, unsigned long size) +adjust_va_to_fit_type(struct rb_root *root, struct list_head *head, + struct vmap_area *va, unsigned long nva_start_addr, + unsigned long size) { struct vmap_area *lva = NULL; enum fit_type type = classify_va_fit_type(va, nva_start_addr, size); @@ -1383,7 +1384,7 @@ adjust_va_to_fit_type(struct vmap_area *va, * V NVA V * |---------------| */ - unlink_va_augment(va, &free_vmap_area_root); + unlink_va_augment(va, root); kmem_cache_free(vmap_area_cachep, va); } else if (type == LE_FIT_TYPE) { /* @@ -1461,8 +1462,7 @@ adjust_va_to_fit_type(struct vmap_area *va, augment_tree_propagate_from(va); if (lva) /* type == NE_FIT_TYPE */ - insert_vmap_area_augment(lva, &va->rb_node, - &free_vmap_area_root, &free_vmap_area_list); + insert_vmap_area_augment(lva, &va->rb_node, root, head); } return 0; @@ -1473,7 +1473,8 @@ adjust_va_to_fit_type(struct vmap_area *va, * Otherwise a vend is returned that indicates failure. */ static __always_inline unsigned long -__alloc_vmap_area(unsigned long size, unsigned long align, +__alloc_vmap_area(struct rb_root *root, struct list_head *head, + unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend) { bool adjust_search_size = true; @@ -1493,7 +1494,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size)) adjust_search_size = false; - va = find_vmap_lowest_match(size, align, vstart, adjust_search_size); + va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size); if (unlikely(!va)) return vend; @@ -1507,7 +1508,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return vend; /* Update the free vmap_area. */ - ret = adjust_va_to_fit_type(va, nva_start_addr, size); + ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size); if (WARN_ON_ONCE(ret)) return vend; @@ -1598,7 +1599,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node); - addr = __alloc_vmap_area(size, align, vstart, vend); + addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, + size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); /* @@ -3874,7 +3876,9 @@ retry: /* It is a BUG(), but trigger recovery instead. */ goto recovery; - ret = adjust_va_to_fit_type(va, start, size); + ret = adjust_va_to_fit_type(&free_vmap_area_root, + &free_vmap_area_list, + va, start, size); if (WARN_ON_ONCE(unlikely(ret))) /* It is a BUG(), but trigger recovery instead. */ goto recovery; From 5d7a7c54d3d7ff2f54725881dc7e06a7f5c94dc2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Jun 2022 11:34:47 +0200 Subject: [PATCH 057/282] mm/vmalloc: initialize VA's list node after unlink A vmap_area can travel between different places. For example attached/detached to/from different rb-trees. In order to prevent fancy bugs, initialize a VA's list node after it is removed from the list, so it pairs with VA's rb_node which is also initialized. There is no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20220607093449.3100-4-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5dce7593c075..58cfecb2ec26 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -977,7 +977,7 @@ __unlink_va(struct vmap_area *va, struct rb_root *root, bool augment) else rb_erase(&va->rb_node, root); - list_del(&va->list); + list_del_init(&va->list); RB_CLEAR_NODE(&va->rb_node); } From 899c6efe58dbe8cb9768057ffc206d03e5a89ce8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Jun 2022 11:34:48 +0200 Subject: [PATCH 058/282] mm/vmalloc: extend __find_vmap_area() with one more argument __find_vmap_area() finds a "vmap_area" based on passed address. It scan the specific "vmap_area_root" rb-tree. Extend the function with one extra argument, so any tree can be specified where the search has to be done. There is no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20220607093449.3100-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 58cfecb2ec26..dd6cdb201195 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -815,9 +815,9 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) return va; } -static struct vmap_area *__find_vmap_area(unsigned long addr) +static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root) { - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node; addr = (unsigned long)kasan_reset_tag((void *)addr); @@ -1834,7 +1834,7 @@ struct vmap_area *find_vmap_area(unsigned long addr) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr); + va = __find_vmap_area(addr, &vmap_area_root); spin_unlock(&vmap_area_lock); return va; @@ -2577,7 +2577,7 @@ struct vm_struct *remove_vm_area(const void *addr) might_sleep(); spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr); + va = __find_vmap_area((unsigned long)addr, &vmap_area_root); if (va && va->vm) { struct vm_struct *vm = va->vm; From 5e21f2d577cf174ced5fe9bdff67dcb70190d9f8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Jun 2022 11:34:49 +0200 Subject: [PATCH 059/282] lib/test_vmalloc: switch to prandom_u32() A get_random_bytes() function can cause a high contention if it is called across CPUs simultaneously. Because it shares one lock per all CPUs: class name con-bounces contentions waittime-min waittime-max waittime-total waittime-avg acq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg &crng->lock: 663145 665886 0.05 8.85 261966.66 0.39 7188152 13731279 0.04 11.89 2181582.30 0.16 ----------- &crng->lock 307835 [<00000000acba59cd>] _extract_crng+0x48/0x90 &crng->lock 358051 [<00000000f0075abc>] _crng_backtrack_protect+0x32/0x90 ----------- &crng->lock 234241 [<00000000f0075abc>] _crng_backtrack_protect+0x32/0x90 &crng->lock 431645 [<00000000acba59cd>] _extract_crng+0x48/0x90 Switch from the get_random_bytes() to prandom_u32() that does not have any internal contention when a random value is needed for the tests. The reason is to minimize CPU cycles introduced by the test-suite itself from the vmalloc performance metrics. Link: https://lkml.kernel.org/r/20220607093449.3100-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Christoph Hellwig Cc: Matthew Wilcox Cc: Nicholas Piggin Cc: Oleksiy Avramchenko Signed-off-by: Andrew Morton --- lib/test_vmalloc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c index cf41fd6df42a..4f2f2d1bac56 100644 --- a/lib/test_vmalloc.c +++ b/lib/test_vmalloc.c @@ -74,12 +74,13 @@ test_report_one_done(void) static int random_size_align_alloc_test(void) { - unsigned long size, align, rnd; + unsigned long size, align; + unsigned int rnd; void *ptr; int i; for (i = 0; i < test_loop_count; i++) { - get_random_bytes(&rnd, sizeof(rnd)); + rnd = prandom_u32(); /* * Maximum 1024 pages, if PAGE_SIZE is 4096. @@ -150,7 +151,7 @@ static int random_size_alloc_test(void) int i; for (i = 0; i < test_loop_count; i++) { - get_random_bytes(&n, sizeof(i)); + n = prandom_u32(); n = (n % 100) + 1; p = vmalloc(n * PAGE_SIZE); @@ -294,14 +295,14 @@ pcpu_alloc_test(void) for (i = 0; i < 35000; i++) { unsigned int r; - get_random_bytes(&r, sizeof(i)); + r = prandom_u32(); size = (r % (PAGE_SIZE / 4)) + 1; /* * Maximum PAGE_SIZE */ - get_random_bytes(&r, sizeof(i)); - align = 1 << ((i % 11) + 1); + r = prandom_u32(); + align = 1 << ((r % 11) + 1); pcpu[i] = __alloc_percpu(size, align); if (!pcpu[i]) @@ -396,7 +397,7 @@ static void shuffle_array(int *arr, int n) int i, j; for (i = n - 1; i > 0; i--) { - get_random_bytes(&rnd, sizeof(rnd)); + rnd = prandom_u32(); /* Cut the range. */ j = rnd % i; From c8945306976f497802b208cf8f2cad4543868bc6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 8 Jun 2022 22:40:30 +0800 Subject: [PATCH 060/282] mm/swapfile: fix possible data races of inuse_pages si->inuse_pages could still be accessed concurrently now. The plain reads outside si->lock critical section, i.e. swap_show and si_swapinfo, which results in data races. READ_ONCE and WRITE_ONCE is used to fix such data races. Note these data races should be ok because they're just used for showing swap info. [linmiaohe@huawei.com: use WRITE_ONCE to pair with READ_ONCE] Link: https://lkml.kernel.org/r/20220625093346.48894-2-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220608144031.829-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: "Huang, Ying" Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swapfile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index a2e66d855b19..6a7579951fa4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -695,7 +695,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->lowest_bit += nr_entries; if (end == si->highest_bit) WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries); - si->inuse_pages += nr_entries; + WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries); if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; @@ -732,7 +732,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); - si->inuse_pages -= nr_entries; + WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); if (si->flags & SWP_BLKDEV) swap_slot_free_notify = si->bdev->bd_disk->fops->swap_slot_free_notify; @@ -2640,7 +2640,7 @@ static int swap_show(struct seq_file *swap, void *v) } bytes = si->pages << (PAGE_SHIFT - 10); - inuse = si->inuse_pages << (PAGE_SHIFT - 10); + inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -3259,7 +3259,7 @@ void si_swapinfo(struct sysinfo *val) struct swap_info_struct *si = swap_info[type]; if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) - nr_to_be_unused += si->inuse_pages; + nr_to_be_unused += READ_ONCE(si->inuse_pages); } val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; From 442701e7058bd8dcb00c7885de99a43f5d0a0d47 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 8 Jun 2022 22:40:31 +0800 Subject: [PATCH 061/282] mm/swap: remove swap_cache_info statistics swap_cache_info are not statistics that could be easily used to tune system performance because they are not easily accessile. Also they can't provide really useful info when OOM occurs. Remove these statistics can also help mitigate unneeded global swap_cache_info cacheline contention. Link: https://lkml.kernel.org/r/20220608144031.829-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Acked-by: "Huang, Ying" Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swap_state.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 778d57d2d92d..f6568765aef7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -59,24 +59,11 @@ static bool enable_vma_readahead __read_mostly = true; #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) -#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) -#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) - -static struct { - unsigned long add_total; - unsigned long del_total; - unsigned long find_success; - unsigned long find_total; -} swap_cache_info; - static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); - printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", - swap_cache_info.add_total, swap_cache_info.del_total, - swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %ldkB\n", get_nr_swap_pages() << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); @@ -133,7 +120,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); - ADD_CACHE_INFO(add_total, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -172,7 +158,6 @@ void __delete_from_swap_cache(struct page *page, address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); - ADD_CACHE_INFO(del_total, nr); } /** @@ -348,12 +333,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, page = find_get_page(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - INC_CACHE_INFO(find_total); if (page) { bool vma_ra = swap_use_vma_readahead(); bool readahead; - INC_CACHE_INFO(find_success); /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. From c364f9af299f2a3d16e5bae94e8f155dc8a721a4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:53 +0000 Subject: [PATCH 062/282] mm/damon/dbgfs: add and use mappings between 'schemes' action inputs and 'damos_action' values Patch series "Extend DAMOS for Proactive LRU-lists Sorting". Introduction ============ In short, this patchset 1) extends DAMON-based Operation Schemes (DAMOS) for low overhead data access pattern based LRU-lists sorting, and 2) implements a static kernel module for easy use of conservatively-tuned version of that using the extended DAMOS capability. Background ---------- As page-granularity access checking overhead could be significant on huge systems, LRU lists are normally not proactively sorted but partially and reactively sorted for special events including specific user requests, system calls and memory pressure. As a result, LRU lists are sometimes not so perfectly prepared to be used as a trustworthy access pattern source for some situations including reclamation target pages selection under sudden memory pressure. DAMON-based Proactive LRU-lists Sorting --------------------------------------- Because DAMON can identify access patterns of best-effort accuracy while inducing only user-specified range of overhead, using DAMON for Proactive LRU-lists Sorting (PLRUS) could be helpful for this situation. The idea is quite simple. Find hot pages and cold pages using DAMON, and prioritize hot pages while deprioritizing cold pages on their LRU-lists. This patchset extends DAMON to support such schemes by introducing a couple of new DAMOS actions for prioritizing and deprioritizing memory regions of specific access patterns on their LRU-lists. In detail, this patchset simply uses 'mark_page_accessed()' and 'deactivate_page()' functions for prioritization and deprioritization of pages on their LRU lists, respectively. To make the scheme easy to use without complex tuning for common situations, this patchset further implements a static kernel module called 'DAMON_LRU_SORT' using the extended DAMOS functionality. It proactively sorts LRU-lists using DAMON with conservatively chosen default hotness/coldness thresholds and small CPU usage quota limit. That is, the module under its default parameters will make no harm for common situation but provide some level of benefit for systems having clear hot/cold access pattern under only memory pressure while consuming only limited small portion of CPU time. Related Works ------------- Proactive reclamation is well known to be helpful for reducing non-optimal reclamation target selection caused performance drops. However, proactive reclamation is not a best option for some cases, because it could incur additional I/O. For an example, it could be prohitive for systems using storage devices that total number of writes is limited, or cloud block storages that charges every I/O. Some proactive reclamation approaches[1,2] induce a level of memory pressure using memcg files or swappiness while monitoring PSI. As reclamation target selection is still relying on the original LRU-lists mechanism, using DAMON-based proactive reclamation before inducing the proactive reclamation could allow more memory saving with same level of performance overhead, or less performance overhead with same level of memory saving. [1] https://blogs.oracle.com/linux/post/anticipating-your-memory-needs [2] https://www.pdl.cmu.edu/ftp/NVM/tmo_asplos22.pdf Evaluation ========== In short, PLRUS achieves 10% memory PSI (some) reduction, 14% major page faults reduction, and 3.74% speedup under memory pressure. Setup ----- To show the effect of PLRUS, I run PARSEC3 and SPLASH-2X benchmarks under below variant systems and measure a few metrics including the runtime of each workload, number of system-wide major page faults, and system-wide memory PSI (some). - orig: v5.18-rc4 based mm-unstable kernel + this patchset, but no DAMON scheme applied. - mprs: Same to 'orig' but artificial memory pressure is induced. - plrus: Same to 'mprs' but a radically tuned PLRUS scheme is applied to the entire physical address space of the system. For the artificial memory pressure, I set 'memory.limit_in_bytes' to 75% of the running workload's peak RSS, wait 1 second, remove the pressure by setting it to 200% of the peak RSS, wait 10 seconds, and repeat the procedure until the workload finishes[1]. I use zram based swap device. The tests are automated[2]. [1] https://github.com/awslabs/damon-tests/blob/next/perf/runners/back/0009_memcg_pressure.sh [2] https://github.com/awslabs/damon-tests/blob/next/perf/full_once_config.sh Radically Tuned PLRUS --------------------- To show effect of PLRUS on the PARSEC3/SPLASH-2X workloads which runs for no long time, we use radically tuned version of PLRUS. The version asks DAMON to do the proactive LRU-lists sorting as below. 1. Find any memory regions shown some accesses (approximately >=20 accesses per 100 sampling) and prioritize pages of the regions on their LRU lists using up to 2% CPU time. Under the CPU time limit, prioritize regions having higher access frequency and kept the access frequency longer first. 2. Find any memory regions shown no access for at least >=5 seconds and deprioritize pages of the rgions on their LRU lists using up to 2% CPU time. Under the CPU time limit, deprioritize regions that not accessed for longer time first. Results ------- I repeat the tests 25 times and calculate average of the measured numbers. The results are as below: metric orig mprs plrus plrus/mprs runtime_seconds 190.06 292.83 281.87 0.96 pgmajfaults 852.55 8769420.00 7525040.00 0.86 memory_psi_some_us 106911.00 6943420.00 6220920.00 0.90 The first row is for legend. The first cell shows the metric that the following cells of the row shows. Second, third, and fourth cells show the metrics under the configs shown at the first row of the cell, and the fifth cell shows the metric under 'plrus' divided by the metric under 'mprs'. Second row shows the averaged runtime of the workloads in seconds. Third row shows the number of system-wide major page faults while the test was ongoing. Fourth row shows the system-wide memory pressure stall for some processes in microseconds while the test was ongoing. In short, PLRUS achieves 10% memory PSI (some) reduction, 14% major page faults reduction, and 3.74% speedup under memory pressure. We also confirmed the CPU usage of kdamond was 2.61% of single CPU, which is below 4% as expected. Sequence of Patches =================== The first and second patch cleans up DAMON debugfs interface and DAMOS_PAGEOUT handling code of physical address space monitoring operations implementation for easier extension of the code. The thrid and fourth patches implement a new DAMOS action called 'lru_prio', which prioritizes pages under memory regions which have a user-specified access pattern, and document it, respectively. The fifth and sixth patches implement yet another new DAMOS action called 'lru_deprio', which deprioritizes pages under memory regions which have a user-specified access pattern, and document it, respectively. The seventh patch implements a static kernel module called 'damon_lru_sort', which utilizes the DAMON-based proactive LRU-lists sorting under conservatively chosen default parameter. Finally, the eighth patch documents 'damon_lru_sort'. This patch (of 8): DAMON debugfs interface assumes users will write 'damos_action' value directly to the 'schemes' file. This makes adding new 'damos_action' in the middle of its definition breaks the backward compatibility of DAMON debugfs interface, as values of some 'damos_action' could be changed. To mitigate the situation, this commit adds mappings between the user inputs and 'damos_action' value and makes DAMON debugfs code uses those. Link: https://lkml.kernel.org/r/20220613192301.8817-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220613192301.8817-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/dbgfs.c | 64 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 14 deletions(-) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 5ae810927309..cb8a7e9926a4 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -97,6 +97,31 @@ out: return ret; } +/* + * Return corresponding dbgfs' scheme action value (int) for the given + * damos_action if the given damos_action value is valid and supported by + * dbgfs, negative error code otherwise. + */ +static int damos_action_to_dbgfs_scheme_action(enum damos_action action) +{ + switch (action) { + case DAMOS_WILLNEED: + return 0; + case DAMOS_COLD: + return 1; + case DAMOS_PAGEOUT: + return 2; + case DAMOS_HUGEPAGE: + return 3; + case DAMOS_NOHUGEPAGE: + return 4; + case DAMOS_STAT: + return 5; + default: + return -EINVAL; + } +} + static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) { struct damos *s; @@ -109,7 +134,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, - s->action, + damos_action_to_dbgfs_scheme_action(s->action), s->quota.ms, s->quota.sz, s->quota.reset_interval, s->quota.weight_sz, @@ -160,18 +185,27 @@ static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes) kfree(schemes); } -static bool damos_action_valid(int action) +/* + * Return corresponding damos_action for the given dbgfs input for a scheme + * action if the input is valid, negative error code otherwise. + */ +static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action) { - switch (action) { - case DAMOS_WILLNEED: - case DAMOS_COLD: - case DAMOS_PAGEOUT: - case DAMOS_HUGEPAGE: - case DAMOS_NOHUGEPAGE: - case DAMOS_STAT: - return true; + switch (dbgfs_action) { + case 0: + return DAMOS_WILLNEED; + case 1: + return DAMOS_COLD; + case 2: + return DAMOS_PAGEOUT; + case 3: + return DAMOS_HUGEPAGE; + case 4: + return DAMOS_NOHUGEPAGE; + case 5: + return DAMOS_STAT; default: - return false; + return -EINVAL; } } @@ -189,7 +223,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, int pos = 0, parsed, ret; unsigned long min_sz, max_sz; unsigned int min_nr_a, max_nr_a, min_age, max_age; - unsigned int action; + unsigned int action_input; + enum damos_action action; schemes = kmalloc_array(max_nr_schemes, sizeof(scheme), GFP_KERNEL); @@ -204,7 +239,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action, "a.ms, + &min_age, &max_age, &action_input, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, "a.weight_age, &wmarks.metric, @@ -212,7 +247,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, &wmarks.low, &parsed); if (ret != 18) break; - if (!damos_action_valid(action)) + action = dbgfs_scheme_action_to_damos_action(action_input); + if ((int)action < 0) goto fail; if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) From 0e93e8bfd0b8b953038785109d0bab72280823f6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:55 +0000 Subject: [PATCH 063/282] mm/damon/paddr: use a separate function for 'DAMOS_PAGEOUT' handling This commit moves code for 'DAMOS_PAGEOUT' handling of the physical address space monitoring operations set to a separate function so that its caller, 'damon_pa_apply_scheme()', can be more easily extended for additional DAMOS actions later. Link: https://lkml.kernel.org/r/20220613192301.8817-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index b40ff5811bb2..7bcd48066b43 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -204,16 +204,11 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) return max_nr_accesses; } -static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, - struct damon_target *t, struct damon_region *r, - struct damos *scheme) +static unsigned long damon_pa_pageout(struct damon_region *r) { unsigned long addr, applied; LIST_HEAD(page_list); - if (scheme->action != DAMOS_PAGEOUT) - return 0; - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -238,6 +233,19 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return applied * PAGE_SIZE; } +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) +{ + switch (scheme->action) { + case DAMOS_PAGEOUT: + return damon_pa_pageout(r); + default: + break; + } + return 0; +} + static int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme) From 8cdcc532268df0893d9756f537cbce479f4c4831 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:56 +0000 Subject: [PATCH 064/282] mm/damon/schemes: add 'LRU_PRIO' DAMOS action This commit adds a new DAMOS action called 'LRU_PRIO' for the physical address space. The action prioritizes pages in the memory regions of the user-specified target access pattern on their LRU lists. This is hence supposed to be used for frequently accessed (hot) memory regions so that hot pages could be more likely protected under memory pressure. Internally, it simply calls 'mark_page_accessed()'. Link: https://lkml.kernel.org/r/20220613192301.8817-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/ops-common.c | 42 ++++++++++++++++++++++++++++++++++++++++++ mm/damon/ops-common.h | 2 ++ mm/damon/paddr.c | 20 ++++++++++++++++++++ mm/damon/sysfs.c | 1 + 5 files changed, 67 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index b9aae19fab3e..4c64e03e94d8 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -86,6 +86,7 @@ struct damon_target { * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. + * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -95,6 +96,7 @@ enum damos_action { DAMOS_PAGEOUT, DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, + DAMOS_LRU_PRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 10ef20b2003f..b1335de200e7 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, /* Return coldness of the region */ return DAMOS_MAX_SCORE - hotness; } + +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s) +{ + unsigned int max_nr_accesses; + int freq_subscore; + unsigned int age_in_sec; + int age_in_log, age_subscore; + unsigned int freq_weight = s->quota.weight_nr_accesses; + unsigned int age_weight = s->quota.weight_age; + int hotness; + + max_nr_accesses = c->aggr_interval / c->sample_interval; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + + age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; + age_in_log++, age_in_sec >>= 1) + ; + + /* If frequency is 0, higher age means it's colder */ + if (freq_subscore == 0) + age_in_log *= -1; + + /* + * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG]. + * Scale it to be in [0, 100] and set it as age subscore. + */ + age_in_log += DAMON_MAX_AGE_IN_LOG; + age_subscore = age_in_log * DAMON_MAX_SUBSCORE / + DAMON_MAX_AGE_IN_LOG / 2; + + hotness = (freq_weight * freq_subscore + age_weight * age_subscore); + if (freq_weight + age_weight) + hotness /= freq_weight + age_weight; + /* + * Transform it to fit in [0, DAMOS_MAX_SCORE] + */ + hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE; + + return hotness; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index e790cb5f8fe0..52329ff361cd 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); +int damon_hot_score(struct damon_ctx *c, struct damon_region *r, + struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 7bcd48066b43..f145b1d51e13 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -233,6 +233,22 @@ static unsigned long damon_pa_pageout(struct damon_region *r) return applied * PAGE_SIZE; } +static unsigned long damon_pa_mark_accessed(struct damon_region *r) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + mark_page_accessed(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -240,6 +256,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_pa_pageout(r); + case DAMOS_LRU_PRIO: + return damon_pa_mark_accessed(r); default: break; } @@ -253,6 +271,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: return damon_pageout_score(context, r, scheme); + case DAMOS_LRU_PRIO: + return damon_hot_score(context, r, scheme); default: break; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index c35809c6087c..86c69f980927 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -762,6 +762,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { "pageout", "hugepage", "nohugepage", + "lru_prio", "stat", }; From 0bcba960b1fa30607f3a0b566c88cd4a8a44ebaf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:57 +0000 Subject: [PATCH 065/282] Docs/admin-guide/damon/sysfs: document 'LRU_PRIO' scheme action This commit documents the 'lru_prio' scheme action for DAMON sysfs interface. Link: https://lkml.kernel.org/r/20220613192301.8817-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 5540a3a40fc9..cb4a0fe8e7af 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -264,6 +264,7 @@ that can be written to and read from the file and their meaning are as below. - ``pageout``: Call ``madvise()`` for the region with ``MADV_PAGEOUT`` - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` + - ``lru_prio``: Prioritize the region on its LRU lists. - ``stat``: Do nothing but count the statistics schemes//access_pattern/ From 99cdc2cd180a7adc87badc9ca92f8af803d8bf3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:58 +0000 Subject: [PATCH 066/282] mm/damon/schemes: add 'LRU_DEPRIO' action This commit adds a new DAMON-based operation scheme action called 'LRU_DEPRIO' for physical address space. The action deprioritizes pages in the memory area of the target access pattern on their LRU lists. This is hence supposed to be used for rarely accessed (cold) memory regions so that cold pages could be more likely reclaimed first under memory pressure. Internally, it simply calls 'lru_deactivate()'. Using this with 'LRU_PRIO' action for hot pages, users can proactively sort LRU lists based on the access pattern. That is, it can make the LRU lists somewhat more trustworthy source of access temperature. As a result, efficiency of LRU-lists based mechanisms including the reclamation target selection could be improved. Link: https://lkml.kernel.org/r/20220613192301.8817-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/paddr.c | 20 ++++++++++++++++++++ mm/damon/sysfs.c | 1 + 3 files changed, 23 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 4c64e03e94d8..7b1f4a488230 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -87,6 +87,7 @@ struct damon_target { * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. + * @DAMOS_LRU_DEPRIO: Deprioritize the region on its LRU lists. * @DAMOS_STAT: Do nothing but count the stat. * @NR_DAMOS_ACTIONS: Total number of DAMOS actions */ @@ -97,6 +98,7 @@ enum damos_action { DAMOS_HUGEPAGE, DAMOS_NOHUGEPAGE, DAMOS_LRU_PRIO, + DAMOS_LRU_DEPRIO, DAMOS_STAT, /* Do nothing but only record the stat */ NR_DAMOS_ACTIONS, }; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index f145b1d51e13..dc131c6a5403 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -249,6 +249,22 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r) return applied * PAGE_SIZE; } +static unsigned long damon_pa_deactivate_pages(struct damon_region *r) +{ + unsigned long addr, applied = 0; + + for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + struct page *page = damon_get_page(PHYS_PFN(addr)); + + if (!page) + continue; + deactivate_page(page); + put_page(page); + applied++; + } + return applied * PAGE_SIZE; +} + static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) @@ -258,6 +274,8 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return damon_pa_pageout(r); case DAMOS_LRU_PRIO: return damon_pa_mark_accessed(r); + case DAMOS_LRU_DEPRIO: + return damon_pa_deactivate_pages(r); default: break; } @@ -273,6 +291,8 @@ static int damon_pa_scheme_score(struct damon_ctx *context, return damon_pageout_score(context, r, scheme); case DAMOS_LRU_PRIO: return damon_hot_score(context, r, scheme); + case DAMOS_LRU_DEPRIO: + return damon_pageout_score(context, r, scheme); default: break; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 86c69f980927..7488e27c87c3 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -763,6 +763,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { "hugepage", "nohugepage", "lru_prio", + "lru_deprio", "stat", }; From b57e39a743e4efc6945523f121691ac76f9161de Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:22:59 +0000 Subject: [PATCH 067/282] Docs/admin-guide/damon/sysfs: document 'LRU_DEPRIO' scheme action This commit documents the 'LRU_DEPRIO' scheme action for DAMON sysfs interface.` Link: https://lkml.kernel.org/r/20220613192301.8817-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index cb4a0fe8e7af..d52f572a9029 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -265,6 +265,7 @@ that can be written to and read from the file and their meaning are as below. - ``hugepage``: Call ``madvise()`` for the region with ``MADV_HUGEPAGE`` - ``nohugepage``: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - ``lru_prio``: Prioritize the region on its LRU lists. + - ``lru_deprio``: Deprioritize the region on its LRU lists. - ``stat``: Do nothing but count the statistics schemes//access_pattern/ From 40e983cca9274e177bd5b9379299b44d9536ac68 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:23:00 +0000 Subject: [PATCH 068/282] mm/damon: introduce DAMON-based LRU-lists Sorting Users can do data access-aware LRU-lists sorting using 'LRU_PRIO' and 'LRU_DEPRIO' DAMOS actions. However, finding best parameters including the hotness/coldness thresholds, CPU quota, and watermarks could be challenging for some users. To make the scheme easy to be used without complex tuning for common situations, this commit implements a static kernel module called 'DAMON_LRU_SORT' using the 'LRU_PRIO' and 'LRU_DEPRIO' DAMOS actions. It proactively sorts LRU-lists using DAMON with conservatively chosen default values of the parameters. That is, the module under its default parameters will make no harm for common situations but provide some level of efficiency improvements for systems having clear hot/cold access pattern under a level of memory pressure while consuming only a limited small portion of CPU time. Link: https://lkml.kernel.org/r/20220613192301.8817-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 8 + mm/damon/Makefile | 1 + mm/damon/lru_sort.c | 546 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 555 insertions(+) create mode 100644 mm/damon/lru_sort.c diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 9b559c76d6dd..66265e3a9c65 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -92,4 +92,12 @@ config DAMON_RECLAIM reclamation under light memory pressure, while the traditional page scanning-based reclamation is used for heavy pressure. +config DAMON_LRU_SORT + bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)" + depends on DAMON_PADDR + help + This builds the DAMON-based LRU-lists sorting subsystem. It tries to + protect frequently accessed (hot) pages while rarely accessed (cold) + pages reclaimed first under memory pressure. + endmenu diff --git a/mm/damon/Makefile b/mm/damon/Makefile index dbf7190b4144..3e6b8ad73858 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o obj-$(CONFIG_DAMON_SYSFS) += sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c new file mode 100644 index 000000000000..c276736a071c --- /dev/null +++ b/mm/damon/lru_sort.c @@ -0,0 +1,546 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON-based LRU-lists Sorting + * + * Author: SeongJae Park + */ + +#define pr_fmt(fmt) "damon-lru-sort: " fmt + +#include +#include +#include +#include +#include + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "damon_lru_sort." + +/* + * Enable or disable DAMON_LRU_SORT. + * + * You can enable DAMON_LRU_SORT by setting the value of this parameter as + * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that + * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the + * watermarks-based activation condition. Refer to below descriptions for the + * watermarks parameter for this. + */ +static bool enabled __read_mostly; + +/* + * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. + * + * Input parameters that updated while DAMON_LRU_SORT is running are not + * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT + * reads values of parametrs except ``enabled`` again. Once the re-reading is + * done, this parameter is set as ``N``. If invalid parameters are found while + * the re-reading, DAMON_LRU_SORT will be disabled. + */ +static bool commit_inputs __read_mostly; +module_param(commit_inputs, bool, 0600); + +/* + * Access frequency threshold for hot memory regions identification in permil. + * + * If a memory region is accessed in frequency of this or higher, + * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the + * LRU list, so that it could not be reclaimed under memory pressure. 50% by + * default. + */ +static unsigned long hot_thres_access_freq = 500; +module_param(hot_thres_access_freq, ulong, 0600); + +/* + * Time threshold for cold memory regions identification in microseconds. + * + * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT + * identifies the region as cold, and mark it as unaccessed on the LRU list, so + * that it could be reclaimed first under memory pressure. 120 seconds by + * default. + */ +static unsigned long cold_min_age __read_mostly = 120000000; +module_param(cold_min_age, ulong, 0600); + +/* + * Limit of time for trying the LRU lists sorting in milliseconds. + * + * DAMON_LRU_SORT tries to use only up to this time within a time window + * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used + * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the + * limit is disabled. + * + * 10 ms by default. + */ +static unsigned long quota_ms __read_mostly = 10; +module_param(quota_ms, ulong, 0600); + +/* + * The time quota charge reset interval in milliseconds. + * + * The charge reset interval for the quota of time (quota_ms). That is, + * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms + * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. + * + * 1 second by default. + */ +static unsigned long quota_reset_interval_ms __read_mostly = 1000; +module_param(quota_reset_interval_ms, ulong, 0600); + +/* + * The watermarks check time interval in microseconds. + * + * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is + * enabled but inactive due to its watermarks rule. 5 seconds by default. + */ +static unsigned long wmarks_interval __read_mostly = 5000000; +module_param(wmarks_interval, ulong, 0600); + +/* + * Free memory rate (per thousand) for the high watermark. + * + * If free memory of the system in bytes per thousand bytes is higher than + * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically + * checks the watermarks. 200 (20%) by default. + */ +static unsigned long wmarks_high __read_mostly = 200; +module_param(wmarks_high, ulong, 0600); + +/* + * Free memory rate (per thousand) for the middle watermark. + * + * If free memory of the system in bytes per thousand bytes is between this and + * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring + * and the LRU-lists sorting. 150 (15%) by default. + */ +static unsigned long wmarks_mid __read_mostly = 150; +module_param(wmarks_mid, ulong, 0600); + +/* + * Free memory rate (per thousand) for the low watermark. + * + * If free memory of the system in bytes per thousand bytes is lower than this, + * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks + * the watermarks. 50 (5%) by default. + */ +static unsigned long wmarks_low __read_mostly = 50; +module_param(wmarks_low, ulong, 0600); + +/* + * Sampling interval for the monitoring in microseconds. + * + * The sampling interval of DAMON for the hot/cold memory monitoring. Please + * refer to the DAMON documentation for more detail. 5 ms by default. + */ +static unsigned long sample_interval __read_mostly = 5000; +module_param(sample_interval, ulong, 0600); + +/* + * Aggregation interval for the monitoring in microseconds. + * + * The aggregation interval of DAMON for the hot/cold memory monitoring. + * Please refer to the DAMON documentation for more detail. 100 ms by default. + */ +static unsigned long aggr_interval __read_mostly = 100000; +module_param(aggr_interval, ulong, 0600); + +/* + * Minimum number of monitoring regions. + * + * The minimal number of monitoring regions of DAMON for the hot/cold memory + * monitoring. This can be used to set lower-bound of the monitoring quality. + * But, setting this too high could result in increased monitoring overhead. + * Please refer to the DAMON documentation for more detail. 10 by default. + */ +static unsigned long min_nr_regions __read_mostly = 10; +module_param(min_nr_regions, ulong, 0600); + +/* + * Maximum number of monitoring regions. + * + * The maximum number of monitoring regions of DAMON for the hot/cold memory + * monitoring. This can be used to set upper-bound of the monitoring overhead. + * However, setting this too low could result in bad monitoring quality. + * Please refer to the DAMON documentation for more detail. 1000 by default. + */ +static unsigned long max_nr_regions __read_mostly = 1000; +module_param(max_nr_regions, ulong, 0600); + +/* + * Start of the target memory region in physical address. + * + * The start physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_start __read_mostly; +module_param(monitor_region_start, ulong, 0600); + +/* + * End of the target memory region in physical address. + * + * The end physical address of memory region that DAMON_LRU_SORT will do work + * against. By default, biggest System RAM is used as the region. + */ +static unsigned long monitor_region_end __read_mostly; +module_param(monitor_region_end, ulong, 0600); + +/* + * PID of the DAMON thread + * + * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. + * Else, -1. + */ +static int kdamond_pid __read_mostly = -1; +module_param(kdamond_pid, int, 0400); + +/* + * Number of hot memory regions that tried to be LRU-sorted. + */ +static unsigned long nr_lru_sort_tried_hot_regions __read_mostly; +module_param(nr_lru_sort_tried_hot_regions, ulong, 0400); + +/* + * Total bytes of hot memory regions that tried to be LRU-sorted. + */ +static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly; +module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400); + +/* + * Number of hot memory regions that successfully be LRU-sorted. + */ +static unsigned long nr_lru_sorted_hot_regions __read_mostly; +module_param(nr_lru_sorted_hot_regions, ulong, 0400); + +/* + * Total bytes of hot memory regions that successfully be LRU-sorted. + */ +static unsigned long bytes_lru_sorted_hot_regions __read_mostly; +module_param(bytes_lru_sorted_hot_regions, ulong, 0400); + +/* + * Number of times that the time quota limit for hot regions have exceeded + */ +static unsigned long nr_hot_quota_exceeds __read_mostly; +module_param(nr_hot_quota_exceeds, ulong, 0400); + +/* + * Number of cold memory regions that tried to be LRU-sorted. + */ +static unsigned long nr_lru_sort_tried_cold_regions __read_mostly; +module_param(nr_lru_sort_tried_cold_regions, ulong, 0400); + +/* + * Total bytes of cold memory regions that tried to be LRU-sorted. + */ +static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly; +module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400); + +/* + * Number of cold memory regions that successfully be LRU-sorted. + */ +static unsigned long nr_lru_sorted_cold_regions __read_mostly; +module_param(nr_lru_sorted_cold_regions, ulong, 0400); + +/* + * Total bytes of cold memory regions that successfully be LRU-sorted. + */ +static unsigned long bytes_lru_sorted_cold_regions __read_mostly; +module_param(bytes_lru_sorted_cold_regions, ulong, 0400); + +/* + * Number of times that the time quota limit for cold regions have exceeded + */ +static unsigned long nr_cold_quota_exceeds __read_mostly; +module_param(nr_cold_quota_exceeds, ulong, 0400); + +static struct damon_ctx *ctx; +static struct damon_target *target; + +struct damon_lru_sort_ram_walk_arg { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_lru_sort_ram_walk_arg *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +static bool get_monitoring_region(unsigned long *start, unsigned long *end) +{ + struct damon_lru_sort_ram_walk_arg arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + +/* Create a DAMON-based operation scheme for hot memory regions */ +static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try LRU-lists sorting of hot pages for more than half + * of quota_ms milliseconds within quota_reset_interval_ms. + */ + .ms = quota_ms / 2, + .sz = 0, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and accessed for more than the threshold */ + hot_thres, UINT_MAX, + /* no matter its age */ + 0, UINT_MAX, + /* prioritize those on LRU lists, as soon as found */ + DAMOS_LRU_PRIO, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +/* Create a DAMON-based operation scheme for cold memory regions */ +static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) +{ + struct damos_watermarks wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = wmarks_interval, + .high = wmarks_high, + .mid = wmarks_mid, + .low = wmarks_low, + }; + struct damos_quota quota = { + /* + * Do not try LRU-lists sorting of cold pages for more than + * half of quota_ms milliseconds within + * quota_reset_interval_ms. + */ + .ms = quota_ms / 2, + .sz = 0, + .reset_interval = quota_reset_interval_ms, + /* Within the quota, mark colder regions not accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1, + }; + struct damos *scheme = damon_new_scheme( + /* Find regions having PAGE_SIZE or larger size */ + PAGE_SIZE, ULONG_MAX, + /* and not accessed at all */ + 0, 0, + /* for cold_thres or more micro-seconds, and */ + cold_thres, UINT_MAX, + /* mark those as not accessed, as soon as found */ + DAMOS_LRU_DEPRIO, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &wmarks); + + return scheme; +} + +static int damon_lru_sort_apply_parameters(void) +{ + struct damos *scheme, *next_scheme; + struct damon_addr_range addr_range; + unsigned int hot_thres, cold_thres; + int err = 0; + + err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, + min_nr_regions, max_nr_regions); + if (err) + return err; + + /* free previously set schemes */ + damon_for_each_scheme_safe(scheme, next_scheme, ctx) + damon_destroy_scheme(scheme); + + /* aggr_interval / sample_interval is the maximum nr_accesses */ + hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / + 1000; + scheme = damon_lru_sort_new_hot_scheme(hot_thres); + if (!scheme) + return -ENOMEM; + damon_add_scheme(ctx, scheme); + + cold_thres = cold_min_age / aggr_interval; + scheme = damon_lru_sort_new_cold_scheme(cold_thres); + if (!scheme) + return -ENOMEM; + damon_add_scheme(ctx, scheme); + + if (monitor_region_start > monitor_region_end) + return -EINVAL; + if (!monitor_region_start && !monitor_region_end && + !get_monitoring_region(&monitor_region_start, + &monitor_region_end)) + return -EINVAL; + addr_range.start = monitor_region_start; + addr_range.end = monitor_region_end; + return damon_set_regions(target, &addr_range, 1); +} + +static int damon_lru_sort_turn(bool on) +{ + int err; + + if (!on) { + err = damon_stop(&ctx, 1); + if (!err) + kdamond_pid = -1; + return err; + } + + err = damon_lru_sort_apply_parameters(); + if (err) + return err; + + err = damon_start(&ctx, 1, true); + if (err) + return err; + kdamond_pid = ctx->kdamond->pid; + return 0; +} + +static struct delayed_work damon_lru_sort_timer; +static void damon_lru_sort_timer_fn(struct work_struct *work) +{ + static bool last_enabled; + bool now_enabled; + + now_enabled = enabled; + if (last_enabled != now_enabled) { + if (!damon_lru_sort_turn(now_enabled)) + last_enabled = now_enabled; + else + enabled = last_enabled; + } +} +static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); + +static bool damon_lru_sort_initialized; + +static int damon_lru_sort_enabled_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + + if (!damon_lru_sort_initialized) + return rc; + + schedule_delayed_work(&damon_lru_sort_timer, 0); + + return 0; +} + +static const struct kernel_param_ops enabled_param_ops = { + .set = damon_lru_sort_enabled_store, + .get = param_get_bool, +}; + +module_param_cb(enabled, &enabled_param_ops, &enabled, 0600); +MODULE_PARM_DESC(enabled, + "Enable or disable DAMON_LRU_SORT (default: disabled)"); + +static int damon_lru_sort_handle_commit_inputs(void) +{ + int err; + + if (!commit_inputs) + return 0; + + err = damon_lru_sort_apply_parameters(); + commit_inputs = false; + return err; +} + +static int damon_lru_sort_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + if (s->action == DAMOS_LRU_PRIO) { + nr_lru_sort_tried_hot_regions = s->stat.nr_tried; + bytes_lru_sort_tried_hot_regions = s->stat.sz_tried; + nr_lru_sorted_hot_regions = s->stat.nr_applied; + bytes_lru_sorted_hot_regions = s->stat.sz_applied; + nr_hot_quota_exceeds = s->stat.qt_exceeds; + } else if (s->action == DAMOS_LRU_DEPRIO) { + nr_lru_sort_tried_cold_regions = s->stat.nr_tried; + bytes_lru_sort_tried_cold_regions = s->stat.sz_tried; + nr_lru_sorted_cold_regions = s->stat.nr_applied; + bytes_lru_sorted_cold_regions = s->stat.sz_applied; + nr_cold_quota_exceeds = s->stat.qt_exceeds; + } + } + + return damon_lru_sort_handle_commit_inputs(); +} + +static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) +{ + return damon_lru_sort_handle_commit_inputs(); +} + +static int __init damon_lru_sort_init(void) +{ + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + return -EINVAL; + + ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; + ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + schedule_delayed_work(&damon_lru_sort_timer, 0); + + damon_lru_sort_initialized = true; + return 0; +} + +module_init(damon_lru_sort_init); From 6acfcd0d75244178a4a101fe0da888fa3bff29fb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 13 Jun 2022 19:23:01 +0000 Subject: [PATCH 069/282] Docs/admin-guide/damon: add a document for DAMON_LRU_SORT This commit documents the usage of DAMON_LRU_SORT for admins. Link: https://lkml.kernel.org/r/20220613192301.8817-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/index.rst | 1 + .../admin-guide/mm/damon/lru_sort.rst | 294 ++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 Documentation/admin-guide/mm/damon/lru_sort.rst diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index c4681fa69b9c..05500042f777 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -14,3 +14,4 @@ optimize those. start usage reclaim + lru_sort diff --git a/Documentation/admin-guide/mm/damon/lru_sort.rst b/Documentation/admin-guide/mm/damon/lru_sort.rst new file mode 100644 index 000000000000..c09cace80651 --- /dev/null +++ b/Documentation/admin-guide/mm/damon/lru_sort.rst @@ -0,0 +1,294 @@ +.. SPDX-License-Identifier: GPL-2.0 + +============================= +DAMON-based LRU-lists Sorting +============================= + +DAMON-based LRU-lists Sorting (DAMON_LRU_SORT) is a static kernel module that +aimed to be used for proactive and lightweight data access pattern based +(de)prioritization of pages on their LRU-lists for making LRU-lists a more +trusworthy data access pattern source. + +Where Proactive LRU-lists Sorting is Required? +============================================== + +As page-granularity access checking overhead could be significant on huge +systems, LRU lists are normally not proactively sorted but partially and +reactively sorted for special events including specific user requests, system +calls and memory pressure. As a result, LRU lists are sometimes not so +perfectly prepared to be used as a trustworthy access pattern source for some +situations including reclamation target pages selection under sudden memory +pressure. + +Because DAMON can identify access patterns of best-effort accuracy while +inducing only user-specified range of overhead, proactively running +DAMON_LRU_SORT could be helpful for making LRU lists more trustworthy access +pattern source with low and controlled overhead. + +How It Works? +============= + +DAMON_LRU_SORT finds hot pages (pages of memory regions that showing access +rates that higher than a user-specified threshold) and cold pages (pages of +memory regions that showing no access for a time that longer than a +user-specified threshold) using DAMON, and prioritizes hot pages while +deprioritizing cold pages on their LRU-lists. To avoid it consuming too much +CPU for the prioritizations, a CPU time usage limit can be configured. Under +the limit, it prioritizes and deprioritizes more hot and cold pages first, +respectively. System administrators can also configure under what situation +this scheme should automatically activated and deactivated with three memory +pressure watermarks. + +Its default parameters for hotness/coldness thresholds and CPU quota limit are +conservatively chosen. That is, the module under its default parameters could +be widely used without harm for common situations while providing a level of +benefits for systems having clear hot/cold access patterns under memory +pressure while consuming only a limited small portion of CPU time. + +Interface: Module Parameters +============================ + +To use this feature, you should first ensure your system is running on a kernel +that is built with ``CONFIG_DAMON_LRU_SORT=y``. + +To let sysadmins enable or disable it and tune for the given system, +DAMON_LRU_SORT utilizes module parameters. That is, you can put +``damon_lru_sort.=`` on the kernel boot command line or write +proper values to ``/sys/modules/damon_lru_sort/parameters/`` files. + +Below are the description of each parameter. + +enabled +------- + +Enable or disable DAMON_LRU_SORT. + +You can enable DAMON_LRU_SORT by setting the value of this parameter as ``Y``. +Setting it as ``N`` disables DAMON_LRU_SORT. Note that DAMON_LRU_SORT could do +no real monitoring and LRU-lists sorting due to the watermarks-based activation +condition. Refer to below descriptions for the watermarks parameter for this. + +commit_inputs +------------- + +Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``. + +Input parameters that updated while DAMON_LRU_SORT is running are not applied +by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT reads values +of parametrs except ``enabled`` again. Once the re-reading is done, this +parameter is set as ``N``. If invalid parameters are found while the +re-reading, DAMON_LRU_SORT will be disabled. + +hot_thres_access_freq +--------------------- + +Access frequency threshold for hot memory regions identification in permil. + +If a memory region is accessed in frequency of this or higher, DAMON_LRU_SORT +identifies the region as hot, and mark it as accessed on the LRU list, so that +it could not be reclaimed under memory pressure. 50% by default. + +cold_min_age +------------ + +Time threshold for cold memory regions identification in microseconds. + +If a memory region is not accessed for this or longer time, DAMON_LRU_SORT +identifies the region as cold, and mark it as unaccessed on the LRU list, so +that it could be reclaimed first under memory pressure. 120 seconds by +default. + +quota_ms +-------- + +Limit of time for trying the LRU lists sorting in milliseconds. + +DAMON_LRU_SORT tries to use only up to this time within a time window +(quota_reset_interval_ms) for trying LRU lists sorting. This can be used +for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the +limit is disabled. + +10 ms by default. + +quota_reset_interval_ms +----------------------- + +The time quota charge reset interval in milliseconds. + +The charge reset interval for the quota of time (quota_ms). That is, +DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms +milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. + +1 second by default. + +wmarks_interval +--------------- + +The watermarks check time interval in microseconds. + +Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is +enabled but inactive due to its watermarks rule. 5 seconds by default. + +wmarks_high +----------- + +Free memory rate (per thousand) for the high watermark. + +If free memory of the system in bytes per thousand bytes is higher than this, +DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the +watermarks. 200 (20%) by default. + +wmarks_mid +---------- + +Free memory rate (per thousand) for the middle watermark. + +If free memory of the system in bytes per thousand bytes is between this and +the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring and +the LRU-lists sorting. 150 (15%) by default. + +wmarks_low +---------- + +Free memory rate (per thousand) for the low watermark. + +If free memory of the system in bytes per thousand bytes is lower than this, +DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks the +watermarks. 50 (5%) by default. + +sample_interval +--------------- + +Sampling interval for the monitoring in microseconds. + +The sampling interval of DAMON for the cold memory monitoring. Please refer to +the DAMON documentation (:doc:`usage`) for more detail. 5ms by default. + +aggr_interval +------------- + +Aggregation interval for the monitoring in microseconds. + +The aggregation interval of DAMON for the cold memory monitoring. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. 100ms by +default. + +min_nr_regions +-------------- + +Minimum number of monitoring regions. + +The minimal number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set lower-bound of the monitoring quality. +But, setting this too high could result in increased monitoring overhead. +Please refer to the DAMON documentation (:doc:`usage`) for more detail. 10 by +default. + +max_nr_regions +-------------- + +Maximum number of monitoring regions. + +The maximum number of monitoring regions of DAMON for the cold memory +monitoring. This can be used to set upper-bound of the monitoring overhead. +However, setting this too low could result in bad monitoring quality. Please +refer to the DAMON documentation (:doc:`usage`) for more detail. 1000 by +defaults. + +monitor_region_start +-------------------- + +Start of target memory region in physical address. + +The start physical address of memory region that DAMON_LRU_SORT will do work +against. By default, biggest System RAM is used as the region. + +monitor_region_end +------------------ + +End of target memory region in physical address. + +The end physical address of memory region that DAMON_LRU_SORT will do work +against. By default, biggest System RAM is used as the region. + +kdamond_pid +----------- + +PID of the DAMON thread. + +If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread. Else, +-1. + +nr_lru_sort_tried_hot_regions +----------------------------- + +Number of hot memory regions that tried to be LRU-sorted. + +bytes_lru_sort_tried_hot_regions +-------------------------------- + +Total bytes of hot memory regions that tried to be LRU-sorted. + +nr_lru_sorted_hot_regions +------------------------- + +Number of hot memory regions that successfully be LRU-sorted. + +bytes_lru_sorted_hot_regions +---------------------------- + +Total bytes of hot memory regions that successfully be LRU-sorted. + +nr_hot_quota_exceeds +-------------------- + +Number of times that the time quota limit for hot regions have exceeded. + +nr_lru_sort_tried_cold_regions +------------------------------ + +Number of cold memory regions that tried to be LRU-sorted. + +bytes_lru_sort_tried_cold_regions +--------------------------------- + +Total bytes of cold memory regions that tried to be LRU-sorted. + +nr_lru_sorted_cold_regions +-------------------------- + +Number of cold memory regions that successfully be LRU-sorted. + +bytes_lru_sorted_cold_regions +----------------------------- + +Total bytes of cold memory regions that successfully be LRU-sorted. + +nr_cold_quota_exceeds +--------------------- + +Number of times that the time quota limit for cold regions have exceeded. + +Example +======= + +Below runtime example commands make DAMON_LRU_SORT to find memory regions +having >=50% access frequency and LRU-prioritize while LRU-deprioritizing +memory regions that not accessed for 120 seconds. The prioritization and +deprioritization is limited to be done using only up to 1% CPU time to avoid +DAMON_LRU_SORT consuming too much CPU time for the (de)prioritization. It also +asks DAMON_LRU_SORT to do nothing if the system's free memory rate is more than +50%, but start the real works if it becomes lower than 40%. If DAMON_RECLAIM +doesn't make progress and therefore the free memory rate becomes lower than +20%, it asks DAMON_LRU_SORT to do nothing again, so that we can fall back to +the LRU-list based page granularity reclamation. :: + + # cd /sys/modules/damon_lru_sort/parameters + # echo 500 > hot_thres_access_freq + # echo 120000000 > cold_min_age + # echo 10 > quota_ms + # echo 1000 > quota_reset_interval_ms + # echo 500 > wmarks_high + # echo 400 > wmarks_mid + # echo 200 > wmarks_low + # echo Y > enabled From 50b0f797cab6cb7dff418777e1acf82dfd3568f8 Mon Sep 17 00:00:00 2001 From: Edward Liaw Date: Mon, 13 Jun 2022 23:33:21 +0000 Subject: [PATCH 070/282] userfaultfd: selftests: infinite loop in faulting_process On Android this test is getting stuck in an infinite loop due to indeterminate behavior: The local variables steps and signalled were being reset to 1 and 0 respectively after every jump back to sigsetjmp by siglongjmp in the signal handler. The test was incrementing them and expecting them to retain their incremented values. The documentation for siglongjmp says: All accessible objects have values as of the time sigsetjmp() was called, except that the values of objects of automatic storage duration which are local to the function containing the invocation of the corresponding sigsetjmp() which do not have volatile-qualified type and which are changed between the sigsetjmp() invocation and siglongjmp() call are indeterminate. Tagging steps and signalled with volatile enabled the test to pass. Link: https://lkml.kernel.org/r/20220613233321.431282-1-edliaw@google.com Signed-off-by: Edward Liaw Reviewed-by: Axel Rasmussen Cc: Shuah Khan Cc: Peter Xu Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/userfaultfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 4bc24581760d..7c3f1b0ab468 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -931,7 +931,7 @@ static int faulting_process(int signal_test) unsigned long split_nr_pages; unsigned long lastnr; struct sigaction act; - unsigned long signalled = 0; + volatile unsigned long signalled = 0; split_nr_pages = (nr_pages + 1) / 2; @@ -946,7 +946,7 @@ static int faulting_process(int signal_test) } for (nr = 0; nr < split_nr_pages; nr++) { - int steps = 1; + volatile int steps = 1; unsigned long offset = nr * page_size; if (signal_test) { From 64fe24a3e05e5f3ac56fcd45afd2fd1d9cc8fcb6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 14 Jun 2022 11:36:29 +0200 Subject: [PATCH 071/282] mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection Similar to our MM_CP_DIRTY_ACCT handling for shared, writable mappings, we can try mapping anonymous pages in a private writable mapping writable if they are exclusive, the PTE is already dirty, and no special handling applies. Mapping the anonymous page writable is essentially the same thing the write fault handler would do in this case. Special handling is required for uffd-wp and softdirty tracking, so take care of that properly. Also, leave PROT_NONE handling alone for now; in the future, we could similarly extend the logic in do_numa_page() or use pte_mk_savedwrite() here. While this improves mprotect(PROT_READ)+mprotect(PROT_READ|PROT_WRITE) performance, it should also be a valuable optimization for uffd-wp, when un-protecting. This has been previously suggested by Peter Collingbourne in [1], relevant in the context of the Scudo memory allocator, before we had PageAnonExclusive. This commit doesn't add the same handling for PMDs (i.e., anonymous THP, anonymous hugetlb); benchmark results from Andrea indicate that there are minor performance gains, so it's might still be valuable to streamline that logic for all anonymous pages in the future. As we now also set MM_CP_DIRTY_ACCT for private mappings, let's rename it to MM_CP_TRY_CHANGE_WRITABLE, to make it clearer what's actually happening. Micro-benchmark courtesy of Andrea: === #define _GNU_SOURCE #include #include #include #include #include #define SIZE (1024*1024*1024) int main(int argc, char *argv[]) { char *p; if (posix_memalign((void **)&p, sysconf(_SC_PAGESIZE)*512, SIZE)) perror("posix_memalign"), exit(1); if (madvise(p, SIZE, argc > 1 ? MADV_HUGEPAGE : MADV_NOHUGEPAGE)) perror("madvise"); explicit_bzero(p, SIZE); for (int loops = 0; loops < 40; loops++) { if (mprotect(p, SIZE, PROT_READ)) perror("mprotect"), exit(1); if (mprotect(p, SIZE, PROT_READ|PROT_WRITE)) perror("mprotect"), exit(1); explicit_bzero(p, SIZE); } } === Results on my Ryzen 9 3900X: Stock 10 runs (lower is better): AVG 6.398s, STDEV 0.043 Patched 10 runs (lower is better): AVG 3.780s, STDEV 0.026 === [1] https://lkml.kernel.org/r/20210429214801.2583336-1-pcc@google.com Link: https://lkml.kernel.org/r/20220614093629.76309-1-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Peter Collingbourne Acked-by: Peter Xu Cc: Nadav Amit Cc: Dave Hansen Cc: Andrea Arcangeli Cc: Yang Shi Cc: Hugh Dickins Cc: Mel Gorman Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 +++-- mm/mprotect.c | 77 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index cf3d0d673f6b..09ea26056e2f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1962,8 +1962,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, * for now all the callers are only use one of the flags at the same * time. */ -/* Whether we should allow dirty bit accounting */ -#define MM_CP_DIRTY_ACCT (1UL << 0) +/* + * Whether we should manually check if we can map individual PTEs writable, + * because something (e.g., COW, uffd-wp) blocks that from happening for all + * PTEs automatically in a writable mapping. + */ +#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ diff --git a/mm/mprotect.c b/mm/mprotect.c index ba5592655ee3..996a97e213ad 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -38,6 +38,39 @@ #include "internal.h" +static inline bool can_change_pte_writable(struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + struct page *page; + + VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + + if (pte_protnone(pte) || !pte_dirty(pte)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_pte_wp(vma, pte)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* + * We can only special-case on exclusive anonymous pages, + * because we know that our write-fault handler similarly would + * map them writable without any additional checks while holding + * the PT lock. + */ + page = vm_normal_page(vma, addr, pte); + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + } + + return true; +} + static unsigned long change_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; - bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, ptent = pte_wrprotect(ptent); ptent = pte_mkuffd_wp(ptent); } else if (uffd_wp_resolve) { - /* - * Leave the write bit to be handled - * by PF interrupt handler, then - * things like COW could be properly - * handled. - */ ptent = pte_clear_uffd_wp(ptent); } - /* Avoid taking write faults for known dirty pages */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) { + /* + * In some writable, shared mappings, we might want + * to catch actual write access -- see + * vma_wants_writenotify(). + * + * In all writable, private mappings, we have to + * properly handle COW. + * + * In both cases, we can sometimes still change PTEs + * writable and avoid the write-fault handler, for + * example, if a PTE is already dirty and no other + * COW or special handling is required. + */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && + !pte_write(ptent) && + can_change_pte_writable(vma, addr, ptent)) ptent = pte_mkwrite(ptent); - } + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); if (pte_needs_flush(oldpte, ptent)) tlb_flush_pte_range(tlb, addr, PAGE_SIZE); @@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; + bool try_change_writable; pgoff_t pgoff; int error; - int dirty_accountable = 0; if (newflags == oldflags) { *pprev = vma; @@ -583,11 +621,20 @@ success: * held in write mode. */ vma->vm_flags = newflags; - dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); + /* + * We want to check manually if we can change individual PTEs writable + * if we can't do that automatically for all PTEs in a mapping. For + * private mappings, that's always the case when we have write + * permissions as we properly have to handle COW. + */ + if (vma->vm_flags & VM_SHARED) + try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); + else + try_change_writable = !!(vma->vm_flags & VM_WRITE); vma_set_page_prot(vma); change_protection(tlb, vma, start, end, vma->vm_page_prot, - dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major From b8cecb9376b9d3031cf62b476a0db087b6b01072 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:44 +0100 Subject: [PATCH 072/282] mm/vmscan: convert reclaim_clean_pages_from_list() to folios Patch series "nvert much of vmscan to folios" vmscan always operates on folios since it puts the pages on the LRU list. Switching all of these functions from pages to folios saves 1483 bytes of text from removing all the baggage around calling compound_page() and similar functions. This patch (of 5): This is a straightforward conversion which removes several hidden calls to compound_head, saving 330 bytes of kernel text. Link: https://lkml.kernel.org/r/20220617154248.700416-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220617154248.700416-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++++++ mm/vmscan.c | 22 +++++++++++----------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e66f7aa3191d..f32aade2a6e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -670,6 +670,12 @@ static __always_inline bool PageAnon(struct page *page) return folio_test_anon(page_folio(page)); } +static __always_inline bool __folio_test_movable(const struct folio *folio) +{ + return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == + PAGE_MAPPING_MOVABLE; +} + static __always_inline int __PageMovable(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == diff --git a/mm/vmscan.c b/mm/vmscan.c index 97ac6c6c026d..2ecca45672e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2041,7 +2041,7 @@ keep: } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *page_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2049,16 +2049,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, }; struct reclaim_stat stat; unsigned int nr_reclaimed; - struct page *page, *next; - LIST_HEAD(clean_pages); + struct folio *folio, *next; + LIST_HEAD(clean_folios); unsigned int noreclaim_flag; - list_for_each_entry_safe(page, next, page_list, lru) { - if (!PageHuge(page) && page_is_file_lru(page) && - !PageDirty(page) && !__PageMovable(page) && - !PageUnevictable(page)) { - ClearPageActive(page); - list_move(&page->lru, &clean_pages); + list_for_each_entry_safe(folio, next, folio_list, lru) { + if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) && + !folio_test_dirty(folio) && !__folio_test_movable(folio) && + !folio_test_unevictable(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &clean_folios); } } @@ -2069,11 +2069,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); - list_splice(&clean_pages, page_list); + list_splice(&clean_folios, folio_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); /* From 166e3d32276f4c9ffd290f92b9df55b255f5fed7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:45 +0100 Subject: [PATCH 073/282] mm/vmscan: convert isolate_lru_pages() to use a folio Remove a few hidden calls to compound_head, saving 279 bytes of text. Link: https://lkml.kernel.org/r/20220617154248.700416-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 66 ++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2ecca45672e2..3ed0acc02e00 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -160,17 +160,17 @@ struct scan_control { }; #ifdef ARCH_HAS_PREFETCHW -#define prefetchw_prev_lru_page(_page, _base, _field) \ +#define prefetchw_prev_lru_folio(_folio, _base, _field) \ do { \ - if ((_page)->lru.prev != _base) { \ - struct page *prev; \ + if ((_folio)->lru.prev != _base) { \ + struct folio *prev; \ \ - prev = lru_to_page(&(_page->lru)); \ + prev = lru_to_folio(&(_folio->lru)); \ prefetchw(&prev->_field); \ } \ } while (0) #else -#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif /* @@ -2139,72 +2139,72 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; unsigned long skipped = 0; unsigned long scan, total_scan, nr_pages; - LIST_HEAD(pages_skipped); + LIST_HEAD(folios_skipped); total_scan = 0; scan = 0; while (scan < nr_to_scan && !list_empty(src)) { struct list_head *move_to = src; - struct page *page; + struct folio *folio; - page = lru_to_page(src); - prefetchw_prev_lru_page(page, src, flags); + folio = lru_to_folio(src); + prefetchw_prev_lru_folio(folio, src, flags); - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); total_scan += nr_pages; - if (page_zonenum(page) > sc->reclaim_idx) { - nr_skipped[page_zonenum(page)] += nr_pages; - move_to = &pages_skipped; + if (folio_zonenum(folio) > sc->reclaim_idx) { + nr_skipped[folio_zonenum(folio)] += nr_pages; + move_to = &folios_skipped; goto move; } /* - * Do not count skipped pages because that makes the function - * return with no isolated pages if the LRU mostly contains - * ineligible pages. This causes the VM to not reclaim any - * pages, triggering a premature OOM. - * Account all tail pages of THP. + * Do not count skipped folios because that makes the function + * return with no isolated folios if the LRU mostly contains + * ineligible folios. This causes the VM to not reclaim any + * folios, triggering a premature OOM. + * Account all pages in a folio. */ scan += nr_pages; - if (!PageLRU(page)) + if (!folio_test_lru(folio)) goto move; - if (!sc->may_unmap && page_mapped(page)) + if (!sc->may_unmap && folio_mapped(folio)) goto move; /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. + * Be careful not to clear the lru flag until after we're + * sure the folio is not being freed elsewhere -- the + * folio release code relies on it. */ - if (unlikely(!get_page_unless_zero(page))) + if (unlikely(!folio_try_get(folio))) goto move; - if (!TestClearPageLRU(page)) { - /* Another thread is already isolating this page */ - put_page(page); + if (!folio_test_clear_lru(folio)) { + /* Another thread is already isolating this folio */ + folio_put(folio); goto move; } nr_taken += nr_pages; - nr_zone_taken[page_zonenum(page)] += nr_pages; + nr_zone_taken[folio_zonenum(folio)] += nr_pages; move_to = dst; move: - list_move(&page->lru, move_to); + list_move(&folio->lru, move_to); } /* - * Splice any skipped pages to the start of the LRU list. Note that + * Splice any skipped folios to the start of the LRU list. Note that * this disrupts the LRU order when reclaiming for lower zones but * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX - * scanning would soon rescan the same pages to skip and waste lots + * scanning would soon rescan the same folios to skip and waste lots * of cpu cycles. */ - if (!list_empty(&pages_skipped)) { + if (!list_empty(&folios_skipped)) { int zid; - list_splice(&pages_skipped, src); + list_splice(&folios_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; From ff00a170d950309f9daef836caa3d54671b883b8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:46 +0100 Subject: [PATCH 074/282] mm/vmscan: convert move_pages_to_lru() to use a folio Remove a few hidden calls to compound_head, saving 387 bytes of text on my test configuration. Link: https://lkml.kernel.org/r/20220617154248.700416-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 54 ++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ed0acc02e00..2ec5bcb13e74 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2308,8 +2308,8 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * move_pages_to_lru() moves pages from private @list to appropriate LRU list. - * On return, @list is reused as a list of pages to be freed by the caller. + * move_pages_to_lru() moves folios from private @list to appropriate LRU list. + * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ @@ -2317,42 +2317,42 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; - LIST_HEAD(pages_to_free); - struct page *page; + LIST_HEAD(folios_to_free); while (!list_empty(list)) { - page = lru_to_page(list); - VM_BUG_ON_PAGE(PageLRU(page), page); - list_del(&page->lru); - if (unlikely(!page_evictable(page))) { + struct folio *folio = lru_to_folio(list); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + list_del(&folio->lru); + if (unlikely(!folio_evictable(folio))) { spin_unlock_irq(&lruvec->lru_lock); - putback_lru_page(page); + folio_putback_lru(folio); spin_lock_irq(&lruvec->lru_lock); continue; } /* - * The SetPageLRU needs to be kept here for list integrity. + * The folio_set_lru needs to be kept here for list integrity. * Otherwise: * #0 move_pages_to_lru #1 release_pages - * if !put_page_testzero - * if (put_page_testzero()) - * !PageLRU //skip lru_lock - * SetPageLRU() - * list_add(&page->lru,) - * list_add(&page->lru,) + * if (!folio_put_testzero()) + * if (folio_put_testzero()) + * !lru //skip lru_lock + * folio_set_lru() + * list_add(&folio->lru,) + * list_add(&folio->lru,) */ - SetPageLRU(page); + folio_set_lru(folio); - if (unlikely(put_page_testzero(page))) { - __clear_page_lru_flags(page); + if (unlikely(folio_put_testzero(folio))) { + __folio_clear_lru_flags(folio); - if (unlikely(PageCompound(page))) { + if (unlikely(folio_test_large(folio))) { spin_unlock_irq(&lruvec->lru_lock); - destroy_compound_page(page); + destroy_compound_page(&folio->page); spin_lock_irq(&lruvec->lru_lock); } else - list_add(&page->lru, &pages_to_free); + list_add(&folio->lru, &folios_to_free); continue; } @@ -2361,18 +2361,18 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, * All pages were isolated from the same lruvec (and isolation * inhibits memcg migration). */ - VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page); - add_page_to_lru_list(page, lruvec); - nr_pages = thp_nr_pages(page); + VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio); + lruvec_add_folio(lruvec, folio); + nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; - if (PageActive(page)) + if (folio_test_active(folio)) workingset_age_nonresident(lruvec, nr_pages); } /* * To save our caller's stack, now use input list for pages to free. */ - list_splice(&pages_to_free, list); + list_splice(&folios_to_free, list); return nr_moved; } From 07f67a8dedc0788f3f91d945bc6e987cf9cccd4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:47 +0100 Subject: [PATCH 075/282] mm/vmscan: convert shrink_active_list() to use a folio Remove a few hidden calls to compound_head, saving 411 bytes of text. Link: https://lkml.kernel.org/r/20220617154248.700416-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 61 +++++++++++++++++++++++++---------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2ec5bcb13e74..8e22a1192ac8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -26,8 +26,7 @@ #include #include #include -#include /* for try_to_release_page(), - buffer_heads_over_limit */ +#include /* for buffer_heads_over_limit */ #include #include #include @@ -2483,21 +2482,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, } /* - * shrink_active_list() moves pages from the active LRU to the inactive LRU. + * shrink_active_list() moves folios from the active LRU to the inactive LRU. * - * We move them the other way if the page is referenced by one or more + * We move them the other way if the folio is referenced by one or more * processes. * - * If the pages are mostly unmapped, the processing is fast and it is + * If the folios are mostly unmapped, the processing is fast and it is * appropriate to hold lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (folio_referenced()), so - * we should drop lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. + * the folios are mapped, the processing is slow (folio_referenced()), so + * we should drop lru_lock around each folio. It's impossible to balance + * this, so instead we remove the folios from the LRU while processing them. + * It is safe to rely on the active flag against the non-LRU folios in here + * because nobody will play with that bit on a non-LRU folio. * - * The downside is that we have to touch page->_refcount against each page. - * But we had to alter page->flags anyway. + * The downside is that we have to touch folio->_refcount against each folio. + * But we had to alter folio->flags anyway. */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, @@ -2507,7 +2506,7 @@ static void shrink_active_list(unsigned long nr_to_scan, unsigned long nr_taken; unsigned long nr_scanned; unsigned long vm_flags; - LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); unsigned nr_deactivate, nr_activate; @@ -2532,23 +2531,21 @@ static void shrink_active_list(unsigned long nr_to_scan, while (!list_empty(&l_hold)) { struct folio *folio; - struct page *page; cond_resched(); folio = lru_to_folio(&l_hold); list_del(&folio->lru); - page = &folio->page; - if (unlikely(!page_evictable(page))) { - putback_lru_page(page); + if (unlikely(!folio_evictable(folio))) { + folio_putback_lru(folio); continue; } if (unlikely(buffer_heads_over_limit)) { - if (page_has_private(page) && trylock_page(page)) { - if (page_has_private(page)) - try_to_release_page(page, 0); - unlock_page(page); + if (folio_get_private(folio) && folio_trylock(folio)) { + if (folio_get_private(folio)) + filemap_release_folio(folio, 0); + folio_unlock(folio); } } @@ -2556,34 +2553,34 @@ static void shrink_active_list(unsigned long nr_to_scan, if (folio_referenced(folio, 0, sc->target_mem_cgroup, &vm_flags) != 0) { /* - * Identify referenced, file-backed active pages and + * Identify referenced, file-backed active folios and * give them one more trip around the active list. So * that executable code get better chances to stay in - * memory under moderate memory pressure. Anon pages + * memory under moderate memory pressure. Anon folios * are not likely to be evicted by use-once streaming - * IO, plus JVM can create lots of anon VM_EXEC pages, + * IO, plus JVM can create lots of anon VM_EXEC folios, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { - nr_rotated += thp_nr_pages(page); - list_add(&page->lru, &l_active); + if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) { + nr_rotated += folio_nr_pages(folio); + list_add(&folio->lru, &l_active); continue; } } - ClearPageActive(page); /* we are de-activating */ - SetPageWorkingset(page); - list_add(&page->lru, &l_inactive); + folio_clear_active(folio); /* we are de-activating */ + folio_set_workingset(folio); + list_add(&folio->lru, &l_inactive); } /* - * Move pages back to the lru list. + * Move folios back to the lru list. */ spin_lock_irq(&lruvec->lru_lock); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); - /* Keep all free pages in l_active list */ + /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); From a83f0551f49682c81444d682053d49f9dfcbe5fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 16:42:48 +0100 Subject: [PATCH 076/282] mm/vmscan: convert reclaim_pages() to use a folio Remove a few hidden calls to compound_head, saving 76 bytes of text. Link: https://lkml.kernel.org/r/20220617154248.700416-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/vmscan.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e22a1192ac8..e7d3db64a4e0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2619,34 +2619,33 @@ static unsigned int reclaim_page_list(struct list_head *page_list, return nr_reclaimed; } -unsigned long reclaim_pages(struct list_head *page_list) +unsigned long reclaim_pages(struct list_head *folio_list) { int nid; unsigned int nr_reclaimed = 0; - LIST_HEAD(node_page_list); - struct page *page; + LIST_HEAD(node_folio_list); unsigned int noreclaim_flag; - if (list_empty(page_list)) + if (list_empty(folio_list)) return nr_reclaimed; noreclaim_flag = memalloc_noreclaim_save(); - nid = page_to_nid(lru_to_page(page_list)); + nid = folio_nid(lru_to_folio(folio_list)); do { - page = lru_to_page(page_list); + struct folio *folio = lru_to_folio(folio_list); - if (nid == page_to_nid(page)) { - ClearPageActive(page); - list_move(&page->lru, &node_page_list); + if (nid == folio_nid(folio)) { + folio_clear_active(folio); + list_move(&folio->lru, &node_folio_list); continue; } - nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); - nid = page_to_nid(lru_to_page(page_list)); - } while (!list_empty(page_list)); + nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nid = folio_nid(lru_to_folio(folio_list)); + } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); From e3c4cebf3f9db8c9150eb1982da7e353d9938bed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:49:59 +0100 Subject: [PATCH 077/282] mm: add folios_put() Patch series "Convert the swap code to be more folio-based". There's still more to do with the swap code, but this reaps a lot of the folio benefit. More than 4kB of kernel text saved (with the UEK7 kernel config). I don't know how much that's going to translate into CPU savings, but some of those compound_head() calls are on every page free, so it should be noticable. It might even be noticable just from an I-cache consumption perspective. This patch (of 22): This is just a wrapper around release_pages() for now. Place the prototype in mm.h along with folio_put() and folio_put_refs(). Link: https://lkml.kernel.org/r/20220617175020.717127-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220617175020.717127-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 +++++++++++++++++++ include/linux/pagemap.h | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 09ea26056e2f..09670ccb94e7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1220,6 +1220,25 @@ static inline void folio_put_refs(struct folio *folio, int refs) __put_page(&folio->page); } +void release_pages(struct page **pages, int nr); + +/** + * folios_put - Decrement the reference count on an array of folios. + * @folios: The folios. + * @nr: How many folios there are. + * + * Like folio_put(), but for an array of folios. This is more efficient + * than writing the loop yourself as it will optimise the locks which + * need to be taken if the folios are freed. + * + * Context: May be called in process or interrupt context, but not in NMI + * context. May be called while holding a spinlock. + */ +static inline void folios_put(struct folio **folios, unsigned int nr) +{ + release_pages((struct page **)folios, nr); +} + static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index ce96866fbec4..c399a9c5da7d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -345,8 +345,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -void release_pages(struct page **pages, int nr); - struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); From c2bc16817aa0dcd5d4b452661840be976f5d5c65 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:00 +0100 Subject: [PATCH 078/282] mm/swap: add folio_batch_move_lru() Start converting the LRU from pagevecs to folio_batches. Combine the functionality of pagevec_add_and_need_flush() with pagevec_lru_move_fn() in the new folio_batch_add_and_move(). Convert the lru_rotate pagevec to a folio_batch. Adds 223 bytes total to kernel text, because we're duplicating infrastructure. This will be more than made up for in future patches. Link: https://lkml.kernel.org/r/20220617175020.717127-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 78 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 034bb24879a3..4265bee41bbd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -46,10 +46,10 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -/* Protecting only lru_rotate.pvec which requires disabling interrupts */ +/* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { local_lock_t lock; - struct pagevec pvec; + struct folio_batch fbatch; }; static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { .lock = INIT_LOCAL_LOCK(lock), @@ -214,18 +214,6 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, pagevec_reinit(pvec); } -static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) -{ - struct folio *folio = page_folio(page); - - if (!folio_test_unevictable(folio)) { - lruvec_del_folio(lruvec, folio); - folio_clear_active(folio); - lruvec_add_folio_tail(lruvec, folio); - __count_vm_events(PGROTATED, folio_nr_pages(folio)); - } -} - /* return true if pagevec needs to drain */ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) { @@ -238,6 +226,52 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) return ret; } +typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); + +static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) +{ + int i; + struct lruvec *lruvec = NULL; + unsigned long flags = 0; + + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; + + /* block memcg migration while the folio moves between lru */ + if (!folio_test_clear_lru(folio)) + continue; + + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + move_fn(lruvec, folio); + + folio_set_lru(folio); + } + + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + folios_put(fbatch->folios, folio_batch_count(fbatch)); + folio_batch_init(fbatch); +} + +static void folio_batch_add_and_move(struct folio_batch *fbatch, + struct folio *folio, move_fn_t move_fn) +{ + if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) && + !lru_cache_disabled()) + return; + folio_batch_move_lru(fbatch, move_fn); +} + +static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio) +{ + if (!folio_test_unevictable(folio)) { + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + lruvec_add_folio_tail(lruvec, folio); + __count_vm_events(PGROTATED, folio_nr_pages(folio)); + } +} + /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it @@ -249,14 +283,13 @@ void folio_rotate_reclaimable(struct folio *folio) { if (!folio_test_locked(folio) && !folio_test_dirty(folio) && !folio_test_unevictable(folio) && folio_test_lru(folio)) { - struct pagevec *pvec; + struct folio_batch *fbatch; unsigned long flags; folio_get(folio); local_lock_irqsave(&lru_rotate.lock, flags); - pvec = this_cpu_ptr(&lru_rotate.pvec); - if (pagevec_add_and_need_flush(pvec, &folio->page)) - pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); + fbatch = this_cpu_ptr(&lru_rotate.fbatch); + folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } @@ -595,19 +628,20 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) */ void lru_add_drain_cpu(int cpu) { + struct folio_batch *fbatch; struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_rotate.pvec, cpu); + fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ - if (data_race(pagevec_count(pvec))) { + if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); - pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); + folio_batch_move_lru(fbatch, lru_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } @@ -824,7 +858,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || - data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) || + data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || From 7d80dd096f8f889128f67a2d452e4dadeed71e63 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:01 +0100 Subject: [PATCH 079/282] mm/swap: make __pagevec_lru_add static __pagevec_lru_add has no callers outside swap.c, so make it static, and move it to a more logical position in the file. Link: https://lkml.kernel.org/r/20220617175020.717127-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 1 - mm/swap.c | 126 ++++++++++++++++++++-------------------- 2 files changed, 63 insertions(+), 64 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 67b1246f136b..b0e3540f3a4c 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -26,7 +26,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, pgoff_t *start, pgoff_t end); diff --git a/mm/swap.c b/mm/swap.c index 4265bee41bbd..cab77a5c64c7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -228,6 +228,69 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); +static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) +{ + int was_unevictable = folio_test_clear_unevictable(folio); + long nr_pages = folio_nr_pages(folio); + + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + folio_set_lru(folio); + /* + * Is an smp_mb__after_atomic() still required here, before + * folio_evictable() tests PageMlocked, to rule out the possibility + * of stranding an evictable folio on an unevictable LRU? I think + * not, because __munlock_page() only clears PageMlocked while the LRU + * lock is held. + * + * (That is not true of __page_cache_release(), and not necessarily + * true of release_pages(): but those only clear PageMlocked after + * put_page_testzero() has excluded any other users of the page.) + */ + if (folio_evictable(folio)) { + if (was_unevictable) + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); + } else { + folio_clear_active(folio); + folio_set_unevictable(folio); + /* + * folio->mlock_count = !!folio_test_mlocked(folio)? + * But that leaves __mlock_page() in doubt whether another + * actor has already counted the mlock or not. Err on the + * safe side, underestimate, let page reclaim fix it, rather + * than leaving a page on the unevictable LRU indefinitely. + */ + folio->mlock_count = 0; + if (!was_unevictable) + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); + } + + lruvec_add_folio(lruvec, folio); + trace_mm_lru_insertion(folio); +} + +/* + * Add the passed pages to the LRU, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +static void __pagevec_lru_add(struct pagevec *pvec) +{ + int i; + struct lruvec *lruvec = NULL; + unsigned long flags = 0; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct folio *folio = page_folio(pvec->pages[i]); + + lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); + __pagevec_lru_add_fn(folio, lruvec); + } + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); +} + static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; @@ -1036,69 +1099,6 @@ void __pagevec_release(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_release); -static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) -{ - int was_unevictable = folio_test_clear_unevictable(folio); - long nr_pages = folio_nr_pages(folio); - - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - - folio_set_lru(folio); - /* - * Is an smp_mb__after_atomic() still required here, before - * folio_evictable() tests PageMlocked, to rule out the possibility - * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears PageMlocked while the LRU - * lock is held. - * - * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear PageMlocked after - * put_page_testzero() has excluded any other users of the page.) - */ - if (folio_evictable(folio)) { - if (was_unevictable) - __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); - } else { - folio_clear_active(folio); - folio_set_unevictable(folio); - /* - * folio->mlock_count = !!folio_test_mlocked(folio)? - * But that leaves __mlock_page() in doubt whether another - * actor has already counted the mlock or not. Err on the - * safe side, underestimate, let page reclaim fix it, rather - * than leaving a page on the unevictable LRU indefinitely. - */ - folio->mlock_count = 0; - if (!was_unevictable) - __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); - } - - lruvec_add_folio(lruvec, folio); - trace_mm_lru_insertion(folio); -} - -/* - * Add the passed pages to the LRU, then drop the caller's refcount - * on them. Reinitialises the caller's pagevec. - */ -void __pagevec_lru_add(struct pagevec *pvec) -{ - int i; - struct lruvec *lruvec = NULL; - unsigned long flags = 0; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct folio *folio = page_folio(pvec->pages[i]); - - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - __pagevec_lru_add_fn(folio, lruvec); - } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); -} - /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune From 70dea5346ea327499f9a71e77bec2732e4d422ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:02 +0100 Subject: [PATCH 080/282] mm/swap: convert lru_add to a folio_batch When adding folios to the LRU for the first time, the LRU flag will already be clear, so skip the test-and-clear part of moving from one LRU to another. Removes 285 bytes from kernel text, mostly due to removing __pagevec_lru_add(). Link: https://lkml.kernel.org/r/20220617175020.717127-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 75 +++++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 49 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index cab77a5c64c7..cb7669bea85b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -61,7 +61,7 @@ static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { */ struct lru_pvecs { local_lock_t lock; - struct pagevec lru_add; + struct folio_batch lru_add; struct pagevec lru_deactivate_file; struct pagevec lru_deactivate; struct pagevec lru_lazyfree; @@ -228,14 +228,13 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); -static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) +static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - folio_set_lru(folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests PageMlocked, to rule out the possibility @@ -269,28 +268,6 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec) trace_mm_lru_insertion(folio); } -/* - * Add the passed pages to the LRU, then drop the caller's refcount - * on them. Reinitialises the caller's pagevec. - */ -static void __pagevec_lru_add(struct pagevec *pvec) -{ - int i; - struct lruvec *lruvec = NULL; - unsigned long flags = 0; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct folio *folio = page_folio(pvec->pages[i]); - - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - __pagevec_lru_add_fn(folio, lruvec); - } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); -} - static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; @@ -301,7 +278,7 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) struct folio *folio = fbatch->folios[i]; /* block memcg migration while the folio moves between lru */ - if (!folio_test_clear_lru(folio)) + if (move_fn != lru_add_fn && !folio_test_clear_lru(folio)) continue; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); @@ -473,26 +450,26 @@ static void folio_activate(struct folio *folio) static void __lru_cache_activate_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; int i; local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); + fbatch = this_cpu_ptr(&lru_pvecs.lru_add); /* - * Search backwards on the optimistic assumption that the page being - * activated has just been added to this pagevec. Note that only - * the local pagevec is examined as a !PageLRU page could be in the + * Search backwards on the optimistic assumption that the folio being + * activated has just been added to this batch. Note that only + * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote - * pagevec that is currently being drained. Furthermore, marking - * a remote pagevec's page PageActive potentially hits a race where - * a page is marked PageActive just after it is added to the inactive + * batch that is currently being drained. Furthermore, marking + * a remote batch's folio active potentially hits a race where + * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ - for (i = pagevec_count(pvec) - 1; i >= 0; i--) { - struct page *pagevec_page = pvec->pages[i]; + for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { + struct folio *batch_folio = fbatch->folios[i]; - if (pagevec_page == &folio->page) { + if (batch_folio == folio) { folio_set_active(folio); break; } @@ -551,16 +528,16 @@ EXPORT_SYMBOL(folio_mark_accessed); */ void folio_add_lru(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); + VM_BUG_ON_FOLIO(folio_test_active(folio) && + folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); folio_get(folio); local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); - if (pagevec_add_and_need_flush(pvec, &folio->page)) - __pagevec_lru_add(pvec); + fbatch = this_cpu_ptr(&lru_pvecs.lru_add); + folio_batch_add_and_move(fbatch, folio, lru_add_fn); local_unlock(&lru_pvecs.lock); } EXPORT_SYMBOL(folio_add_lru); @@ -691,11 +668,11 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) */ void lru_add_drain_cpu(int cpu) { - struct folio_batch *fbatch; - struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu); + struct folio_batch *fbatch = &per_cpu(lru_pvecs.lru_add, cpu); + struct pagevec *pvec; - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_add_fn); fbatch = &per_cpu(lru_rotate.fbatch, cpu); /* Disabling interrupts below acts as a compiler barrier. */ @@ -920,7 +897,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (folio_batch_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || @@ -1084,8 +1061,8 @@ EXPORT_SYMBOL(release_pages); * OK from a correctness point of view but is inefficient - those pages may be * cache-warm and we want to give them back to the page allocator ASAP. * - * So __pagevec_release() will drain those queues here. __pagevec_lru_add() - * and __pagevec_lru_add_active() call release_pages() directly to avoid + * So __pagevec_release() will drain those queues here. + * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __pagevec_release(struct pagevec *pvec) From 7a3dbfe8a52b5d7a1639aa0bf7b3a3271d9e6e05 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:03 +0100 Subject: [PATCH 081/282] mm/swap: convert lru_deactivate_file to a folio_batch Use a folio throughout lru_deactivate_file_fn(), removing many hidden calls to compound_head(). Shrinks the kernel by 864 bytes of text. Link: https://lkml.kernel.org/r/20220617175020.717127-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 82 ++++++++++++++++++++++++++----------------------------- 1 file changed, 39 insertions(+), 43 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index cb7669bea85b..75c72d235479 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -62,7 +62,7 @@ static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { struct lru_pvecs { local_lock_t lock; struct folio_batch lru_add; - struct pagevec lru_deactivate_file; + struct folio_batch lru_deactivate_file; struct pagevec lru_deactivate; struct pagevec lru_lazyfree; #ifdef CONFIG_SMP @@ -562,56 +562,57 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, } /* - * If the page can not be invalidated, it is moved to the + * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * - * If the page isn't page_mapped and dirty/writeback, the page - * could reclaim asap using PG_reclaim. + * If the folio isn't mapped and dirty/writeback, the folio + * could be reclaimed asap using the reclaim flag. * - * 1. active, mapped page -> none - * 2. active, dirty/writeback page -> inactive, head, PG_reclaim - * 3. inactive, mapped page -> none - * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 1. active, mapped folio -> none + * 2. active, dirty/writeback folio -> inactive, head, reclaim + * 3. inactive, mapped folio -> none + * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * - * In 4, why it moves inactive's head, the VM expects the page would - * be write it out by flusher threads as this is much more effective + * In 4, it moves to the head of the inactive list so the folio is + * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ -static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) +static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) { - bool active = PageActive(page); - int nr_pages = thp_nr_pages(page); + bool active = folio_test_active(folio); + long nr_pages = folio_nr_pages(folio); - if (PageUnevictable(page)) + if (folio_test_unevictable(folio)) return; - /* Some processes are using the page */ - if (page_mapped(page)) + /* Some processes are using the folio */ + if (folio_mapped(folio)) return; - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - ClearPageReferenced(page); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); - if (PageWriteback(page) || PageDirty(page)) { + if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* - * PG_reclaim could be raced with end_page_writeback - * It can make readahead confusing. But race window - * is _really_ small and it's non-critical problem. + * Setting the reclaim flag could race with + * folio_end_writeback() and confuse readahead. But the + * race window is _really_ small and it's not a critical + * problem. */ - add_page_to_lru_list(page, lruvec); - SetPageReclaim(page); + lruvec_add_folio(lruvec, folio); + folio_set_reclaim(folio); } else { /* - * The page's writeback ends up during pagevec - * We move that page into tail of inactive. + * The folio's writeback ended while it was in the batch. + * We move that folio to the tail of the inactive list. */ - add_page_to_lru_list_tail(page, lruvec); + lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } @@ -685,9 +686,9 @@ void lru_add_drain_cpu(int cpu) local_unlock_irqrestore(&lru_rotate.lock, flags); } - pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); + fbatch = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_deactivate_file_fn); pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) @@ -701,32 +702,27 @@ void lru_add_drain_cpu(int cpu) } /** - * deactivate_file_folio() - Forcefully deactivate a file folio. + * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * - * Context: Caller holds a reference on the page. + * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { - struct pagevec *pvec; + struct folio_batch *fbatch; - /* - * In a workload with many unevictable pages such as mprotect, - * unevictable folio deactivation for accelerating reclaim is pointless. - */ + /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; folio_get(folio); local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); - - if (pagevec_add_and_need_flush(pvec, &folio->page)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); + fbatch = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); + folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } @@ -899,7 +895,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) if (folio_batch_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || - pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || + folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || From 85cd7791a809156e562df6381a7c6d4ab12c7280 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:04 +0100 Subject: [PATCH 082/282] mm/swap: convert lru_deactivate to a folio_batch Using folios instead of pages shrinks deactivate_page() and lru_deactivate_fn() by 778 bytes between them. Link: https://lkml.kernel.org/r/20220617175020.717127-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 75c72d235479..f23a549c4966 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -63,7 +63,7 @@ struct lru_pvecs { local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; - struct pagevec lru_deactivate; + struct folio_batch lru_deactivate; struct pagevec lru_lazyfree; #ifdef CONFIG_SMP struct pagevec activate_page; @@ -623,15 +623,15 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio) } } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) +static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) { - if (PageActive(page) && !PageUnevictable(page)) { - int nr_pages = thp_nr_pages(page); + if (folio_test_active(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, @@ -690,9 +690,9 @@ void lru_add_drain_cpu(int cpu) if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); - pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn); + fbatch = &per_cpu(lru_pvecs.lru_deactivate, cpu); + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_deactivate_fn); pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) @@ -736,14 +736,16 @@ void deactivate_file_folio(struct folio *folio) */ void deactivate_page(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec; + struct folio *folio = page_folio(page); + if (folio_test_lru(folio) && folio_test_active(folio) && + !folio_test_unevictable(folio)) { + struct folio_batch *fbatch; + + folio_get(folio); local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); - get_page(page); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn); + fbatch = this_cpu_ptr(&lru_pvecs.lru_deactivate); + folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } } @@ -896,7 +898,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) if (folio_batch_count(&per_cpu(lru_pvecs.lru_add, cpu)) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || - pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || + folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || need_mlock_page_drain(cpu) || From cec394bafab5d921d21e273b0db94a4802d9a991 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:05 +0100 Subject: [PATCH 083/282] mm/swap: convert lru_lazyfree to a folio_batch Using folios instead of pages removes several calls to compound_head(), shrinking the kernel by 1089 bytes of text. Link: https://lkml.kernel.org/r/20220617175020.717127-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index f23a549c4966..4a2f866c8878 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -64,7 +64,7 @@ struct lru_pvecs { struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; - struct pagevec lru_lazyfree; + struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct pagevec activate_page; #endif @@ -639,22 +639,22 @@ static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio) } } -static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) +static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) { - if (PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page) && !PageUnevictable(page)) { - int nr_pages = thp_nr_pages(page); + if (folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) { + long nr_pages = folio_nr_pages(folio); - del_page_from_lru_list(page, lruvec); - ClearPageActive(page); - ClearPageReferenced(page); + lruvec_del_folio(lruvec, folio); + folio_clear_active(folio); + folio_clear_referenced(folio); /* - * Lazyfree pages are clean anonymous pages. They have - * PG_swapbacked flag cleared, to distinguish them from normal - * anonymous pages + * Lazyfree folios are clean anonymous folios. They have + * the swapbacked flag cleared, to distinguish them from normal + * anonymous folios */ - ClearPageSwapBacked(page); - add_page_to_lru_list(page, lruvec); + folio_clear_swapbacked(folio); + lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, @@ -670,7 +670,6 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) void lru_add_drain_cpu(int cpu) { struct folio_batch *fbatch = &per_cpu(lru_pvecs.lru_add, cpu); - struct pagevec *pvec; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); @@ -694,9 +693,9 @@ void lru_add_drain_cpu(int cpu) if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); - pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_lazyfree_fn); + fbatch = &per_cpu(lru_pvecs.lru_lazyfree, cpu); + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, lru_lazyfree_fn); activate_page_drain(cpu); } @@ -759,15 +758,17 @@ void deactivate_page(struct page *page) */ void mark_page_lazyfree(struct page *page) { - if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page) && !PageUnevictable(page)) { - struct pagevec *pvec; + struct folio *folio = page_folio(page); + if (folio_test_lru(folio) && folio_test_anon(folio) && + folio_test_swapbacked(folio) && !folio_test_swapcache(folio) && + !folio_test_unevictable(folio)) { + struct folio_batch *fbatch; + + folio_get(folio); local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); - get_page(page); - if (pagevec_add_and_need_flush(pvec, page)) - pagevec_lru_move_fn(pvec, lru_lazyfree_fn); + fbatch = this_cpu_ptr(&lru_pvecs.lru_lazyfree); + folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } } @@ -899,7 +900,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || - pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || + folio_batch_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || need_activate_page_drain(cpu) || need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { From 3a44610b126399021095b4495941926a7dd756c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:06 +0100 Subject: [PATCH 084/282] mm/swap: convert activate_page to a folio_batch Rename it to just 'activate', saving 696 bytes of text from removals of compound_page() and the pagevec_lru_move_fn() infrastructure. Inline need_activate_page_drain() into its only caller. Link: https://lkml.kernel.org/r/20220617175020.717127-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 81 +++++++++++-------------------------------------------- 1 file changed, 16 insertions(+), 65 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 4a2f866c8878..3f402d351ad5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -66,7 +66,7 @@ struct lru_pvecs { struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP - struct pagevec activate_page; + struct folio_batch activate; #endif }; static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { @@ -188,44 +188,6 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, } EXPORT_SYMBOL_GPL(get_kernel_pages); -static void pagevec_lru_move_fn(struct pagevec *pvec, - void (*move_fn)(struct page *page, struct lruvec *lruvec)) -{ - int i; - struct lruvec *lruvec = NULL; - unsigned long flags = 0; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct folio *folio = page_folio(page); - - /* block memcg migration during page moving between lru */ - if (!TestClearPageLRU(page)) - continue; - - lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags); - (*move_fn)(page, lruvec); - - SetPageLRU(page); - } - if (lruvec) - unlock_page_lruvec_irqrestore(lruvec, flags); - release_pages(pvec->pages, pvec->nr); - pagevec_reinit(pvec); -} - -/* return true if pagevec needs to drain */ -static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page) -{ - bool ret = false; - - if (!pagevec_add(pvec, page) || PageCompound(page) || - lru_cache_disabled()) - ret = true; - - return ret; -} - typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) @@ -380,7 +342,7 @@ void lru_note_cost_folio(struct folio *folio) folio_nr_pages(folio)); } -static void __folio_activate(struct folio *folio, struct lruvec *lruvec) +static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) { if (!folio_test_active(folio) && !folio_test_unevictable(folio)) { long nr_pages = folio_nr_pages(folio); @@ -397,41 +359,30 @@ static void __folio_activate(struct folio *folio, struct lruvec *lruvec) } #ifdef CONFIG_SMP -static void __activate_page(struct page *page, struct lruvec *lruvec) +static void folio_activate_drain(int cpu) { - return __folio_activate(page_folio(page), lruvec); -} + struct folio_batch *fbatch = &per_cpu(lru_pvecs.activate, cpu); -static void activate_page_drain(int cpu) -{ - struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); - - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, __activate_page); -} - -static bool need_activate_page_drain(int cpu) -{ - return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0; + if (folio_batch_count(fbatch)) + folio_batch_move_lru(fbatch, folio_activate_fn); } static void folio_activate(struct folio *folio) { if (folio_test_lru(folio) && !folio_test_active(folio) && !folio_test_unevictable(folio)) { - struct pagevec *pvec; + struct folio_batch *fbatch; folio_get(folio); local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.activate_page); - if (pagevec_add_and_need_flush(pvec, &folio->page)) - pagevec_lru_move_fn(pvec, __activate_page); + fbatch = this_cpu_ptr(&lru_pvecs.activate); + folio_batch_add_and_move(fbatch, folio, folio_activate_fn); local_unlock(&lru_pvecs.lock); } } #else -static inline void activate_page_drain(int cpu) +static inline void folio_activate_drain(int cpu) { } @@ -441,7 +392,7 @@ static void folio_activate(struct folio *folio) if (folio_test_clear_lru(folio)) { lruvec = folio_lruvec_lock_irq(folio); - __folio_activate(folio, lruvec); + folio_activate_fn(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } @@ -500,9 +451,9 @@ void folio_mark_accessed(struct folio *folio) */ } else if (!folio_test_active(folio)) { /* - * If the page is on the LRU, queue it for activation via - * lru_pvecs.activate_page. Otherwise, assume the page is on a - * pagevec, mark it active and it'll be moved to the active + * If the folio is on the LRU, queue it for activation via + * lru_pvecs.activate. Otherwise, assume the folio is in a + * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) @@ -697,7 +648,7 @@ void lru_add_drain_cpu(int cpu) if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); - activate_page_drain(cpu); + folio_activate_drain(cpu); } /** @@ -901,7 +852,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || folio_batch_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - need_activate_page_drain(cpu) || + folio_batch_count(&per_cpu(lru_pvecs.activate, cpu)) || need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); From 82ac64d86fb079431e3af618a074e77be398299b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:07 +0100 Subject: [PATCH 085/282] mm/swap: rename lru_pvecs to cpu_fbatches No change to generated code, but this struct no longer contains any pagevecs, and not all the folio batches it contains are lru. Link: https://lkml.kernel.org/r/20220617175020.717127-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 90 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 3f402d351ad5..01e4e9c7d7a3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -56,10 +56,10 @@ static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = { }; /* - * The following struct pagevec are grouped together because they are protected + * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ -struct lru_pvecs { +struct cpu_fbatches { local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; @@ -69,7 +69,7 @@ struct lru_pvecs { struct folio_batch activate; #endif }; -static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { +static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), }; @@ -361,7 +361,7 @@ static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { - struct folio_batch *fbatch = &per_cpu(lru_pvecs.activate, cpu); + struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, folio_activate_fn); @@ -374,10 +374,10 @@ static void folio_activate(struct folio *folio) struct folio_batch *fbatch; folio_get(folio); - local_lock(&lru_pvecs.lock); - fbatch = this_cpu_ptr(&lru_pvecs.activate); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.activate); folio_batch_add_and_move(fbatch, folio, folio_activate_fn); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } } @@ -404,8 +404,8 @@ static void __lru_cache_activate_folio(struct folio *folio) struct folio_batch *fbatch; int i; - local_lock(&lru_pvecs.lock); - fbatch = this_cpu_ptr(&lru_pvecs.lru_add); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being @@ -426,7 +426,7 @@ static void __lru_cache_activate_folio(struct folio *folio) } } - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } /* @@ -452,7 +452,7 @@ void folio_mark_accessed(struct folio *folio) } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via - * lru_pvecs.activate. Otherwise, assume the folio is in a + * cpu_fbatches.activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ @@ -474,7 +474,7 @@ EXPORT_SYMBOL(folio_mark_accessed); * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of folio_add_lru() + * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) @@ -486,10 +486,10 @@ void folio_add_lru(struct folio *folio) VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); folio_get(folio); - local_lock(&lru_pvecs.lock); - fbatch = this_cpu_ptr(&lru_pvecs.lru_add); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); folio_batch_add_and_move(fbatch, folio, lru_add_fn); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } EXPORT_SYMBOL(folio_add_lru); @@ -614,13 +614,13 @@ static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) } /* - * Drain pages out of the cpu's pagevecs. + * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { - struct folio_batch *fbatch = &per_cpu(lru_pvecs.lru_add, cpu); + struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_add, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); @@ -636,15 +636,15 @@ void lru_add_drain_cpu(int cpu) local_unlock_irqrestore(&lru_rotate.lock, flags); } - fbatch = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); + fbatch = &per_cpu(cpu_fbatches.lru_deactivate_file, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); - fbatch = &per_cpu(lru_pvecs.lru_deactivate, cpu); + fbatch = &per_cpu(cpu_fbatches.lru_deactivate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); - fbatch = &per_cpu(lru_pvecs.lru_lazyfree, cpu); + fbatch = &per_cpu(cpu_fbatches.lru_lazyfree, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); @@ -670,10 +670,10 @@ void deactivate_file_folio(struct folio *folio) return; folio_get(folio); - local_lock(&lru_pvecs.lock); - fbatch = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file); folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } /* @@ -693,10 +693,10 @@ void deactivate_page(struct page *page) struct folio_batch *fbatch; folio_get(folio); - local_lock(&lru_pvecs.lock); - fbatch = this_cpu_ptr(&lru_pvecs.lru_deactivate); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate); folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } } @@ -717,18 +717,18 @@ void mark_page_lazyfree(struct page *page) struct folio_batch *fbatch; folio_get(folio); - local_lock(&lru_pvecs.lock); - fbatch = this_cpu_ptr(&lru_pvecs.lru_lazyfree); + local_lock(&cpu_fbatches.lock); + fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree); folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); } } void lru_add_drain(void) { - local_lock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); mlock_page_drain_local(); } @@ -740,19 +740,19 @@ void lru_add_drain(void) */ static void lru_add_and_bh_lrus_drain(void) { - local_lock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_page_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { - local_lock(&lru_pvecs.lock); + local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); - local_unlock(&lru_pvecs.lock); + local_unlock(&cpu_fbatches.lock); mlock_page_drain_local(); } @@ -797,8 +797,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus) return; /* - * Guarantee pagevec counter stores visible by this CPU are visible to - * other CPUs before loading the current drain generation. + * Guarantee folio_batch counter stores visible by this CPU + * are visible to other CPUs before loading the current drain + * generation. */ smp_mb(); @@ -824,8 +825,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus) * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical - * section. Use a full memory barrier to guarantee that the new global - * drain generation number is stored before loading pagevec counters. + * section. Use a full memory barrier to guarantee that the + * new global drain generation number is stored before loading + * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. @@ -847,12 +849,12 @@ static inline void __lru_add_drain_all(bool force_all_cpus) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (folio_batch_count(&per_cpu(lru_pvecs.lru_add, cpu)) || + if (folio_batch_count(&per_cpu(cpu_fbatches.lru_add, cpu)) || data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || - folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) || - folio_batch_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) || - folio_batch_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) || - folio_batch_count(&per_cpu(lru_pvecs.activate, cpu)) || + folio_batch_count(&per_cpu(cpu_fbatches.lru_deactivate_file, cpu)) || + folio_batch_count(&per_cpu(cpu_fbatches.lru_deactivate, cpu)) || + folio_batch_count(&per_cpu(cpu_fbatches.lru_lazyfree, cpu)) || + folio_batch_count(&per_cpu(cpu_fbatches.activate, cpu)) || need_mlock_page_drain(cpu) || has_bh_in_lru(cpu, NULL)) { INIT_WORK(work, lru_add_drain_per_cpu); From 4864545a4669781f75aa711ebf7b25e6f0f37d13 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:08 +0100 Subject: [PATCH 086/282] mm/swap: pull the CPU conditional out of __lru_add_drain_all() The function is too long, so pull this complicated conditional out into cpu_needs_drain(). This ends up shrinking the text by 14 bytes, by allowing GCC to cache the result of calling per_cpu() instead of relocating each lookup individually. Link: https://lkml.kernel.org/r/20220617175020.717127-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 01e4e9c7d7a3..df78c4c4dbeb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -765,6 +765,21 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) lru_add_and_bh_lrus_drain(); } +static bool cpu_needs_drain(unsigned int cpu) +{ + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); + + /* Check these in order of likelihood that they're not zero */ + return folio_batch_count(&fbatches->lru_add) || + data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || + folio_batch_count(&fbatches->lru_deactivate_file) || + folio_batch_count(&fbatches->lru_deactivate) || + folio_batch_count(&fbatches->lru_lazyfree) || + folio_batch_count(&fbatches->activate) || + need_mlock_page_drain(cpu) || + has_bh_in_lru(cpu, NULL); +} + /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is @@ -849,14 +864,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus) for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (folio_batch_count(&per_cpu(cpu_fbatches.lru_add, cpu)) || - data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) || - folio_batch_count(&per_cpu(cpu_fbatches.lru_deactivate_file, cpu)) || - folio_batch_count(&per_cpu(cpu_fbatches.lru_deactivate, cpu)) || - folio_batch_count(&per_cpu(cpu_fbatches.lru_lazyfree, cpu)) || - folio_batch_count(&per_cpu(cpu_fbatches.activate, cpu)) || - need_mlock_page_drain(cpu) || - has_bh_in_lru(cpu, NULL)) { + if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); From a2d33b5dd674c21e75ca47e3d791d070cba267dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:09 +0100 Subject: [PATCH 087/282] mm/swap: optimise lru_add_drain_cpu() Do the per-cpu dereferencing of the fbatches once which saves 14 bytes of text and several percpu relocations. Link: https://lkml.kernel.org/r/20220617175020.717127-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index df78c4c4dbeb..84318692db6a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -620,7 +620,8 @@ static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio) */ void lru_add_drain_cpu(int cpu) { - struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_add, cpu); + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); + struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add_fn); @@ -636,15 +637,15 @@ void lru_add_drain_cpu(int cpu) local_unlock_irqrestore(&lru_rotate.lock, flags); } - fbatch = &per_cpu(cpu_fbatches.lru_deactivate_file, cpu); + fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file_fn); - fbatch = &per_cpu(cpu_fbatches.lru_deactivate, cpu); + fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_fn); - fbatch = &per_cpu(cpu_fbatches.lru_lazyfree, cpu); + fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree_fn); From 2397f780e1e05c7bd7ffebd931931b5d8a45ad8c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:10 +0100 Subject: [PATCH 088/282] mm/swap: convert try_to_free_swap to use a folio Save a few calls to compound_head by converting the passed page to a folio. Reduces kernel text size by 74 bytes. Link: https://lkml.kernel.org/r/20220617175020.717127-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6a7579951fa4..af6e83911ad2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1568,16 +1568,15 @@ unlock_out: return ret; } -static bool page_swapped(struct page *page) +static bool folio_swapped(struct folio *folio) { swp_entry_t entry; struct swap_info_struct *si; - if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) - return page_swapcount(page) != 0; + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) + return page_swapcount(&folio->page) != 0; - page = compound_head(page); - entry.val = page_private(page); + entry = folio_swap_entry(folio); si = _swap_info_get(entry); if (si) return swap_page_trans_huge_swapped(si, entry); @@ -1590,13 +1589,14 @@ static bool page_swapped(struct page *page) */ int try_to_free_swap(struct page *page) { - VM_BUG_ON_PAGE(!PageLocked(page), page); + struct folio *folio = page_folio(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return 0; - if (PageWriteback(page)) + if (folio_test_writeback(folio)) return 0; - if (page_swapped(page)) + if (folio_swapped(folio)) return 0; /* @@ -1617,9 +1617,8 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; - page = compound_head(page); - delete_from_swap_cache(page); - SetPageDirty(page); + delete_from_swap_cache(&folio->page); + folio_set_dirty(folio); return 1; } From ab5e653ee810024a1e170c75f973a252053f7467 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:11 +0100 Subject: [PATCH 089/282] mm/swap: convert release_pages to use a folio internally This function was already calling compound_head(), but now it can cache the result of calling compound_head() and avoid calling it again. Saves 299 bytes of text by avoiding various calls to compound_page() and avoiding checks of PageTail. Link: https://lkml.kernel.org/r/20220617175020.717127-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 84318692db6a..417dc32534c1 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -941,8 +941,7 @@ void release_pages(struct page **pages, int nr) unsigned int lock_batch; for (i = 0; i < nr; i++) { - struct page *page = pages[i]; - struct folio *folio = page_folio(page); + struct folio *folio = page_folio(pages[i]); /* * Make sure the IRQ-safe lock-holding time does not get @@ -954,35 +953,34 @@ void release_pages(struct page **pages, int nr) lruvec = NULL; } - page = &folio->page; - if (is_huge_zero_page(page)) + if (is_huge_zero_page(&folio->page)) continue; - if (is_zone_device_page(page)) { + if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - if (put_devmap_managed_page(page)) + if (put_devmap_managed_page(&folio->page)) continue; - if (put_page_testzero(page)) - free_zone_device_page(page); + if (folio_put_testzero(folio)) + free_zone_device_page(&folio->page); continue; } - if (!put_page_testzero(page)) + if (!folio_put_testzero(folio)) continue; - if (PageCompound(page)) { + if (folio_test_large(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __put_compound_page(page); + __put_compound_page(&folio->page); continue; } - if (PageLRU(page)) { + if (folio_test_lru(folio)) { struct lruvec *prev_lruvec = lruvec; lruvec = folio_lruvec_relock_irqsave(folio, lruvec, @@ -990,8 +988,8 @@ void release_pages(struct page **pages, int nr) if (prev_lruvec != lruvec) lock_batch = 0; - del_page_from_lru_list(page, lruvec); - __clear_page_lru_flags(page); + lruvec_del_folio(lruvec, folio); + __folio_clear_lru_flags(folio); } /* @@ -1000,13 +998,13 @@ void release_pages(struct page **pages, int nr) * found set here. This does not indicate a problem, unless * "unevictable_pgs_cleared" appears worryingly large. */ - if (unlikely(PageMlocked(page))) { - __ClearPageMlocked(page); - dec_zone_page_state(page, NR_MLOCK); + if (unlikely(folio_test_mlocked(folio))) { + __folio_clear_mlocked(folio); + zone_stat_sub_folio(folio, NR_MLOCK); count_vm_event(UNEVICTABLE_PGCLEARED); } - list_add(&page->lru, &pages_to_free); + list_add(&folio->lru, &pages_to_free); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); From 2f58e5de662726312fd98259a07ab945210999d1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:12 +0100 Subject: [PATCH 090/282] mm/swap: convert put_pages_list to use folios Pages linked through the LRU list cannot be tail pages as ->compound_head is in a union with one of the words of the list_head, and they cannot be ZONE_DEVICE pages as ->pgmap is in a union with the same word. Saves 60 bytes of text by removing a call to page_is_fake_head(). Link: https://lkml.kernel.org/r/20220617175020.717127-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 417dc32534c1..a5a91aec83da 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -138,19 +138,19 @@ EXPORT_SYMBOL(__put_page); */ void put_pages_list(struct list_head *pages) { - struct page *page, *next; + struct folio *folio, *next; - list_for_each_entry_safe(page, next, pages, lru) { - if (!put_page_testzero(page)) { - list_del(&page->lru); + list_for_each_entry_safe(folio, next, pages, lru) { + if (!folio_put_testzero(folio)) { + list_del(&folio->lru); continue; } - if (PageHead(page)) { - list_del(&page->lru); - __put_compound_page(page); + if (folio_test_large(folio)) { + list_del(&folio->lru); + __put_compound_page(&folio->page); continue; } - /* Cannot be PageLRU because it's passed to us using the lru */ + /* LRU flag must be clear because it's passed using the lru */ } free_unref_page_list(pages); From 8d29c7036f5ff360ea1f51b9fed5d909be7c8094 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:13 +0100 Subject: [PATCH 091/282] mm/swap: convert __put_page() to __folio_put() Saves 11 bytes of text by removing a check of PageTail. Link: https://lkml.kernel.org/r/20220617175020.717127-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- mm/swap.c | 14 +++++++------- net/core/page_pool.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 09670ccb94e7..3fb49aec13fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -855,7 +855,7 @@ static inline struct folio *virt_to_folio(const void *x) return page_folio(page); } -void __put_page(struct page *page); +void __folio_put(struct folio *folio); void put_pages_list(struct list_head *pages); @@ -1197,7 +1197,7 @@ static inline __must_check bool try_get_page(struct page *page) static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) - __put_page(&folio->page); + __folio_put(folio); } /** @@ -1217,7 +1217,7 @@ static inline void folio_put(struct folio *folio) static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) - __put_page(&folio->page); + __folio_put(folio); } void release_pages(struct page **pages, int nr); diff --git a/mm/swap.c b/mm/swap.c index a5a91aec83da..d09e9ac53809 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -119,16 +119,16 @@ static void __put_compound_page(struct page *page) destroy_compound_page(page); } -void __put_page(struct page *page) +void __folio_put(struct folio *folio) { - if (unlikely(is_zone_device_page(page))) - free_zone_device_page(page); - else if (unlikely(PageCompound(page))) - __put_compound_page(page); + if (unlikely(folio_is_zone_device(folio))) + free_zone_device_page(&folio->page); + else if (unlikely(folio_test_large(folio))) + __put_compound_page(&folio->page); else - __put_single_page(page); + __put_single_page(&folio->page); } -EXPORT_SYMBOL(__put_page); +EXPORT_SYMBOL(__folio_put); /** * put_pages_list() - release a list of pages diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f18e6e771993..db70e94c8df2 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -16,7 +16,7 @@ #include #include #include -#include /* for __put_page() */ +#include /* for put_page() */ #include #include From 83d9965995408c450c7ee8c2c8bc26abe21c311b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:14 +0100 Subject: [PATCH 092/282] mm/swap: convert __put_single_page() to __folio_put_small() Saves 56 bytes of text by removing a call to compound_head(). Link: https://lkml.kernel.org/r/20220617175020.717127-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index d09e9ac53809..ceed884e90cf 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -99,11 +99,11 @@ static void __page_cache_release(struct page *page) } } -static void __put_single_page(struct page *page) +static void __folio_put_small(struct folio *folio) { - __page_cache_release(page); - mem_cgroup_uncharge(page_folio(page)); - free_unref_page(page, 0); + __page_cache_release(&folio->page); + mem_cgroup_uncharge(folio); + free_unref_page(&folio->page, 0); } static void __put_compound_page(struct page *page) @@ -126,7 +126,7 @@ void __folio_put(struct folio *folio) else if (unlikely(folio_test_large(folio))) __put_compound_page(&folio->page); else - __put_single_page(&folio->page); + __folio_put_small(folio); } EXPORT_SYMBOL(__folio_put); From 5ef82fe7f6bca2827c3d1457e9ecd6219da29ede Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:15 +0100 Subject: [PATCH 093/282] mm/swap: convert __put_compound_page() to __folio_put_large() All the callers now have a folio, so pass it in. This doesn't save any text, but it does save a call to compound_head() as folio_test_hugetlb() does not contain a call like PageHuge() does. Link: https://lkml.kernel.org/r/20220617175020.717127-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index ceed884e90cf..b709f3ece57f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -106,7 +106,7 @@ static void __folio_put_small(struct folio *folio) free_unref_page(&folio->page, 0); } -static void __put_compound_page(struct page *page) +static void __folio_put_large(struct folio *folio) { /* * __page_cache_release() is supposed to be called for thp, not for @@ -114,9 +114,9 @@ static void __put_compound_page(struct page *page) * (it's never listed to any LRU lists) and no memcg routines should * be called for hugetlb (it has a separate hugetlb_cgroup.) */ - if (!PageHuge(page)) - __page_cache_release(page); - destroy_compound_page(page); + if (!folio_test_hugetlb(folio)) + __page_cache_release(&folio->page); + destroy_compound_page(&folio->page); } void __folio_put(struct folio *folio) @@ -124,7 +124,7 @@ void __folio_put(struct folio *folio) if (unlikely(folio_is_zone_device(folio))) free_zone_device_page(&folio->page); else if (unlikely(folio_test_large(folio))) - __put_compound_page(&folio->page); + __folio_put_large(folio); else __folio_put_small(folio); } @@ -147,7 +147,7 @@ void put_pages_list(struct list_head *pages) } if (folio_test_large(folio)) { list_del(&folio->lru); - __put_compound_page(&folio->page); + __folio_put_large(folio); continue; } /* LRU flag must be clear because it's passed using the lru */ @@ -976,7 +976,7 @@ void release_pages(struct page **pages, int nr) unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } - __put_compound_page(&folio->page); + __folio_put_large(folio); continue; } From 188e8caee968def9fb67c7536c270b5b463c3461 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:16 +0100 Subject: [PATCH 094/282] mm/swap: convert __page_cache_release() to use a folio All the callers now have a folio. Saves several calls to compound_head, totalling 502 bytes of text. Link: https://lkml.kernel.org/r/20220617175020.717127-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index b709f3ece57f..5f6caa651599 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -77,31 +77,30 @@ static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { * This path almost never happens for VM activity - pages are normally freed * via pagevecs. But it gets used by networking - and for compound pages. */ -static void __page_cache_release(struct page *page) +static void __page_cache_release(struct folio *folio) { - if (PageLRU(page)) { - struct folio *folio = page_folio(page); + if (folio_test_lru(folio)) { struct lruvec *lruvec; unsigned long flags; lruvec = folio_lruvec_lock_irqsave(folio, &flags); - del_page_from_lru_list(page, lruvec); - __clear_page_lru_flags(page); + lruvec_del_folio(lruvec, folio); + __folio_clear_lru_flags(folio); unlock_page_lruvec_irqrestore(lruvec, flags); } - /* See comment on PageMlocked in release_pages() */ - if (unlikely(PageMlocked(page))) { - int nr_pages = thp_nr_pages(page); + /* See comment on folio_test_mlocked in release_pages() */ + if (unlikely(folio_test_mlocked(folio))) { + long nr_pages = folio_nr_pages(folio); - __ClearPageMlocked(page); - mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + __folio_clear_mlocked(folio); + zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages); count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); } } static void __folio_put_small(struct folio *folio) { - __page_cache_release(&folio->page); + __page_cache_release(folio); mem_cgroup_uncharge(folio); free_unref_page(&folio->page, 0); } @@ -115,7 +114,7 @@ static void __folio_put_large(struct folio *folio) * be called for hugetlb (it has a separate hugetlb_cgroup.) */ if (!folio_test_hugetlb(folio)) - __page_cache_release(&folio->page); + __page_cache_release(folio); destroy_compound_page(&folio->page); } @@ -199,14 +198,14 @@ static void lru_add_fn(struct lruvec *lruvec, struct folio *folio) /* * Is an smp_mb__after_atomic() still required here, before - * folio_evictable() tests PageMlocked, to rule out the possibility + * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think - * not, because __munlock_page() only clears PageMlocked while the LRU - * lock is held. + * not, because __munlock_page() only clears the mlocked flag + * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily - * true of release_pages(): but those only clear PageMlocked after - * put_page_testzero() has excluded any other users of the page.) + * true of release_pages(): but those only clear the mlocked flag after + * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) From 5375336c8c42a343c3b440b6f1e21c65e7b174b9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:17 +0100 Subject: [PATCH 095/282] mm: convert destroy_compound_page() to destroy_large_folio() All callers now have a folio, so push the folio->page conversion down to this function. [akpm@linux-foundation.org: uninline destroy_large_folio() to fix build issue] Link: https://lkml.kernel.org/r/20220617175020.717127-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +----- mm/page_alloc.c | 8 ++++++++ mm/swap.c | 2 +- mm/vmscan.c | 4 ++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3fb49aec13fd..9cc02a7e503b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -892,11 +892,7 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline void destroy_compound_page(struct page *page) -{ - VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - compound_page_dtors[page[1].compound_dtor](page); -} +void destroy_large_folio(struct folio *folio); static inline int head_compound_pincount(struct page *head) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 248469134962..52fd92b2c1fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -744,6 +744,14 @@ void prep_compound_page(struct page *page, unsigned int order) prep_compound_head(page, order); } +void destroy_large_folio(struct folio *folio) +{ + enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; + + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); + compound_page_dtors[dtor](&folio->page); +} + #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; diff --git a/mm/swap.c b/mm/swap.c index 5f6caa651599..1f563d857768 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -115,7 +115,7 @@ static void __folio_put_large(struct folio *folio) */ if (!folio_test_hugetlb(folio)) __page_cache_release(folio); - destroy_compound_page(&folio->page); + destroy_large_folio(folio); } void __folio_put(struct folio *folio) diff --git a/mm/vmscan.c b/mm/vmscan.c index e7d3db64a4e0..e660d7205f47 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1979,7 +1979,7 @@ free_it: * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) - destroy_compound_page(&folio->page); + destroy_large_folio(folio); else list_add(&folio->lru, &free_pages); continue; @@ -2348,7 +2348,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, if (unlikely(folio_test_large(folio))) { spin_unlock_irq(&lruvec->lru_lock); - destroy_compound_page(&folio->page); + destroy_large_folio(folio); spin_lock_irq(&lruvec->lru_lock); } else list_add(&folio->lru, &folios_to_free); From b98c359f1d921deae04bb5dbbbbbb9d8705b7c4c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:18 +0100 Subject: [PATCH 096/282] mm: convert page_swap_flags to folio_swap_flags The only caller already has a folio, so push the folio->page conversion down a level. Link: https://lkml.kernel.org/r/20220617175020.717127-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.h | 6 +++--- mm/vmscan.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 0193797b0c92..85453110be8e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -61,9 +61,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); -static inline unsigned int page_swap_flags(struct page *page) +static inline unsigned int folio_swap_flags(struct folio *folio) { - return page_swap_info(page)->flags; + return page_swap_info(&folio->page)->flags; } #else /* CONFIG_SWAP */ struct swap_iocb; @@ -149,7 +149,7 @@ static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, { } -static inline unsigned int page_swap_flags(struct page *page) +static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index e660d7205f47..13a28b94cd56 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1572,7 +1572,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) * but that will never affect SWP_FS_OPS, so the data_race * is safe. */ - return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS); + return !data_race(folio_swap_flags(folio) & SWP_FS_OPS); } /* From 75fa68a5d89871a35246aa2759c95d6dfaf1b582 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:19 +0100 Subject: [PATCH 097/282] mm/swap: convert delete_from_swap_cache() to take a folio All but one caller already has a folio, so convert it to use a folio. Link: https://lkml.kernel.org/r/20220617175020.717127-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory-failure.c | 5 +++-- mm/shmem.c | 4 ++-- mm/swap.h | 4 ++-- mm/swap_state.c | 16 ++++++++-------- mm/swapfile.c | 2 +- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 845369f839e1..f7612ccdb299 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1007,12 +1007,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p) static int me_swapcache_clean(struct page_state *ps, struct page *p) { + struct folio *folio = page_folio(p); int ret; - delete_from_swap_cache(p); + delete_from_swap_cache(folio); ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; - unlock_page(p); + folio_unlock(folio); if (has_extra_refcount(ps, p, false)) ret = MF_FAILED; diff --git a/mm/shmem.c b/mm/shmem.c index 12d45a03f7fc..12ac67dc831f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1691,7 +1691,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, return; folio_wait_writeback(folio); - delete_from_swap_cache(&folio->page); + delete_from_swap_cache(folio); spin_lock_irq(&info->lock); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't @@ -1789,7 +1789,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); - delete_from_swap_cache(&folio->page); + delete_from_swap_cache(folio); folio_mark_dirty(folio); swap_free(swap); diff --git a/mm/swap.h b/mm/swap.h index 85453110be8e..3684f7f4eac9 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -38,7 +38,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp); void __delete_from_swap_cache(struct page *page, swp_entry_t entry, void *shadow); -void delete_from_swap_cache(struct page *page); +void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); void free_swap_cache(struct page *page); @@ -140,7 +140,7 @@ static inline void __delete_from_swap_cache(struct page *page, { } -static inline void delete_from_swap_cache(struct page *page) +static inline void delete_from_swap_cache(struct folio *folio) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index f6568765aef7..06a08e698c9f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -222,22 +222,22 @@ fail: } /* - * This must be called only on pages that have + * This must be called only on folios that have * been verified to be in the swap cache and locked. - * It will never put the page into the free list, - * the caller has a reference on the page. + * It will never put the folio into the free list, + * the caller has a reference on the folio. */ -void delete_from_swap_cache(struct page *page) +void delete_from_swap_cache(struct folio *folio) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = folio_swap_entry(folio); struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(page, entry, NULL); + __delete_from_swap_cache(&folio->page, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(page, entry); - page_ref_sub(page, thp_nr_pages(page)); + put_swap_page(&folio->page, entry); + folio_ref_sub(folio, folio_nr_pages(folio)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, diff --git a/mm/swapfile.c b/mm/swapfile.c index af6e83911ad2..1fdccd2f1422 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1617,7 +1617,7 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; - delete_from_swap_cache(&folio->page); + delete_from_swap_cache(folio); folio_set_dirty(folio); return 1; } From ceff9d3354e95ca17e12ad869acea5407cc467f9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 17 Jun 2022 18:50:20 +0100 Subject: [PATCH 098/282] mm/swap: convert __delete_from_swap_cache() to a folio All callers now have a folio, so convert the entire function to operate on folios. Link: https://lkml.kernel.org/r/20220617175020.717127-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.h | 4 ++-- mm/swap_state.c | 25 +++++++++++++------------ mm/vmscan.c | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 3684f7f4eac9..fa0816af4712 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -36,7 +36,7 @@ bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp); -void __delete_from_swap_cache(struct page *page, +void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow); void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, @@ -135,7 +135,7 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, return -1; } -static inline void __delete_from_swap_cache(struct page *page, +static inline void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 06a08e698c9f..cc9c061c2579 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -133,31 +133,32 @@ unlock: } /* - * This must be called only on pages that have + * This must be called only on folios that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page, +void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); - int i, nr = thp_nr_pages(page); + int i; + long nr = folio_nr_pages(folio); pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapCache(page), page); - VM_BUG_ON_PAGE(PageWriteback(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != page, entry); - set_page_private(page + i, 0); + VM_BUG_ON_FOLIO(entry != folio, folio); + set_page_private(folio_page(folio, i), 0); xas_next(&xas); } - ClearPageSwapCache(page); + folio_clear_swapcache(folio); address_space->nrpages -= nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); - __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } /** @@ -233,7 +234,7 @@ void delete_from_swap_cache(struct folio *folio) struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(&folio->page, entry, NULL); + __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_page(&folio->page, entry); diff --git a/mm/vmscan.c b/mm/vmscan.c index 13a28b94cd56..161096d9311a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1329,7 +1329,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, mem_cgroup_swapout(folio, swap); if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __delete_from_swap_cache(&folio->page, swap, shadow); + __delete_from_swap_cache(folio, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(&folio->page, swap); } else { From ed7802dd48f7a507213cbb95bb4c6f1fe134eb5d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 17 Jun 2022 21:56:49 +0800 Subject: [PATCH 099/282] mm: memory_hotplug: enumerate all supported section flags Patch series "make hugetlb_optimize_vmemmap compatible with memmap_on_memory", v3. This series makes hugetlb_optimize_vmemmap compatible with memmap_on_memory. This patch (of 2): We are almost running out of section flags, only one bit is available in the worst case (powerpc with 256k pages). However, there are still some free bits (in ->section_mem_map) on other architectures (e.g. x86_64 has 10 bits available, arm64 has 8 bits available with worst case of 64K pages). We have hard coded those numbers in code, it is inconvenient to use those bits on other architectures except powerpc. So transfer those section flags to enumeration to make it easy to add new section flags in the future. Also, move SECTION_TAINT_ZONE_DEVICE into the scope of CONFIG_ZONE_DEVICE to save a bit on non-zone-device case. [songmuchun@bytedance.com: replace enum with defines per David] Link: https://lkml.kernel.org/r/20220620110616.12056-2-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Oscar Salvador Cc: Paul E. McKenney Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 41 ++++++++++++++++++++++++++++++++--------- mm/memory_hotplug.c | 6 ++++++ mm/sparse.c | 2 +- 3 files changed, 39 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index aab70355d64f..2b5757752333 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1418,16 +1418,32 @@ extern size_t mem_section_usage_size(void); * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. - * To sum it up, at least 6 bits are available. + * To sum it up, at least 6 bits are available on all architectures. + * However, we can exceed 6 bits on some other architectures except + * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available + * with the worst case of 64K pages on arm64) if we make sure the + * exceeded bit is not applicable to powerpc. */ -#define SECTION_MARKED_PRESENT (1UL<<0) -#define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_IS_ONLINE (1UL<<2) -#define SECTION_IS_EARLY (1UL<<3) -#define SECTION_TAINT_ZONE_DEVICE (1UL<<4) -#define SECTION_MAP_LAST_BIT (1UL<<5) -#define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 6 +enum { + SECTION_MARKED_PRESENT_BIT, + SECTION_HAS_MEM_MAP_BIT, + SECTION_IS_ONLINE_BIT, + SECTION_IS_EARLY_BIT, +#ifdef CONFIG_ZONE_DEVICE + SECTION_TAINT_ZONE_DEVICE_BIT, +#endif + SECTION_MAP_LAST_BIT, +}; + +#define SECTION_MARKED_PRESENT BIT(SECTION_MARKED_PRESENT_BIT) +#define SECTION_HAS_MEM_MAP BIT(SECTION_HAS_MEM_MAP_BIT) +#define SECTION_IS_ONLINE BIT(SECTION_IS_ONLINE_BIT) +#define SECTION_IS_EARLY BIT(SECTION_IS_EARLY_BIT) +#ifdef CONFIG_ZONE_DEVICE +#define SECTION_TAINT_ZONE_DEVICE BIT(SECTION_TAINT_ZONE_DEVICE_BIT) +#endif +#define SECTION_MAP_MASK (~(BIT(SECTION_MAP_LAST_BIT) - 1)) +#define SECTION_NID_SHIFT SECTION_MAP_LAST_BIT static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1466,12 +1482,19 @@ static inline int online_section(struct mem_section *section) return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } +#ifdef CONFIG_ZONE_DEVICE static inline int online_device_section(struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } +#else +static inline int online_device_section(struct mem_section *section) +{ + return 0; +} +#endif static inline int online_section_nr(unsigned long nr) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 84990a14d51a..a2a6d280054f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -670,12 +670,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon } +#ifdef CONFIG_ZONE_DEVICE static void section_taint_zone_device(unsigned long pfn) { struct mem_section *ms = __pfn_to_section(pfn); ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE; } +#else +static inline void section_taint_zone_device(unsigned long pfn) +{ +} +#endif /* * Associate the pfn range with the given zone, initializing the memmaps diff --git a/mm/sparse.c b/mm/sparse.c index cb3bfae64036..e5a8a3a0edd7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p { unsigned long coded_mem_map = (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); - BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL< PFN_SECTION_SHIFT); BUG_ON(coded_mem_map & ~SECTION_MAP_MASK); return coded_mem_map; } From 66361095129b3b5d065e6c09cf0c085ef4a8c40f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 17 Jun 2022 21:56:50 +0800 Subject: [PATCH 100/282] mm: memory_hotplug: make hugetlb_optimize_vmemmap compatible with memmap_on_memory For now, the feature of hugetlb_free_vmemmap is not compatible with the feature of memory_hotplug.memmap_on_memory, and hugetlb_free_vmemmap takes precedence over memory_hotplug.memmap_on_memory. However, someone wants to make memory_hotplug.memmap_on_memory takes precedence over hugetlb_free_vmemmap since memmap_on_memory makes it more likely to succeed memory hotplug in close-to-OOM situations. So the decision of making hugetlb_free_vmemmap take precedence is not wise and elegant. The proper approach is to have hugetlb_vmemmap.c do the check whether the section which the HugeTLB pages belong to can be optimized. If the section's vmemmap pages are allocated from the added memory block itself, hugetlb_free_vmemmap should refuse to optimize the vmemmap, otherwise, do the optimization. Then both kernel parameters are compatible. So this patch introduces VmemmapSelfHosted to mask any non-optimizable vmemmap pages. The hugetlb_vmemmap can use this flag to detect if a vmemmap page can be optimized. [songmuchun@bytedance.com: walk vmemmap page tables to avoid false-positive] Link: https://lkml.kernel.org/r/20220620110616.12056-3-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220617135650.74901-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Co-developed-by: Oscar Salvador Signed-off-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Paul E. McKenney Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 22 +++---- Documentation/admin-guide/sysctl/vm.rst | 5 +- include/linux/memory_hotplug.h | 9 --- include/linux/page-flags.h | 11 ++++ mm/hugetlb_vmemmap.c | 66 ++++++++++++++++--- mm/memory_hotplug.c | 27 ++++---- 6 files changed, 93 insertions(+), 47 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 8c0ea6b6c6a9..2cacd4f8deb7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1722,9 +1722,11 @@ Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. - This is not compatible with memory_hotplug.memmap_on_memory. - If both parameters are enabled, hugetlb_free_vmemmap takes - precedence over memory_hotplug.memmap_on_memory. + Note that the vmemmap pages may be allocated from the added + memory block itself when memory_hotplug.memmap_on_memory is + enabled, those vmemmap pages cannot be optimized even if this + feature is enabled. Other vmemmap pages not allocated from + the added memory block itself do not be affected. hung_task_panic= [KNL] Should the hung task detector generate panics. @@ -3068,10 +3070,12 @@ [KNL,X86,ARM] Boolean flag to enable this feature. Format: {on | off (default)} When enabled, runtime hotplugged memory will - allocate its internal metadata (struct pages) - from the hotadded memory which will allow to - hotadd a lot of memory without requiring - additional memory to do so. + allocate its internal metadata (struct pages, + those vmemmap pages cannot be optimized even + if hugetlb_free_vmemmap is enabled) from the + hotadded memory which will allow to hotadd a + lot of memory without requiring additional + memory to do so. This feature is disabled by default because it has some implication on large (e.g. GB) allocations in some configurations (e.g. small @@ -3081,10 +3085,6 @@ Note that even when enabled, there are a few cases where the feature is not effective. - This is not compatible with hugetlb_free_vmemmap. If - both parameters are enabled, hugetlb_free_vmemmap takes - precedence over memory_hotplug.memmap_on_memory. - memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest Format: default : 0 diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 4a440a7cfeb0..f74f722ad702 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -565,9 +565,8 @@ See Documentation/admin-guide/mm/hugetlbpage.rst hugetlb_optimize_vmemmap ======================== -This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter) -is configured or the size of 'struct page' (a structure defined in -include/linux/mm_types.h) is not power of two (an unusual system config could +This knob is not available when the size of 'struct page' (a structure defined +in include/linux/mm_types.h) is not power of two (an unusual system config could result in this). Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 20d7edf62a6a..e0b2209ab71c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -351,13 +351,4 @@ void arch_remove_linear_mapping(u64 start, u64 size); extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -bool mhp_memmap_on_memory(void); -#else -static inline bool mhp_memmap_on_memory(void) -{ - return false; -} -#endif - #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f32aade2a6e0..82719d33c0f1 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -193,6 +193,11 @@ enum pageflags { /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, + +#ifdef CONFIG_MEMORY_HOTPLUG + /* For self-hosted memmap pages */ + PG_vmemmap_self_hosted = PG_owner_priv_1, +#endif }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -628,6 +633,12 @@ PAGEFLAG_FALSE(SkipKASanPoison, skip_kasan_poison) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_MEMORY_HOTPLUG +PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) +#else +PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) +#endif + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba29c15c53d6..1362feb3c6c9 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -10,7 +10,7 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include +#include #include "hugetlb_vmemmap.h" /* @@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } +static unsigned int vmemmap_optimizable_pages(struct hstate *h, + struct page *head) +{ + if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + return 0; + + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { + pmd_t *pmdp, pmd; + struct page *vmemmap_page; + unsigned long vaddr = (unsigned long)head; + + /* + * Only the vmemmap page's vmemmap page can be self-hosted. + * Walking the page tables to find the backing page of the + * vmemmap page. + */ + pmdp = pmd_off_k(vaddr); + /* + * The READ_ONCE() is used to stabilize *pmdp in a register or + * on the stack so that it will stop changing under the code. + * The only concurrent operation where it can be changed is + * split_vmemmap_huge_pmd() (*pmdp will be stable after this + * operation). + */ + pmd = READ_ONCE(*pmdp); + if (pmd_leaf(pmd)) + vmemmap_page = pmd_page(pmd) + pte_index(vaddr); + else + vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr)); + /* + * Due to HugeTLB alignment requirements and the vmemmap pages + * being at the start of the hotplugged memory region in + * memory_hotplug.memmap_on_memory case. Checking any vmemmap + * page's vmemmap page if it is marked as VmemmapSelfHosted is + * sufficient. + * + * [ hotplugged memory ] + * [ section ][...][ section ] + * [ vmemmap ][ usable memory ] + * ^ | | | + * +---+ | | + * ^ | | + * +-------+ | + * ^ | + * +-------------------------------------------+ + */ + if (PageVmemmapSelfHosted(vmemmap_page)) + return 0; + } + + return hugetlb_optimize_vmemmap_pages(h); +} + void hugetlb_vmemmap_free(struct hstate *h, struct page *head) { unsigned long vmemmap_addr = (unsigned long)head; unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); + vmemmap_pages = vmemmap_optimizable_pages(h, head); if (!vmemmap_pages) return; - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) - return; - static_branch_inc(&hugetlb_optimize_vmemmap_key); vmemmap_addr += RESERVE_VMEMMAP_SIZE; @@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = { static __init int hugetlb_vmemmap_sysctls_init(void) { /* - * If "memory_hotplug.memmap_on_memory" is enabled or "struct page" - * crosses page boundaries, the vmemmap pages cannot be optimized. + * If "struct page" crosses page boundaries, the vmemmap pages cannot + * be optimized. */ - if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page))) + if (is_power_of_2(sizeof(struct page))) register_sysctl_init("vm", hugetlb_vmemmap_sysctls); return 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2a6d280054f..99ecb2b3ff53 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -43,30 +43,22 @@ #include "shuffle.h" #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY -static int memmap_on_memory_set(const char *val, const struct kernel_param *kp) -{ - if (hugetlb_optimize_vmemmap_enabled()) - return 0; - return param_set_bool(val, kp); -} - -static const struct kernel_param_ops memmap_on_memory_ops = { - .flags = KERNEL_PARAM_OPS_FL_NOARG, - .set = memmap_on_memory_set, - .get = param_get_bool, -}; - /* * memory_hotplug.memmap_on_memory parameter */ static bool memmap_on_memory __ro_after_init; -module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444); +module_param(memmap_on_memory, bool, 0444); MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); -bool mhp_memmap_on_memory(void) +static inline bool mhp_memmap_on_memory(void) { return memmap_on_memory; } +#else +static inline bool mhp_memmap_on_memory(void) +{ + return false; +} #endif enum { @@ -1035,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, struct zone *zone) { unsigned long end_pfn = pfn + nr_pages; - int ret; + int ret, i; ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); if (ret) @@ -1043,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + for (i = 0; i < nr_pages; i++) + SetPageVmemmapSelfHosted(pfn_to_page(pfn + i)); + /* * It might be that the vmemmap_pages fully span sections. If that is * the case, mark those sections online here as otherwise they will be From dc2628f39582e79bce41842fc91235b70054838c Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 16 Jun 2022 11:38:46 +0800 Subject: [PATCH 101/282] mm: hugetlb: remove minimum_order variable commit 641844f5616d ("mm/hugetlb: introduce minimum hugepage order") fixed a static checker warning and introduced a global variable minimum_order to fix the warning. However, the local variable in dissolve_free_huge_pages() can be initialized to huge_page_order(&default_hstate) to fix the warning. So remove minimum_order to simplify the code. Link: https://lkml.kernel.org/r/20220616033846.96937-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 70d2763f92ea..65454896f174 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order) #endif static unsigned long hugetlb_cma_size __initdata; -/* - * Minimum page order among possible hugepage sizes, set to a proper value - * at boot time. - */ -static unsigned int minimum_order __read_mostly = UINT_MAX; - __initdata LIST_HEAD(huge_boot_pages); /* for command line parsing */ @@ -2152,11 +2146,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned long pfn; struct page *page; int rc = 0; + unsigned int order; + struct hstate *h; if (!hugepages_supported()) return rc; - for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) { + order = huge_page_order(&default_hstate); + for_each_hstate(h) + order = min(order, huge_page_order(h)); + + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { page = pfn_to_page(pfn); rc = dissolve_free_huge_page(page); if (rc) @@ -3148,9 +3148,6 @@ static void __init hugetlb_init_hstates(void) struct hstate *h, *h2; for_each_hstate(h) { - if (minimum_order > huge_page_order(h)) - minimum_order = huge_page_order(h); - /* oversize hugepages were init'ed in early boot */ if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); @@ -3175,7 +3172,6 @@ static void __init hugetlb_init_hstates(void) h->demote_order = h2->order; } } - VM_BUG_ON(minimum_order == UINT_MAX); } static void __init report_hugepages(void) From f7cc67ae7f6221abe57857bd6efd21e06a05bd45 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jun 2022 17:05:27 +0800 Subject: [PATCH 102/282] mm/madvise: minor cleanup for swapin_walk_pmd_entry() Passing index to pte_offset_map_lock() directly so the below calculation can be avoided. Rename orig_pte to ptep as it's not changed. Also use helper is_swap_pte() to improve the readability. No functional change intended. [akpm@linux-foundation.org: reduce scope of `ptep'] Link: https://lkml.kernel.org/r/20220618090527.37843-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/madvise.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 0316bbc6441b..e55108d4e4b2 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -195,7 +195,6 @@ success: static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *walk) { - pte_t *orig_pte; struct vm_area_struct *vma = walk->private; unsigned long index; struct swap_iocb *splug = NULL; @@ -208,12 +207,13 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, swp_entry_t entry; struct page *page; spinlock_t *ptl; + pte_t *ptep; - orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); - pte = *(orig_pte + ((index - start) / PAGE_SIZE)); - pte_unmap_unlock(orig_pte, ptl); + ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl); + pte = *ptep; + pte_unmap_unlock(ptep, ptl); - if (pte_present(pte) || pte_none(pte)) + if (!is_swap_pte(pte)) continue; entry = pte_to_swp_entry(pte); if (unlikely(non_swap_entry(entry))) From 0506c31d0a8443a9f55eb69d81db426f2eb3296e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 20 Jun 2022 19:47:15 +0800 Subject: [PATCH 103/282] mm: rmap: simplify the hugetlb handling when unmapping or migration According to previous discussion [1], there are so many levels of indenting to handle the hugetlb case when unmapping or migration. We can combine folio_test_anon() and huge_pmd_unshare() to save one level of indenting, by adding a local variable and moving the VM_BUG_ON() a little forward. No intended functional changes in this patch. [1] https://lore.kernel.org/all/0b986dc4-5843-3e2d-c2df-5a2e9f13e6ab@oracle.com/ Link: https://lkml.kernel.org/r/28414b1b96f095e838c1e548074f8e0fc70d78cf.1655724713.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 90 +++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 65e0a767b837..56134cdc5ca3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1537,6 +1537,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { + bool anon = folio_test_anon(folio); + /* * The try_to_unmap() is only passed a hugetlb page * in the case where the hugetlb page is poisoned. @@ -1551,31 +1553,28 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ flush_cache_range(vma, range.start, range.end); - if (!folio_test_anon(folio)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); + if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + /* - * To call huge_pmd_unshare, i_mmap_rwsem must be - * held in write mode. Caller needs to explicitly - * do this outside rmap routines. + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. */ - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); - - if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; - } + page_vma_mapped_walk_done(&pvmw); + break; } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { @@ -1906,6 +1905,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { + bool anon = folio_test_anon(folio); + /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may @@ -1915,31 +1916,28 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ flush_cache_range(vma, range.start, range.end); - if (!folio_test_anon(folio)) { + /* + * To call huge_pmd_unshare, i_mmap_rwsem must be + * held in write mode. Caller needs to explicitly + * do this outside rmap routines. + */ + VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); + if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + flush_tlb_range(vma, range.start, range.end); + mmu_notifier_invalidate_range(mm, range.start, + range.end); + /* - * To call huge_pmd_unshare, i_mmap_rwsem must be - * held in write mode. Caller needs to explicitly - * do this outside rmap routines. + * The ref count of the PMD page was dropped + * which is part of the way map counting + * is done for shared PMDs. Return 'true' + * here. When there is no other sharing, + * huge_pmd_unshare returns false and we will + * unmap the actual page and drop map count + * to zero. */ - VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); - - if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; - } + page_vma_mapped_walk_done(&pvmw); + break; } /* Nuke the hugetlb page table entry */ From 30934843019a18df975217e4a819f1c362994764 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 20 Jun 2022 10:12:50 +0200 Subject: [PATCH 104/282] mm/smaps: add Pss_Dirty Pss is the sum of the sizes of clean and dirty private pages, and the proportional sizes of clean and dirty shared pages: Private = Private_Dirty + Private_Clean Shared_Proportional = Shared_Dirty_Proportional + Shared_Clean_Proportional Pss = Private + Shared_Proportional The Shared*Proportional fields are not present in smaps, so it is not always possible to determine how much of the Pss is from dirty pages and how much is from clean pages. This information can be useful for measuring memory usage for the purpose of optimisation, since clean pages can usually be discarded by the kernel immediately while dirty pages cannot. The smaps routines in the kernel already have access to this data, so add a Pss_Dirty to show it to userspace. Pss_Clean is not added since it can be calculated from Pss and Pss_Dirty. Link: https://lkml.kernel.org/r/20220620081251.2928103-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Cc: Jonathan Corbet Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- Documentation/ABI/testing/procfs-smaps_rollup | 1 + Documentation/filesystems/proc.rst | 5 ++++- fs/proc/task_mmu.c | 3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/testing/procfs-smaps_rollup b/Documentation/ABI/testing/procfs-smaps_rollup index a4e31c465194..b446a7154a1b 100644 --- a/Documentation/ABI/testing/procfs-smaps_rollup +++ b/Documentation/ABI/testing/procfs-smaps_rollup @@ -22,6 +22,7 @@ Description: MMUPageSize: 4 kB Rss: 884 kB Pss: 385 kB + Pss_Dirty: 68 kB Pss_Anon: 301 kB Pss_File: 80 kB Pss_Shmem: 4 kB diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8543a59f288f..0b5120ff506c 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -448,6 +448,7 @@ Memory Area, or VMA) there is a series of lines such as the following:: MMUPageSize: 4 kB Rss: 892 kB Pss: 374 kB + Pss_Dirty: 0 kB Shared_Clean: 892 kB Shared_Dirty: 0 kB Private_Clean: 0 kB @@ -479,7 +480,9 @@ dirty shared and private pages in the mapping. The "proportional set size" (PSS) of a process is the count of pages it has in memory, where each page is divided by the number of processes sharing it. So if a process has 1000 pages all to itself, and 1000 shared with one other -process, its PSS will be 1500. +process, its PSS will be 1500. "Pss_Dirty" is the portion of PSS which +consists of dirty pages. ("Pss_Clean" is not included, but it can be +calculated by subtracting "Pss_Dirty" from "Pss".) Note that even a page which is part of a MAP_SHARED mapping, but has only a single pte mapped, i.e. is currently used by only one process, is accounted diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2d04e3470d4c..751c19d5bfdd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -406,6 +406,7 @@ struct mem_size_stats { u64 pss_anon; u64 pss_file; u64 pss_shmem; + u64 pss_dirty; u64 pss_locked; u64 swap_pss; }; @@ -427,6 +428,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, mss->pss_locked += pss; if (dirty || PageDirty(page)) { + mss->pss_dirty += pss; if (private) mss->private_dirty += size; else @@ -808,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, { SEQ_PUT_DEC("Rss: ", mss->resident); SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nPss_Dirty: ", mss->pss_dirty >> PSS_SHIFT); if (rollup_mode) { /* * These are meaningful only for smaps_rollup, otherwise two of From e8da368a1e42a8056d1a6b419e1b91b6cf11d77e Mon Sep 17 00:00:00 2001 From: Yun-Ze Li Date: Mon, 20 Jun 2022 07:15:16 +0000 Subject: [PATCH 105/282] mm, docs: fix comments that mention mem_hotplug_end() Comments that mention mem_hotplug_end() are confusing as there is no function called mem_hotplug_end(). Fix them by replacing all the occurences of mem_hotplug_end() in the comments with mem_hotplug_done(). [akpm@linux-foundation.org: grammatical fixes] Link: https://lkml.kernel.org/r/20220620071516.1286101-1-p76091292@gs.ncku.edu.tw Signed-off-by: Yun-Ze Li Cc: Souptick Joarder Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 6 +++--- mm/compaction.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2b5757752333..735bf5b37949 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -591,8 +591,8 @@ struct zone { * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by - * mem_hotplug_begin/end(). Any reader who can't tolerant drift of - * present_pages should get_online_mems() to get a stable value. + * mem_hotplug_begin/done(). Any reader who can't tolerant drift of + * present_pages should use get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; @@ -870,7 +870,7 @@ typedef struct pglist_data { unsigned long nr_reclaim_start; /* nr pages written while throttled * when throttling started. */ struct task_struct *kswapd; /* Protected by - mem_hotplug_begin/end() */ + mem_hotplug_begin/done() */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; diff --git a/mm/compaction.c b/mm/compaction.c index 1f89b969c12b..cd029ab03d0e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3011,7 +3011,7 @@ void kcompactd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold mem_hotplug_begin/end(). + * be holding mem_hotplug_begin/done(). */ void kcompactd_stop(int nid) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 161096d9311a..f58761cea0a0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4645,7 +4645,7 @@ void kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold mem_hotplug_begin/end(). + * be holding mem_hotplug_begin/done(). */ void kswapd_stop(int nid) { From dc89997264de565999a1cb55db3f295d3a8e457b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 22 Jun 2022 11:35:01 +0900 Subject: [PATCH 106/282] zram: do not lookup algorithm in backends table Always use crypto_has_comp() so that crypto can lookup module, call usermodhelper to load the modules, wait for usermodhelper to finish and so on. Otherwise crypto will do all of these steps under CPU hot-plug lock and this looks like too much stuff to handle under the CPU hot-plug lock. Besides this can end up in a deadlock when usermodhelper triggers a code path that attempts to lock the CPU hot-plug lock, that zram already holds. An example of such deadlock: - path A. zram grabs CPU hot-plug lock, execs /sbin/modprobe from crypto and waits for modprobe to finish disksize_store zcomp_create __cpuhp_state_add_instance __cpuhp_state_add_instance_cpuslocked zcomp_cpu_up_prepare crypto_alloc_base crypto_alg_mod_lookup call_usermodehelper_exec wait_for_completion_killable do_wait_for_common schedule - path B. async work kthread that brings in scsi device. It wants to register CPUHP states at some point, and it needs the CPU hot-plug lock for that, which is owned by zram. async_run_entry_fn scsi_probe_and_add_lun scsi_mq_alloc_queue blk_mq_init_queue blk_mq_init_allocated_queue blk_mq_realloc_hw_ctxs __cpuhp_state_add_instance __cpuhp_state_add_instance_cpuslocked mutex_lock schedule - path C. modprobe sleeps, waiting for all aync works to finish. load_module do_init_module async_synchronize_full async_synchronize_cookie_domain schedule [senozhatsky@chromium.org: add comment] Link: https://lkml.kernel.org/r/20220624060606.1014474-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20220622023501.517125-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zcomp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 052aa3f65514..0916de952e09 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -63,12 +63,6 @@ static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp) bool zcomp_available_algorithm(const char *comp) { - int i; - - i = sysfs_match_string(backends, comp); - if (i >= 0) - return true; - /* * Crypto does not ignore a trailing new line symbol, * so make sure you don't supply a string containing @@ -217,6 +211,11 @@ struct zcomp *zcomp_create(const char *compress) struct zcomp *comp; int error; + /* + * Crypto API will execute /sbin/modprobe if the compression module + * is not loaded yet. We must do it here, otherwise we are about to + * call /sbin/modprobe under CPU hot-plug lock. + */ if (!zcomp_available_algorithm(compress)) return ERR_PTR(-EINVAL); From ade63b419c4e8d27f0642804b6c8c7a76ffc18ac Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Thu, 23 Jun 2022 02:08:34 +0000 Subject: [PATCH 107/282] mm/page_alloc: make the annotations of available memory more accurate Not all systems use swap, so estimating available memory would help to prevent swapping or OOM of system that not use swap. And we need to reserve some page cache to prevent swapping or thrashing. If somebody is accessing the pages in pagecache, and if too much would be freed, most accesses might mean reading data from disk, i.e. thrashing. Link: https://lkml.kernel.org/r/20220623020833.972979-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Signed-off-by: CGEL ZTE Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 52fd92b2c1fe..c9c02b23f02f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5800,14 +5800,14 @@ long si_mem_available(void) /* * Estimate the amount of memory available for userspace allocations, - * without causing swapping. + * without causing swapping or OOM. */ available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will - * start swapping. Assume at least half of the page cache, or the - * low watermark worth of cache, needs to stay. + * start swapping or thrashing. Assume at least half of the page + * cache, or the low watermark worth of cache, needs to stay. */ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; pagecache -= min(pagecache / 2, wmark_low); From 18f3962953e40401b7ed98e8524167282c3e626e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 26 Jun 2022 22:57:17 +0800 Subject: [PATCH 108/282] mm: hugetlb: kill set_huge_swap_pte_at() Commit e5251fd43007 ("mm/hugetlb: introduce set_huge_swap_pte_at() helper") add set_huge_swap_pte_at() to handle swap entries on architectures that support hugepages consisting of contiguous ptes. And currently the set_huge_swap_pte_at() is only overridden by arm64. set_huge_swap_pte_at() provide a sz parameter to help determine the number of entries to be updated. But in fact, all hugetlb swap entries contain pfn information, so we can find the corresponding folio through the pfn recorded in the swap entry, then the folio_size() is the number of entries that need to be updated. And considering that users will easily cause bugs by ignoring the difference between set_huge_swap_pte_at() and set_huge_pte_at(). Let's handle swap entries in set_huge_pte_at() and remove the set_huge_swap_pte_at(), then we can call set_huge_pte_at() anywhere, which simplifies our coding. Link: https://lkml.kernel.org/r/20220626145717.53572-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Mike Kravetz Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 3 --- arch/arm64/mm/hugetlbpage.c | 34 ++++++++++++++++---------------- include/linux/hugetlb.h | 13 ------------ mm/hugetlb.c | 8 +++----- mm/rmap.c | 11 +++-------- 5 files changed, 23 insertions(+), 46 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1fd2846dbefe..d20f5da2d76f 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -46,9 +46,6 @@ extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET extern pte_t huge_ptep_get(pte_t *ptep); -extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz); -#define set_huge_swap_pte_at set_huge_swap_pte_at void __init arm64_hugetlb_cma_reserve(void); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index e2a5ec9fdc0d..3be8f25aa5be 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -238,6 +238,13 @@ static void clear_flush(struct mm_struct *mm, flush_tlb_range(&vma, saddr, addr); } +static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry) +{ + VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry)); + + return page_folio(pfn_to_page(swp_offset(entry))); +} + void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { @@ -247,11 +254,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, unsigned long pfn, dpfn; pgprot_t hugeprot; - /* - * Code needs to be expanded to handle huge swap and migration - * entries. Needed for HUGETLB and MEMORY_FAILURE. - */ - WARN_ON(!pte_present(pte)); + if (!pte_present(pte)) { + struct folio *folio; + + folio = hugetlb_swap_entry_to_folio(pte_to_swp_entry(pte)); + ncontig = num_contig_ptes(folio_size(folio), &pgsize); + + for (i = 0; i < ncontig; i++, ptep++) + set_pte_at(mm, addr, ptep, pte); + return; + } if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); @@ -269,18 +281,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); } -void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ - int i, ncontig; - size_t pgsize; - - ncontig = num_contig_ptes(sz, &pgsize); - - for (i = 0; i < ncontig; i++, ptep++) - set_pte(ptep, pte); -} - pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 756b66ff025e..c6cccfaf8708 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -903,14 +903,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) atomic_long_sub(l, &mm->hugetlb_usage); } -#ifndef set_huge_swap_pte_at -static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ - set_huge_pte_at(mm, addr, ptep, pte); -} -#endif - #ifndef huge_ptep_modify_prot_start #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, @@ -1094,11 +1086,6 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } -static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, unsigned long sz) -{ -} - static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65454896f174..064da8ffbac6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4798,12 +4798,11 @@ again: entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = huge_pte_mkuffd_wp(entry); - set_huge_swap_pte_at(src, addr, src_pte, - entry, sz); + set_huge_pte_at(src, addr, src_pte, entry); } if (!userfaultfd_wp(dst_vma) && uffd_wp) entry = huge_pte_clear_uffd_wp(entry); - set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { /* * We copy the pte marker only if the dst vma has @@ -6344,8 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_mkuffd_wp(newpte); else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); - set_huge_swap_pte_at(mm, address, ptep, - newpte, psize); + set_huge_pte_at(mm, address, ptep, newpte); pages++; } spin_unlock(ptl); diff --git a/mm/rmap.c b/mm/rmap.c index 56134cdc5ca3..83172ee0ea35 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1618,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_swap_pte_at(mm, address, - pvmw.pte, pteval, - vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, pteval); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -2004,9 +2002,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_swap_pte_at(mm, address, - pvmw.pte, pteval, - vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, pteval); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -2074,8 +2070,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); if (folio_test_hugetlb(folio)) - set_huge_swap_pte_at(mm, address, pvmw.pte, - swp_pte, vma_mmu_pagesize(vma)); + set_huge_pte_at(mm, address, pvmw.pte, swp_pte); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), From f673bd7c2654a0e2a1ec59417dcf9b7ceae9c14c Mon Sep 17 00:00:00 2001 From: XueBing Chen Date: Sat, 25 Jun 2022 16:51:35 +0800 Subject: [PATCH 109/282] mm: sparsemem: drop unexpected word 'a' in comments there is an unexpected word 'a' in the comments that need to be dropped Link: https://lkml.kernel.org/r/24fbdae3.c86.1819a0f31b9.Coremail.chenxuebing@jari.cn Signed-off-by: XueBing Chen Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 3ff88a2eefb8..98529356b9cd 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -548,7 +548,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, } else { /* * When a PTE/PMD entry is freed from the init_mm - * there's a a free_pages() call to this page allocated + * there's a free_pages() call to this page allocated * above. Thus this get_page() is paired with the * put_page_testzero() on the freeing path. * This can only called by certain ZONE_DEVICE path, From dd5ff79d4ab85ac0cdb5f87e8fee4c4725255c4b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:10 +0800 Subject: [PATCH 110/282] mm/khugepaged: remove unneeded shmem_huge_enabled() check Patch series "A few cleanup patches for khugepaged", v2. This series contains a few cleaup patches to remove unneeded return value, use helper macro, fix typos and so on. More details can be found in the respective changelogs. This patch (of 7): If we reach here, khugepaged_scan_mm_slot() has already made sure that hugepage is enabled for shmem, via its call to hugepage_vma_check(). Remove this duplicated check. Link: https://lkml.kernel.org/r/20220625092816.4856-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220625092816.4856-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Andrea Arcangeli Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Cc: David Howells Cc: NeilBrown Cc: Alistair Popple Cc: David Hildenbrand Cc: Suren Baghdasaryan Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 16be62d493cd..34e6b4604aa1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2145,8 +2145,6 @@ skip: if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); - if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) - goto skip; while (khugepaged_scan.address < hend) { int ret; From 4d928e20fd5b9fd97e1e631500386ef12a2858d7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:11 +0800 Subject: [PATCH 111/282] mm/khugepaged: stop swapping in page when VM_FAULT_RETRY occurs When do_swap_page returns VM_FAULT_RETRY, we do not retry here and thus swap entry will remain in pagetable. This will result in later failure. So stop swapping in pages in this case to save cpu cycles. As A further optimization, mmap_lock is released when __collapse_huge_page_swapin() fails to avoid relocking mmap_lock. And "swapped_in++" is moved after error handling to make it more accurate. Link: https://lkml.kernel.org/r/20220625092816.4856-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 34e6b4604aa1..16e98395f362 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -972,8 +972,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * Bring missing pages in from swap, to complete THP collapse. * Only done if khugepaged_scan_pmd believes it is worthwhile. * - * Called and returns without pte mapped or spinlocks held, - * but with mmap_lock held to protect against vma changes. + * Called and returns without pte mapped or spinlocks held. + * Note that if false is returned, mmap_lock will be released. */ static bool __collapse_huge_page_swapin(struct mm_struct *mm, @@ -1000,27 +1000,24 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, pte_unmap(vmf.pte); continue; } - swapped_in++; ret = do_swap_page(&vmf); - /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ + /* + * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. + * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because + * we do not retry here and swap entry will remain in pagetable + * resulting in later failure. + */ if (ret & VM_FAULT_RETRY) { - mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, haddr, &vma)) { - /* vma is no longer available, don't continue to swapin */ - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } - /* check if the pmd is still valid */ - if (mm_find_pmd(mm, haddr) != pmd) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } - } - if (ret & VM_FAULT_ERROR) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } + if (ret & VM_FAULT_ERROR) { + mmap_read_unlock(mm); + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); + return false; + } + swapped_in++; } /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ @@ -1086,13 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm, } /* - * __collapse_huge_page_swapin always returns with mmap_lock locked. - * If it fails, we release mmap_lock and jump out_nolock. + * __collapse_huge_page_swapin will return with mmap_lock released + * when it fails. So we jump out_nolock directly in that case. * Continuing to collapse causes inconsistency. */ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mmap_read_unlock(mm); goto out_nolock; } From 36ee2c784af0dcfa9bb442f7fa68c842d48371fc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:12 +0800 Subject: [PATCH 112/282] mm/khugepaged: trivial typo and codestyle cleanup Fix some typos and tweak the code to meet codestyle. No functional change intended. Link: https://lkml.kernel.org/r/20220625092816.4856-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 16e98395f362..e237c5ec59bb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -260,7 +260,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR-1) + if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; @@ -286,7 +286,7 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); - if (err || max_ptes_swap > HPAGE_PMD_NR-1) + if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; @@ -313,7 +313,7 @@ static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, unsigned long max_ptes_shared; err = kstrtoul(buf, 10, &max_ptes_shared); - if (err || max_ptes_shared > HPAGE_PMD_NR-1) + if (err || max_ptes_shared > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_shared = max_ptes_shared; @@ -599,7 +599,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, int none_or_zero = 0, shared = 0, result = 0, referenced = 0; bool writable = false; - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && @@ -1215,7 +1215,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { @@ -1305,7 +1305,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Check if the page has any GUP (or other external) pins. * - * Here the check is racy it may see totmal_mapcount > refcount + * Here the check is racy it may see total_mapcount > refcount * in some cases. * For example, one process with one forked child process. * The parent has the PMD split due to MADV_DONTNEED, then @@ -1553,7 +1553,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * mmap_write_lock(mm) as PMD-mapping is likely to be split * later. * - * Not that vma->anon_vma check is racy: it can be set up after + * Note that vma->anon_vma check is racy: it can be set up after * the check but before we took mmap_lock by the fault path. * But page lock would prevent establishing any new ptes of the * page, so we are safe. From 2f55f070e5b80f130f5b161931ca91ce9cb2e625 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:13 +0800 Subject: [PATCH 113/282] mm/khugepaged: minor cleanup for collapse_file nr_none is always 0 for non-shmem case because the page can be read from the backend store. So when nr_none ! = 0, it must be in is_shmem case. Also only adjust the nrpages and uncharge shmem when nr_none != 0 to save cpu cycles. Link: https://lkml.kernel.org/r/20220625092816.4856-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zach O'Keefe Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e237c5ec59bb..35f87bd2af28 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1881,8 +1881,8 @@ out_unlock: if (nr_none) { __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); - if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + /* nr_none is always 0 for non-shmem. */ + __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ @@ -1946,10 +1946,10 @@ xa_unlocked: /* Something went wrong: roll back page cache changes */ xas_lock_irq(&xas); - mapping->nrpages -= nr_none; - - if (is_shmem) + if (nr_none) { + mapping->nrpages -= nr_none; shmem_uncharge(mapping->host, nr_none); + } xas_set(&xas, start); xas_for_each(&xas, page, end - 1) { From 6dcdc94db1d45da0e40b6947888098b9daf9eda6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:14 +0800 Subject: [PATCH 114/282] mm/khugepaged: use helper macro __ATTR_RW Use helper macro __ATTR_RW to define the khugepaged attributes. Minor readability improvement. Link: https://lkml.kernel.org/r/20220625092816.4856-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/khugepaged.c | 67 ++++++++++++++++++++++--------------------------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 35f87bd2af28..6a969c0633a9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -147,8 +147,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, return count; } static struct kobj_attribute scan_sleep_millisecs_attr = - __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, - scan_sleep_millisecs_store); + __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -175,8 +174,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, return count; } static struct kobj_attribute alloc_sleep_millisecs_attr = - __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, - alloc_sleep_millisecs_store); + __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -200,8 +198,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, return count; } static struct kobj_attribute pages_to_scan_attr = - __ATTR(pages_to_scan, 0644, pages_to_scan_show, - pages_to_scan_store); + __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -221,22 +218,21 @@ static ssize_t full_scans_show(struct kobject *kobj, static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); -static ssize_t khugepaged_defrag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } -static ssize_t khugepaged_defrag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = - __ATTR(defrag, 0644, khugepaged_defrag_show, - khugepaged_defrag_store); + __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over @@ -246,15 +242,15 @@ static struct kobj_attribute khugepaged_defrag_attr = * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ -static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } -static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_none; @@ -268,19 +264,18 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = - __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, - khugepaged_max_ptes_none_store); + __ATTR_RW(max_ptes_none); -static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } -static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_swap; @@ -295,19 +290,18 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, } static struct kobj_attribute khugepaged_max_ptes_swap_attr = - __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, - khugepaged_max_ptes_swap_store); + __ATTR_RW(max_ptes_swap); -static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } -static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_shared; @@ -322,8 +316,7 @@ static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, } static struct kobj_attribute khugepaged_max_ptes_shared_attr = - __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show, - khugepaged_max_ptes_shared_store); + __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, From 081c32564bac58365be53b905721c9ba2300819a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:15 +0800 Subject: [PATCH 115/282] mm/khugepaged: remove unneeded return value of khugepaged_add_pte_mapped_thp() The return value of khugepaged_add_pte_mapped_thp() is always 0 and also ignored. Remove it to clean up the code. Link: https://lkml.kernel.org/r/20220625092816.4856-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/khugepaged.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6a969c0633a9..08e885f28def 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1371,8 +1371,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ -static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) { struct mm_slot *mm_slot; @@ -1383,7 +1383,6 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); - return 0; } static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, From 1baec203b77cafa24610b5c9ae7a2aa380d74ef6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 25 Jun 2022 17:28:16 +0800 Subject: [PATCH 116/282] mm/khugepaged: try to free transhuge swapcache when possible Transhuge swapcaches won't be freed in __collapse_huge_page_copy(). It's because release_pte_page() is not called for these pages and thus free_page_and_swap_cache can't grab the page lock. These pages won't be freed from swap cache even if we are the only user until next time reclaim. It shouldn't hurt indeed, but we could try to free these pages to save more memory for system. Link: https://lkml.kernel.org/r/20220625092816.4856-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Alistair Popple Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: David Howells Cc: Matthew Wilcox (Oracle) Cc: NeilBrown Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ mm/khugepaged.c | 7 ++++++- mm/swap.h | 5 ----- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 95a5b7aa1ae9..6d11c51b2b62 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -455,6 +455,7 @@ static inline unsigned long total_swapcache_pages(void) return global_node_page_state(NR_SWAPCACHE); } +extern void free_swap_cache(struct page *page); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); /* linux/mm/swapfile.c */ @@ -539,6 +540,10 @@ static inline void put_swap_device(struct swap_info_struct *si) /* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */ #define free_swap_and_cache(e) is_pfn_swap_entry(e) +static inline void free_swap_cache(struct page *page) +{ +} + static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 08e885f28def..01e0d6336754 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -755,7 +755,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) { list_del(&src_page->lru); - release_pte_page(src_page); + mod_node_page_state(page_pgdat(src_page), + NR_ISOLATED_ANON + page_is_file_lru(src_page), + -compound_nr(src_page)); + unlock_page(src_page); + free_swap_cache(src_page); + putback_lru_page(src_page); } } diff --git a/mm/swap.h b/mm/swap.h index fa0816af4712..17936e068c1c 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,7 +41,6 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); -void free_swap_cache(struct page *page); struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); @@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline void free_swap_cache(struct page *page) -{ -} - static inline void show_swap_cache_info(void) { } From 4f2930c6718afbb6c5904cda6f6781a70c52a042 Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Mon, 27 Jun 2022 18:39:12 +0200 Subject: [PATCH 117/282] selftests/vm: only run 128TBswitch with 5-level paging The test va_128TBswitch.c expects to be able to pass mmap an address hint and length that cross the address 1<<47. On x86_64, this is not possible without 5-level page tables, so the test fails. The test is already only run on 64-bit powerpc and x86_64 archs, but this patch adds an additional check on x86_64 that skips the test if PG_TABLE_LEVELS < 5. There is precedent for checking /proc/config.gz in selftests, e.g. in selftests/firmware. Running the tests produces the desired output: sudo make -C tools/testing/selftests TARGETS=vm run_tests --------------------------- running ./va_128TBswitch.sh --------------------------- ./va_128TBswitch.sh: PG_TABLE_LEVELS=4, must be >= 5 to run this test [SKIP] ------------------------------- [adam@wowsignal.io: restrict the check to x86_64] Link: https://lkml.kernel.org/r/20220628163654.337600-1-adam@wowsignal.io [adam@wowsignal.io: fix formatting issues, rename "die" to "fail"] Link: https://lkml.kernel.org/r/20220701163030.415735-1-adam@wowsignal.io Link: https://lkml.kernel.org/r/20220627163912.5581-1-adam@wowsignal.io Signed-off-by: Adam Sindelar Cc: Adam Sindelar Cc: David Vernet Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/run_vmtests.sh | 2 +- tools/testing/selftests/vm/va_128TBswitch.sh | 54 ++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/vm/va_128TBswitch.sh diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 44f25acfbeca..6a34209379a4 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -93,6 +93,7 @@ TEST_PROGS := run_vmtests.sh TEST_FILES := test_vmalloc.sh TEST_FILES += test_hmm.sh +TEST_FILES += va_128TBswitch.sh KSFT_KHDR_INSTALL := 1 include ../lib.mk diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 41fce8bea929..27c01c35c7a9 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -151,7 +151,7 @@ if [ $VADDR64 -ne 0 ]; then run_test ./virtual_address_range # virtual address 128TB switch test - run_test ./va_128TBswitch + run_test ./va_128TBswitch.sh fi # VADDR64 # vmalloc stability smoke test diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh new file mode 100644 index 000000000000..41580751dc51 --- /dev/null +++ b/tools/testing/selftests/vm/va_128TBswitch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_128TBswitch From ec1658f0f90c5b855ad590930bf057514581521d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 14 Jul 2022 17:04:58 +0000 Subject: [PATCH 118/282] mm/damon/lru_sort: fix potential memory leak in damon_lru_sort_init() damon_lru_sort_init() returns an error when damon_select_ops() fails without freeing 'ctx' which allocated before. This commit fixes the potential memory leak by freeing 'ctx' under the situation. Link: https://lkml.kernel.org/r/20220714170458.49727-1-sj@kernel.org Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index c276736a071c..9de6f00a71c5 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -524,8 +524,10 @@ static int __init damon_lru_sort_init(void) if (!ctx) return -ENOMEM; - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); return -EINVAL; + } ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; From 30f6f8614a1d1f8ccae6642eaad629997aa33b3b Mon Sep 17 00:00:00 2001 From: Kalpana Shetty Date: Tue, 31 May 2022 15:55:56 +0530 Subject: [PATCH 119/282] selftests/vm: add protection_keys tests to run_vmtests Add "protected_keys" tests to "run_vmtests.sh" would help run all VM related tests from a single shell script. [kalpana.shetty@amd.com: Shuah Khan's review comments incorporated, added -x executable check] Link: https://lkml.kernel.org/r/20220617202931.357-1-kalpana.shetty@amd.com Link: https://lkml.kernel.org/r/20220610090704.296-1-kalpana.shetty@amd.com Link: https://lkml.kernel.org/r/20220531102556.388-1-kalpana.shetty@amd.com Signed-off-by: Kalpana Shetty Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 27c01c35c7a9..2af563a9652e 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -179,4 +179,15 @@ run_test ./ksm_tests -N -m 1 # KSM test with 2 NUMA nodes and merge_across_nodes = 0 run_test ./ksm_tests -N -m 0 +# protection_keys tests +if [ -x ./protection_keys_32 ] +then + run_test ./protection_keys_32 +fi + +if [ -x ./protection_keys_64 ] +then + run_test ./protection_keys_64 +fi + exit $exitcode From 6077c943beee407168f72ece745b0aeaef6b896f Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:08 -0500 Subject: [PATCH 120/282] mm: rename is_pinnable_page() to is_longterm_pinnable_page() Patch series "Add MEMORY_DEVICE_COHERENT for coherent device memory mapping", v9. This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory owned by a device that can be mapped into CPU page tables like MEMORY_DEVICE_GENERIC and can also be migrated like MEMORY_DEVICE_PRIVATE. This patch series is mostly self-contained except for a few places where it needs to update other subsystems to handle the new memory type. System stability and performance are not affected according to our ongoing testing, including xfstests. How it works: The system BIOS advertises the GPU device memory (aka VRAM) as SPM (special purpose memory) in the UEFI system address map. The amdgpu driver registers the memory with devmap as MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for this hardware page migration capability is the Frontier supercomputer project. This functionality is not AMD-specific. We expect other GPU vendors to find this functionality useful, and possibly other hardware types in the future. Our test nodes in the lab are similar to the Frontier configuration, with .5 TB of system memory plus 256 GB of device memory split across 4 GPUs, all in a single coherent address space. Page migration is expected to improve application efficiency significantly. We will report empirical results as they become available. Coherent device type pages at gup are now migrated back to system memory if they are being pinned long-term (FOLL_LONGTERM). The reason is, that long-term pinning would interfere with the device memory manager owning the device-coherent pages (e.g. evictions in TTM). These series incorporate Alistair Popple patches to do this migration from pin_user_pages() calls. hmm_gup_test has been added to hmm-test to test different get user pages calls. This series includes handling of device-managed anonymous pages returned by vm_normal_pages. Although they behave like normal pages for purposes of mapping in CPU page tables and for COW, they do not support LRU lists, NUMA migration or THP. We also introduced a FOLL_LRU flag that adds the same behaviour to follow_page and related APIs, to allow callers to specify that they expect to put pages on an LRU list. This patch (of 14): is_pinnable_page() and folio_is_pinnable() are renamed to is_longterm_pinnable_page() and folio_is_longterm_pinnable() respectively. These functions are used in the FOLL_LONGTERM flag context. Link: https://lkml.kernel.org/r/20220715150521.18165-1-alex.sierra@amd.com Link: https://lkml.kernel.org/r/20220715150521.18165-2-alex.sierra@amd.com Signed-off-by: Alex Sierra Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Felix Kuehling Cc: Ralph Campbell Cc: Christoph Hellwig Cc: Jerome Glisse Cc: Alistair Popple Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- mm/gup.c | 4 ++-- mm/gup_test.c | 2 +- mm/hugetlb.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9cc02a7e503b..3c044e38958c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1607,7 +1607,7 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma, /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ #ifdef CONFIG_MIGRATION -static inline bool is_pinnable_page(struct page *page) +static inline bool is_longterm_pinnable_page(struct page *page) { #ifdef CONFIG_CMA int mt = get_pageblock_migratetype(page); @@ -1618,15 +1618,15 @@ static inline bool is_pinnable_page(struct page *page) return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); } #else -static inline bool is_pinnable_page(struct page *page) +static inline bool is_longterm_pinnable_page(struct page *page) { return true; } #endif -static inline bool folio_is_pinnable(struct folio *folio) +static inline bool folio_is_longterm_pinnable(struct folio *folio) { - return is_pinnable_page(&folio->page); + return is_longterm_pinnable_page(&folio->page); } static inline void set_page_zone(struct page *page, enum zone_type zone) diff --git a/mm/gup.c b/mm/gup.c index 3129b754ade3..a9940e3b3181 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -133,7 +133,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) * path. */ if (unlikely((flags & FOLL_LONGTERM) && - !is_pinnable_page(page))) + !is_longterm_pinnable_page(page))) return NULL; /* @@ -1923,7 +1923,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - if (folio_is_pinnable(folio)) + if (folio_is_longterm_pinnable(folio)) continue; /* diff --git a/mm/gup_test.c b/mm/gup_test.c index d974dec19e1c..12b0a91767d3 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, dump_page(page, "gup_test failure"); break; } else if (cmd == PIN_LONGTERM_BENCHMARK && - WARN(!is_pinnable_page(page), + WARN(!is_longterm_pinnable_page(page), "pages[%lu] is NOT pinnable but pinned\n", i)) { dump_page(page, "gup_test failure"); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 064da8ffbac6..ffdf3fc4a83f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1129,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (pin && !is_pinnable_page(page)) + if (pin && !is_longterm_pinnable_page(page)) continue; if (PageHWPoison(page)) From 5bb88dc571b1cbf0284100a317fb21ab7d03e40c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:09 -0500 Subject: [PATCH 121/282] mm: move page zone helpers from mm.h to mmzone.h It makes more sense to have these helpers in zone specific header file, rather than the generic mm.h Link: https://lkml.kernel.org/r/20220715150521.18165-3-alex.sierra@amd.com Signed-off-by: Alex Sierra Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/memremap.h | 2 +- include/linux/mm.h | 78 --------------------------------------- include/linux/mmzone.h | 80 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 79 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 9f5ee49482de..732dde5988fb 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -2,7 +2,7 @@ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ -#include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c044e38958c..a2d01e49253b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1045,84 +1045,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); * back into memory. */ -/* - * The zone field is never updated after free_area_init_core() - * sets it, so none of the operations on it need to be atomic. - */ - -/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ -#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) -#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) -#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) -#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) -#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) - -/* - * Define the bit shifts to access each section. For non-existent - * sections we define the shift as 0; that plus a 0 mask ensures - * the compiler will optimise away reference to them. - */ -#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) -#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) -#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) -#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) -#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) - -/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ -#ifdef NODE_NOT_IN_PAGE_FLAGS -#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ - SECTIONS_PGOFF : ZONES_PGOFF) -#else -#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) -#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ - NODES_PGOFF : ZONES_PGOFF) -#endif - -#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) - -#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) -#define NODES_MASK ((1UL << NODES_WIDTH) - 1) -#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) -#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) -#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) - -static inline enum zone_type page_zonenum(const struct page *page) -{ - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; -} - -static inline enum zone_type folio_zonenum(const struct folio *folio) -{ - return page_zonenum(&folio->page); -} - -#ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_DEVICE; -} -extern void memmap_init_zone_device(struct zone *, unsigned long, - unsigned long, struct dev_pagemap *); -#else -static inline bool is_zone_device_page(const struct page *page) -{ - return false; -} -#endif - -static inline bool folio_is_zone_device(const struct folio *folio) -{ - return is_zone_device_page(&folio->page); -} - -static inline bool is_zone_movable_page(const struct page *page) -{ - return page_zonenum(page) == ZONE_MOVABLE; -} - #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 735bf5b37949..5da1135e6755 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -730,6 +730,86 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +#ifndef BUILD_VDSO32_64 +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ + +/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ +#define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) +#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) +#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) +#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) +#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) + +/* + * Define the bit shifts to access each section. For non-existent + * sections we define the shift as 0; that plus a 0 mask ensures + * the compiler will optimise away reference to them. + */ +#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) +#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) +#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) +#define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) +#define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) + +/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ +#ifdef NODE_NOT_IN_PAGE_FLAGS +#define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) +#define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF) ? \ + SECTIONS_PGOFF : ZONES_PGOFF) +#else +#define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) +#define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF) ? \ + NODES_PGOFF : ZONES_PGOFF) +#endif + +#define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) + +#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) +#define NODES_MASK ((1UL << NODES_WIDTH) - 1) +#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) +#define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) +#define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) +#define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) + +static inline enum zone_type page_zonenum(const struct page *page) +{ + ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); + return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; +} + +static inline enum zone_type folio_zonenum(const struct folio *folio) +{ + return page_zonenum(&folio->page); +} + +#ifdef CONFIG_ZONE_DEVICE +static inline bool is_zone_device_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_DEVICE; +} +extern void memmap_init_zone_device(struct zone *, unsigned long, + unsigned long, struct dev_pagemap *); +#else +static inline bool is_zone_device_page(const struct page *page) +{ + return false; +} +#endif + +static inline bool folio_is_zone_device(const struct folio *folio) +{ + return is_zone_device_page(&folio->page); +} + +static inline bool is_zone_movable_page(const struct page *page) +{ + return page_zonenum(page) == ZONE_MOVABLE; +} +#endif + /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone From f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:10 -0500 Subject: [PATCH 122/282] mm: add zone device coherent type memory support Device memory that is cache coherent from device and CPU point of view. This is used on platforms that have an advanced system bus (like CAPI or CXL). Any page of a process can be migrated to such memory. However, no one should be allowed to pin such memory so that it can always be evicted. [hch@lst.de: rebased ontop of the refcount changes, remove is_dev_private_or_coherent_page] Link: https://lkml.kernel.org/r/20220715150521.18165-4-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Acked-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/memremap.h | 19 +++++++++++++++++++ include/linux/mm.h | 5 ++++- mm/memcontrol.c | 7 ++++--- mm/memory-failure.c | 8 ++++++-- mm/memremap.c | 10 ++++++++++ mm/migrate_device.c | 16 +++++++--------- mm/rmap.c | 5 +++-- 7 files changed, 53 insertions(+), 17 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 732dde5988fb..09320b7f706c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -41,6 +41,13 @@ struct vmem_altmap { * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/mm/hmm.rst. * + * MEMORY_DEVICE_COHERENT: + * Device memory that is cache coherent from device and CPU point of view. This + * is used on platforms that have an advanced system bus (like CAPI or CXL). A + * driver can hotplug the device memory using ZONE_DEVICE and with that memory + * type. Any page of a process can be migrated to such memory. However no one + * should be allowed to pin such memory so that it can always be evicted. + * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page @@ -61,6 +68,7 @@ struct vmem_altmap { enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, + MEMORY_DEVICE_COHERENT, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, @@ -150,6 +158,17 @@ static inline bool is_pci_p2pdma_page(const struct page *page) page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline bool is_device_coherent_page(const struct page *page) +{ + return is_zone_device_page(page) && + page->pgmap->type == MEMORY_DEVICE_COHERENT; +} + +static inline bool folio_is_device_coherent(const struct folio *folio) +{ + return is_device_coherent_page(&folio->page); +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/include/linux/mm.h b/include/linux/mm.h index a2d01e49253b..64393ed3330a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -28,6 +28,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1537,7 +1538,9 @@ static inline bool is_longterm_pinnable_page(struct page *page) if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - return !is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)); + return !(is_device_coherent_page(page) || + is_zone_movable_page(page) || + is_zero_pfn(page_to_pfn(page))); } #else static inline bool is_longterm_pinnable_page(struct page *page) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1497affe08c4..b1868784f895 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5716,8 +5716,8 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE - * (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5755,7 +5755,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page)) + if (is_device_private_page(page) || + is_device_coherent_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f7612ccdb299..b7ca5db7e60e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1686,12 +1686,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto unlock; } - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: /* - * TODO: Handle HMM pages which may need coordination + * TODO: Handle device pages which may need coordination * with device-side memory. */ goto unlock; + default: + break; } /* diff --git a/mm/memremap.c b/mm/memremap.c index 8b5c8fd4ea8e..f0955785150f 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; + case MEMORY_DEVICE_COHERENT: + if (!pgmap->ops->page_free) { + WARN(1, "Missing page_free method\n"); + return ERR_PTR(-EINVAL); + } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } + break; case MEMORY_DEVICE_FS_DAX: if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { WARN(1, "File system DAX not supported\n"); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5052093d0262..a4847ad65da3 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -518,7 +518,7 @@ EXPORT_SYMBOL(migrate_vma_setup); * handle_pte_fault() * do_anonymous_page() * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE - * private page. + * private or coherent page. */ static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, @@ -594,11 +594,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, page_to_pfn(page)); entry = swp_entry_to_pte(swp_entry); } else { - /* - * For now we only support migrating to un-addressable device - * memory. - */ - if (is_zone_device_page(page)) { + if (is_zone_device_page(page) && + !is_device_coherent_page(page)) { pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); goto abort; } @@ -701,10 +698,11 @@ void migrate_vma_pages(struct migrate_vma *migrate) mapping = page_mapping(page); - if (is_device_private_page(newpage)) { + if (is_device_private_page(newpage) || + is_device_coherent_page(newpage)) { /* - * For now only support private anonymous when migrating - * to un-addressable device memory. + * For now only support anonymous memory migrating to + * device private or coherent memory. */ if (mapping) { migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; diff --git a/mm/rmap.c b/mm/rmap.c index 83172ee0ea35..0532fd92ecb3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1953,7 +1953,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (folio_is_zone_device(folio)) { + if (folio_is_device_private(folio)) { unsigned long pfn = folio_pfn(folio); swp_entry_t entry; pte_t swp_pte; @@ -2124,7 +2124,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) TTU_SYNC))) return; - if (folio_is_zone_device(folio) && !folio_is_device_private(folio)) + if (folio_is_zone_device(folio) && + (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* From 3218f8712d6bba1812efd5e0d66c1e15134f2a91 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:11 -0500 Subject: [PATCH 123/282] mm: handling Non-LRU pages returned by vm_normal_pages With DEVICE_COHERENT, we'll soon have vm_normal_pages() return device-managed anonymous pages that are not LRU pages. Although they behave like normal pages for purposes of mapping in CPU page, and for COW. They do not support LRU lists, NUMA migration or THP. Callers to follow_page() currently don't expect ZONE_DEVICE pages, however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check for ZONE_DEVICE pages in applicable users of follow_page() as well. Link: https://lkml.kernel.org/r/20220715150521.18165-5-alex.sierra@amd.com Signed-off-by: Alex Sierra Acked-by: Felix Kuehling [v2] Reviewed-by: Alistair Popple [v6] Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- mm/huge_memory.c | 2 +- mm/khugepaged.c | 9 ++++++--- mm/ksm.c | 6 +++--- mm/madvise.c | 4 ++-- mm/memory.c | 10 +++++++++- mm/mempolicy.c | 2 +- mm/migrate.c | 4 ++-- mm/mlock.c | 2 +- mm/mprotect.c | 2 +- 10 files changed, 27 insertions(+), 16 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 751c19d5bfdd..1d7fd832123b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1795,7 +1795,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return NULL; page = vm_normal_page(vma, addr, pte); - if (!page) + if (!page || is_zone_device_page(page)) return NULL; if (PageReserved(page)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 60d742c33de3..a563de8234c1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (IS_ERR(page)) continue; - if (!page) + if (!page || is_zone_device_page(page)) continue; if (!is_transparent_hugepage(page)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01e0d6336754..dea102170ab3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -611,7 +611,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } @@ -1261,7 +1261,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, writable = true; page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) { + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } @@ -1472,7 +1472,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) goto abort; page = vm_normal_page(vma, addr, *pte); - + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + page = NULL; /* * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. @@ -1490,6 +1491,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (pte_none(*pte)) continue; page = vm_normal_page(vma, addr, *pte); + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + goto abort; page_remove_rmap(page, vma, false); } diff --git a/mm/ksm.c b/mm/ksm.c index 8d2dc501c92c..55f1d9634869 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) cond_resched(); page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) + if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, @@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page)) + if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) goto out; if (PageAnon(page)) { flush_anon_page(vma, page, addr); @@ -2308,7 +2308,7 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page)) { + if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) { ksm_scan.address += PAGE_SIZE; cond_resched(); continue; diff --git a/mm/madvise.c b/mm/madvise.c index e55108d4e4b2..5f0f0948a50e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -421,7 +421,7 @@ regular_page: continue; page = vm_normal_page(vma, addr, ptent); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* @@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } page = vm_normal_page(vma, addr, ptent); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* diff --git a/mm/memory.c b/mm/memory.c index 580c62febe42..dce0b2e686eb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -624,6 +624,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) + /* + * NOTE: New users of ZONE_DEVICE will not set pte_devmap() + * and will have refcounts incremented on their struct pages + * when they are inserted into PTEs, thus they are safe to + * return here. Legacy ZONE_DEVICE pages that set pte_devmap() + * do not have refcounts. Example of legacy ZONE_DEVICE is + * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers. + */ return NULL; print_bad_pte(vma, addr, pte, NULL); @@ -4693,7 +4701,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_modify(old_pte, vma->vm_page_prot); page = vm_normal_page(vma, vmf->address, pte); - if (!page) + if (!page || is_zone_device_page(page)) goto out_map; /* TODO: handle PTE-mapped THP */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f4cd963550c1..88a5173c6ff0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -523,7 +523,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - if (!page) + if (!page || is_zone_device_page(page)) continue; /* * vm_normal_page() filters out zero pages, but there might diff --git a/mm/migrate.c b/mm/migrate.c index 7934eebf1689..1649270bc1a7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1630,7 +1630,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; err = -ENOENT; - if (!page) + if (!page || is_zone_device_page(page)) goto out; err = 0; @@ -1821,7 +1821,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (IS_ERR(page)) goto set_status; - if (page) { + if (page && !is_zone_device_page(page)) { err = page_to_nid(page); put_page(page); } else { diff --git a/mm/mlock.c b/mm/mlock.c index 716caf851043..b14e929084cc 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); - if (!page) + if (!page || is_zone_device_page(page)) continue; if (PageTransCompound(page)) continue; diff --git a/mm/mprotect.c b/mm/mprotect.c index 996a97e213ad..5ef478b06a7d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -127,7 +127,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, continue; page = vm_normal_page(vma, addr, oldpte); - if (!page || PageKsm(page)) + if (!page || is_zone_device_page(page) || PageKsm(page)) continue; /* Also skip shared copy-on-write pages */ From dd19e6d8ffaa1289d75d7833de97faf1b6b2c8e4 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:12 -0500 Subject: [PATCH 124/282] mm: add device coherent vma selection for memory migration This case is used to migrate pages from device memory, back to system memory. Device coherent type memory is cache coherent from device and CPU point of view. Link: https://lkml.kernel.org/r/20220715150521.18165-6-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Reviewed-by: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 + mm/migrate_device.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 069a89e847f3..b84908debe5c 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -148,6 +148,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn) enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, + MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, }; struct migrate_vma { diff --git a/mm/migrate_device.c b/mm/migrate_device.c index a4847ad65da3..18bc6483f63a 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -148,15 +148,21 @@ again: if (is_writable_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { - if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) - goto next; pfn = pte_pfn(pte); - if (is_zero_pfn(pfn)) { + if (is_zero_pfn(pfn) && + (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) { mpfn = MIGRATE_PFN_MIGRATE; migrate->cpages++; goto next; } page = vm_normal_page(migrate->vma, addr, pte); + if (page && !is_zone_device_page(page) && + !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) + goto next; + else if (page && is_device_coherent_page(page) && + (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) || + page->pgmap->owner != migrate->pgmap_owner)) + goto next; mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } From b05a79d4377f6dcc30683008ffd1c531ea965393 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 15 Jul 2022 10:05:13 -0500 Subject: [PATCH 125/282] mm/gup: migrate device coherent pages when pinning instead of failing Currently any attempts to pin a device coherent page will fail. This is because device coherent pages need to be managed by a device driver, and pinning them would prevent a driver from migrating them off the device. However this is no reason to fail pinning of these pages. These are coherent and accessible from the CPU so can be migrated just like pinning ZONE_MOVABLE pages. So instead of failing all attempts to pin them first try migrating them out of ZONE_DEVICE. [hch@lst.de: rebased to the split device memory checks, moved migrate_device_page to migrate_device.c] Link: https://lkml.kernel.org/r/20220715150521.18165-7-alex.sierra@amd.com Signed-off-by: Alistair Popple Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- mm/gup.c | 50 +++++++++++++++++++++++++++++++++++++------ mm/internal.h | 1 + mm/migrate_device.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a9940e3b3181..ecf362688268 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1913,7 +1913,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, unsigned long isolation_error_count = 0, i; struct folio *prev_folio = NULL; LIST_HEAD(movable_page_list); - bool drain_allow = true; + bool drain_allow = true, coherent_pages = false; int ret = 0; for (i = 0; i < nr_pages; i++) { @@ -1923,9 +1923,38 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; + /* + * Device coherent pages are managed by a driver and should not + * be pinned indefinitely as it prevents the driver moving the + * page. So when trying to pin with FOLL_LONGTERM instead try + * to migrate the page out of device memory. + */ + if (folio_is_device_coherent(folio)) { + /* + * We always want a new GUP lookup with device coherent + * pages. + */ + pages[i] = 0; + coherent_pages = true; + + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + if (gup_flags & FOLL_PIN) { + get_page(&folio->page); + unpin_user_page(&folio->page); + } + + ret = migrate_device_coherent_page(&folio->page); + if (ret) + goto unpin_pages; + + continue; + } + if (folio_is_longterm_pinnable(folio)) continue; - /* * Try to move out any movable page before pinning the range. */ @@ -1951,7 +1980,8 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, folio_nr_pages(folio)); } - if (!list_empty(&movable_page_list) || isolation_error_count) + if (!list_empty(&movable_page_list) || isolation_error_count + || coherent_pages) goto unpin_pages; /* @@ -1961,10 +1991,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, return nr_pages; unpin_pages: - if (gup_flags & FOLL_PIN) { - unpin_user_pages(pages, nr_pages); - } else { - for (i = 0; i < nr_pages; i++) + /* + * pages[i] might be NULL if any device coherent pages were found. + */ + for (i = 0; i < nr_pages; i++) { + if (!pages[i]) + continue; + + if (gup_flags & FOLL_PIN) + unpin_user_page(pages[i]); + else put_page(pages[i]); } diff --git a/mm/internal.h b/mm/internal.h index c0f8fbe0445b..899dab512c5a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -853,6 +853,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); void free_zone_device_page(struct page *page); +int migrate_device_coherent_page(struct page *page); /* * mm/gup.c diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 18bc6483f63a..7feeb447e3b9 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -686,6 +686,12 @@ void migrate_vma_pages(struct migrate_vma *migrate) } if (!page) { + /* + * The only time there is no vma is when called from + * migrate_device_coherent_page(). However this isn't + * called if the page could not be unmapped. + */ + VM_BUG_ON(!migrate->vma); if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { @@ -794,3 +800,49 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } EXPORT_SYMBOL(migrate_vma_finalize); + +/* + * Migrate a device coherent page back to normal memory. The caller should have + * a reference on page which will be copied to the new page if migration is + * successful or dropped on failure. + */ +int migrate_device_coherent_page(struct page *page) +{ + unsigned long src_pfn, dst_pfn = 0; + struct migrate_vma args; + struct page *dpage; + + WARN_ON_ONCE(PageCompound(page)); + + lock_page(page); + src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; + args.src = &src_pfn; + args.dst = &dst_pfn; + args.cpages = 1; + args.npages = 1; + args.vma = NULL; + + /* + * We don't have a VMA and don't need to walk the page tables to find + * the source page. So call migrate_vma_unmap() directly to unmap the + * page as migrate_vma_setup() will fail if args.vma == NULL. + */ + migrate_vma_unmap(&args); + if (!(src_pfn & MIGRATE_PFN_MIGRATE)) + return -EBUSY; + + dpage = alloc_page(GFP_USER | __GFP_NOWARN); + if (dpage) { + lock_page(dpage); + dst_pfn = migrate_pfn(page_to_pfn(dpage)); + } + + migrate_vma_pages(&args); + if (src_pfn & MIGRATE_PFN_MIGRATE) + copy_highpage(dpage, page); + migrate_vma_finalize(&args); + + if (src_pfn & MIGRATE_PFN_MIGRATE) + return 0; + return -EBUSY; +} From c83dee9b639469b6563a281f39deb99311f16bc4 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:14 -0500 Subject: [PATCH 126/282] drm/amdkfd: add SPM support for SVM When CPU is connected throug XGMI, it has coherent access to VRAM resource. In this case that resource is taken from a table in the device gmc aperture base. This resource is used along with the device type, which could be DEVICE_PRIVATE or DEVICE_COHERENT to create the device page map region. Also, MIGRATE_VMA_SELECT_DEVICE_COHERENT flag is selected for coherent type case during migration to device. Link: https://lkml.kernel.org/r/20220715150521.18165-8-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Reviewed-by: Felix Kuehling Cc: Alistair Popple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 34 +++++++++++++++--------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index e44376c2ecdc..f73e3e340413 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -671,13 +671,15 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.vma = vma; migrate.start = start; migrate.end = end; - migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev); + if (adev->gmc.xgmi.connected_to_cpu) + migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT; + else + migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t), GFP_KERNEL); - if (!buf) goto out; @@ -947,7 +949,7 @@ int svm_migrate_init(struct amdgpu_device *adev) { struct kfd_dev *kfddev = adev->kfd.dev; struct dev_pagemap *pgmap; - struct resource *res; + struct resource *res = NULL; unsigned long size; void *r; @@ -962,28 +964,34 @@ int svm_migrate_init(struct amdgpu_device *adev) * should remove reserved size */ size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20); - res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); - if (IS_ERR(res)) - return -ENOMEM; + if (adev->gmc.xgmi.connected_to_cpu) { + pgmap->range.start = adev->gmc.aper_base; + pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1; + pgmap->type = MEMORY_DEVICE_COHERENT; + } else { + res = devm_request_free_mem_region(adev->dev, &iomem_resource, size); + if (IS_ERR(res)) + return -ENOMEM; + pgmap->range.start = res->start; + pgmap->range.end = res->end; + pgmap->type = MEMORY_DEVICE_PRIVATE; + } - pgmap->type = MEMORY_DEVICE_PRIVATE; pgmap->nr_range = 1; - pgmap->range.start = res->start; - pgmap->range.end = res->end; pgmap->ops = &svm_migrate_pgmap_ops; pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev); - pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; - + pgmap->flags = 0; /* Device manager releases device-specific resources, memory region and * pgmap when driver disconnects from device. */ r = devm_memremap_pages(adev->dev, pgmap); if (IS_ERR(r)) { pr_err("failed to register HMM device memory\n"); - /* Disable SVM support capability */ pgmap->type = 0; - devm_release_mem_region(adev->dev, res->start, resource_size(res)); + if (pgmap->type == MEMORY_DEVICE_PRIVATE) + devm_release_mem_region(adev->dev, res->start, + res->end - res->start + 1); return PTR_ERR(r); } From 188f48268d04f3851cf30151af9025f1f661cc6f Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:15 -0500 Subject: [PATCH 127/282] lib: test_hmm add ioctl to get zone device type Add new ioctl cmd to query zone device type. This will be used once the test_hmm adds zone device coherent type. Link: https://lkml.kernel.org/r/20220715150521.18165-9-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- lib/test_hmm.c | 11 +++++++++-- lib/test_hmm_uapi.h | 14 ++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index f2c3015c5c82..ed737eae5959 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -87,6 +87,7 @@ struct dmirror_chunk { struct dmirror_device { struct cdev cdevice; struct hmm_devmem *devmem; + unsigned int zone_device_type; unsigned int devmem_capacity; unsigned int devmem_count; @@ -1266,14 +1267,20 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) static int __init hmm_dmirror_init(void) { int ret; - int id; + int id = 0; + int ndevices = 0; ret = alloc_chrdev_region(&dmirror_dev, 0, DMIRROR_NDEVICES, "HMM_DMIRROR"); if (ret) goto err_unreg; - for (id = 0; id < DMIRROR_NDEVICES; id++) { + memset(dmirror_devices, 0, DMIRROR_NDEVICES * sizeof(dmirror_devices[0])); + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + for (id = 0; id < ndevices; id++) { ret = dmirror_device_init(dmirror_devices + id, id); if (ret) goto err_chrdev; diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index f14dea5dcd06..0511af7464ee 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -31,10 +31,11 @@ struct hmm_dmirror_cmd { /* Expose the address space of the calling process through hmm device file */ #define HMM_DMIRROR_READ _IOWR('H', 0x00, struct hmm_dmirror_cmd) #define HMM_DMIRROR_WRITE _IOWR('H', 0x01, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_MIGRATE _IOWR('H', 0x02, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x03, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x04, struct hmm_dmirror_cmd) -#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_DEV _IOWR('H', 0x02, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_MIGRATE_TO_SYS _IOWR('H', 0x03, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_SNAPSHOT _IOWR('H', 0x04, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_EXCLUSIVE _IOWR('H', 0x05, struct hmm_dmirror_cmd) +#define HMM_DMIRROR_CHECK_EXCLUSIVE _IOWR('H', 0x06, struct hmm_dmirror_cmd) /* * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT. @@ -62,4 +63,9 @@ enum { HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, }; +enum { + /* 0 is reserved to catch uninitialized type fields */ + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1, +}; + #endif /* _LIB_TEST_HMM_UAPI_H */ From 25b80162d550408058b2d8fff2e63807bceda64c Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:16 -0500 Subject: [PATCH 128/282] lib: test_hmm add module param for zone device type In order to configure device coherent in test_hmm, two module parameters should be passed, which correspond to the SP start address of each device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed, private device type is configured. Link: https://lkml.kernel.org/r/20220715150521.18165-10-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- lib/test_hmm.c | 73 ++++++++++++++++++++++++++++++++------------- lib/test_hmm_uapi.h | 1 + 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index ed737eae5959..436124da00e6 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -37,6 +37,16 @@ #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) #define DEVMEM_CHUNKS_RESERVE 16 +static unsigned long spm_addr_dev0; +module_param(spm_addr_dev0, long, 0644); +MODULE_PARM_DESC(spm_addr_dev0, + "Specify start address for SPM (special purpose memory) used for device 0. By setting this Coherent device type will be used. Make sure spm_addr_dev1 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); + +static unsigned long spm_addr_dev1; +module_param(spm_addr_dev1, long, 0644); +MODULE_PARM_DESC(spm_addr_dev1, + "Specify start address for SPM (special purpose memory) used for device 1. By setting this Coherent device type will be used. Make sure spm_addr_dev0 is set too. Minimum SPM size should be DEVMEM_CHUNK_SIZE."); + static const struct dev_pagemap_ops dmirror_devmem_ops; static const struct mmu_interval_notifier_ops dmirror_min_ops; static dev_t dmirror_dev; @@ -455,28 +465,44 @@ fini: return ret; } -static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, +static int dmirror_allocate_chunk(struct dmirror_device *mdevice, struct page **ppage) { struct dmirror_chunk *devmem; - struct resource *res; + struct resource *res = NULL; unsigned long pfn; unsigned long pfn_first; unsigned long pfn_last; void *ptr; + int ret = -ENOMEM; devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); if (!devmem) - return false; + return ret; - res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, - "hmm_dmirror"); - if (IS_ERR(res)) + switch (mdevice->zone_device_type) { + case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE: + res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE, + "hmm_dmirror"); + if (IS_ERR_OR_NULL(res)) + goto err_devmem; + devmem->pagemap.range.start = res->start; + devmem->pagemap.range.end = res->end; + devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; + break; + case HMM_DMIRROR_MEMORY_DEVICE_COHERENT: + devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) ? + spm_addr_dev0 : + spm_addr_dev1; + devmem->pagemap.range.end = devmem->pagemap.range.start + + DEVMEM_CHUNK_SIZE - 1; + devmem->pagemap.type = MEMORY_DEVICE_COHERENT; + break; + default: + ret = -EINVAL; goto err_devmem; + } - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.range.start = res->start; - devmem->pagemap.range.end = res->end; devmem->pagemap.nr_range = 1; devmem->pagemap.ops = &dmirror_devmem_ops; devmem->pagemap.owner = mdevice; @@ -497,10 +523,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, mdevice->devmem_capacity = new_capacity; mdevice->devmem_chunks = new_chunks; } - ptr = memremap_pages(&devmem->pagemap, numa_node_id()); - if (IS_ERR(ptr)) + if (IS_ERR_OR_NULL(ptr)) { + if (ptr) + ret = PTR_ERR(ptr); + else + ret = -EFAULT; goto err_release; + } devmem->mdevice = mdevice; pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT; @@ -529,15 +559,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device *mdevice, } spin_unlock(&mdevice->lock); - return true; + return 0; err_release: mutex_unlock(&mdevice->devmem_lock); - release_mem_region(devmem->pagemap.range.start, range_len(&devmem->pagemap.range)); + if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); err_devmem: kfree(devmem); - return false; + return ret; } static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) @@ -562,7 +594,7 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) spin_unlock(&mdevice->lock); } else { spin_unlock(&mdevice->lock); - if (!dmirror_allocate_chunk(mdevice, &dpage)) + if (dmirror_allocate_chunk(mdevice, &dpage)) goto error; } @@ -1238,10 +1270,8 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) if (ret) return ret; - /* Build a list of free ZONE_DEVICE private struct pages */ - dmirror_allocate_chunk(mdevice, NULL); - - return 0; + /* Build a list of free ZONE_DEVICE struct pages */ + return dmirror_allocate_chunk(mdevice, NULL); } static void dmirror_device_remove(struct dmirror_device *mdevice) @@ -1254,8 +1284,9 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) mdevice->devmem_chunks[i]; memunmap_pages(&devmem->pagemap); - release_mem_region(devmem->pagemap.range.start, - range_len(&devmem->pagemap.range)); + if (devmem->pagemap.type == MEMORY_DEVICE_PRIVATE) + release_mem_region(devmem->pagemap.range.start, + range_len(&devmem->pagemap.range)); kfree(devmem); } kfree(mdevice->devmem_chunks); diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index 0511af7464ee..f700da7807c1 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -66,6 +66,7 @@ enum { enum { /* 0 is reserved to catch uninitialized type fields */ HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1, + HMM_DMIRROR_MEMORY_DEVICE_COHERENT, }; #endif /* _LIB_TEST_HMM_UAPI_H */ From 4c2e0f764eb4444272cfd2fa4afeb84c453a1a34 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:17 -0500 Subject: [PATCH 129/282] lib: add support for device coherent type in test_hmm Device Coherent type uses device memory that is coherently accesible by the CPU. This could be shown as SP (special purpose) memory range at the BIOS-e820 memory enumeration. If no SP memory is supported in system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP. Currently, test_hmm only supports two different SP ranges of at least 256MB size. This could be specified in the kernel parameter variable efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 & 0x140000000 physical address. Ex. efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000 Private and coherent device mirror instances can be created in the same probed. This is done by passing the module parameters spm_addr_dev0 & spm_addr_dev1. In this case, it will create four instances of device_mirror. The first two correspond to private device type, the last two to coherent type. Then, they can be easily accessed from user space through /dev/hmm_mirror. Usually num_device 0 and 1 are for private, and 2 and 3 for coherent types. If no module parameters are passed, two instances of private type device_mirror will be created only. Link: https://lkml.kernel.org/r/20220715150521.18165-11-alex.sierra@amd.com Signed-off-by: Alex Sierra Acked-by: Felix Kuehling Reviewed-by: Alistair Poppple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- lib/test_hmm.c | 253 +++++++++++++++++++++++++++++++++----------- lib/test_hmm_uapi.h | 4 + 2 files changed, 196 insertions(+), 61 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 436124da00e6..e3965cafd27c 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -32,11 +32,22 @@ #include "test_hmm_uapi.h" -#define DMIRROR_NDEVICES 2 +#define DMIRROR_NDEVICES 4 #define DMIRROR_RANGE_FAULT_TIMEOUT 1000 #define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) #define DEVMEM_CHUNKS_RESERVE 16 +/* + * For device_private pages, dpage is just a dummy struct page + * representing a piece of device memory. dmirror_devmem_alloc_page + * allocates a real system memory page as backing storage to fake a + * real device. zone_device_data points to that backing page. But + * for device_coherent memory, the struct page represents real + * physical CPU-accessible memory that we can use directly. + */ +#define BACKING_PAGE(page) (is_device_private_page((page)) ? \ + (page)->zone_device_data : (page)) + static unsigned long spm_addr_dev0; module_param(spm_addr_dev0, long, 0644); MODULE_PARM_DESC(spm_addr_dev0, @@ -125,6 +136,21 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce, return 0; } +static bool dmirror_is_private_zone(struct dmirror_device *mdevice) +{ + return (mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false; +} + +static enum migrate_vma_direction +dmirror_select_device(struct dmirror *dmirror) +{ + return (dmirror->mdevice->zone_device_type == + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : + MIGRATE_VMA_SELECT_DEVICE_COHERENT; +} + static void dmirror_bounce_fini(struct dmirror_bounce *bounce) { vfree(bounce->ptr); @@ -575,16 +601,19 @@ err_devmem: static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) { struct page *dpage = NULL; - struct page *rpage; + struct page *rpage = NULL; /* - * This is a fake device so we alloc real system memory to store - * our device memory. + * For ZONE_DEVICE private type, this is a fake device so we allocate + * real system memory to store our device memory. + * For ZONE_DEVICE coherent type we use the actual dpage to store the + * data and ignore rpage. */ - rpage = alloc_page(GFP_HIGHUSER); - if (!rpage) - return NULL; - + if (dmirror_is_private_zone(mdevice)) { + rpage = alloc_page(GFP_HIGHUSER); + if (!rpage) + return NULL; + } spin_lock(&mdevice->lock); if (mdevice->free_pages) { @@ -603,7 +632,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) return dpage; error: - __free_page(rpage); + if (rpage) + __free_page(rpage); return NULL; } @@ -629,12 +659,16 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, * unallocated pte_none() or read-only zero page. */ spage = migrate_pfn_to_page(*src); + if (WARN(spage && is_zone_device_page(spage), + "page already in device spage pfn: 0x%lx\n", + page_to_pfn(spage))) + continue; dpage = dmirror_devmem_alloc_page(mdevice); if (!dpage) continue; - rpage = dpage->zone_device_data; + rpage = BACKING_PAGE(dpage); if (spage) copy_highpage(rpage, spage); else @@ -648,6 +682,8 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, */ rpage->zone_device_data = dmirror; + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); *dst = migrate_pfn(page_to_pfn(dpage)); if ((*src & MIGRATE_PFN_WRITE) || (!spage && args->vma->vm_flags & VM_WRITE)) @@ -725,11 +761,7 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, if (!dpage) continue; - /* - * Store the page that holds the data so the page table - * doesn't have to deal with ZONE_DEVICE private pages. - */ - entry = dpage->zone_device_data; + entry = BACKING_PAGE(dpage); if (*dst & MIGRATE_PFN_WRITE) entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); @@ -815,15 +847,126 @@ static int dmirror_exclusive(struct dmirror *dmirror, return ret; } -static int dmirror_migrate(struct dmirror *dmirror, - struct hmm_dmirror_cmd *cmd) +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, + struct dmirror *dmirror) +{ + const unsigned long *src = args->src; + unsigned long *dst = args->dst; + unsigned long start = args->start; + unsigned long end = args->end; + unsigned long addr; + + for (addr = start; addr < end; addr += PAGE_SIZE, + src++, dst++) { + struct page *dpage, *spage; + + spage = migrate_pfn_to_page(*src); + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) + continue; + + if (WARN_ON(!is_device_private_page(spage) && + !is_device_coherent_page(spage))) + continue; + spage = BACKING_PAGE(spage); + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); + if (!dpage) + continue; + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", + page_to_pfn(spage), page_to_pfn(dpage)); + + lock_page(dpage); + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); + copy_highpage(dpage, spage); + *dst = migrate_pfn(page_to_pfn(dpage)); + if (*src & MIGRATE_PFN_WRITE) + *dst |= MIGRATE_PFN_WRITE; + } + return 0; +} + +static unsigned long +dmirror_successful_migrated_pages(struct migrate_vma *migrate) +{ + unsigned long cpages = 0; + unsigned long i; + + for (i = 0; i < migrate->npages; i++) { + if (migrate->src[i] & MIGRATE_PFN_VALID && + migrate->src[i] & MIGRATE_PFN_MIGRATE) + cpages++; + } + return cpages; +} + +static int dmirror_migrate_to_system(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) { unsigned long start, end, addr; unsigned long size = cmd->npages << PAGE_SHIFT; struct mm_struct *mm = dmirror->notifier.mm; struct vm_area_struct *vma; - unsigned long src_pfns[64]; - unsigned long dst_pfns[64]; + unsigned long src_pfns[64] = { 0 }; + unsigned long dst_pfns[64] = { 0 }; + struct migrate_vma args; + unsigned long next; + int ret; + + start = cmd->addr; + end = start + size; + if (end < start) + return -EINVAL; + + /* Since the mm is for the mirrored process, get a reference first. */ + if (!mmget_not_zero(mm)) + return -EINVAL; + + cmd->cpages = 0; + mmap_read_lock(mm); + for (addr = start; addr < end; addr = next) { + vma = vma_lookup(mm, addr); + if (!vma || !(vma->vm_flags & VM_READ)) { + ret = -EINVAL; + goto out; + } + next = min(end, addr + (ARRAY_SIZE(src_pfns) << PAGE_SHIFT)); + if (next > vma->vm_end) + next = vma->vm_end; + + args.vma = vma; + args.src = src_pfns; + args.dst = dst_pfns; + args.start = addr; + args.end = next; + args.pgmap_owner = dmirror->mdevice; + args.flags = dmirror_select_device(dmirror); + + ret = migrate_vma_setup(&args); + if (ret) + goto out; + + pr_debug("Migrating from device mem to sys mem\n"); + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); + + migrate_vma_pages(&args); + cmd->cpages += dmirror_successful_migrated_pages(&args); + migrate_vma_finalize(&args); + } +out: + mmap_read_unlock(mm); + mmput(mm); + + return ret; +} + +static int dmirror_migrate_to_device(struct dmirror *dmirror, + struct hmm_dmirror_cmd *cmd) +{ + unsigned long start, end, addr; + unsigned long size = cmd->npages << PAGE_SHIFT; + struct mm_struct *mm = dmirror->notifier.mm; + struct vm_area_struct *vma; + unsigned long src_pfns[64] = { 0 }; + unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; struct migrate_vma args; unsigned long next; @@ -860,6 +1003,7 @@ static int dmirror_migrate(struct dmirror *dmirror, if (ret) goto out; + pr_debug("Migrating from sys mem to device mem\n"); dmirror_migrate_alloc_and_copy(&args, dmirror); migrate_vma_pages(&args); dmirror_migrate_finalize_and_map(&args, dmirror); @@ -868,7 +1012,10 @@ static int dmirror_migrate(struct dmirror *dmirror, mmap_read_unlock(mm); mmput(mm); - /* Return the migrated data for verification. */ + /* + * Return the migrated data for verification. + * Only for pages in device zone + */ ret = dmirror_bounce_init(&bounce, start, size); if (ret) return ret; @@ -911,6 +1058,12 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; else *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; + } else if (is_device_coherent_page(page)) { + /* Is the page migrated to this device or some other? */ + if (dmirror->mdevice == dmirror_page_to_device(page)) + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL; + else + *perm = HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE; } else if (is_zero_pfn(page_to_pfn(page))) *perm = HMM_DMIRROR_PROT_ZERO; else @@ -1098,8 +1251,12 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, ret = dmirror_write(dmirror, &cmd); break; - case HMM_DMIRROR_MIGRATE: - ret = dmirror_migrate(dmirror, &cmd); + case HMM_DMIRROR_MIGRATE_TO_DEV: + ret = dmirror_migrate_to_device(dmirror, &cmd); + break; + + case HMM_DMIRROR_MIGRATE_TO_SYS: + ret = dmirror_migrate_to_system(dmirror, &cmd); break; case HMM_DMIRROR_EXCLUSIVE: @@ -1161,14 +1318,13 @@ static const struct file_operations dmirror_fops = { static void dmirror_devmem_free(struct page *page) { - struct page *rpage = page->zone_device_data; + struct page *rpage = BACKING_PAGE(page); struct dmirror_device *mdevice; - if (rpage) + if (rpage != page) __free_page(rpage); mdevice = dmirror_page_to_device(page); - spin_lock(&mdevice->lock); mdevice->cfree++; page->zone_device_data = mdevice->free_pages; @@ -1176,43 +1332,11 @@ static void dmirror_devmem_free(struct page *page) spin_unlock(&mdevice->lock); } -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, - struct dmirror *dmirror) -{ - const unsigned long *src = args->src; - unsigned long *dst = args->dst; - unsigned long start = args->start; - unsigned long end = args->end; - unsigned long addr; - - for (addr = start; addr < end; addr += PAGE_SIZE, - src++, dst++) { - struct page *dpage, *spage; - - spage = migrate_pfn_to_page(*src); - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) - continue; - spage = spage->zone_device_data; - - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); - if (!dpage) - continue; - - lock_page(dpage); - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); - copy_highpage(dpage, spage); - *dst = migrate_pfn(page_to_pfn(dpage)); - if (*src & MIGRATE_PFN_WRITE) - *dst |= MIGRATE_PFN_WRITE; - } - return 0; -} - static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { struct migrate_vma args; - unsigned long src_pfns; - unsigned long dst_pfns; + unsigned long src_pfns = 0; + unsigned long dst_pfns = 0; struct page *rpage; struct dmirror *dmirror; vm_fault_t ret; @@ -1232,7 +1356,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.src = &src_pfns; args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; - args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + args.flags = dmirror_select_device(dmirror); if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; @@ -1311,6 +1435,12 @@ static int __init hmm_dmirror_init(void) HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; dmirror_devices[ndevices++].zone_device_type = HMM_DMIRROR_MEMORY_DEVICE_PRIVATE; + if (spm_addr_dev0 && spm_addr_dev1) { + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + dmirror_devices[ndevices++].zone_device_type = + HMM_DMIRROR_MEMORY_DEVICE_COHERENT; + } for (id = 0; id < ndevices; id++) { ret = dmirror_device_init(dmirror_devices + id, id); if (ret) @@ -1333,7 +1463,8 @@ static void __exit hmm_dmirror_exit(void) int id; for (id = 0; id < DMIRROR_NDEVICES; id++) - dmirror_device_remove(dmirror_devices + id); + if (dmirror_devices[id].zone_device_type) + dmirror_device_remove(dmirror_devices + id); unregister_chrdev_region(dmirror_dev, DMIRROR_NDEVICES); } diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h index f700da7807c1..e31d58c9034a 100644 --- a/lib/test_hmm_uapi.h +++ b/lib/test_hmm_uapi.h @@ -50,6 +50,8 @@ struct hmm_dmirror_cmd { * device the ioctl() is made * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some * other device + * HMM_DMIRROR_PROT_DEV_COHERENT: Migrate device coherent page on the device + * the ioctl() is made */ enum { HMM_DMIRROR_PROT_ERROR = 0xFF, @@ -61,6 +63,8 @@ enum { HMM_DMIRROR_PROT_ZERO = 0x10, HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, + HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL = 0x40, + HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE = 0x50, }; enum { From f70dab3c015153ad4837f305e6d65cfaff573dac Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:18 -0500 Subject: [PATCH 130/282] tools: update hmm-test to support device coherent type Test cases such as migrate_fault and migrate_multiple, were modified to explicit migrate from device to sys memory without the need of page faults, when using device coherent type. Snapshot test case updated to read memory device type first and based on that, get the proper returned results migrate_ping_pong test case added to test explicit migration from device to sys memory for both private and coherent zone types. Helpers to migrate from device to sys memory and vicerversa were also added. Link: https://lkml.kernel.org/r/20220715150521.18165-12-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 121 ++++++++++++++++++++----- 1 file changed, 100 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 203323967b50..4b547188ec40 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -46,6 +46,13 @@ struct hmm_buffer { uint64_t faults; }; +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, +}; + #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 @@ -60,6 +67,21 @@ FIXTURE(hmm) unsigned int page_shift; }; +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + FIXTURE(hmm2) { int fd0; @@ -68,6 +90,24 @@ FIXTURE(hmm2) unsigned int page_shift; }; +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + static int hmm_open(int unit) { char pathname[HMM_PATH_MAX]; @@ -81,12 +121,19 @@ static int hmm_open(int unit) return fd; } +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + FIXTURE_SETUP(hmm) { self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd = hmm_open(0); + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd, 0); } @@ -95,9 +142,11 @@ FIXTURE_SETUP(hmm2) self->page_size = sysconf(_SC_PAGE_SIZE); self->page_shift = ffs(self->page_size) - 1; - self->fd0 = hmm_open(0); + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(1); + self->fd1 = hmm_open(variant->device_number1); ASSERT_GE(self->fd1, 0); } @@ -211,6 +260,20 @@ static void hmm_nanosleep(unsigned int n) nanosleep(&t, NULL); } +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + /* * Simple NULL test of device open/close. */ @@ -875,7 +938,7 @@ TEST_F(hmm, migrate) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -923,7 +986,7 @@ TEST_F(hmm, migrate_fault) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -936,7 +999,7 @@ TEST_F(hmm, migrate_fault) ASSERT_EQ(ptr[i], i); /* Migrate memory to the device again. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -976,7 +1039,7 @@ TEST_F(hmm, migrate_shared) ASSERT_NE(buffer->ptr, MAP_FAILED); /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, -ENOENT); hmm_buffer_free(buffer); @@ -1015,7 +1078,7 @@ TEST_F(hmm2, migrate_mixed) p = buffer->ptr; /* Migrating a protected area should be an error. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); ASSERT_EQ(ret, -EINVAL); /* Punch a hole after the first page address. */ @@ -1023,7 +1086,7 @@ TEST_F(hmm2, migrate_mixed) ASSERT_EQ(ret, 0); /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); ASSERT_EQ(ret, -EINVAL); /* Page 2 will be a read-only zero page. */ @@ -1055,13 +1118,13 @@ TEST_F(hmm2, migrate_mixed) /* Now try to migrate pages 2-5 to device 1. */ buffer->ptr = p + 2 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 4); /* Page 5 won't be migrated to device 0 because it's on device 1. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, -ENOENT); buffer->ptr = p; @@ -1070,8 +1133,12 @@ TEST_F(hmm2, migrate_mixed) } /* - * Migrate anonymous memory to device private memory and fault it back to system - * memory multiple times. + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. */ TEST_F(hmm, migrate_multiple) { @@ -1107,8 +1174,7 @@ TEST_F(hmm, migrate_multiple) ptr[i] = i; /* Migrate memory to device. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, - npages); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages); @@ -1116,7 +1182,13 @@ TEST_F(hmm, migrate_multiple) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - /* Fault pages back to system memory and check them. */ + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); @@ -1354,13 +1426,13 @@ TEST_F(hmm2, snapshot) /* Page 5 will be migrated to device 0. */ buffer->ptr = p + 5 * self->page_size; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); /* Page 6 will be migrated to device 1. */ buffer->ptr = p + 6 * self->page_size; - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, 1); @@ -1377,9 +1449,16 @@ TEST_F(hmm2, snapshot) ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } hmm_buffer_free(buffer); } From e6474b1aeb2a0bb01c925f79cafa829b4b5e05c2 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:19 -0500 Subject: [PATCH 131/282] tools: update test_hmm script to support SP config Add two more parameters to set spm_addr_dev0 & spm_addr_dev1 addresses. These two parameters configure the start SP addresses for each device in test_hmm driver. Consequently, this configures zone device type as coherent. Link: https://lkml.kernel.org/r/20220715150521.18165-13-alex.sierra@amd.com Signed-off-by: Alex Sierra Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Alistair Popple Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/test_hmm.sh | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 0647b525a625..539c9371e592 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -40,11 +40,26 @@ check_test_requirements() load_driver() { - modprobe $DRIVER > /dev/null 2>&1 + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi if [ $? == 0 ]; then major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) mknod /dev/hmm_dmirror0 c $major 0 mknod /dev/hmm_dmirror1 c $major 1 + if [ $# -eq 2 ]; then + mknod /dev/hmm_dmirror2 c $major 2 + mknod /dev/hmm_dmirror3 c $major 3 + fi fi } @@ -58,7 +73,7 @@ run_smoke() { echo "Running smoke test. Note, this test provides basic coverage." - load_driver + load_driver $1 $2 $(dirname "${BASH_SOURCE[0]}")/hmm-tests unload_driver } @@ -75,6 +90,9 @@ usage() echo "# Smoke testing" echo "./${TEST_NAME}.sh smoke" echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke " + echo exit 0 } @@ -84,7 +102,7 @@ function run_test() usage else if [ "$1" = "smoke" ]; then - run_smoke + run_smoke $2 $3 else usage fi From 9e09b705fdb8f3892ec1efc2382f64adabc14c50 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:20 -0500 Subject: [PATCH 132/282] tools: add hmm gup tests for device coherent type The intention is to test hmm device coherent type under different get user pages paths. Also, test gup with FOLL_LONGTERM flag set in device coherent pages. These pages should get migrated back to system memory. Link: https://lkml.kernel.org/r/20220715150521.18165-14-alex.sierra@amd.com Signed-off-by: Alex Sierra Reviewed-by: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 110 +++++++++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 4b547188ec40..bb38b9777610 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -36,6 +36,7 @@ * in the usual include/uapi/... directory. */ #include "../../../../lib/test_hmm_uapi.h" +#include "../../../../mm/gup_test.h" struct hmm_buffer { void *ptr; @@ -59,6 +60,9 @@ enum { #define NTIMES 10 #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ FIXTURE(hmm) { @@ -1764,4 +1768,110 @@ TEST_F(hmm, exclusive_cow) hmm_buffer_free(buffer); } +static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, + int npages, int size, int flags) +{ + struct gup_test gup = { + .nr_pages_per_call = npages, + .addr = addr, + .gup_flags = FOLL_WRITE | flags, + .size = size, + }; + + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl on error\n"); + return errno; + } + + return 0; +} + +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr, + GUP_BASIC_TEST, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 1 * self->page_size, + GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 2 * self->page_size, + PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 3 * self->page_size, + PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + if (hmm_is_coherent_type(variant->device_number)) { + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); + } else { + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); + } + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); + /* + * Check again the content on the pages. Make sure there's no + * corrupted data. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + close(gup_fd); + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN From 96c0657383fefdee04d4053bcc266bfa620d073d Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Fri, 15 Jul 2022 10:05:21 -0500 Subject: [PATCH 133/282] tools: add selftests to hmm for COW in device memory The objective is to test device migration mechanism in pages marked as COW, for private and coherent device type. In case of writing to COW private page(s), a page fault will migrate pages back to system memory first. Then, these pages will be duplicated. In case of COW device coherent type, pages are duplicated directly from device memory. Link: https://lkml.kernel.org/r/20220715150521.18165-15-alex.sierra@amd.com Signed-off-by: Alex Sierra Acked-by: Felix Kuehling Cc: Alistair Popple Cc: Christoph Hellwig Cc: David Hildenbrand Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 80 ++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index bb38b9777610..716b62c05e3d 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1874,4 +1874,84 @@ TEST_F(hmm, hmm_gup_test) close(gup_fd); hmm_buffer_free(buffer); } + +/* + * Test copy-on-write in device pages. + * In case of writing to COW private page(s), a page fault will migrate pages + * back to system memory first. Then, these pages will be duplicated. In case + * of COW device coherent type, pages are duplicated directly from device + * memory. + */ +TEST_F(hmm, hmm_cow_in_device) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + pid_t pid; + int status; + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (!pid) { + /* Child process waitd for SIGTERM from the parent. */ + while (1) { + } + perror("Should not reach this\n"); + exit(0); + } + /* Parent process writes to COW pages(s) and gets a + * new copy in system. In case of device private pages, + * this write causes a migration to system mem first. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Terminate child and wait */ + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); + + hmm_buffer_free(buffer); +} TEST_HARNESS_MAIN From 8012b866085523758780850087102421dbcce522 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:25 +0800 Subject: [PATCH 134/282] dax: introduce holder for dax_device Patch series "v14 fsdax-rmap + v11 fsdax-reflink", v2. The patchset fsdax-rmap is aimed to support shared pages tracking for fsdax. It moves owner tracking from dax_assocaite_entry() to pmem device driver, by introducing an interface ->memory_failure() for struct pagemap. This interface is called by memory_failure() in mm, and implemented by pmem device. Then call holder operations to find the filesystem which the corrupted data located in, and call filesystem handler to track files or metadata associated with this page. Finally we are able to try to fix the corrupted data in filesystem and do other necessary processing, such as killing processes who are using the files affected. The call trace is like this: memory_failure() |* fsdax case |------------ |pgmap->ops->memory_failure() => pmem_pgmap_memory_failure() | dax_holder_notify_failure() => | dax_device->holder_ops->notify_failure() => | - xfs_dax_notify_failure() | |* xfs_dax_notify_failure() | |-------------------------- | | xfs_rmap_query_range() | | xfs_dax_failure_fn() | | * corrupted on metadata | | try to recover data, call xfs_force_shutdown() | | * corrupted on file data | | try to recover data, call mf_dax_kill_procs() |* normal case |------------- |mf_generic_kill_procs() The patchset fsdax-reflink attempts to add CoW support for fsdax, and takes XFS, which has both reflink and fsdax features, as an example. One of the key mechanisms needed to be implemented in fsdax is CoW. Copy the data from srcmap before we actually write data to the destination iomap. And we just copy range in which data won't be changed. Another mechanism is range comparison. In page cache case, readpage() is used to load data on disk to page cache in order to be able to compare data. In fsdax case, readpage() does not work. So, we need another compare data with direct access support. With the two mechanisms implemented in fsdax, we are able to make reflink and fsdax work together in XFS. This patch (of 14): To easily track filesystem from a pmem device, we introduce a holder for dax_device structure, and also its operation. This holder is used to remember who is using this dax_device: - When it is the backend of a filesystem, the holder will be the instance of this filesystem. - When this pmem device is one of the targets in a mapped device, the holder will be this mapped device. In this case, the mapped device has its own dax_device and it will follow the first rule. So that we can finally track to the filesystem we needed. The holder and holder_ops will be set when filesystem is being mounted, or an target device is being activated. Link: https://lkml.kernel.org/r/20220603053738.1218681-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20220603053738.1218681-2-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Cc: Dave Chinner Cc: Jane Chu Cc: Goldwyn Rodrigues Cc: Al Viro Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Miaohe Lin Cc: Dan Williams Cc: Goldwyn Rodrigues Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- drivers/dax/super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++- drivers/md/dm.c | 2 +- fs/erofs/super.c | 10 ++++--- fs/ext2/super.c | 7 +++-- fs/ext4/super.c | 9 +++--- fs/xfs/xfs_buf.c | 5 ++-- include/linux/dax.h | 33 ++++++++++++++++------ 7 files changed, 110 insertions(+), 23 deletions(-) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 50a08b2ec247..9b5e2a5eb0ae 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -22,6 +22,8 @@ * @private: dax driver private data * @flags: state and boolean properties * @ops: operations for this device + * @holder_data: holder of a dax_device: could be filesystem or mapped device + * @holder_ops: operations for the inner holder */ struct dax_device { struct inode inode; @@ -29,6 +31,8 @@ struct dax_device { void *private; unsigned long flags; const struct dax_operations *ops; + void *holder_data; + const struct dax_holder_operations *holder_ops; }; static dev_t dax_devt; @@ -71,8 +75,11 @@ EXPORT_SYMBOL_GPL(dax_remove_host); * fs_dax_get_by_bdev() - temporary lookup mechanism for filesystem-dax * @bdev: block device to find a dax_device for * @start_off: returns the byte offset into the dax_device that @bdev starts + * @holder: filesystem or mapped device inside the dax_device + * @ops: operations for the inner holder */ -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops) { struct dax_device *dax_dev; u64 part_size; @@ -92,11 +99,26 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off) dax_dev = xa_load(&dax_hosts, (unsigned long)bdev->bd_disk); if (!dax_dev || !dax_alive(dax_dev) || !igrab(&dax_dev->inode)) dax_dev = NULL; + else if (holder) { + if (!cmpxchg(&dax_dev->holder_data, NULL, holder)) + dax_dev->holder_ops = ops; + else + dax_dev = NULL; + } dax_read_unlock(id); return dax_dev; } EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); + +void fs_put_dax(struct dax_device *dax_dev, void *holder) +{ + if (dax_dev && holder && + cmpxchg(&dax_dev->holder_data, holder, NULL) == holder) + dax_dev->holder_ops = NULL; + put_dax(dax_dev); +} +EXPORT_SYMBOL_GPL(fs_put_dax); #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ enum dax_device_flags { @@ -204,6 +226,29 @@ size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_recovery_write); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, + u64 len, int mf_flags) +{ + int rc, id; + + id = dax_read_lock(); + if (!dax_alive(dax_dev)) { + rc = -ENXIO; + goto out; + } + + if (!dax_dev->holder_ops) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); +out: + dax_read_unlock(id); + return rc; +} +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) @@ -277,8 +322,15 @@ void kill_dax(struct dax_device *dax_dev) if (!dax_dev) return; + if (dax_dev->holder_data != NULL) + dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0); + clear_bit(DAXDEV_ALIVE, &dax_dev->flags); synchronize_srcu(&dax_srcu); + + /* clear holder data */ + dax_dev->holder_ops = NULL; + dax_dev->holder_data = NULL; } EXPORT_SYMBOL_GPL(kill_dax); @@ -420,6 +472,19 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); +/** + * dax_holder() - obtain the holder of a dax device + * @dax_dev: a dax_device instance + + * Return: the holder's data which represents the holder if registered, + * otherwize NULL. + */ +void *dax_holder(struct dax_device *dax_dev) +{ + return dax_dev->holder_data; +} +EXPORT_SYMBOL_GPL(dax_holder); + /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b75f1ef7386..0177a4ce9a18 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -758,7 +758,7 @@ static int open_table_device(struct table_device *td, dev_t dev, } td->dm_dev.bdev = bdev; - td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off); + td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL); return 0; } diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 95addc5c9d34..3173debeaa5a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -255,7 +255,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, if (IS_ERR(bdev)) return PTR_ERR(bdev); dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off); + dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, + NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -720,7 +721,8 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev, - &sbi->dax_part_off); + &sbi->dax_part_off, + NULL, NULL); } err = erofs_read_superblock(sb); @@ -812,7 +814,7 @@ static int erofs_release_device_info(int id, void *ptr, void *data) { struct erofs_device_info *dif = ptr; - fs_put_dax(dif->dax_dev); + fs_put_dax(dif->dax_dev, NULL); if (dif->bdev) blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); erofs_fscache_unregister_cookie(&dif->fscache); @@ -886,7 +888,7 @@ static void erofs_kill_sb(struct super_block *sb) return; erofs_free_dev_context(sbi->devs); - fs_put_dax(sbi->dax_dev); + fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_cookie(&sbi->s_fscache); erofs_fscache_unregister_fs(sb); kfree(sbi->opt.fsid); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d..4638946251b9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,7 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -835,7 +835,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); spin_lock_init(&sbi->s_lock); ret = -EINVAL; @@ -1204,7 +1205,7 @@ failed_mount_group_desc: failed_mount: brelse(bh); failed_sbi: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 845f2f8aee5f..1f8bf507ba5a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1307,7 +1307,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #if IS_ENABLED(CONFIG_UNICODE) utf8_unload(sb->s_encoding); @@ -4272,7 +4272,7 @@ static void ext4_free_sbi(struct ext4_sb_info *sbi) return; kfree(sbi->s_blockgroup_lock); - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); } @@ -4284,7 +4284,8 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) if (!sbi) return NULL; - sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off); + sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off, + NULL, NULL); sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); @@ -4296,7 +4297,7 @@ static struct ext4_sb_info *ext4_alloc_sbi(struct super_block *sb) sbi->s_sb = sb; return sbi; err_out: - fs_put_dax(sbi->s_daxdev); + fs_put_dax(sbi->s_daxdev, NULL); kfree(sbi); return NULL; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4aa9c9cf5b6e..1ec2a7b6d44e 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1911,7 +1911,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); - fs_put_dax(btp->bt_daxdev); + fs_put_dax(btp->bt_daxdev, NULL); kmem_free(btp); } @@ -1964,7 +1964,8 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off); + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL, + NULL); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/include/linux/dax.h b/include/linux/dax.h index e7b81634c52a..cf85fc36da5f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -43,8 +43,21 @@ struct dax_operations { void *addr, size_t bytes, struct iov_iter *iter); }; +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); +void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -66,6 +79,10 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void *dax_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops) { @@ -114,12 +131,9 @@ struct writeback_control; #if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX) int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk); void dax_remove_host(struct gendisk *disk); -struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off); -static inline void fs_put_dax(struct dax_device *dax_dev) -{ - put_dax(dax_dev); -} +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off, + void *holder, const struct dax_holder_operations *ops); +void fs_put_dax(struct dax_device *dax_dev, void *holder); #else static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk) { @@ -129,11 +143,12 @@ static inline void dax_remove_host(struct gendisk *disk) { } static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, - u64 *start_off) + u64 *start_off, void *holder, + const struct dax_holder_operations *ops) { return NULL; } -static inline void fs_put_dax(struct dax_device *dax_dev) +static inline void fs_put_dax(struct dax_device *dax_dev, void *holder) { } #endif /* CONFIG_BLOCK && CONFIG_FS_DAX */ @@ -203,6 +218,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, From 00cc790e00369387f6ab80c5724550c2c6340334 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:26 +0800 Subject: [PATCH 135/282] mm: factor helpers for memory_failure_dev_pagemap memory_failure_dev_pagemap code is a bit complex before introduce RMAP feature for fsdax. So it is needed to factor some helper functions to simplify these code. [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n build] [zhengbin13@huawei.com: fix redefinition of mf_generic_kill_procs] Link: https://lkml.kernel.org/r/20220628112143.1170473-1-zhengbin13@huawei.com Link: https://lkml.kernel.org/r/20220603053738.1218681-3-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Signed-off-by: Zheng Bin Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Miaohe Lin Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- mm/memory-failure.c | 171 ++++++++++++++++++++++++-------------------- 1 file changed, 95 insertions(+), 76 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b7ca5db7e60e..f8a8a5d45eba 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1499,6 +1499,95 @@ static int try_to_split_thp_page(struct page *page, const char *msg) return 0; } +static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, + struct address_space *mapping, pgoff_t index, int flags) +{ + struct to_kill *tk; + unsigned long size = 0; + + list_for_each_entry(tk, to_kill, nd) + if (tk->size_shift) + size = max(size, 1UL << tk->size_shift); + + if (size) { + /* + * Unmap the largest mapping to avoid breaking up device-dax + * mappings which are constant size. The actual size of the + * mapping being torn down is communicated in siginfo, see + * kill_proc() + */ + loff_t start = (index << PAGE_SHIFT) & ~(size - 1); + + unmap_mapping_range(mapping, start, size, 0); + } + + kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); +} + +static int mf_generic_kill_procs(unsigned long long pfn, int flags, + struct dev_pagemap *pgmap) +{ + struct page *page = pfn_to_page(pfn); + LIST_HEAD(to_kill); + dax_entry_t cookie; + int rc = 0; + + /* + * Pages instantiated by device-dax (not filesystem-dax) + * may be compound pages. + */ + page = compound_head(page); + + /* + * Prevent the inode from being freed while we are interrogating + * the address_space, typically this would be handled by + * lock_page(), but dax pages do not use the page lock. This + * also prevents changes to the mapping of this pfn until + * poison signaling is complete. + */ + cookie = dax_lock_page(page); + if (!cookie) + return -EBUSY; + + if (hwpoison_filter(page)) { + rc = -EOPNOTSUPP; + goto unlock; + } + + switch (pgmap->type) { + case MEMORY_DEVICE_PRIVATE: + case MEMORY_DEVICE_COHERENT: + /* + * TODO: Handle device pages which may need coordination + * with device-side memory. + */ + rc = -ENXIO; + goto unlock; + default: + break; + } + + /* + * Use this flag as an indication that the dax page has been + * remapped UC to prevent speculative consumption of poison. + */ + SetPageHWPoison(page); + + /* + * Unlike System-RAM there is no possibility to swap in a + * different physical page at a given virtual address, so all + * userspace consumption of ZONE_DEVICE memory necessitates + * SIGBUS (i.e. MF_MUST_KILL) + */ + flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + collect_procs(page, &to_kill, true); + + unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags); +unlock: + dax_unlock_page(page, cookie); + return rc; +} + /* * Called from hugetlb code with hugetlb_lock held. * @@ -1634,23 +1723,20 @@ out: unlock_page(head); return res; } + #else static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { return 0; } -#endif + +#endif /* CONFIG_HUGETLB_PAGE */ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { struct page *page = pfn_to_page(pfn); - unsigned long size = 0; - struct to_kill *tk; - LIST_HEAD(tokill); - int rc = -EBUSY; - loff_t start; - dax_entry_t cookie; + int rc = -ENXIO; if (flags & MF_COUNT_INCREASED) /* @@ -1659,77 +1745,10 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, put_page(page); /* device metadata space is not recoverable */ - if (!pgmap_pfn_valid(pgmap, pfn)) { - rc = -ENXIO; - goto out; - } - - /* - * Pages instantiated by device-dax (not filesystem-dax) - * may be compound pages. - */ - page = compound_head(page); - - /* - * Prevent the inode from being freed while we are interrogating - * the address_space, typically this would be handled by - * lock_page(), but dax pages do not use the page lock. This - * also prevents changes to the mapping of this pfn until - * poison signaling is complete. - */ - cookie = dax_lock_page(page); - if (!cookie) + if (!pgmap_pfn_valid(pgmap, pfn)) goto out; - if (hwpoison_filter(page)) { - rc = -EOPNOTSUPP; - goto unlock; - } - - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_COHERENT: - /* - * TODO: Handle device pages which may need coordination - * with device-side memory. - */ - goto unlock; - default: - break; - } - - /* - * Use this flag as an indication that the dax page has been - * remapped UC to prevent speculative consumption of poison. - */ - SetPageHWPoison(page); - - /* - * Unlike System-RAM there is no possibility to swap in a - * different physical page at a given virtual address, so all - * userspace consumption of ZONE_DEVICE memory necessitates - * SIGBUS (i.e. MF_MUST_KILL) - */ - flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &tokill, true); - - list_for_each_entry(tk, &tokill, nd) - if (tk->size_shift) - size = max(size, 1UL << tk->size_shift); - if (size) { - /* - * Unmap the largest mapping to avoid breaking up - * device-dax mappings which are constant size. The - * actual size of the mapping being torn down is - * communicated in siginfo, see kill_proc() - */ - start = (page->index << PAGE_SHIFT) & ~(size - 1); - unmap_mapping_range(page->mapping, start, size, 0); - } - kill_procs(&tokill, true, false, pfn, flags); - rc = 0; -unlock: - dax_unlock_page(page, cookie); + rc = mf_generic_kill_procs(pfn, flags, pgmap); out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); From 33a8f7f2b3a3437d016d1b4047a4fd37eb6951b3 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:27 +0800 Subject: [PATCH 136/282] pagemap,pmem: introduce ->memory_failure() When memory-failure occurs, we call this function which is implemented by each kind of devices. For the fsdax case, pmem device driver implements it. Pmem device driver will find out the filesystem in which the corrupted page located in. With dax_holder notify support, we are able to notify the memory failure from pmem driver to upper layers. If there is something not support in the notify routine, memory_failure will fall back to the generic hanlder. Link: https://lkml.kernel.org/r/20220603053738.1218681-4-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Reviewed-by: Naoya Horiguchi Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- drivers/nvdimm/pmem.c | 17 +++++++++++++++++ include/linux/memremap.h | 12 ++++++++++++ mm/memory-failure.c | 14 ++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 629d10fcf53b..107c9cb3d57d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -453,6 +453,21 @@ static void pmem_release_disk(void *__pmem) blk_cleanup_disk(pmem->disk); } +static int pmem_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct pmem_device *pmem = + container_of(pgmap, struct pmem_device, pgmap); + u64 offset = PFN_PHYS(pfn) - pmem->phys_addr - pmem->data_offset; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(pmem->dax_dev, offset, len, mf_flags); +} + +static const struct dev_pagemap_ops fsdax_pagemap_ops = { + .memory_failure = pmem_pagemap_memory_failure, +}; + static int pmem_attach_disk(struct device *dev, struct nd_namespace_common *ndns) { @@ -514,6 +529,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pfn_flags = PFN_DEV; if (is_nd_pfn(dev)) { pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); @@ -527,6 +543,7 @@ static int pmem_attach_disk(struct device *dev, pmem->pgmap.range.end = res->end; pmem->pgmap.nr_range = 1; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX; + pmem->pgmap.ops = &fsdax_pagemap_ops; addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; bb_range = pmem->pgmap.range; diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 09320b7f706c..19010491a603 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -87,6 +87,18 @@ struct dev_pagemap_ops { * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); + + /* + * Handle the memory failure happens on a range of pfns. Notify the + * processes who are using these pfns, and try to recover the data on + * them if necessary. The mf_flags is finally passed to the recover + * function through the whole notify routine. + * + * When this is not implemented, or it returns -EOPNOTSUPP, the caller + * will fall back to a common handler called mf_generic_kill_procs(). + */ + int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, + unsigned long nr_pages, int mf_flags); }; #define PGMAP_ALTMAP_VALID (1 << 0) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f8a8a5d45eba..46c77151f726 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1748,6 +1748,20 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, if (!pgmap_pfn_valid(pgmap, pfn)) goto out; + /* + * Call driver's implementation to handle the memory failure, otherwise + * fall back to generic handler. + */ + if (pgmap->ops->memory_failure) { + rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags); + /* + * Fall back to generic handler too if operation is not + * supported inside the driver/device/filesystem. + */ + if (rc != -EOPNOTSUPP) + goto out; + } + rc = mf_generic_kill_procs(pfn, flags, pgmap); out: /* drop pgmap ref acquired in caller */ From 2f437effc689ef913fbe5e31110580b4e7cf04be Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:28 +0800 Subject: [PATCH 137/282] fsdax: introduce dax_lock_mapping_entry() The current dax_lock_page() locks dax entry by obtaining mapping and index in page. To support 1-to-N RMAP in NVDIMM, we need a new function to lock a specific dax entry corresponding to this file's mapping,index. And output the page corresponding to the specific dax entry for caller use. Link: https://lkml.kernel.org/r/20220603053738.1218681-5-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 63 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 15 +++++++++++ 2 files changed, 78 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index 4155a6107fa1..65e44d78b3bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -455,6 +455,69 @@ void dax_unlock_page(struct page *page, dax_entry_t cookie) dax_unlock_entry(&xas, (void *)cookie); } +/* + * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping + * @mapping: the file's mapping whose entry we want to lock + * @index: the offset within this file + * @page: output the dax page corresponding to this dax entry + * + * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry + * could not be locked. + */ +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index, + struct page **page) +{ + XA_STATE(xas, NULL, 0); + void *entry; + + rcu_read_lock(); + for (;;) { + entry = NULL; + if (!dax_mapping(mapping)) + break; + + xas.xa = &mapping->i_pages; + xas_lock_irq(&xas); + xas_set(&xas, index); + entry = xas_load(&xas); + if (dax_is_locked(entry)) { + rcu_read_unlock(); + wait_entry_unlocked(&xas, entry); + rcu_read_lock(); + continue; + } + if (!entry || + dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Because we are looking for entry from file's mapping + * and index, so the entry may not be inserted for now, + * or even a zero/empty entry. We don't think this is + * an error case. So, return a special value and do + * not output @page. + */ + entry = (void *)~0UL; + } else { + *page = pfn_to_page(dax_to_pfn(entry)); + dax_lock_entry(&xas, entry); + } + xas_unlock_irq(&xas); + break; + } + rcu_read_unlock(); + return (dax_entry_t)entry; +} + +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index, + dax_entry_t cookie) +{ + XA_STATE(xas, &mapping->i_pages, index); + + if (cookie == ~0UL) + return; + + dax_unlock_entry(&xas, (void *)cookie); +} + /* * Find page cache entry at given index. If it is a DAX entry, return it * with the entry locked. If the page cache doesn't contain an entry at diff --git a/include/linux/dax.h b/include/linux/dax.h index cf85fc36da5f..7116681b48c0 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -161,6 +161,10 @@ struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); dax_entry_t dax_lock_page(struct page *page); void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page); +void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie); #else static inline struct page *dax_layout_busy_page(struct address_space *mapping) { @@ -188,6 +192,17 @@ static inline dax_entry_t dax_lock_page(struct page *page) static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) { } + +static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, + unsigned long index, struct page **page) +{ + return 0; +} + +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + unsigned long index, dax_entry_t cookie) +{ +} #endif int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, From c36e2024957120566efd99395b5c8cc95b5175c1 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:29 +0800 Subject: [PATCH 138/282] mm: introduce mf_dax_kill_procs() for fsdax case This new function is a variant of mf_generic_kill_procs that accepts a file, offset pair instead of a struct to support multiple files sharing a DAX mapping. It is intended to be called by the file systems as part of the memory_failure handler after the file system performed a reverse mapping from the storage address to the file and file offset. Link: https://lkml.kernel.org/r/20220603053738.1218681-6-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Dan Williams Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Reviewed-by: Miaohe Lin Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 + mm/memory-failure.c | 96 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 88 insertions(+), 10 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 64393ed3330a..d4ebfc206e2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3178,6 +3178,8 @@ enum mf_flags { MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, }; +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46c77151f726..c9931c676335 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -297,10 +297,9 @@ void shake_page(struct page *p) } EXPORT_SYMBOL_GPL(shake_page); -static unsigned long dev_pagemap_mapping_shift(struct page *page, - struct vm_area_struct *vma) +static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, + unsigned long address) { - unsigned long address = vma_address(page, vma); unsigned long ret = 0; pgd_t *pgd; p4d_t *p4d; @@ -340,10 +339,14 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * + * Notice: @fsdax_pgoff is used only when @p is a fsdax page. + * In other cases, such as anonymous and file-backend page, the address to be + * killed can be caculated by @p itself. */ static void add_to_kill(struct task_struct *tsk, struct page *p, - struct vm_area_struct *vma, - struct list_head *to_kill) + pgoff_t fsdax_pgoff, struct vm_area_struct *vma, + struct list_head *to_kill) { struct to_kill *tk; @@ -354,9 +357,15 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } tk->addr = page_address_in_vma(p, vma); - if (is_zone_device_page(p)) - tk->size_shift = dev_pagemap_mapping_shift(p, vma); - else + if (is_zone_device_page(p)) { + /* + * Since page->mapping is not used for fsdax, we need + * calculate the address based on the vma. + */ + if (p->pgmap->type == MEMORY_DEVICE_FS_DAX) + tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma); + tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); + } else tk->size_shift = page_shift(compound_head(p)); /* @@ -505,7 +514,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill); + add_to_kill(t, page, 0, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -541,13 +550,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill); + add_to_kill(t, page, 0, vma, to_kill); } } read_unlock(&tasklist_lock); i_mmap_unlock_read(mapping); } +#ifdef CONFIG_FS_DAX +/* + * Collect processes when the error hit a fsdax page. + */ +static void collect_procs_fsdax(struct page *page, + struct address_space *mapping, pgoff_t pgoff, + struct list_head *to_kill) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + + i_mmap_lock_read(mapping); + read_lock(&tasklist_lock); + for_each_process(tsk) { + struct task_struct *t = task_early_kill(tsk, true); + + if (!t) + continue; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_mm == t->mm) + add_to_kill(t, page, pgoff, vma, to_kill); + } + } + read_unlock(&tasklist_lock); + i_mmap_unlock_read(mapping); +} +#endif /* CONFIG_FS_DAX */ + /* * Collect the processes who have the corrupted page mapped to kill. */ @@ -1588,6 +1625,45 @@ unlock: return rc; } +#ifdef CONFIG_FS_DAX +/** + * mf_dax_kill_procs - Collect and kill processes who are using this file range + * @mapping: address_space of the file in use + * @index: start pgoff of the range within the file + * @count: length of the range, in unit of PAGE_SIZE + * @mf_flags: memory failure flags + */ +int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, + unsigned long count, int mf_flags) +{ + LIST_HEAD(to_kill); + dax_entry_t cookie; + struct page *page; + size_t end = index + count; + + mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; + + for (; index < end; index++) { + page = NULL; + cookie = dax_lock_mapping_entry(mapping, index, &page); + if (!cookie) + return -EBUSY; + if (!page) + goto unlock; + + SetPageHWPoison(page); + + collect_procs_fsdax(page, mapping, index, &to_kill); + unmap_and_kill(&to_kill, page_to_pfn(page), mapping, + index, mf_flags); +unlock: + dax_unlock_mapping_entry(mapping, index, cookie); + } + return 0; +} +EXPORT_SYMBOL_GPL(mf_dax_kill_procs); +#endif /* CONFIG_FS_DAX */ + /* * Called from hugetlb code with hugetlb_lock held. * From 6f643c57d57c56d4677bc05f1fca2ef3f249797c Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:30 +0800 Subject: [PATCH 139/282] xfs: implement ->notify_failure() for XFS Introduce xfs_notify_failure.c to handle failure related works, such as implement ->notify_failure(), register/unregister dax holder in xfs, and so on. If the rmap feature of XFS enabled, we can query it to find files and metadata which are associated with the corrupt data. For now all we do is kill processes with that file mapped into their address spaces, but future patches could actually do something about corrupt metadata. After that, the memory failure needs to notify the processes who are using those files. Link: https://lkml.kernel.org/r/20220603053738.1218681-7-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/xfs/Makefile | 5 + fs/xfs/xfs_buf.c | 11 +- fs/xfs/xfs_fsops.c | 3 + fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_notify_failure.c | 220 ++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_super.h | 1 + 6 files changed, 238 insertions(+), 3 deletions(-) create mode 100644 fs/xfs/xfs_notify_failure.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index b056cfc6398e..805a0d0a88c1 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -129,6 +129,11 @@ xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o +# notify failure +ifeq ($(CONFIG_MEMORY_FAILURE),y) +xfs-$(CONFIG_FS_DAX) += xfs_notify_failure.o +endif + # online scrub/repair ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1ec2a7b6d44e..59c6b62fde57 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -5,6 +5,7 @@ */ #include "xfs.h" #include +#include #include "xfs_shared.h" #include "xfs_format.h" @@ -1911,7 +1912,7 @@ xfs_free_buftarg( list_lru_destroy(&btp->bt_lru); blkdev_issue_flush(btp->bt_bdev); - fs_put_dax(btp->bt_daxdev, NULL); + fs_put_dax(btp->bt_daxdev, btp->bt_mount); kmem_free(btp); } @@ -1958,14 +1959,18 @@ xfs_alloc_buftarg( struct block_device *bdev) { xfs_buftarg_t *btp; + const struct dax_holder_operations *ops = NULL; +#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) + ops = &xfs_dax_holder_operations; +#endif btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, NULL, - NULL); + btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, + mp, ops); /* * Buffer IO error rate limiting. Limit it to no more than 10 messages diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index d4a77c53f94b..0bc74a71a095 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -528,6 +528,9 @@ xfs_do_force_shutdown( } else if (flags & SHUTDOWN_CORRUPT_INCORE) { tag = XFS_PTAG_SHUTDOWN_CORRUPT; why = "Corruption of in-memory data"; + } else if (flags & SHUTDOWN_CORRUPT_ONDISK) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of on-disk metadata"; } else { tag = XFS_PTAG_SHUTDOWN_IOERROR; why = "Metadata I/O Error"; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ba5d42abf66e..32a67f78fdd1 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -454,6 +454,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname, #define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */ #define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */ +#define SHUTDOWN_CORRUPT_ONDISK (1u << 4) /* corrupt metadata on device */ #define XFS_SHUTDOWN_STRINGS \ { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c new file mode 100644 index 000000000000..aa8dc27c599c --- /dev/null +++ b/fs/xfs/xfs_notify_failure.c @@ -0,0 +1,220 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Fujitsu. All Rights Reserved. + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include "xfs_btree.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_trans.h" + +#include +#include + +struct failure_info { + xfs_agblock_t startblock; + xfs_extlen_t blockcount; + int mf_flags; +}; + +static pgoff_t +xfs_failure_pgoff( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rec, + const struct failure_info *notify) +{ + loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset); + + if (notify->startblock > rec->rm_startblock) + pos += XFS_FSB_TO_B(mp, + notify->startblock - rec->rm_startblock); + return pos >> PAGE_SHIFT; +} + +static unsigned long +xfs_failure_pgcnt( + struct xfs_mount *mp, + const struct xfs_rmap_irec *rec, + const struct failure_info *notify) +{ + xfs_agblock_t end_rec; + xfs_agblock_t end_notify; + xfs_agblock_t start_cross; + xfs_agblock_t end_cross; + + start_cross = max(rec->rm_startblock, notify->startblock); + + end_rec = rec->rm_startblock + rec->rm_blockcount; + end_notify = notify->startblock + notify->blockcount; + end_cross = min(end_rec, end_notify); + + return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT; +} + +static int +xfs_dax_failure_fn( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *data) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip; + struct failure_info *notify = data; + int error = 0; + + if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) || + (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + + /* Get files that incore, filter out others that are not in use. */ + error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, + 0, &ip); + /* Continue the rmap query if the inode isn't incore */ + if (error == -ENODATA) + return 0; + if (error) + return error; + + error = mf_dax_kill_procs(VFS_I(ip)->i_mapping, + xfs_failure_pgoff(mp, rec, notify), + xfs_failure_pgcnt(mp, rec, notify), + notify->mf_flags); + xfs_irele(ip); + return error; +} + +static int +xfs_dax_notify_ddev_failure( + struct xfs_mount *mp, + xfs_daddr_t daddr, + xfs_daddr_t bblen, + int mf_flags) +{ + struct xfs_trans *tp = NULL; + struct xfs_btree_cur *cur = NULL; + struct xfs_buf *agf_bp = NULL; + int error = 0; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); + xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); + xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + for (; agno <= end_agno; agno++) { + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct failure_info notify; + struct xfs_agf *agf; + xfs_agblock_t agend; + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp); + if (error) + break; + + cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag); + + /* + * Set the rmap range from ri_low to ri_high, which represents + * a [start, end] where we looking for the files or metadata. + */ + memset(&ri_high, 0xFF, sizeof(ri_high)); + ri_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno); + if (agno == end_agno) + ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); + + agf = agf_bp->b_addr; + agend = min(be32_to_cpu(agf->agf_length), + ri_high.rm_startblock); + notify.startblock = ri_low.rm_startblock; + notify.blockcount = agend - ri_low.rm_startblock; + + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_dax_failure_fn, ¬ify); + xfs_btree_del_cursor(cur, error); + xfs_trans_brelse(tp, agf_bp); + if (error) + break; + + fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0); + } + + xfs_trans_cancel(tp); + return error; +} + +static int +xfs_dax_notify_failure( + struct dax_device *dax_dev, + u64 offset, + u64 len, + int mf_flags) +{ + struct xfs_mount *mp = dax_holder(dax_dev); + u64 ddev_start; + u64 ddev_end; + + if (!(mp->m_sb.sb_flags & SB_BORN)) { + xfs_warn(mp, "filesystem is not ready for notify_failure()!"); + return -EIO; + } + + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) { + xfs_warn(mp, + "notify_failure() not supported on realtime device!"); + return -EOPNOTSUPP; + } + + if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev && + mp->m_logdev_targp != mp->m_ddev_targp) { + xfs_err(mp, "ondisk log corrupt, shutting down fs!"); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK); + return -EFSCORRUPTED; + } + + if (!xfs_has_rmapbt(mp)) { + xfs_warn(mp, "notify_failure() needs rmapbt enabled!"); + return -EOPNOTSUPP; + } + + ddev_start = mp->m_ddev_targp->bt_dax_part_off; + ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + + /* Ignore the range out of filesystem area */ + if (offset + len < ddev_start) + return -ENXIO; + if (offset > ddev_end) + return -ENXIO; + + /* Calculate the real range when it touches the boundary */ + if (offset > ddev_start) + offset -= ddev_start; + else { + len -= ddev_start - offset; + offset = 0; + } + if (offset + len > ddev_end) + len -= ddev_end - offset; + + return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), + mf_flags); +} + +const struct dax_holder_operations xfs_dax_holder_operations = { + .notify_failure = xfs_dax_notify_failure, +}; diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 3cd5a51bace1..364e2c2648a8 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -92,6 +92,7 @@ extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, extern const struct export_operations xfs_export_operations; extern const struct quotactl_ops xfs_quotactl_operations; +extern const struct dax_holder_operations xfs_dax_holder_operations; extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); From 6061b69b9a550a2ab84e805d0d2315ba6215f112 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:31 +0800 Subject: [PATCH 140/282] fsdax: set a CoW flag when associate reflink mappings Introduce a PAGE_MAPPING_DAX_COW flag to support association with CoW file mappings. In this case, since the dax-rmap has already took the responsibility to look up for shared files by given dax page, the page->mapping is no longer to used for rmap but for marking that this dax page is shared. And to make sure disassociation works fine, we use page->index as refcount, and clear page->mapping to the initial state when page->index is decreased to 0. With the help of this new flag, it is able to distinguish normal case and CoW case, and keep the warning in normal case. Link: https://lkml.kernel.org/r/20220603053738.1218681-8-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 50 +++++++++++++++++++++++++++++++------- include/linux/page-flags.h | 6 +++++ 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 65e44d78b3bb..b59b864017ad 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -334,13 +334,35 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) +static inline bool dax_mapping_is_cow(struct address_space *mapping) +{ + return (unsigned long)mapping == PAGE_MAPPING_DAX_COW; +} + /* - * TODO: for reflink+dax we need a way to associate a single page with - * multiple address_space instances at different linear_page_index() - * offsets. + * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount. + */ +static inline void dax_mapping_set_cow(struct page *page) +{ + if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) { + /* + * Reset the index if the page was already mapped + * regularly before. + */ + if (page->mapping) + page->index = 1; + page->mapping = (void *)PAGE_MAPPING_DAX_COW; + } + page->index++; +} + +/* + * When it is called in dax_insert_entry(), the cow flag will indicate that + * whether this entry is shared by multiple files. If so, set the page->mapping + * FS_DAX_MAPPING_COW, and use page->index as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool cow) { unsigned long size = dax_entry_size(entry), pfn, index; int i = 0; @@ -352,9 +374,13 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + if (cow) { + dax_mapping_set_cow(page); + } else { + WARN_ON_ONCE(page->mapping); + page->mapping = mapping; + page->index = index + i++; + } } } @@ -370,7 +396,12 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - WARN_ON_ONCE(page->mapping && page->mapping != mapping); + if (dax_mapping_is_cow(page->mapping)) { + /* keep the CoW flag if this page is still shared */ + if (page->index-- > 0) + continue; + } else + WARN_ON_ONCE(page->mapping && page->mapping != mapping); page->mapping = NULL; page->index = 0; } @@ -830,7 +861,8 @@ static void *dax_insert_entry(struct xa_state *xas, void *old; dax_disassociate_entry(entry, mapping, false); - dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); + dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, + false); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 82719d33c0f1..f2ff65f1bf83 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -661,6 +661,12 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) +/* + * Different with flags above, this flag is used only for fsdax mode. It + * indicates that this page->mapping is now under reflink case. + */ +#define PAGE_MAPPING_DAX_COW 0x1 + static __always_inline bool folio_mapping_flags(struct folio *folio) { return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0; From e28cd3e50f3041186ba7fe74a9c7443cd8afc2da Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:32 +0800 Subject: [PATCH 141/282] fsdax: output address in dax_iomap_pfn() and rename it Add address output in dax_iomap_pfn() in order to perform a memcpy() in CoW case. Since this function both output address and pfn, rename it to dax_iomap_direct_access(). [ruansy.fnst@fujitsu.com: initialize `rc', per Dan] Link: https://lore.kernel.org/linux-fsdevel/Yp8FUZnO64Qvyx5G@kili/ Link: https://lkml.kernel.org/r/20220607143837.161174-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20220603053738.1218681-9-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Dan Williams Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/dax.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index b59b864017ad..7a8eb1e30a1b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1026,20 +1026,22 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, - pfn_t *pfnp) +static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, + size_t size, void **kaddr, pfn_t *pfnp) { pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - int id, rc; + int id, rc = 0; long length; id = dax_read_lock(); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), - DAX_ACCESS, NULL, pfnp); + DAX_ACCESS, kaddr, pfnp); if (length < 0) { rc = length; goto out; } + if (!pfnp) + goto out_check_addr; rc = -EINVAL; if (PFN_PHYS(length) < size) goto out; @@ -1049,6 +1051,12 @@ static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, if (length > 1 && !pfn_t_devmap(*pfnp)) goto out; rc = 0; + +out_check_addr: + if (!kaddr) + goto out; + if (!*kaddr) + rc = -EFAULT; out: dax_read_unlock(id); return rc; @@ -1456,7 +1464,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; } - err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn); + err = dax_iomap_direct_access(&iter->iomap, pos, size, NULL, &pfn); if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); From ff17b8df224b98e282ec39a9949a3672fa3dbe93 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:33 +0800 Subject: [PATCH 142/282] fsdax: introduce dax_iomap_cow_copy() In the case where the iomap is a write operation and iomap is not equal to srcmap after iomap_begin, we consider it is a CoW operation. In this case, the destination (iomap->addr) points to a newly allocated extent. It is needed to copy the data from srcmap to the extent. In theory, it is better to copy the head and tail ranges which is outside of the non-aligned area instead of copying the whole aligned range. But in dax page fault, it will always be an aligned range. So copy the whole range in this case. Link: https://lkml.kernel.org/r/20220603053738.1218681-10-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 5 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 7a8eb1e30a1b..6a353838070d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1062,6 +1062,60 @@ out: return rc; } +/** + * dax_iomap_cow_copy - Copy the data from source to destination before write + * @pos: address to do copy from. + * @length: size of copy operation. + * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE) + * @srcmap: iomap srcmap + * @daddr: destination address to copy to. + * + * This can be called from two places. Either during DAX write fault (page + * aligned), to copy the length size data to daddr. Or, while doing normal DAX + * write operation, dax_iomap_actor() might call this to do the copy of either + * start or end unaligned address. In the latter case the rest of the copy of + * aligned ranges is taken care by dax_iomap_actor() itself. + */ +static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, + const struct iomap *srcmap, void *daddr) +{ + loff_t head_off = pos & (align_size - 1); + size_t size = ALIGN(head_off + length, align_size); + loff_t end = pos + length; + loff_t pg_end = round_up(end, align_size); + bool copy_all = head_off == 0 && end == pg_end; + void *saddr = 0; + int ret = 0; + + ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); + if (ret) + return ret; + + if (copy_all) { + ret = copy_mc_to_kernel(daddr, saddr, length); + return ret ? -EIO : 0; + } + + /* Copy the head part of the range */ + if (head_off) { + ret = copy_mc_to_kernel(daddr, saddr, head_off); + if (ret) + return -EIO; + } + + /* Copy the tail part of the range */ + if (end < pg_end) { + loff_t tail_off = head_off + length; + loff_t tail_len = pg_end - end; + + ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off, + tail_len); + if (ret) + return -EIO; + } + return 0; +} + /* * The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with @@ -1232,15 +1286,17 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; + const struct iomap *srcmap = &iomi->srcmap; loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; struct dax_device *dax_dev = iomap->dax_dev; loff_t end = pos + length, done = 0; + bool write = iov_iter_rw(iter) == WRITE; ssize_t ret = 0; size_t xfer; int id; - if (iov_iter_rw(iter) == READ) { + if (!write) { end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; @@ -1249,7 +1305,12 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, return iov_iter_zero(min(length, end - pos), iter); } - if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) + /* + * In DAX mode, enforce either pure overwrites of written extents, or + * writes to unwritten extents as part of a copy-on-write operation. + */ + if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED && + !(iomap->flags & IOMAP_F_SHARED))) return -EIO; /* @@ -1291,6 +1352,14 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, break; } + if (write && + srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap, + kaddr); + if (ret) + break; + } + map_len = PFN_PHYS(map_len); kaddr += offset; map_len -= offset; @@ -1300,7 +1369,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (recovery) xfer = dax_recovery_write(dax_dev, pgoff, kaddr, map_len, iter); - else if (iov_iter_rw(iter) == WRITE) + else if (write) xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else @@ -1440,6 +1509,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = &iter->srcmap; size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -1447,6 +1517,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, unsigned long entry_flags = pmd ? DAX_PMD : 0; int err = 0; pfn_t pfn; + void *kaddr; if (!pmd && vmf->cow_page) return dax_fault_cow_page(vmf, iter); @@ -1459,18 +1530,25 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, return dax_pmd_load_hole(xas, vmf, iomap, entry); } - if (iomap->type != IOMAP_MAPPED) { + if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { WARN_ON_ONCE(1); return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; } - err = dax_iomap_direct_access(&iter->iomap, pos, size, NULL, &pfn); + err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn); if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags, write && !sync); + if (write && + srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { + err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr); + if (err) + return dax_fault_return(err); + } + if (sync) return dax_fault_synchronous_pfnp(pfnp, pfn); From e5d6df73302c8d1e7ab2d3555f0faafd0d4b0027 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:34 +0800 Subject: [PATCH 143/282] fsdax: replace mmap entry in case of CoW Replace the existing entry to the newly allocated one in case of CoW. Also, we mark the entry as PAGECACHE_TAG_TOWRITE so writeback marks this entry as writeprotected. This helps us snapshots so new write pagefaults after snapshots trigger a CoW. Link: https://lkml.kernel.org/r/20220603053738.1218681-11-ruansy.fnst@fujitsu.com Signed-off-by: Goldwyn Rodrigues Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/dax.c | 77 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 35 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 6a353838070d..04fee1569328 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -829,6 +829,23 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter return 0; } +/* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(const struct iomap_iter *iter, + struct vm_area_struct *vma) +{ + return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && + (iter->iomap.flags & IOMAP_F_DIRTY); +} + +static bool dax_fault_is_cow(const struct iomap_iter *iter) +{ + return (iter->flags & IOMAP_WRITE) && + (iter->iomap.flags & IOMAP_F_SHARED); +} + /* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to @@ -836,16 +853,19 @@ static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ -static void *dax_insert_entry(struct xa_state *xas, - struct address_space *mapping, struct vm_fault *vmf, - void *entry, pfn_t pfn, unsigned long flags, bool dirty) +static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap_iter *iter, void *entry, pfn_t pfn, + unsigned long flags) { + struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *new_entry = dax_make_entry(pfn, flags); + bool dirty = !dax_fault_is_synchronous(iter, vmf->vma); + bool cow = dax_fault_is_cow(iter); if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { + if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) { unsigned long index = xas->xa_index; /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) @@ -857,12 +877,12 @@ static void *dax_insert_entry(struct xa_state *xas, xas_reset(xas); xas_lock_irq(xas); - if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; dax_disassociate_entry(entry, mapping, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - false); + cow); /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -882,6 +902,9 @@ static void *dax_insert_entry(struct xa_state *xas, if (dirty) xas_set_mark(xas, PAGECACHE_TAG_DIRTY); + if (cow) + xas_set_mark(xas, PAGECACHE_TAG_TOWRITE); + xas_unlock_irq(xas); return entry; } @@ -1123,17 +1146,15 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size, * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static vm_fault_t dax_load_hole(struct xa_state *xas, - struct address_space *mapping, void **entry, - struct vm_fault *vmf) +static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap_iter *iter, void **entry) { - struct inode *inode = mapping->host; + struct inode *inode = iter->inode; unsigned long vaddr = vmf->address; pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); vm_fault_t ret; - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); trace_dax_load_hole(inode, vmf, ret); @@ -1142,7 +1163,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, #ifdef CONFIG_FS_DAX_PMD static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - const struct iomap *iomap, void **entry) + const struct iomap_iter *iter, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1160,8 +1181,8 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, goto fallback; pfn = page_to_pfn_t(zero_page); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_PMD | DAX_ZERO_PAGE, false); + *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE); if (arch_needs_pgtable_deposit()) { pgtable = pte_alloc_one(vma->vm_mm); @@ -1194,7 +1215,7 @@ fallback: } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - const struct iomap *iomap, void **entry) + const struct iomap_iter *iter, void **entry) { return VM_FAULT_FALLBACK; } @@ -1439,17 +1460,6 @@ static vm_fault_t dax_fault_return(int error) return vmf_error(error); } -/* - * MAP_SYNC on a dax mapping guarantees dirty metadata is - * flushed on write-faults (non-cow), but not read-faults. - */ -static bool dax_fault_is_synchronous(unsigned long flags, - struct vm_area_struct *vma, const struct iomap *iomap) -{ - return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) - && (iomap->flags & IOMAP_F_DIRTY); -} - /* * When handling a synchronous page fault and the inode need a fsync, we can * insert the PTE/PMD into page tables only after that fsync happened. Skip @@ -1507,13 +1517,11 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, const struct iomap_iter *iter, pfn_t *pfnp, struct xa_state *xas, void **entry, bool pmd) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = &iter->srcmap; size_t size = pmd ? PMD_SIZE : PAGE_SIZE; loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; - bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap); + bool write = iter->flags & IOMAP_WRITE; unsigned long entry_flags = pmd ? DAX_PMD : 0; int err = 0; pfn_t pfn; @@ -1526,8 +1534,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, if (!write && (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { if (!pmd) - return dax_load_hole(xas, mapping, entry, vmf); - return dax_pmd_load_hole(xas, vmf, iomap, entry); + return dax_load_hole(xas, vmf, iter, entry); + return dax_pmd_load_hole(xas, vmf, iter, entry); } if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) { @@ -1539,8 +1547,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, if (err) return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags, - write && !sync); + *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags); if (write && srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) { @@ -1549,7 +1556,7 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, return dax_fault_return(err); } - if (sync) + if (dax_fault_is_synchronous(iter, vmf->vma)) return dax_fault_synchronous_pfnp(pfnp, pfn); /* insert PMD pfn */ From 8dbfc76da30472cfa07218a27eaaa538f0a49551 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:35 +0800 Subject: [PATCH 144/282] fsdax: add dax_iomap_cow_copy() for dax zero Punch hole on a reflinked file needs dax_iomap_cow_copy() too. Otherwise, data in not aligned area will be not correct. So, add the CoW operation for not aligned case in dax_memzero(). Link: https://lkml.kernel.org/r/20220603053738.1218681-12-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Ritesh Harjani Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/dax.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 04fee1569328..0aab32300531 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1221,17 +1221,28 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static int dax_memzero(struct dax_device *dax_dev, pgoff_t pgoff, - unsigned int offset, size_t size) +static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) { + const struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + unsigned offset = offset_in_page(pos); + pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); void *kaddr; long ret; - ret = dax_direct_access(dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); - if (ret > 0) { - memset(kaddr + offset, 0, size); - dax_flush(dax_dev, kaddr + offset, size); - } + ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, + NULL); + if (ret < 0) + return ret; + memset(kaddr + offset, 0, size); + if (srcmap->addr != iomap->addr) { + ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap, + kaddr); + if (ret < 0) + return ret; + dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE); + } else + dax_flush(iomap->dax_dev, kaddr + offset, size); return ret; } @@ -1258,7 +1269,7 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iomap->dax_dev, pgoff, offset, size); + rc = dax_memzero(iter, pos, size); dax_read_unlock(id); if (rc < 0) From 6f7db3894ae23eb5d40af4efb404aa0c072a68d2 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:36 +0800 Subject: [PATCH 145/282] fsdax: dedup file range to use a compare function With dax we cannot deal with readpage() etc. So, we create a dax comparison function which is similar with vfs_dedupe_file_range_compare(). And introduce dax_remap_file_range_prep() for filesystem use. Link: https://lkml.kernel.org/r/20220603053738.1218681-13-ruansy.fnst@fujitsu.com Signed-off-by: Goldwyn Rodrigues Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/dax.c | 82 ++++++++++++++++++++++++++++++++++++++++++++ fs/remap_range.c | 31 ++++++++++++++--- fs/xfs/xfs_reflink.c | 8 +++-- include/linux/dax.h | 8 +++++ include/linux/fs.h | 12 ++++--- 5 files changed, 130 insertions(+), 11 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 0aab32300531..e0f9c4a0a0c1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1873,3 +1873,85 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, return dax_insert_pfn_mkwrite(vmf, pfn, order); } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); + +static loff_t dax_range_compare_iter(struct iomap_iter *it_src, + struct iomap_iter *it_dest, u64 len, bool *same) +{ + const struct iomap *smap = &it_src->iomap; + const struct iomap *dmap = &it_dest->iomap; + loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + void *saddr, *daddr; + int id, ret; + + len = min(len, min(smap->length, dmap->length)); + + if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { + *same = true; + return len; + } + + if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { + *same = false; + return 0; + } + + id = dax_read_lock(); + ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE), + &saddr, NULL); + if (ret < 0) + goto out_unlock; + + ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE), + &daddr, NULL); + if (ret < 0) + goto out_unlock; + + *same = !memcmp(saddr, daddr, len); + if (!*same) + len = 0; + dax_read_unlock(id); + return len; + +out_unlock: + dax_read_unlock(id); + return -EIO; +} + +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dst, loff_t dstoff, loff_t len, bool *same, + const struct iomap_ops *ops) +{ + struct iomap_iter src_iter = { + .inode = src, + .pos = srcoff, + .len = len, + .flags = IOMAP_DAX, + }; + struct iomap_iter dst_iter = { + .inode = dst, + .pos = dstoff, + .len = len, + .flags = IOMAP_DAX, + }; + int ret; + + while ((ret = iomap_iter(&src_iter, ops)) > 0) { + while ((ret = iomap_iter(&dst_iter, ops)) > 0) { + dst_iter.processed = dax_range_compare_iter(&src_iter, + &dst_iter, len, same); + } + if (ret <= 0) + src_iter.processed = ret; + } + return ret; +} + +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, ops); +} +EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); diff --git a/fs/remap_range.c b/fs/remap_range.c index e112b5424cdb..231de627c1b9 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #include @@ -271,9 +272,11 @@ out_error: * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ -int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *len, unsigned int remap_flags) +int +__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); @@ -333,8 +336,18 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; - ret = vfs_dedupe_file_range_compare(file_in, pos_in, - file_out, pos_out, *len, &is_same); + if (*len == 0) + return 0; + + if (!IS_DAX(inode_in)) + ret = vfs_dedupe_file_range_compare(file_in, pos_in, + file_out, pos_out, *len, &is_same); + else if (dax_read_ops) + ret = dax_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same, + dax_read_ops); + else + return -EINVAL; if (ret) return ret; if (!is_same) @@ -352,6 +365,14 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, return ret; } + +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags) +{ + return __generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, NULL); +} EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index e7a7c00d93be..cbaf36d21020 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1367,8 +1367,12 @@ xfs_reflink_remap_prep( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, - len, remap_flags); + if (!IS_DAX(inode_in)) + ret = generic_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags); + else + ret = dax_remap_file_range_prep(file_in, pos_in, file_out, + pos_out, len, remap_flags, &xfs_read_iomap_ops); if (ret || *len == 0) goto out_unlock; diff --git a/include/linux/dax.h b/include/linux/dax.h index 7116681b48c0..ba985333e26b 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -246,6 +246,14 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); +int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same, + const struct iomap_ops *ops); +int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *ops); static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..134e9d7ad5d6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -74,6 +74,7 @@ struct fsverity_operations; struct fs_context; struct fs_parameter_spec; struct fileattr; +struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2070,10 +2071,13 @@ extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); -extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - loff_t *count, - unsigned int remap_flags); +int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *len, unsigned int remap_flags, + const struct iomap_ops *dax_read_ops); +int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); From ea6c49b784f0998297fb206af81c28dfaf8bb343 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:37 +0800 Subject: [PATCH 146/282] xfs: support CoW in fsdax mode In fsdax mode, WRITE and ZERO on a shared extent need CoW performed. After that, new allocated extents needs to be remapped to the file. So, add a CoW identification in ->iomap_begin(), and implement ->iomap_end() to do the remapping work. [akpm@linux-foundation.org: make xfs_dax_fault() static] Link: https://lkml.kernel.org/r/20220603053738.1218681-14-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/xfs/xfs_file.c | 33 ++++++++++++++++++++++++++++----- fs/xfs/xfs_iomap.c | 30 +++++++++++++++++++++++++++++- fs/xfs/xfs_iomap.h | 1 + 3 files changed, 58 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5a171c0b244b..45d8e64188f3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,7 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" +#include #include #include #include @@ -669,7 +670,7 @@ xfs_file_dax_write( pos = iocb->ki_pos; trace_xfs_file_dax_write(iocb, from); - ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); @@ -1254,6 +1255,31 @@ xfs_file_llseek( return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } +#ifdef CONFIG_FS_DAX +static int +xfs_dax_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault, + pfn_t *pfn) +{ + return dax_iomap_fault(vmf, pe_size, pfn, NULL, + (write_fault && !vmf->cow_page) ? + &xfs_dax_write_iomap_ops : + &xfs_read_iomap_ops); +} +#else +static int +xfs_dax_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault, + pfn_t *pfn) +{ + return 0; +} +#endif + /* * Locking for serialisation of IO during page faults. This results in a lock * ordering of: @@ -1285,10 +1311,7 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, - (write_fault && !vmf->cow_page) ? - &xfs_direct_write_iomap_ops : - &xfs_read_iomap_ops); + ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a393259a3a3..4c07f5e718fb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -773,7 +773,8 @@ xfs_direct_write_iomap_begin( /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, - &lockmode, flags & IOMAP_DIRECT); + &lockmode, + (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; if (shared) @@ -867,6 +868,33 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +static int +xfs_dax_write_iomap_end( + struct inode *inode, + loff_t pos, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + + if (!xfs_is_cow_inode(ip)) + return 0; + + if (!written) { + xfs_reflink_cancel_cow_range(ip, pos, length, true); + return 0; + } + + return xfs_reflink_end_cow(ip, pos, written); +} + +const struct iomap_ops xfs_dax_write_iomap_ops = { + .iomap_begin = xfs_direct_write_iomap_begin, + .iomap_end = xfs_dax_write_iomap_end, +}; + static int xfs_buffered_write_iomap_begin( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index e88dc162c785..c782e8c0479c 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -51,5 +51,6 @@ extern const struct iomap_ops xfs_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; +extern const struct iomap_ops xfs_dax_write_iomap_ops; #endif /* __XFS_IOMAP_H__*/ From 13f9e267fdbba30820ce3999338b7d8fe7d6bf77 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Fri, 3 Jun 2022 13:37:38 +0800 Subject: [PATCH 147/282] xfs: add dax dedupe support Introduce xfs_mmaplock_two_inodes_and_break_dax_layout() for dax files who are going to be deduped. After that, call compare range function only when files are both DAX or not. Link: https://lkml.kernel.org/r/20220603053738.1218681-15-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Dan Williams Cc: Dan Williams Cc: Dave Chinner Cc: Goldwyn Rodrigues Cc: Goldwyn Rodrigues Cc: Jane Chu Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Ritesh Harjani Signed-off-by: Andrew Morton --- fs/xfs/xfs_file.c | 2 +- fs/xfs/xfs_inode.c | 69 +++++++++++++++++++++++++++++++++++++++++--- fs/xfs/xfs_inode.h | 1 + fs/xfs/xfs_reflink.c | 4 +-- 4 files changed, 69 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 45d8e64188f3..92a3af1e4414 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -808,7 +808,7 @@ xfs_wait_dax_page( xfs_ilock(ip, XFS_MMAPLOCK_EXCL); } -static int +int xfs_break_dax_layouts( struct inode *inode, bool *retry) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 52d6f2c7d58b..6f251781ebd0 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3767,6 +3767,50 @@ retry: return 0; } +static int +xfs_mmaplock_two_inodes_and_break_dax_layout( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + int error; + bool retry; + struct page *page; + + if (ip1->i_ino > ip2->i_ino) + swap(ip1, ip2); + +again: + retry = false; + /* Lock the first inode */ + xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); + error = xfs_break_dax_layouts(VFS_I(ip1), &retry); + if (error || retry) { + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + if (error == 0 && retry) + goto again; + return error; + } + + if (ip1 == ip2) + return 0; + + /* Nested lock the second inode */ + xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); + /* + * We cannot use xfs_break_dax_layouts() directly here because it may + * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable + * for this nested lock case. + */ + page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); + if (page && page_ref_count(page) != 1) { + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + goto again; + } + + return 0; +} + /* * Lock two inodes so that userspace cannot initiate I/O via file syscalls or * mmap activity. @@ -3781,8 +3825,19 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, - VFS_I(ip2)->i_mapping); + + if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { + ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); + if (ret) { + inode_unlock(VFS_I(ip2)); + if (ip1 != ip2) + inode_unlock(VFS_I(ip1)); + return ret; + } + } else + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); + return 0; } @@ -3792,8 +3847,14 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, - VFS_I(ip2)->i_mapping); + if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + } else + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); + inode_unlock(VFS_I(ip2)); if (ip1 != ip2) inode_unlock(VFS_I(ip1)); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7be6f8e705ab..8313cc83b6ee 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -467,6 +467,7 @@ xfs_itruncate_extents( } /* from xfs_file.c */ +int xfs_break_dax_layouts(struct inode *inode, bool *retry); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cbaf36d21020..d07f06ff0f13 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1363,8 +1363,8 @@ xfs_reflink_remap_prep( if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) goto out_unlock; - /* Don't share DAX file data for now. */ - if (IS_DAX(inode_in) || IS_DAX(inode_out)) + /* Don't share DAX file data with non-DAX file. */ + if (IS_DAX(inode_in) != IS_DAX(inode_out)) goto out_unlock; if (!IS_DAX(inode_in)) From 66137fb34a4b5e519d4e4679ae5aca9989688a94 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:34 -0700 Subject: [PATCH 148/282] mm: khugepaged: check THP flag in hugepage_vma_check() Patch series "Cleanup transhuge_xxx helpers", v5. This series is the follow-up of the discussion about cleaning up transhuge_xxx helpers at https://lore.kernel.org/linux-mm/627a71f8-e879-69a5-ceb3-fc8d29d2f7f1@suse.cz/. THP has a bunch of helpers that do VMA sanity check for different paths, they do the similar checks for the most callsites and have a lot duplicate codes. And it is confusing what helpers should be used at what conditions. This series reorganized and cleaned up the code so that we could consolidate all the checks into hugepage_vma_check(). The transhuge_vma_enabled(), transparent_hugepage_active() and __transparent_hugepage_enabled() are killed by this series. This patch (of 7): Currently the THP flag check in hugepage_vma_check() will fallthrough if the flag is NEVER and VM_HUGEPAGE is set. This is not a problem for now since all the callers have the flag checked before or can't be invoked if the flag is NEVER. However, the following patch will call hugepage_vma_check() in more places, for example, page fault, so this flag must be checked in hugepge_vma_check(). Link: https://lkml.kernel.org/r/20220616174840.1202070-1-shy828301@gmail.com Link: https://lkml.kernel.org/r/20220616174840.1202070-2-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Reviewed-by: Miaohe Lin Cc: Vlastimil Babka Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index dea102170ab3..3eec970a884d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -451,6 +451,9 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); + if (!khugepaged_enabled()) + return false; + /* THP settings require madvise. */ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) return false; From 4fa6893faeaaea4fe4440512d2a708527ef47051 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:35 -0700 Subject: [PATCH 149/282] mm: thp: consolidate vma size check to transhuge_vma_suitable There are couple of places that check whether the vma size is ok for THP or whether address fits, they are open coded and duplicate, use transhuge_vma_suitable() to do the job by passing in (vma->end - HPAGE_PMD_SIZE). Move vma size check into hugepage_vma_check(). This will make khugepaged_enter() is as same as khugepaged_enter_vma(). There is just one caller for khugepaged_enter(), replace it to khugepaged_enter_vma() and remove khugepaged_enter(). Link: https://lkml.kernel.org/r/20220616174840.1202070-3-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 11 +++++++++++ include/linux/khugepaged.h | 14 -------------- mm/huge_memory.c | 2 +- mm/khugepaged.c | 19 ++++++------------- 4 files changed, 18 insertions(+), 28 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 648cb3ce7099..8a5a8bfce0f5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,17 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +/* + * Do the below checks: + * - For file vma, check if the linear page offset of vma is + * HPAGE_PMD_NR aligned within the file. The hugepage is + * guaranteed to be hugepage-aligned within the file, but we must + * check that the PMD-aligned addresses in the VMA map to + * PMD-aligned offsets within the file, else the hugepage will + * not be PMD-mappable. + * - For all vmas, check if the haddr is in an aligned HPAGE_PMD_SIZE + * area. + */ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 392d34c3c59a..31ca8a7f78f4 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -51,16 +51,6 @@ static inline void khugepaged_exit(struct mm_struct *mm) if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) __khugepaged_exit(mm); } - -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags)) - __khugepaged_enter(vma->vm_mm); - } -} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { @@ -68,10 +58,6 @@ static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm static inline void khugepaged_exit(struct mm_struct *mm) { } -static inline void khugepaged_enter(struct vm_area_struct *vma, - unsigned long vm_flags) -{ -} static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a563de8234c1..2751649aaf33 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -726,7 +726,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - khugepaged_enter(vma, vma->vm_flags); + khugepaged_enter_vma(vma, vma->vm_flags); if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3eec970a884d..c7e22135f1b5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -443,8 +443,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (vma_is_dax(vma)) return false; - if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - - vma->vm_pgoff, HPAGE_PMD_NR)) + /* Check alignment for file vma and size for both file and anon vma */ + if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; /* Enabled via shmem mount options or sysfs settings. */ @@ -505,9 +505,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled() && - (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK))) { + khugepaged_enabled()) { if (hugepage_vma_check(vma, vm_flags)) __khugepaged_enter(vma->vm_mm); } @@ -948,7 +946,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct vm_area_struct **vmap) { struct vm_area_struct *vma; - unsigned long hstart, hend; if (unlikely(khugepaged_test_exit(mm))) return SCAN_ANY_PROCESS; @@ -957,9 +954,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) + if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; @@ -2135,10 +2130,8 @@ skip: progress++; continue; } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) - goto skip; + hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); + hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) From f707fa493784b515ced01d4e261afe16fc784b5d Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:36 -0700 Subject: [PATCH 150/282] mm: khugepaged: better comments for anon vma check in hugepage_vma_revalidate The hugepage_vma_revalidate() needs to check if the vma is still anonymous vma or not since the address may be unmapped then remapped to file before khugepaged reaquired the mmap_lock. The old comment is not quite helpful, elaborate this with better comment. Link: https://lkml.kernel.org/r/20220616174840.1202070-4-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/khugepaged.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c7e22135f1b5..67e144e64b7f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -958,7 +958,13 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_ADDRESS_RANGE; if (!hugepage_vma_check(vma, vma->vm_flags)) return SCAN_VMA_CHECK; - /* Anon VMA expected */ + /* + * Anon VMA expected, the address may be unmapped then + * remapped to file after khugepaged reaquired the mmap_lock. + * + * hugepage_vma_check may return true for qualified file + * vmas. + */ if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; return 0; From 9fec51689ff60d9766b38051a0b1692f93d95364 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:37 -0700 Subject: [PATCH 151/282] mm: thp: kill transparent_hugepage_active() The transparent_hugepage_active() was introduced to show THP eligibility bit in smaps in proc, smaps is the only user. But it actually does the similar check as hugepage_vma_check() which is used by khugepaged. We definitely don't have to maintain two similar checks, so kill transparent_hugepage_active(). This patch also fixed the wrong behavior for VM_NO_KHUGEPAGED vmas. Also move hugepage_vma_check() to huge_memory.c and huge_mm.h since it is not only for khugepaged anymore. [akpm@linux-foundation.org: check vma->vm_mm, per Zach] [akpm@linux-foundation.org: add comment to vdso check] Link: https://lkml.kernel.org/r/20220616174840.1202070-5-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 16 ++++++----- include/linux/khugepaged.h | 2 -- mm/huge_memory.c | 55 +++++++++++++++++++++++++++++++------- mm/khugepaged.c | 48 +++------------------------------ 5 files changed, 60 insertions(+), 63 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1d7fd832123b..072cf770b5d0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - transparent_hugepage_active(vma)); + hugepage_vma_check(vma, vma->vm_flags, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a5a8bfce0f5..64487bcd0c7b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -202,7 +202,9 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool transparent_hugepage_active(struct vm_area_struct *vma); +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -351,11 +353,6 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) return false; } -static inline bool transparent_hugepage_active(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { @@ -368,6 +365,13 @@ static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, return false; } +static inline bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps) +{ + return false; +} + static inline void prep_transhuge_page(struct page *page) {} #define transparent_hugepage_flags 0UL diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 31ca8a7f78f4..ea5fd4c398f7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -10,8 +10,6 @@ extern struct attribute_group khugepaged_attr_group; extern int khugepaged_init(void); extern void khugepaged_destroy(void); extern int start_stop_khugepaged(void); -extern bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags); extern void __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern void khugepaged_enter_vma(struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2751649aaf33..8cbd21aaf03e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,21 +69,56 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool transparent_hugepage_active(struct vm_area_struct *vma) +bool hugepage_vma_check(struct vm_area_struct *vma, + unsigned long vm_flags, + bool smaps) { - /* The addr is used to check if the vma size fits */ - unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; - - if (!transhuge_vma_suitable(vma, addr)) + if (!vma->vm_mm) /* vdso */ return false; - if (vma_is_anonymous(vma)) - return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma)) + + if (!transhuge_vma_enabled(vma, vm_flags)) + return false; + + if (vm_flags & VM_NO_KHUGEPAGED) + return false; + + /* Don't run khugepaged against DAX vma */ + if (vma_is_dax(vma)) + return false; + + /* Check alignment for file vma and size for both file and anon vma */ + if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) + return false; + + /* Enabled via shmem mount options or sysfs settings. */ + if (shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + + if (!khugepaged_enabled()) + return false; + + /* THP settings require madvise. */ + if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + return false; + + /* Only regular file is valid */ + if (file_thp_enabled(vma)) return true; - return false; + if (!vma_is_anonymous(vma)) + return false; + + if (vma_is_temporary_stack(vma)) + return false; + + /* + * THPeligible bit of smaps should show 1 for proper VMAs even + * though anon_vma is not initialized yet. + */ + if (!vma->anon_vma) + return smaps; + + return true; } static bool get_huge_zero_page(void) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 67e144e64b7f..6bbf3adac534 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -430,46 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - if (!transhuge_vma_enabled(vma, vm_flags)) - return false; - - if (vm_flags & VM_NO_KHUGEPAGED) - return false; - - /* Don't run khugepaged against DAX vma */ - if (vma_is_dax(vma)) - return false; - - /* Check alignment for file vma and size for both file and anon vma */ - if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) - return false; - - /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); - - if (!khugepaged_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) - return false; - - /* Only regular file is valid */ - if (file_thp_enabled(vma)) - return true; - - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return false; - if (vma_is_temporary_stack(vma)) - return false; - - return true; -} - void __khugepaged_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; @@ -506,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags)) + if (hugepage_vma_check(vma, vm_flags, false)) __khugepaged_enter(vma->vm_mm); } } @@ -956,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags)) + if (!hugepage_vma_check(vma, vma->vm_flags, false)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1441,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() * will not fail the vma for missing VM_HUGEPAGE */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2131,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false)) { skip: progress++; continue; From 7da4e2cb8b1ff8221759bfc7512d651ee69516dc Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:38 -0700 Subject: [PATCH 152/282] mm: thp: kill __transhuge_page_enabled() The page fault path checks THP eligibility with __transhuge_page_enabled() which does the similar thing as hugepage_vma_check(), so use hugepage_vma_check() instead. However page fault allows DAX and !anon_vma cases, so added a new flag, in_pf, to hugepage_vma_check() to make page fault work correctly. The in_pf flag is also used to skip shmem and file THP for page fault since shmem handles THP in its own shmem_fault() and file THP allocation on fault is not supported yet. Also remove hugepage_vma_enabled() since hugepage_vma_check() is the only caller now, it is not necessary to have a helper function. Link: https://lkml.kernel.org/r/20220616174840.1202070-6-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 57 ++--------------------------------------- mm/huge_memory.c | 53 +++++++++++++++++++++++++++++--------- mm/khugepaged.c | 8 +++--- mm/memory.c | 7 +++-- 5 files changed, 53 insertions(+), 74 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 072cf770b5d0..a3398d0f1927 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -863,7 +863,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true)); + hugepage_vma_check(vma, vma->vm_flags, true, false)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 64487bcd0c7b..cd8a6c5d9fe5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -146,48 +146,6 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - /* Explicitly disabled through madvise. */ - if ((vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - return true; -} - -/* - * to be used on vmas which are known to support THP. - * Use transparent_hugepage_active otherwise - */ -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - - /* - * If the hardware/firmware marked hugepage support disabled. - */ - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) - return false; - - if (!transhuge_vma_enabled(vma, vma->vm_flags)) - return false; - - if (vma_is_temporary_stack(vma)) - return false; - - if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) - return true; - - if (vma_is_dax(vma)) - return true; - - if (transparent_hugepage_flags & - (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) - return !!(vma->vm_flags & VM_HUGEPAGE); - - return false; -} - static inline bool file_thp_enabled(struct vm_area_struct *vma) { struct inode *inode; @@ -204,7 +162,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps); + bool smaps, bool in_pf); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -348,26 +306,15 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) return false; } -static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) -{ - return false; -} - static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long addr) { return false; } -static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, - unsigned long vm_flags) -{ - return false; -} - static inline bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps) + bool smaps, bool in_pf) { return false; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8cbd21aaf03e..4b90c7021e52 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -71,27 +71,53 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL; bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, - bool smaps) + bool smaps, bool in_pf) { if (!vma->vm_mm) /* vdso */ return false; - if (!transhuge_vma_enabled(vma, vm_flags)) + /* + * Explicitly disabled through madvise or prctl, or some + * architectures may disable THP for some mappings, for + * example, s390 kvm. + * */ + if ((vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; + /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ + if (vma_is_dax(vma)) + return in_pf; + + /* + * Special VMA and hugetlb VMA. + * Must be checked after dax since some dax mappings may have + * VM_MIXEDMAP set. + */ if (vm_flags & VM_NO_KHUGEPAGED) return false; - /* Don't run khugepaged against DAX vma */ - if (vma_is_dax(vma)) + /* + * Check alignment for file vma and size for both file and anon vma. + * + * Skip the check for page fault. Huge fault does the check in fault + * handlers. And this check is not suitable for huge PUD fault. + */ + if (!in_pf && + !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) return false; - /* Check alignment for file vma and size for both file and anon vma */ - if (!transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE))) - return false; - - /* Enabled via shmem mount options or sysfs settings. */ - if (shmem_file(vma->vm_file)) + /* + * Enabled via shmem mount options or sysfs settings. + * Must be done before hugepage flags check since shmem has its + * own flags. + */ + if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); if (!khugepaged_enabled()) @@ -102,7 +128,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, return false; /* Only regular file is valid */ - if (file_thp_enabled(vma)) + if (!in_pf && file_thp_enabled(vma)) return true; if (!vma_is_anonymous(vma)) @@ -114,9 +140,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma, /* * THPeligible bit of smaps should show 1 for proper VMAs even * though anon_vma is not initialized yet. + * + * Allow page fault since anon_vma may be not initialized until + * the first page fault. */ if (!vma->anon_vma) - return smaps; + return (smaps || in_pf); return true; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6bbf3adac534..d683ef1edeb5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -466,7 +466,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && khugepaged_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false)) + if (hugepage_vma_check(vma, vm_flags, false, false)) __khugepaged_enter(vma->vm_mm); } } @@ -916,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1401,7 +1401,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() * will not fail the vma for missing VM_HUGEPAGE */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false)) + if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2091,7 +2091,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { skip: progress++; continue; diff --git a/mm/memory.c b/mm/memory.c index dce0b2e686eb..2392d5db473a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4970,6 +4970,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; + unsigned long vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; @@ -4983,7 +4984,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (!vmf.pud) return VM_FAULT_OOM; retry_pud: - if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { + if (pud_none(*vmf.pud) && + hugepage_vma_check(vma, vm_flags, false, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5016,7 +5018,8 @@ retry_pud: if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { + if (pmd_none(*vmf.pmd) && + hugepage_vma_check(vma, vm_flags, false, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; From 1064026bab9f011bdea1251d44d66bbbcee04f6e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 16 Jun 2022 10:48:39 -0700 Subject: [PATCH 153/282] mm: khugepaged: reorg some khugepaged helpers The khugepaged_{enabled|always|req_madv} are not khugepaged only anymore, move them to huge_mm.h and rename to hugepage_flags_xxx, and remove khugepaged_req_madv due to no users. Also move khugepaged_defrag to khugepaged.c since its only caller is in that file, it doesn't have to be in a header file. Link: https://lkml.kernel.org/r/20220616174840.1202070-7-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 ++++++++ include/linux/khugepaged.h | 14 -------------- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 18 +++++++++++------- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cd8a6c5d9fe5..ae3d8e2fd9e2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -116,6 +116,14 @@ extern struct kobj_attribute shmem_enabled_attr; extern unsigned long transparent_hugepage_flags; +#define hugepage_flags_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4b90c7021e52..8e1b3d9f7ebf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -120,11 +120,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!khugepaged_enabled()) + if (!hugepage_flags_enabled()) return false; /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) return false; /* Only regular file is valid */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d683ef1edeb5..01f71786d530 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -465,7 +465,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && - khugepaged_enabled()) { + hugepage_flags_enabled()) { if (hugepage_vma_check(vma, vm_flags, false, false)) __khugepaged_enter(vma->vm_mm); } @@ -761,6 +761,10 @@ static bool khugepaged_scan_abort(int nid) return false; } +#define khugepaged_defrag() \ + (transparent_hugepage_flags & \ + (1< Date: Thu, 16 Jun 2022 10:48:40 -0700 Subject: [PATCH 154/282] doc: proc: fix the description to THPeligible The THPeligible bit shows 1 if and only if the VMA is eligible for allocating THP and the THP is also PMD mappable. Some misaligned file VMAs may be eligible for allocating THP but the THP can't be mapped by PMD. Make this more explicitly to avoid ambiguity. Link: https://lkml.kernel.org/r/20220616174840.1202070-8-shy828301@gmail.com Signed-off-by: Yang Shi Reviewed-by: Zach O'Keefe Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 0b5120ff506c..e7aafc82be99 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -517,8 +517,10 @@ replaced by copy-on-write) part of the underlying shmem object out on swap. "SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this does not take into account swapped out page of underlying shmem objects. "Locked" indicates whether the mapping is locked in memory or not. + "THPeligible" indicates whether the mapping is eligible for allocating THP -pages - 1 if true, 0 otherwise. It just shows the current status. +pages as well as the THP is PMD mappable or not - 1 if true, 0 otherwise. +It just shows the current status. "VmFlags" field deserves a separate description. This member represents the kernel flags associated with the particular virtual memory area in two letter From 3de0de758029a0beb1d47facd3d390d2804a3e94 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Wed, 15 Jun 2022 14:22:18 +0800 Subject: [PATCH 155/282] kasan: separate double free case from invalid free Currently, KASAN describes all invalid-free/double-free bugs as "double-free or invalid-free". This is ambiguous. KASAN should report "double-free" when a double-free is a more likely cause (the address points to the start of an object) and report "invalid-free" otherwise [1]. [1] https://bugzilla.kernel.org/show_bug.cgi?id=212193 Link: https://lkml.kernel.org/r/20220615062219.22618-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Vincenzo Frascino Cc: Matthias Brugger Cc: Chinwen Chang Cc: Yee Lee Cc: Andrew Yang Signed-off-by: Andrew Morton --- mm/kasan/common.c | 8 ++++---- mm/kasan/kasan.h | 3 ++- mm/kasan/report.c | 12 ++++++++---- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index c40c0e7b3b5f..707c3a527fcb 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -343,7 +343,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(nearest_obj(cache, virt_to_slab(object), object) != object)) { - kasan_report_invalid_free(tagged_object, ip); + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE); return true; } @@ -352,7 +352,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (!kasan_byte_accessible(tagged_object)) { - kasan_report_invalid_free(tagged_object, ip); + kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE); return true; } @@ -377,12 +377,12 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip) { if (ptr != page_address(virt_to_head_page(ptr))) { - kasan_report_invalid_free(ptr, ip); + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE); return true; } if (!kasan_byte_accessible(ptr)) { - kasan_report_invalid_free(ptr, ip); + kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE); return true; } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 610d60d6e5b8..01c03e45acd4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -125,6 +125,7 @@ static inline bool kasan_sync_fault_possible(void) enum kasan_report_type { KASAN_REPORT_ACCESS, KASAN_REPORT_INVALID_FREE, + KASAN_REPORT_DOUBLE_FREE, }; struct kasan_report_info { @@ -277,7 +278,7 @@ static inline void kasan_print_address_stack_frame(const void *addr) { } bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); -void kasan_report_invalid_free(void *object, unsigned long ip); +void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b341a191651d..fe3f606b3a98 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -176,8 +176,12 @@ static void end_report(unsigned long *flags, void *addr) static void print_error_description(struct kasan_report_info *info) { if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", - (void *)info->ip); + pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip); + return; + } + + if (info->type == KASAN_REPORT_DOUBLE_FREE) { + pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip); return; } @@ -433,7 +437,7 @@ static void print_report(struct kasan_report_info *info) } } -void kasan_report_invalid_free(void *ptr, unsigned long ip) +void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; struct kasan_report_info info; @@ -448,7 +452,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip) start_report(&flags, true); - info.type = KASAN_REPORT_INVALID_FREE; + info.type = type; info.access_addr = ptr; info.first_bad_addr = kasan_reset_tag(ptr); info.access_size = 0; From e95a9851787bbb3cd4deb40fe8bab03f731852d1 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 21 Jun 2022 16:56:17 -0700 Subject: [PATCH 156/282] hugetlb: skip to end of PT page mapping when pte not present Patch series "hugetlb: speed up linear address scanning", v2. At unmap, fork and remap time hugetlb address ranges are linearly scanned. We can optimize these scans if the ranges are sparsely populated. Also, enable page table "Lazy copy" for hugetlb at fork. NOTE: Architectures not defining CONFIG_ARCH_WANT_GENERAL_HUGETLB need to add an arch specific version hugetlb_mask_last_page() to take advantage of sparse address scanning improvements. Baolin Wang added the routine for arm64. Other architectures which could be optimized are: ia64, mips, parisc, powerpc, s390, sh and sparc. This patch (of 4): HugeTLB address ranges are linearly scanned during fork, unmap and remap operations. If a non-present entry is encountered, the code currently continues to the next huge page aligned address. However, a non-present entry implies that the page table page for that entry is not present. Therefore, the linear scan can skip to the end of range mapped by the page table page. This can speed operations on large sparsely populated hugetlb mappings. Create a new routine hugetlb_mask_last_page() that will return an address mask. When the mask is ORed with an address, the result will be the address of the last huge page mapped by the associated page table page. Use this mask to update addresses in routines which linearly scan hugetlb address ranges when a non-present pte is encountered. hugetlb_mask_last_page is related to the implementation of huge_pte_offset as hugetlb_mask_last_page is called when huge_pte_offset returns NULL. This patch only provides a complete hugetlb_mask_last_page implementation when CONFIG_ARCH_WANT_GENERAL_HUGETLB is defined. Architectures which provide their own versions of huge_pte_offset can also provide their own version of hugetlb_mask_last_page. Link: https://lkml.kernel.org/r/20220621235620.291305-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220621235620.291305-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Tested-by: Baolin Wang Reviewed-by: Baolin Wang Acked-by: Muchun Song Reported-by: kernel test robot Cc: Michal Hocko Cc: Peter Xu Cc: Naoya Horiguchi Cc: James Houghton Cc: Mina Almasry Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Paul Walmsley Cc: Christian Borntraeger Cc: Catalin Marinas Cc: Will Deacon Cc: Rolf Eike Beer Cc: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 56 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c6cccfaf8708..ce30fad5fd13 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -194,6 +194,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); +unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long *addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ffdf3fc4a83f..95fd1c36c17f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4727,6 +4727,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; + unsigned long last_addr_mask; int ret = 0; if (cow) { @@ -4746,11 +4747,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, i_mmap_lock_read(mapping); } + last_addr_mask = hugetlb_mask_last_page(h); for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr, sz); - if (!src_pte) + if (!src_pte) { + addr |= last_addr_mask; continue; + } dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; @@ -4767,8 +4771,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * after taking the lock below. */ dst_entry = huge_ptep_get(dst_pte); - if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) { + addr |= last_addr_mask; continue; + } dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); @@ -4928,6 +4934,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; + unsigned long last_addr_mask; unsigned long old_addr_copy; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; @@ -4943,12 +4950,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); - if (!src_pte) + if (!src_pte) { + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; + } if (huge_pte_none(huge_ptep_get(src_pte))) continue; @@ -4993,6 +5004,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); struct mmu_notifier_range range; + unsigned long last_addr_mask; bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); @@ -5013,11 +5025,14 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { ptep = huge_pte_offset(mm, address, sz); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, &address, ptep)) { @@ -6285,6 +6300,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long pages = 0, psize = huge_page_size(h); bool shared_pmd = false; struct mmu_notifier_range range; + unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; @@ -6301,12 +6317,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); + last_addr_mask = hugetlb_mask_last_page(h); i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); - if (!ptep) + if (!ptep) { + address |= last_addr_mask; continue; + } ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, vma, &address, ptep)) { /* @@ -6856,6 +6875,33 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return (pte_t *)pmd; } +/* + * Return a mask that can be used to update an address to the last huge + * page in a page table page mapping size. Used to skip non-present + * page table entries when linearly scanning address ranges. Architectures + * with unique huge page to page table relationships can define their own + * version of this routine. + */ +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + if (hp_size == PUD_SIZE) + return P4D_SIZE - PUD_SIZE; + else if (hp_size == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; + else + return 0UL; +} + +#else + +/* See description above. Architectures can provide their own version. */ +__weak unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + return 0UL; +} + #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ /* From 1bcdb769f9e0ad7a17891772e6b5414a2945714f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 21 Jun 2022 16:56:18 -0700 Subject: [PATCH 157/282] arm64/hugetlb: implement arm64 specific hugetlb_mask_last_page The HugeTLB address ranges are linearly scanned during fork, unmap and remap operations, and the linear scan can skip to the end of range mapped by the page table page if hitting a non-present entry, which can help to speed linear scanning of the HugeTLB address ranges. So hugetlb_mask_last_page() is introduced to help to update the address in the loop of HugeTLB linear scanning with getting the last huge page mapped by the associated page table page[1], when a non-present entry is encountered. Considering ARM64 specific cont-pte/pmd size HugeTLB, this patch implemented an ARM64 specific hugetlb_mask_last_page() to help this case. [1] https://lore.kernel.org/linux-mm/20220527225849.284839-1-mike.kravetz@oracle.com/ [baolin.wang@linux.alibaba.com: fix build] Link: https://lkml.kernel.org/r/a14e7b39-6a8a-4609-b4a1-84ac574f5c96@linux.alibaba.com Link: https://lkml.kernel.org/r/20220621235620.291305-3-mike.kravetz@oracle.com Signed-off-by: Baolin Wang Signed-off-by: Mike Kravetz Acked-by: Muchun Song Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David Hildenbrand Cc: James Houghton Cc: kernel test robot Cc: Michal Hocko Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Paul Walmsley Cc: Peter Xu Cc: Rolf Eike Beer Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/hugetlbpage.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 3be8f25aa5be..7430060cb0d6 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -368,6 +368,28 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; } +unsigned long hugetlb_mask_last_page(struct hstate *h) +{ + unsigned long hp_size = huge_page_size(h); + + switch (hp_size) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + return PGDIR_SIZE - PUD_SIZE; +#endif + case CONT_PMD_SIZE: + return PUD_SIZE - CONT_PMD_SIZE; + case PMD_SIZE: + return PUD_SIZE - PMD_SIZE; + case CONT_PTE_SIZE: + return PMD_SIZE - CONT_PTE_SIZE; + default: + break; + } + + return 0UL; +} + pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; From 4ddb4d91b82f4b64458fe35bc8e395c7c082ea2b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 21 Jun 2022 16:56:19 -0700 Subject: [PATCH 158/282] hugetlb: do not update address in huge_pmd_unshare As an optimization for loops sequentially processing hugetlb address ranges, huge_pmd_unshare would update a passed address if it unshared a pmd. Updating a loop control variable outside the loop like this is generally a bad idea. These loops are now using hugetlb_mask_last_page to optimize scanning when non-present ptes are discovered. The same can be done when huge_pmd_unshare returns 1 indicating a pmd was unshared. Remove address update from huge_pmd_unshare. Change the passed argument type and update all callers. In loops sequentially processing addresses use hugetlb_mask_last_page to update address if pmd is unshared. [sfr@canb.auug.org.au: fix an unused variable warning/error] Link: https://lkml.kernel.org/r/20220622171117.70850960@canb.auug.org.au Link: https://lkml.kernel.org/r/20220621235620.291305-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Stephen Rothwell Acked-by: Muchun Song Reviewed-by: Baolin Wang Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David Hildenbrand Cc: James Houghton Cc: kernel test robot Cc: Michal Hocko Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Paul Walmsley Cc: Peter Xu Cc: Rolf Eike Beer Cc: Will Deacon Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 44 ++++++++++++++++------------------------- mm/rmap.c | 4 ++-- 3 files changed, 21 insertions(+), 31 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ce30fad5fd13..75ee739d815b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -196,7 +196,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep); + unsigned long addr, pte_t *ptep); void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, @@ -243,7 +243,7 @@ static inline struct address_space *hugetlb_page_mapping_lock_write( static inline int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 95fd1c36c17f..96635a2874e3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4935,7 +4935,6 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long old_end = old_addr + len; unsigned long last_addr_mask; - unsigned long old_addr_copy; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; bool shared_pmd = false; @@ -4963,14 +4962,10 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (huge_pte_none(huge_ptep_get(src_pte))) continue; - /* old_addr arg to huge_pmd_unshare() is a pointer and so the - * arg may be modified. Pass a copy instead to preserve the - * value in old_addr. - */ - old_addr_copy = old_addr; - - if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) { + if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { shared_pmd = true; + old_addr |= last_addr_mask; + new_addr |= last_addr_mask; continue; } @@ -5035,10 +5030,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { spin_unlock(ptl); tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); force_flush = true; + address |= last_addr_mask; continue; } @@ -6327,7 +6323,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, continue; } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, &address, ptep)) { + if (huge_pmd_unshare(mm, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6337,6 +6333,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, pages++; spin_unlock(ptl); shared_pmd = true; + address |= last_addr_mask; continue; } pte = huge_ptep_get(ptep); @@ -6759,11 +6756,11 @@ out: * 0 the underlying pte page is not shared, or it is the last user */ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { - pgd_t *pgd = pgd_offset(mm, *addr); - p4d_t *p4d = p4d_offset(pgd, *addr); - pud_t *pud = pud_offset(p4d, *addr); + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); BUG_ON(page_count(virt_to_page(ptep)) == 0); @@ -6773,14 +6770,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_clear(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); - /* - * This update of passed address optimizes loops sequentially - * processing addresses in increments of huge page size (PMD_SIZE - * in this case). By clearing the pud, a PUD_SIZE area is unmapped. - * Update address to the 'last page' in the cleared area so that - * calling loop can move to first page past this area. - */ - *addr |= PUD_SIZE - PMD_SIZE; return 1; } @@ -6792,7 +6781,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, } int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long *addr, pte_t *ptep) + unsigned long addr, pte_t *ptep) { return 0; } @@ -6899,6 +6888,10 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { +#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE + if (huge_page_size(h) == PMD_SIZE) + return PUD_SIZE - PMD_SIZE; +#endif return 0UL; } @@ -7125,14 +7118,11 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_invalidate_range_start(&range); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { - unsigned long tmp = address; - ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - /* We don't want 'address' to be changed */ - huge_pmd_unshare(mm, vma, &tmp, ptep); + huge_pmd_unshare(mm, vma, address, ptep); spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); diff --git a/mm/rmap.c b/mm/rmap.c index 0532fd92ecb3..fb6b3b47f3e4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1559,7 +1559,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -1920,7 +1920,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * do this outside rmap routines. */ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, &address, pvmw.pte)) { + if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { flush_tlb_range(vma, range.start, range.end); mmu_notifier_invalidate_range(mm, range.start, range.end); From bcd51a3c679d179cf526414f859c57d081fd37e7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 21 Jun 2022 16:56:20 -0700 Subject: [PATCH 159/282] hugetlb: lazy page table copies in fork() Lazy page table copying at fork time was introduced with commit d992895ba2b2 ("[PATCH] Lazy page table copies in fork()"). At the time, hugetlb was very new and did not support page faulting. As a result, it was excluded. When full page fault support was added for hugetlb, the exclusion was not removed. Simply remove the check that prevents lazy copying of hugetlb page tables at fork. Of course, like other mappings this only applies to shared mappings. Lazy page table copying at fork will be less advantageous for hugetlb mappings because: - There are fewer page table entries with hugetlb - hugetlb pmds can be shared instead of copied In any case, completely eliminating the copy at fork time should speed things up. Link: https://lkml.kernel.org/r/20220621235620.291305-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Muchun Song Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Anshuman Khandual Cc: Baolin Wang Cc: Catalin Marinas Cc: Christian Borntraeger Cc: James Houghton Cc: kernel test robot Cc: Michal Hocko Cc: Mina Almasry Cc: Naoya Horiguchi Cc: Paul Walmsley Cc: Peter Xu Cc: Rolf Eike Beer Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 2392d5db473a..7cd0d4c086db 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1253,7 +1253,7 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) if (userfaultfd_wp(dst_vma)) return true; - if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) + if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return true; if (src_vma->anon_vma) From bf75f200569dd05ac2112797f44548beb6b4be26 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:17 +0100 Subject: [PATCH 160/282] mm/page_alloc: add page->buddy_list and page->pcp_list Patch series "Drain remote per-cpu directly", v5. Some setups, notably NOHZ_FULL CPUs, may be running realtime or latency-sensitive applications that cannot tolerate interference due to per-cpu drain work queued by __drain_all_pages(). Introduce a new mechanism to remotely drain the per-cpu lists. It is made possible by remotely locking 'struct per_cpu_pages' new per-cpu spinlocks. This has two advantages, the time to drain is more predictable and other unrelated tasks are not interrupted. This series has the same intent as Nicolas' series "mm/page_alloc: Remote per-cpu lists drain support" -- avoid interference of a high priority task due to a workqueue item draining per-cpu page lists. While many workloads can tolerate a brief interruption, it may cause a real-time task running on a NOHZ_FULL CPU to miss a deadline and at minimum, the draining is non-deterministic. Currently an IRQ-safe local_lock protects the page allocator per-cpu lists. The local_lock on its own prevents migration and the IRQ disabling protects from corruption due to an interrupt arriving while a page allocation is in progress. This series adjusts the locking. A spinlock is added to struct per_cpu_pages to protect the list contents while local_lock_irq is ultimately replaced by just the spinlock in the final patch. This allows a remote CPU to safely. Follow-on work should allow the spin_lock_irqsave to be converted to spin_lock to avoid IRQs being disabled/enabled in most cases. The follow-on patch will be one kernel release later as it is relatively high risk and it'll make bisections more clear if there are any problems. Patch 1 is a cosmetic patch to clarify when page->lru is storing buddy pages and when it is storing per-cpu pages. Patch 2 shrinks per_cpu_pages to make room for a spin lock. Strictly speaking this is not necessary but it avoids per_cpu_pages consuming another cache line. Patch 3 is a preparation patch to avoid code duplication. Patch 4 is a minor correction. Patch 5 uses a spin_lock to protect the per_cpu_pages contents while still relying on local_lock to prevent migration, stabilise the pcp lookup and prevent IRQ reentrancy. Patch 6 remote drains per-cpu pages directly instead of using a workqueue. Patch 7 uses a normal spinlock instead of local_lock for remote draining This patch (of 7): The page allocator uses page->lru for storing pages on either buddy or PCP lists. Create page->buddy_list and page->pcp_list as a union with page->lru. This is simply to clarify what type of list a page is on in the page allocator. No functional change intended. [minchan@kernel.org: fix page lru fields in macros] Link: https://lkml.kernel.org/r/20220624125423.6126-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Reviewed-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Marcelo Tosatti Cc: Michal Hocko Cc: Hugh Dickins Cc: Marek Szyprowski Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++++ mm/page_alloc.c | 24 ++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b961a29bf26..cf97f3884fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -87,6 +87,7 @@ struct page { */ union { struct list_head lru; + /* Or, for the Unevictable "LRU list" slot */ struct { /* Always even, to negate PageTail */ @@ -94,6 +95,10 @@ struct page { /* Count page's or folio's mlocks */ unsigned int mlock_count; }; + + /* Or, free page */ + struct list_head buddy_list; + struct list_head pcp_list; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c9c02b23f02f..78ba5ba66586 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -793,7 +793,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, return false; __SetPageGuard(page); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -936,7 +936,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add(&page->lru, &area->free_list[migratetype]); + list_add(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -946,7 +946,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_add_tail(&page->lru, &area->free_list[migratetype]); + list_add_tail(&page->buddy_list, &area->free_list[migratetype]); area->nr_free++; } @@ -960,7 +960,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone, { struct free_area *area = &zone->free_area[order]; - list_move_tail(&page->lru, &area->free_list[migratetype]); + list_move_tail(&page->buddy_list, &area->free_list[migratetype]); } static inline void del_page_from_free_list(struct page *page, struct zone *zone, @@ -970,7 +970,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone, if (page_reported(page)) __ClearPageReported(page); - list_del(&page->lru); + list_del(&page->buddy_list); __ClearPageBuddy(page); set_page_private(page, 0); zone->free_area[order].nr_free--; @@ -1508,11 +1508,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; - page = list_last_entry(list, struct page, lru); + page = list_last_entry(list, struct page, pcp_list); mt = get_pcppage_migratetype(page); /* must delete to avoid corrupting pcp list */ - list_del(&page->lru); + list_del(&page->pcp_list); count -= nr_pages; pcp->count -= nr_pages; @@ -3072,7 +3072,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * for IO devices that can merge IO requests if the physical * pages are ordered properly. */ - list_add_tail(&page->lru, list); + list_add_tail(&page->pcp_list, list); allocated++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, @@ -3322,7 +3322,7 @@ void mark_free_pages(struct zone *zone) for_each_migratetype_order(order, t) { list_for_each_entry(page, - &zone->free_area[order].free_list[t], lru) { + &zone->free_area[order].free_list[t], buddy_list) { unsigned long i; pfn = page_to_pfn(page); @@ -3411,7 +3411,7 @@ static void free_unref_page_commit(struct page *page, int migratetype, __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); - list_add(&page->lru, &pcp->lists[pindex]); + list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; /* @@ -3674,8 +3674,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, return NULL; } - page = list_first_entry(list, struct page, lru); - list_del(&page->lru); + page = list_first_entry(list, struct page, pcp_list); + list_del(&page->pcp_list); pcp->count -= 1 << order; } while (check_new_pcp(page, order)); From 5d0a661d808fc8ddc26940b1a12b82ae356f3ae2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:18 +0100 Subject: [PATCH 161/282] mm/page_alloc: use only one PCP list for THP-sized allocations The per_cpu_pages is cache-aligned on a standard x86-64 distribution configuration but a later patch will add a new field which would push the structure into the next cache line. Use only one list to store THP-sized pages on the per-cpu list. This assumes that the vast majority of THP-sized allocations are GFP_MOVABLE but even if it was another type, it would not contribute to serious fragmentation that potentially causes a later THP allocation failure. Align per_cpu_pages on the cacheline boundary to ensure there is no false cache sharing. After this patch, the structure sizing is; struct per_cpu_pages { int count; /* 0 4 */ int high; /* 4 4 */ int batch; /* 8 4 */ short int free_factor; /* 12 2 */ short int expire; /* 14 2 */ struct list_head lists[13]; /* 16 208 */ /* size: 256, cachelines: 4, members: 6 */ /* padding: 32 */ } __attribute__((__aligned__(64))); Link: https://lkml.kernel.org/r/20220624125423.6126-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Nicolas Saenz Julienne Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 +++++++---- mm/page_alloc.c | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5da1135e6755..041136b5628a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -355,15 +355,18 @@ enum zone_watermarks { }; /* - * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional - * for pageblock size for THP if configured. + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list + * for THP which will usually be GFP_MOVABLE. Even if it is another type, + * it should not contribute to serious fragmentation causing THP allocation + * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define NR_PCP_THP 1 #else #define NR_PCP_THP 0 #endif -#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) +#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) +#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) /* * Shift to encode migratetype and order in the same integer, with order @@ -389,7 +392,7 @@ struct per_cpu_pages { /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; -}; +} ____cacheline_aligned_in_smp; struct per_cpu_zonestat { #ifdef CONFIG_SMP diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 78ba5ba66586..b5c340d2cb43 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -653,7 +653,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); - base = PAGE_ALLOC_COSTLY_ORDER + 1; + return NR_LOWORDER_PCP_LISTS; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -667,7 +667,7 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (pindex == NR_LOWORDER_PCP_LISTS) order = pageblock_order; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); From 589d9973c1d2c3344a94a57441071340b0c71097 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:19 +0100 Subject: [PATCH 162/282] mm/page_alloc: split out buddy removal code from rmqueue into separate helper This is a preparation page to allow the buddy removal code to be reused in a later patch. No functional change. Link: https://lkml.kernel.org/r/20220624125423.6126-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Minchan Kim Acked-by: Minchan Kim Reviewed-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/page_alloc.c | 81 ++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5c340d2cb43..a384c3887ee7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3641,6 +3641,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, #endif } +static __always_inline +struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + unsigned int order, unsigned int alloc_flags, + int migratetype) +{ + struct page *page; + unsigned long flags; + + do { + page = NULL; + spin_lock_irqsave(&zone->lock, flags); + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return NULL; + } + } + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); + spin_unlock_irqrestore(&zone->lock, flags); + } while (check_new_pages(page, order)); + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone, 1); + + return page; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -3721,9 +3758,14 @@ struct page *rmqueue(struct zone *preferred_zone, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { - unsigned long flags; struct page *page; + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and @@ -3737,35 +3779,10 @@ struct page *rmqueue(struct zone *preferred_zone, } } - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - - do { - page = NULL; - spin_lock_irqsave(&zone->lock, flags); - /* - * order-0 request can reach here when the pcplist is skipped - * due to non-CMA allocation context. HIGHATOMIC area is - * reserved for high-order atomic allocation, so order-0 - * request should skip it. - */ - if (order > 0 && alloc_flags & ALLOC_HARDER) - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (!page) { - page = __rmqueue(zone, order, migratetype, alloc_flags); - if (!page) - goto failed; - } - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - spin_unlock_irqrestore(&zone->lock, flags); - } while (check_new_pages(page, order)); - - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, 1); + page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, + migratetype); + if (unlikely(!page)) + return NULL; out: /* Separate test+clear to avoid unnecessary atomics */ @@ -3776,10 +3793,6 @@ out: VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; - -failed: - spin_unlock_irqrestore(&zone->lock, flags); - return NULL; } #ifdef CONFIG_FAIL_PAGE_ALLOC From e2a66c21b774a4e8d0079089fafdc30a31414d40 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:20 +0100 Subject: [PATCH 163/282] mm/page_alloc: remove mistaken page == NULL check in rmqueue If a page allocation fails, the ZONE_BOOSTER_WATERMARK should be tested, cleared and kswapd woken whether the allocation attempt was via the PCP or directly via the buddy list. Remove the page == NULL so the ZONE_BOOSTED_WATERMARK bit is checked unconditionally. As it is unlikely that ZONE_BOOSTED_WATERMARK is set, mark the branch accordingly. Link: https://lkml.kernel.org/r/20220624125423.6126-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Minchan Kim Cc: Nicolas Saenz Julienne Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a384c3887ee7..026c9437456c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3781,12 +3781,10 @@ struct page *rmqueue(struct zone *preferred_zone, page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags, migratetype); - if (unlikely(!page)) - return NULL; out: /* Separate test+clear to avoid unnecessary atomics */ - if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { + if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); } From 4b23a68f953628eb4e4b7fe1294ebf93d4b8ceee Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:21 +0100 Subject: [PATCH 164/282] mm/page_alloc: protect PCP lists with a spinlock Currently the PCP lists are protected by using local_lock_irqsave to prevent migration and IRQ reentrancy but this is inconvenient. Remote draining of the lists is impossible and a workqueue is required and every task allocation/free must disable then enable interrupts which is expensive. As preparation for dealing with both of those problems, protect the lists with a spinlock. The IRQ-unsafe version of the lock is used because IRQs are already disabled by local_lock_irqsave. spin_trylock is used in combination with local_lock_irqsave() but later will be replaced with a spin_trylock_irqsave when the local_lock is removed. The per_cpu_pages still fits within the same number of cache lines after this patch relative to before the series. struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int batch; /* 12 4 */ short int free_factor; /* 16 2 */ short int expire; /* 18 2 */ /* XXX 4 bytes hole, try to pack */ struct list_head lists[13]; /* 24 208 */ /* size: 256, cachelines: 4, members: 7 */ /* sum members: 228, holes: 1, sum holes: 4 */ /* padding: 24 */ } __attribute__((__aligned__(64))); There is overhead in the fast path due to acquiring the spinlock even though the spinlock is per-cpu and uncontended in the common case. Page Fault Test (PFT) running on a 1-socket reported the following results on a 1 socket machine. 5.19.0-rc3 5.19.0-rc3 vanilla mm-pcpspinirq-v5r16 Hmean faults/sec-1 869275.7381 ( 0.00%) 874597.5167 * 0.61%* Hmean faults/sec-3 2370266.6681 ( 0.00%) 2379802.0362 * 0.40%* Hmean faults/sec-5 2701099.7019 ( 0.00%) 2664889.7003 * -1.34%* Hmean faults/sec-7 3517170.9157 ( 0.00%) 3491122.8242 * -0.74%* Hmean faults/sec-8 3965729.6187 ( 0.00%) 3939727.0243 * -0.66%* There is a small hit in the number of faults per second but given that the results are more stable, it's borderline noise. [akpm@linux-foundation.org: add missing local_unlock_irqrestore() on contention path] Link: https://lkml.kernel.org/r/20220624125423.6126-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Yu Zhao Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Minchan Kim Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 119 +++++++++++++++++++++++++++++++++-------- 2 files changed, 99 insertions(+), 21 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 041136b5628a..578247a341b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -382,6 +382,7 @@ enum zone_watermarks { /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { + spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 026c9437456c..a08ec4ac7ef2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -133,6 +133,20 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = { .lock = INIT_LOCAL_LOCK(lock), }; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -3101,15 +3115,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - local_unlock_irqrestore(&pagesets.lock, flags); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif @@ -3122,16 +3143,17 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp, 0); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* @@ -3399,17 +3421,15 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, int migratetype, +static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, + struct page *page, int migratetype, unsigned int order) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; int high; int pindex; bool free_high; __count_vm_event(PGFREE); - pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3436,6 +3456,9 @@ static void free_unref_page_commit(struct page *page, int migratetype, void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; + unsigned long __maybe_unused UP_flags; + struct per_cpu_pages *pcp; + struct zone *zone; unsigned long pfn = page_to_pfn(page); int migratetype; @@ -3459,7 +3482,16 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, migratetype, order); + zone = page_zone(page); + pcp_trylock_prepare(UP_flags); + pcp = this_cpu_ptr(zone->per_cpu_pageset); + if (spin_trylock(&pcp->lock)) { + free_unref_page_commit(zone, pcp, page, migratetype, order); + spin_unlock(&pcp->lock); + } else { + free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); + } + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3469,6 +3501,8 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; + struct per_cpu_pages *pcp = NULL; + struct zone *locked_zone = NULL; unsigned long flags; int batch_count = 0; int migratetype; @@ -3495,6 +3529,17 @@ void free_unref_page_list(struct list_head *list) local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + if (pcp) + spin_unlock(&pcp->lock); + locked_zone = zone; + pcp = this_cpu_ptr(zone->per_cpu_pageset); + spin_lock(&pcp->lock); + } + /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3504,18 +3549,24 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, migratetype, 0); + free_unref_page_commit(zone, pcp, page, migratetype, 0); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; local_lock_irqsave(&pagesets.lock, flags); + pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); + spin_lock(&pcp->lock); } } + + if (pcp) + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3729,18 +3780,32 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; local_lock_irqsave(&pagesets.lock, flags); + /* + * spin_trylock may fail due to a parallel drain. In the future, the + * trylock will also protect against IRQ reentrancy. + */ + pcp = this_cpu_ptr(zone->per_cpu_pageset); + pcp_trylock_prepare(UP_flags); + if (!spin_trylock(&pcp->lock)) { + pcp_trylock_finish(UP_flags); + local_unlock_irqrestore(&pagesets.lock, flags); + return NULL; + } + /* * On allocation, reduce the number of pages that are batch freed. * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3775,7 +3840,8 @@ struct page *rmqueue(struct zone *preferred_zone, migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); - goto out; + if (likely(page)) + goto out; } } @@ -5260,6 +5326,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, { struct page *page; unsigned long flags; + unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5340,11 +5407,15 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; - /* Attempt the batch allocation */ + /* Is a parallel drain in progress? */ local_lock_irqsave(&pagesets.lock, flags); + pcp_trylock_prepare(UP_flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); - pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; + if (!spin_trylock(&pcp->lock)) + goto failed_irq; + /* Attempt the batch allocation */ + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; while (nr_populated < nr_pages) { /* Skip existing pages */ @@ -5357,8 +5428,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp, pcp_list); if (unlikely(!page)) { /* Try and allocate at least one page */ - if (!nr_account) + if (!nr_account) { + spin_unlock(&pcp->lock); goto failed_irq; + } break; } nr_account++; @@ -5371,6 +5444,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); @@ -5380,6 +5455,7 @@ out: return nr_populated; failed_irq: + pcp_trylock_finish(UP_flags); local_unlock_irqrestore(&pagesets.lock, flags); failed: @@ -7020,6 +7096,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]); From 443c2accd1b6679a1320167f8f56eed6536b806e Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 24 Jun 2022 13:54:22 +0100 Subject: [PATCH 165/282] mm/page_alloc: remotely drain per-cpu lists Some setups, notably NOHZ_FULL CPUs, are too busy to handle the per-cpu drain work queued by __drain_all_pages(). So introduce a new mechanism to remotely drain the per-cpu lists. It is made possible by remotely locking 'struct per_cpu_pages' new per-cpu spinlocks. A benefit of this new scheme is that drain operations are now migration safe. There was no observed performance degradation vs. the previous scheme. Both netperf and hackbench were run in parallel to triggering the __drain_all_pages(NULL, true) code path around ~100 times per second. The new scheme performs a bit better (~5%), although the important point here is there are no performance regressions vs. the previous mechanism. Per-cpu lists draining happens only in slow paths. Minchan Kim tested an earlier version and reported; My workload is not NOHZ CPUs but run apps under heavy memory pressure so they goes to direct reclaim and be stuck on drain_all_pages until work on workqueue run. unit: nanosecond max(dur) avg(dur) count(dur) 166713013 487511.77786438033 1283 From traces, system encountered the drain_all_pages 1283 times and worst case was 166ms and avg was 487us. The other problem was alloc_contig_range in CMA. The PCP draining takes several hundred millisecond sometimes though there is no memory pressure or a few of pages to be migrated out but CPU were fully booked. Your patch perfectly removed those wasted time. Link: https://lkml.kernel.org/r/20220624125423.6126-7-mgorman@techsingularity.net Signed-off-by: Nicolas Saenz Julienne Signed-off-by: Mel Gorman Tested-by: Yu Zhao Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/page_alloc.c | 58 ++++--------------------------------------------- 1 file changed, 4 insertions(+), 54 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a08ec4ac7ef2..6baed6ffeec6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -165,13 +165,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); #endif -/* work_structs for global per-cpu drains */ -struct pcpu_drain { - struct zone *zone; - struct work_struct work; -}; static DEFINE_MUTEX(pcpu_drain_mutex); -static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -3109,9 +3103,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to drain pagesets of this * currently executing processor on remote nodes after they have * expired. - * - * Note that this function must be called with the thread pinned to - * a single processor. */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { @@ -3136,10 +3127,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) /* * Drain pcplists of the indicated processor and zone. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { @@ -3158,10 +3145,6 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) /* * Drain pcplists of all zones on the indicated processor. - * - * The processor must either be the current processor and the - * thread pinned to the current processor or a processor that - * is not online. */ static void drain_pages(unsigned int cpu) { @@ -3174,9 +3157,6 @@ static void drain_pages(unsigned int cpu) /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. - * - * The CPU has to be pinned. When zone parameter is non-NULL, spill just - * the single zone's pages. */ void drain_local_pages(struct zone *zone) { @@ -3188,24 +3168,6 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } -static void drain_local_pages_wq(struct work_struct *work) -{ - struct pcpu_drain *drain; - - drain = container_of(work, struct pcpu_drain, work); - - /* - * drain_all_pages doesn't use proper cpu hotplug protection so - * we can race with cpu offline when the WQ can move this from - * a cpu pinned worker to an unbound one. We can operate on a different - * cpu which is alright but we also have to make sure to not move to - * a different one. - */ - migrate_disable(); - drain_local_pages(drain->zone); - migrate_enable(); -} - /* * The implementation of drain_all_pages(), exposing an extra parameter to * drain on all cpus. @@ -3226,13 +3188,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) */ static cpumask_t cpus_with_pcps; - /* - * Make sure nobody triggers this path before mm_percpu_wq is fully - * initialized. - */ - if (WARN_ON_ONCE(!mm_percpu_wq)) - return; - /* * Do not drain if one is already in progress unless it's specific to * a zone. Such callers are primarily CMA and memory hotplug and need @@ -3282,14 +3237,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) } for_each_cpu(cpu, &cpus_with_pcps) { - struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu); - - drain->zone = zone; - INIT_WORK(&drain->work, drain_local_pages_wq); - queue_work_on(cpu, mm_percpu_wq, &drain->work); + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } - for_each_cpu(cpu, &cpus_with_pcps) - flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work); mutex_unlock(&pcpu_drain_mutex); } @@ -3298,8 +3250,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus) * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. - * - * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { From 01b44456a7aa7c3b24fa9db7d1714b208b8ef3d8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 24 Jun 2022 13:54:23 +0100 Subject: [PATCH 166/282] mm/page_alloc: replace local_lock with normal spinlock struct per_cpu_pages is no longer strictly local as PCP lists can be drained remotely using a lock for protection. While the use of local_lock works, it goes against the intent of local_lock which is for "pure CPU local concurrency control mechanisms and not suited for inter-CPU concurrency control" (Documentation/locking/locktypes.rst) local_lock protects against migration between when the percpu pointer is accessed and the pcp->lock acquired. The lock acquisition is a preemption point so in the worst case, a task could migrate to another NUMA node and accidentally allocate remote memory. The main requirement is to pin the task to a CPU that is suitable for PREEMPT_RT and !PREEMPT_RT. Replace local_lock with helpers that pin a task to a CPU, lookup the per-cpu structure and acquire the embedded lock. It's similar to local_lock without breaking the intent behind the API. It is not a complete API as only the parts needed for PCP-alloc are implemented but in theory, the generic helpers could be promoted to a general API if there was demand for an embedded lock within a per-cpu struct with a guarantee that the per-cpu structure locked matches the running CPU and cannot use get_cpu_var due to RT concerns. PCP requires these semantics to avoid accidentally allocating remote memory. [mgorman@techsingularity.net: use pcp_spin_trylock_irqsave instead of pcpu_spin_trylock_irqsave] Link: https://lkml.kernel.org/r/20220627084645.GA27531@techsingularity.net Link: https://lkml.kernel.org/r/20220624125423.6126-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Tested-by: Yu Zhao Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Marek Szyprowski Cc: Michal Hocko Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/page_alloc.c | 140 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 95 insertions(+), 45 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6baed6ffeec6..215b26664ad7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -126,13 +126,6 @@ typedef int __bitwise fpi_t; static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) -struct pagesets { - local_lock_t lock; -}; -static DEFINE_PER_CPU(struct pagesets, pagesets) = { - .lock = INIT_LOCAL_LOCK(lock), -}; - #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /* * On SMP, spin_trylock is sufficient protection. @@ -147,6 +140,83 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) = { #define pcp_trylock_finish(flags) local_irq_restore(flags) #endif +/* + * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid + * a migration causing the wrong PCP to be locked and remote memory being + * potentially allocated, pin the task to the CPU for the lookup+lock. + * preempt_disable is used on !RT because it is faster than migrate_disable. + * migrate_disable is used on RT because otherwise RT spinlock usage is + * interfered with and a high priority task cannot preempt the allocator. + */ +#ifndef CONFIG_PREEMPT_RT +#define pcpu_task_pin() preempt_disable() +#define pcpu_task_unpin() preempt_enable() +#else +#define pcpu_task_pin() migrate_disable() +#define pcpu_task_unpin() migrate_enable() +#endif + +/* + * Generic helper to lookup and a per-cpu variable with an embedded spinlock. + * Return value should be used with equivalent unlock helper. + */ +#define pcpu_spin_lock(type, member, ptr) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock(&_ret->member); \ + _ret; \ +}) + +#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + spin_lock_irqsave(&_ret->member, flags); \ + _ret; \ +}) + +#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ +({ \ + type *_ret; \ + pcpu_task_pin(); \ + _ret = this_cpu_ptr(ptr); \ + if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + pcpu_task_unpin(); \ + _ret = NULL; \ + } \ + _ret; \ +}) + +#define pcpu_spin_unlock(member, ptr) \ +({ \ + spin_unlock(&ptr->member); \ + pcpu_task_unpin(); \ +}) + +#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ +({ \ + spin_unlock_irqrestore(&ptr->member, flags); \ + pcpu_task_unpin(); \ +}) + +/* struct per_cpu_pages specific helpers. */ +#define pcp_spin_lock(ptr) \ + pcpu_spin_lock(struct per_cpu_pages, lock, ptr) + +#define pcp_spin_lock_irqsave(ptr, flags) \ + pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_trylock_irqsave(ptr, flags) \ + pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) + +#define pcp_spin_unlock(ptr) \ + pcpu_spin_unlock(lock, ptr) + +#define pcp_spin_unlock_irqrestore(ptr, flags) \ + pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -1485,10 +1555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -3056,10 +3123,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, { int i, allocated = 0; - /* - * local_lock_irq held so equivalent to spin_lock_irqsave for - * both PREEMPT_RT and non-PREEMPT_RT configurations. - */ + /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, @@ -3431,18 +3495,16 @@ void free_unref_page(struct page *page, unsigned int order) migratetype = MIGRATE_MOVABLE; } - local_lock_irqsave(&pagesets.lock, flags); zone = page_zone(page); pcp_trylock_prepare(UP_flags); - pcp = this_cpu_ptr(zone->per_cpu_pageset); - if (spin_trylock(&pcp->lock)) { + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (pcp) { free_unref_page_commit(zone, pcp, page, migratetype, order); - spin_unlock(&pcp->lock); + pcp_spin_unlock_irqrestore(pcp, flags); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); } pcp_trylock_finish(UP_flags); - local_unlock_irqrestore(&pagesets.lock, flags); } /* @@ -3477,17 +3539,16 @@ void free_unref_page_list(struct list_head *list) } } - local_lock_irqsave(&pagesets.lock, flags); list_for_each_entry_safe(page, next, list, lru) { struct zone *zone = page_zone(page); /* Different zone, different pcp lock. */ if (zone != locked_zone) { if (pcp) - spin_unlock(&pcp->lock); + pcp_spin_unlock_irqrestore(pcp, flags); + locked_zone = zone; - pcp = this_cpu_ptr(zone->per_cpu_pageset); - spin_lock(&pcp->lock); + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } /* @@ -3506,18 +3567,14 @@ void free_unref_page_list(struct list_head *list) * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - spin_unlock(&pcp->lock); - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); batch_count = 0; - local_lock_irqsave(&pagesets.lock, flags); - pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); - spin_lock(&pcp->lock); + pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } } if (pcp) - spin_unlock(&pcp->lock); - local_unlock_irqrestore(&pagesets.lock, flags); + pcp_spin_unlock_irqrestore(pcp, flags); } /* @@ -3732,17 +3789,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, unsigned long flags; unsigned long __maybe_unused UP_flags; - local_lock_irqsave(&pagesets.lock, flags); - /* * spin_trylock may fail due to a parallel drain. In the future, the * trylock will also protect against IRQ reentrancy. */ - pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_trylock_prepare(UP_flags); - if (!spin_trylock(&pcp->lock)) { + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) { pcp_trylock_finish(UP_flags); - local_unlock_irqrestore(&pagesets.lock, flags); return NULL; } @@ -3754,9 +3808,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - spin_unlock(&pcp->lock); + pcp_spin_unlock_irqrestore(pcp, flags); pcp_trylock_finish(UP_flags); - local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone, 1); @@ -5358,10 +5411,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed; /* Is a parallel drain in progress? */ - local_lock_irqsave(&pagesets.lock, flags); pcp_trylock_prepare(UP_flags); - pcp = this_cpu_ptr(zone->per_cpu_pageset); - if (!spin_trylock(&pcp->lock)) + pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + if (!pcp) goto failed_irq; /* Attempt the batch allocation */ @@ -5379,7 +5431,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - spin_unlock(&pcp->lock); + pcp_spin_unlock_irqrestore(pcp, flags); goto failed_irq; } break; @@ -5394,9 +5446,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - spin_unlock(&pcp->lock); + pcp_spin_unlock_irqrestore(pcp, flags); pcp_trylock_finish(UP_flags); - local_unlock_irqrestore(&pagesets.lock, flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(ac.preferred_zoneref->zone, zone, nr_account); @@ -5406,7 +5457,6 @@ out: failed_irq: pcp_trylock_finish(UP_flags); - local_unlock_irqrestore(&pagesets.lock, flags); failed: page = __alloc_pages(gfp, 0, preferred_nid, nodemask); From 9c94bef9c91288cc51e861a7acaa52ebb48c0121 Mon Sep 17 00:00:00 2001 From: Xiang Yang Date: Sat, 25 Jun 2022 14:18:44 +0800 Subject: [PATCH 167/282] mm/memcontrol.c: replace cgroup_memory_nokmem with mem_cgroup_kmem_disabled() mem_cgroup_kmem_disabled() checks whether the kmem accounting is off. Therefore, replace cgroup_memory_nokmem with mem_cgroup_kmem_disabled(), which is the same work in percpu.c and slab_common.c. Link: https://lkml.kernel.org/r/20220625061844.226764-1-xiangyang3@huawei.com Signed-off-by: Xiang Yang Reviewed-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Souptick Joarder (HPE) Cc: Johannes Weiner Cc: Michal Hocko Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b1868784f895..c5bfb3eacd08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3653,7 +3653,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) { struct obj_cgroup *objcg; - if (cgroup_memory_nokmem) + if (mem_cgroup_kmem_disabled()) return 0; if (unlikely(mem_cgroup_is_root(memcg))) @@ -3677,7 +3677,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct mem_cgroup *parent; - if (cgroup_memory_nokmem) + if (mem_cgroup_kmem_disabled()) return; if (unlikely(mem_cgroup_is_root(memcg))) From b3c56f8f2064ae1b73e099d23a82ae1642e435a1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 30 Jun 2022 10:41:20 +0200 Subject: [PATCH 168/282] lib/test_free_pages.c: pass a pointer to virt_to_page() In a recent change to the Arm architecture with the end goal of removing highmem we need to convert virt_to_phys() and virt_to_pfn() to static inline functions. This will make them strongly typed. However since virt_to_* is always implemented as macros they have become polymorphic and accept both (void *) and e.g. unsigned long as arguments. Other functions such as virt_to_page() simply wrap virt_to_pfn() and get affected indirectly. To be able to proceed, patch mm to use (void *) as argument to affected functions in all instances. This patch (of 5): A pointer into virtual memory is represented by a (void *) not an u32, so the compiler warns: lib/test_free_pages.c:20:50: warning: passing argument 1 of 'virt_to_pfn' makes pointer from integer without a cast [-Wint-conversion] Fix this with an explicit cast. Link: https://lkml.kernel.org/r/20220630084124.691207-1-linus.walleij@linaro.org Link: https://lkml.kernel.org/r/20220630084124.691207-2-linus.walleij@linaro.org Signed-off-by: Linus Walleij Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jason Gunthorpe Cc: Marco Elver Signed-off-by: Andrew Morton --- lib/test_free_pages.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_free_pages.c b/lib/test_free_pages.c index 25ae1ac2624a..9ebf6f5549f3 100644 --- a/lib/test_free_pages.c +++ b/lib/test_free_pages.c @@ -17,7 +17,7 @@ static void test_free_pages(gfp_t gfp) for (i = 0; i < 1000 * 1000; i++) { unsigned long addr = __get_free_pages(gfp, 3); - struct page *page = virt_to_page(addr); + struct page *page = virt_to_page((void *)addr); /* Simulate page cache getting a speculative reference */ get_page(page); From 259ecb34e2cd73811e250fc9c8d1f07df7bb2d14 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 30 Jun 2022 10:41:21 +0200 Subject: [PATCH 169/282] mm/highmem: pass a pointer to virt_to_page() Functions that work on a pointer to virtual memory such as virt_to_pfn() and users of that function such as virt_to_page() are supposed to pass a pointer to virtual memory, ideally a (void *) or other pointer. However since many architectures implement virt_to_pfn() as a macro, this function becomes polymorphic and accepts both a (unsigned long) and a (void *). If we instead implement a proper virt_to_pfn(void *addr) function the following happens (occurred on arch/arm): mm/highmem.c:153:29: warning: passing argument 1 of 'virt_to_pfn' makes pointer from integer without a cast [-Wint-conversion] We already have a proper void * pointer in the scope of this function named "vaddr" so pass that instead. Link: https://lkml.kernel.org/r/20220630084124.691207-3-linus.walleij@linaro.org Signed-off-by: Linus Walleij Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jason Gunthorpe Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/highmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/highmem.c b/mm/highmem.c index 1a692997fac4..e92a7ceb30e8 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -150,7 +150,7 @@ struct page *__kmap_to_page(void *vaddr) return pte_page(pkmap_page_table[i]); } - return virt_to_page(addr); + return virt_to_page(vaddr); } EXPORT_SYMBOL(__kmap_to_page); From 9e7ee421ac1f8d7fe350d2dee87e31919e9cba84 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 30 Jun 2022 10:41:22 +0200 Subject: [PATCH 170/282] mm: kfence: pass a pointer to virt_to_page() Functions that work on a pointer to virtual memory such as virt_to_pfn() and users of that function such as virt_to_page() are supposed to pass a pointer to virtual memory, ideally a (void *) or other pointer. However since many architectures implement virt_to_pfn() as a macro, this function becomes polymorphic and accepts both a (unsigned long) and a (void *). If we instead implement a proper virt_to_pfn(void *addr) function the following happens (occurred on arch/arm): mm/kfence/core.c:558:30: warning: passing argument 1 of 'virt_to_pfn' makes pointer from integer without a cast [-Wint-conversion] In one case we can refer to __kfence_pool directly (and that is a proper (char *) pointer) and in the other call site we use an explicit cast. Link: https://lkml.kernel.org/r/20220630084124.691207-4-linus.walleij@linaro.org Signed-off-by: Linus Walleij Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- mm/kfence/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 4b5e5a3d3a63..d39ffb058354 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -546,7 +546,7 @@ static unsigned long kfence_init_pool(void) if (!arch_kfence_init_pool()) return addr; - pages = virt_to_page(addr); + pages = virt_to_page(__kfence_pool); /* * Set up object pages: they must have PG_slab set, to avoid freeing @@ -660,7 +660,7 @@ static bool kfence_init_pool_late(void) /* Same as above. */ free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); #ifdef CONFIG_CONTIG_ALLOC - free_contig_range(page_to_pfn(virt_to_page(addr)), free_size / PAGE_SIZE); + free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); #else free_pages_exact((void *)addr, free_size); #endif From 396a400bc1d3c3e8e4ab836f834d2da7c070d395 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 30 Jun 2022 10:41:23 +0200 Subject: [PATCH 171/282] mm: gup: pass a pointer to virt_to_page() Functions that work on a pointer to virtual memory such as virt_to_pfn() and users of that function such as virt_to_page() are supposed to pass a pointer to virtual memory, ideally a (void *) or other pointer. However since many architectures implement virt_to_pfn() as a macro, this function becomes polymorphic and accepts both a (unsigned long) and a (void *). If we instead implement a proper virt_to_pfn(void *addr) function the following happens (occurred on arch/arm): mm/gup.c: In function '__get_user_pages_locked': mm/gup.c:1599:49: warning: passing argument 1 of 'virt_to_pfn' makes pointer from integer without a cast [-Wint-conversion] pages[i] = virt_to_page(start); Fix this with an explicit cast. Link: https://lkml.kernel.org/r/20220630084124.691207-5-linus.walleij@linaro.org Signed-off-by: Linus Walleij Reviewed-by: Jason Gunthorpe Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index ecf362688268..364b274a10c2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1704,7 +1704,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, goto finish_or_fault; if (pages) { - pages[i] = virt_to_page(start); + pages[i] = virt_to_page((void *)start); if (pages[i]) get_page(pages[i]); } From 9330723c26ca22c95065d2e41741cfeef00e4fd7 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 30 Jun 2022 10:41:24 +0200 Subject: [PATCH 172/282] mm: nommu: pass a pointer to virt_to_page() Functions that work on a pointer to virtual memory such as virt_to_pfn() and users of that function such as virt_to_page() are supposed to pass a pointer to virtual memory, ideally a (void *) or other pointer. However since many architectures implement virt_to_pfn() as a macro, this function becomes polymorphic and accepts both a (unsigned long) and a (void *). If we instead implement a proper virt_to_pfn(void *addr) function the following happens (occurred on arch/arm): mm/nommu.c: In function 'free_page_series': mm/nommu.c:501:50: warning: passing argument 1 of 'virt_to_pfn' makes pointer from integer without a cast [-Wint-conversion] struct page *page = virt_to_page(from); Fix this with an explicit cast. Link: https://lkml.kernel.org/r/20220630084124.691207-6-linus.walleij@linaro.org Signed-off-by: Linus Walleij Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Jason Gunthorpe Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/nommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/nommu.c b/mm/nommu.c index 9d7afc2d959e..e819cbc21b39 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -500,7 +500,7 @@ static void delete_nommu_region(struct vm_region *region) static void free_page_series(unsigned long from, unsigned long to) { for (; from < to; from += PAGE_SIZE) { - struct page *page = virt_to_page(from); + struct page *page = virt_to_page((void *)from); atomic_long_dec(&mmap_pages_allocated); put_page(page); From 840532711d7299d7e937952482ec899d4622c452 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:35 +0530 Subject: [PATCH 173/282] mm/mmap: build protect protection_map[] with __P000 Patch series "mm/mmap: Drop __SXXX/__PXXX macros from across platforms", v7. __SXXX/__PXXX macros are unnecessary abstraction layer in creating the generic protection_map[] array which is used for vm_get_page_prot(). This abstraction layer can be avoided, if the platforms just define the array protection_map[] for all possible vm_flags access permission combinations and also export vm_get_page_prot() implementation. This series drops __SXXX/__PXXX macros from across platforms in the tree. First it build protects generic protection_map[] array with '#ifdef __P000' and moves it inside platforms which enable ARCH_HAS_VM_GET_PAGE_PROT. Later this build protects same array with '#ifdef ARCH_HAS_VM_GET_PAGE_PROT' and moves inside remaining platforms while enabling ARCH_HAS_VM_GET_PAGE_PROT. This adds a new macro DECLARE_VM_GET_PAGE_PROT defining the current generic vm_get_page_prot(), in order for it to be reused on platforms that do not require custom implementation. Finally, ARCH_HAS_VM_GET_PAGE_PROT can just be dropped, as all platforms now define and export vm_get_page_prot(), via looking up a private and static protection_map[] array. protection_map[] data type has been changed as 'static const' on all platforms that do not change it during boot. This patch (of 26): Build protect generic protection_map[] array with __P000, so that it can be moved inside all the platforms one after the other. Otherwise there will be build failures during this process. CONFIG_ARCH_HAS_VM_GET_PAGE_PROT cannot be used for this purpose as only certain platforms enable this config now. Link: https://lkml.kernel.org/r/20220711070600.2378316-1-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/20220711070600.2378316-2-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Suggested-by: Christophe Leroy Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/mmap.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index d4ebfc206e2b..1a435ce146a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,7 +425,9 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ +#ifdef __P000 extern pgprot_t protection_map[16]; +#endif /* * The default fault flags that should be used by most of the diff --git a/mm/mmap.c b/mm/mmap.c index c14d7286a379..def0e03cf25c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -101,6 +101,7 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no * x: (yes) yes */ +#ifdef __P000 pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = __P000, [VM_READ] = __P001, @@ -119,6 +120,7 @@ pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 }; +#endif #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT pgprot_t vm_get_page_prot(unsigned long vm_flags) From 43957b5d11037a651d162f65c682ec3c76777fc8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:36 +0530 Subject: [PATCH 174/282] mm/mmap: define DECLARE_VM_GET_PAGE_PROT This just converts the generic vm_get_page_prot() implementation into a new macro i.e DECLARE_VM_GET_PAGE_PROT which later can be used across platforms when enabling them with ARCH_HAS_VM_GET_PAGE_PROT. This does not create any functional change. Link: https://lkml.kernel.org/r/20220711070600.2378316-3-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Suggested-by: Christoph Hellwig Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 28 ++++++++++++++++++++++++++++ mm/mmap.c | 26 +------------------------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3cdc16cfd867..014ee8f0fbaa 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1689,4 +1689,32 @@ typedef unsigned int pgtbl_mod_mask; #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and + * MAP_PRIVATE (with Enhanced PAN supported): + * r: (no) no + * w: (no) no + * x: (yes) yes + */ +#define DECLARE_VM_GET_PAGE_PROT \ +pgprot_t vm_get_page_prot(unsigned long vm_flags) \ +{ \ + return protection_map[vm_flags & \ + (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)]; \ +} \ +EXPORT_SYMBOL(vm_get_page_prot); + #endif /* _LINUX_PGTABLE_H */ diff --git a/mm/mmap.c b/mm/mmap.c index def0e03cf25c..3c0d65743bc4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -81,26 +81,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -/* description of effects of mapping type and prot in current implementation. - * this is due to the limited x86 page protection hardware. The expected - * behavior is in parens: - * - * map_type prot - * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC - * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (yes) yes w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes - * w: (no) no w: (no) no w: (copy) copy w: (no) no - * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and - * MAP_PRIVATE (with Enhanced PAN supported): - * r: (no) no - * w: (no) no - * x: (yes) yes - */ #ifdef __P000 pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = __P000, @@ -123,11 +103,7 @@ pgprot_t protection_map[16] __ro_after_init = { #endif #ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -pgprot_t vm_get_page_prot(unsigned long vm_flags) -{ - return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; -} -EXPORT_SYMBOL(vm_get_page_prot); +DECLARE_VM_GET_PAGE_PROT #endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) From 6eac1eaf2105dd8a2daf3b47634c24fc956bc77a Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:37 +0530 Subject: [PATCH 175/282] powerpc/mm: move protection_map[] inside the platform This moves protection_map[] inside the platform and while here, also enable ARCH_HAS_VM_GET_PAGE_PROT on 32 bit and nohash 64 (aka book3e/64) platforms via DECLARE_VM_GET_PAGE_PROT. Link: https://lkml.kernel.org/r/20220711070600.2378316-4-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Michael Ellerman Cc: Paul Mackerras Cc: Nicholas Piggin Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michal Simek Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/pgtable.h | 20 +------------------- arch/powerpc/mm/pgtable.c | 24 ++++++++++++++++++++++++ 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c2ce2e60c8f0..1035d172c7dd 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,7 +140,7 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT if PPC_BOOK3S_64 + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index d564d0ecd4cd..33f4bf8d22b0 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -20,25 +20,6 @@ struct mm_struct; #include #endif /* !CONFIG_PPC_BOOK3S */ -/* Note due to the way vm flags are laid out, the bits are XWR */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_X -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY_X -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_X -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED_X -#define __S111 PAGE_SHARED_X - #ifndef __ASSEMBLY__ #ifndef MAX_PTRS_PER_PGD @@ -79,6 +60,7 @@ extern void paging_init(void); void poking_init(void); extern unsigned long ioremap_bot; +extern const pgprot_t protection_map[16]; /* * kern_addr_valid is intended to indicate whether an address is a valid diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index e6166b71d36d..cb2dcdb18f8e 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -472,3 +472,27 @@ out: return ret_pte; } EXPORT_SYMBOL_GPL(__find_linux_pte); + +/* Note due to the way vm flags are laid out, the bits are XWR */ +const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_X, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY_X, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; + +#ifndef CONFIG_PPC_BOOK3S_64 +DECLARE_VM_GET_PAGE_PROT +#endif From 25740d31ee5b6e99ab674eaa3ecbbe3f8a6d3b8c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:38 +0530 Subject: [PATCH 176/282] sparc/mm: move protection_map[] inside the platform This moves protection_map[] inside the platform and while here, also enable ARCH_HAS_VM_GET_PAGE_PROT on 32 bit platforms via DECLARE_VM_GET_PAGE_PROT. Link: https://lkml.kernel.org/r/20220711070600.2378316-5-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Sam Ravnborg Cc: "David S. Miller" Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sparc/Kconfig | 2 +- arch/sparc/include/asm/pgtable_32.h | 19 ------------------- arch/sparc/include/asm/pgtable_64.h | 19 ------------------- arch/sparc/mm/init_32.c | 20 ++++++++++++++++++++ arch/sparc/mm/init_64.c | 3 +++ 5 files changed, 24 insertions(+), 39 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ba449c47effd..09f868613a4d 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -13,6 +13,7 @@ config 64BIT config SPARC bool default y + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI select ARCH_MIGHT_HAVE_PC_SERIO select DMA_OPS @@ -84,7 +85,6 @@ config SPARC64 select PERF_USE_VMALLOC select ARCH_HAVE_NMI_SAFE_CMPXCHG select HAVE_C_RECORDMCOUNT - select ARCH_HAS_VM_GET_PAGE_PROT select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 4866625da314..8ff549004fac 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -64,25 +64,6 @@ void paging_init(void); extern unsigned long ptr_in_current_pgd; -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED - /* First physical page can be anywhere, the following is needed so that * va-->pa and vice versa conversions work properly without performance * hit for all __pa()/__va() operations. diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 4679e45c8348..a779418ceba9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -187,25 +187,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_SZHUGE_4U _PAGE_SZ4MB_4U #define _PAGE_SZHUGE_4V _PAGE_SZ4MB_4V -/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ -#define __P000 __pgprot(0) -#define __P001 __pgprot(0) -#define __P010 __pgprot(0) -#define __P011 __pgprot(0) -#define __P100 __pgprot(0) -#define __P101 __pgprot(0) -#define __P110 __pgprot(0) -#define __P111 __pgprot(0) - -#define __S000 __pgprot(0) -#define __S001 __pgprot(0) -#define __S010 __pgprot(0) -#define __S011 __pgprot(0) -#define __S100 __pgprot(0) -#define __S101 __pgprot(0) -#define __S110 __pgprot(0) -#define __S111 __pgprot(0) - #ifndef __ASSEMBLY__ pte_t mk_pte_io(unsigned long, pgprot_t, int, unsigned long); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 1e9f577f084d..d88e774c8eb4 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -302,3 +302,23 @@ void sparc_flush_page_to_ram(struct page *page) __flush_page_to_ram(vaddr); } EXPORT_SYMBOL(sparc_flush_page_to_ram); + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f6174df2d5af..d6faee23c77d 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2634,6 +2634,9 @@ void vmemmap_free(unsigned long start, unsigned long end, } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* These are actually filled in at boot time by sun4{u,v}_pgprot_init() */ +static pgprot_t protection_map[16] __ro_after_init; + static void prot_init_common(unsigned long page_none, unsigned long page_shared, unsigned long page_copy, From 42251045cc741adae5ffc645c8b9ee906c14d013 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:39 +0530 Subject: [PATCH 177/282] arm64/mm: move protection_map[] inside the platform This moves protection_map[] inside the platform and makes it a static. Link: https://lkml.kernel.org/r/20220711070600.2378316-6-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Arnd Bergmann Cc: Brian Cain Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable-prot.h | 18 ------------------ arch/arm64/mm/mmap.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h index 62e0ebeed720..9b165117a454 100644 --- a/arch/arm64/include/asm/pgtable-prot.h +++ b/arch/arm64/include/asm/pgtable-prot.h @@ -89,24 +89,6 @@ extern bool arm64_use_ng_mappings; #define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) #define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_READONLY -#define __P011 PAGE_READONLY -#define __P100 PAGE_READONLY_EXEC /* PAGE_EXECONLY if Enhanced PAN */ -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_READONLY_EXEC -#define __P111 PAGE_READONLY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC /* PAGE_EXECONLY if Enhanced PAN */ -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - #endif /* __ASSEMBLY__ */ #endif /* __ASM_PGTABLE_PROT_H */ diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index 78e9490f748d..8f5b7ce857ed 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -13,6 +13,27 @@ #include #include +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_READONLY, + [VM_WRITE | VM_READ] = PAGE_READONLY, + /* PAGE_EXECONLY if Enhanced PAN */ + [VM_EXEC] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + /* PAGE_EXECONLY if Enhanced PAN */ + [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; + /* * You really shouldn't be using read() or write() on /dev/mem. This might go * away in the future. From 4867fbbdd6b362400d154417e08ce76b14200ba1 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:40 +0530 Subject: [PATCH 178/282] x86/mm: move protection_map[] inside the platform This moves protection_map[] inside the platform and makes it a static. This also defines a helper function add_encrypt_protection_map() that can update the protection_map[] array with pgprot_encrypted(). Link: https://lkml.kernel.org/r/20220711070600.2378316-7-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christoph Hellwig Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/x86/include/asm/mem_encrypt.h | 2 ++ arch/x86/include/asm/pgtable_types.h | 19 ------------------- arch/x86/mm/mem_encrypt_amd.c | 6 ++---- arch/x86/mm/pgprot.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 88ceaf3648b3..72ca90552b6a 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -89,6 +89,8 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void add_encrypt_protection_map(void); + /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when * writing to or comparing values from the cr3 register. Having the diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index bdaf8391e2e0..aa174fed3a71 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -230,25 +230,6 @@ enum page_cache_mode { #endif /* __ASSEMBLY__ */ -/* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY_EXEC -#define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY_EXEC -#define __S101 PAGE_READONLY_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - /* * early identity mapping pte attrib macros. */ diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index f6d038e2cd8e..5c3c3ed46f5a 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -486,8 +487,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, boo void __init sme_early_init(void) { - unsigned int i; - if (!sme_me_mask) return; @@ -496,8 +495,7 @@ void __init sme_early_init(void) __supported_pte_mask = __sme_set(__supported_pte_mask); /* Update the protection map with memory encryption mask */ - for (i = 0; i < ARRAY_SIZE(protection_map); i++) - protection_map[i] = pgprot_encrypted(protection_map[i]); + add_encrypt_protection_map(); x86_platform.guest.enc_status_change_prepare = amd_enc_status_change_prepare; x86_platform.guest.enc_status_change_finish = amd_enc_status_change_finish; diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c index 763742782286..c84bd9540b16 100644 --- a/arch/x86/mm/pgprot.c +++ b/arch/x86/mm/pgprot.c @@ -3,6 +3,34 @@ #include #include #include +#include + +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; + +void add_encrypt_protection_map(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(protection_map); i++) + protection_map[i] = pgprot_encrypted(protection_map[i]); +} pgprot_t vm_get_page_prot(unsigned long vm_flags) { From 09095f74130dfb2110ef2bcdd9ad0d42addaa1d5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:41 +0530 Subject: [PATCH 179/282] mm/mmap: build protect protection_map[] with ARCH_HAS_VM_GET_PAGE_PROT Now that protection_map[] has been moved inside those platforms that enable ARCH_HAS_VM_GET_PAGE_PROT. Hence generic protection_map[] array now can be protected with CONFIG_ARCH_HAS_VM_GET_PAGE_PROT intead of __P000. Link: https://lkml.kernel.org/r/20220711070600.2378316-8-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/mmap.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1a435ce146a2..4b4dc93f9bc3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,7 +425,7 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -#ifdef __P000 +#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT extern pgprot_t protection_map[16]; #endif diff --git a/mm/mmap.c b/mm/mmap.c index 3c0d65743bc4..2a58a9cd0752 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -81,7 +81,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -#ifdef __P000 +#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT pgprot_t protection_map[16] __ro_after_init = { [VM_NONE] = __P000, [VM_READ] = __P001, @@ -100,9 +100,6 @@ pgprot_t protection_map[16] __ro_after_init = { [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 }; -#endif - -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT DECLARE_VM_GET_PAGE_PROT #endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ From fa3f9f4a912c22aadf0510bf7f4bd113da442a10 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:42 +0530 Subject: [PATCH 180/282] microblaze/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-9-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Michal Simek Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/microblaze/Kconfig | 1 + arch/microblaze/include/asm/pgtable.h | 17 ----------------- arch/microblaze/mm/init.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 8cf429ad1c84..15f91ba8a0c4 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -7,6 +7,7 @@ config MICROBLAZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 0c72646370e1..ba348e997dbb 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -204,23 +204,6 @@ extern pte_t *va_to_pte(unsigned long address); * We consider execute permission the same as read. * Also, write permissions imply read permissions. */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY_X -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY_X -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY_X -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED_X -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED_X #ifndef __ASSEMBLY__ /* diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index f4e503461d24..353fabdfcbc5 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -285,3 +285,23 @@ void * __ref zalloc_maybe_bootmem(size_t size, gfp_t mask) return p; } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY_X, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; +DECLARE_VM_GET_PAGE_PROT From f6d1e19c20a44cd4d2bb5451cb472648707d9b97 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:43 +0530 Subject: [PATCH 181/282] loongarch/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-10-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Huacai Chen Cc: WANG Xuerui Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/pgtable-bits.h | 19 ---------- arch/loongarch/mm/cache.c | 46 +++++++++++++++++++++++ 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index db2838cf8c02..adf8cf6ec5d5 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -9,6 +9,7 @@ config LOONGARCH select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index 3badd112d9ab..9ca147a29bab 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -83,25 +83,6 @@ _PAGE_GLOBAL | _PAGE_KERN | _CACHE_SUC) #define PAGE_KERNEL_WUC __pgprot(_PAGE_PRESENT | __READABLE | __WRITEABLE | \ _PAGE_GLOBAL | _PAGE_KERN | _CACHE_WUC) - -#define __P000 __pgprot(_CACHE_CC | _PAGE_USER | _PAGE_PROTNONE | _PAGE_NO_EXEC | _PAGE_NO_READ) -#define __P001 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_NO_EXEC) -#define __P010 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_NO_EXEC) -#define __P011 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_NO_EXEC) -#define __P100 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT) -#define __P101 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT) -#define __P110 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT) -#define __P111 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT) - -#define __S000 __pgprot(_CACHE_CC | _PAGE_USER | _PAGE_PROTNONE | _PAGE_NO_EXEC | _PAGE_NO_READ) -#define __S001 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_NO_EXEC) -#define __S010 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE) -#define __S011 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_WRITE) -#define __S100 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT) -#define __S101 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT) -#define __S110 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_WRITE) -#define __S111 __pgprot(_CACHE_CC | _PAGE_VALID | _PAGE_USER | _PAGE_PRESENT | _PAGE_WRITE) - #ifndef __ASSEMBLY__ #define pgprot_noncached pgprot_noncached diff --git a/arch/loongarch/mm/cache.c b/arch/loongarch/mm/cache.c index 9e5ce5aa73f7..e8c68dcf6ab2 100644 --- a/arch/loongarch/mm/cache.c +++ b/arch/loongarch/mm/cache.c @@ -139,3 +139,49 @@ void cpu_cache_init(void) shm_align_mask = PAGE_SIZE - 1; } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = __pgprot(_CACHE_CC | _PAGE_USER | + _PAGE_PROTNONE | _PAGE_NO_EXEC | + _PAGE_NO_READ), + [VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_NO_EXEC), + [VM_WRITE] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_NO_EXEC), + [VM_WRITE | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_NO_EXEC), + [VM_EXEC] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT), + [VM_EXEC | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT), + [VM_EXEC | VM_WRITE] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT), + [VM_EXEC | VM_WRITE | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT), + [VM_SHARED] = __pgprot(_CACHE_CC | _PAGE_USER | + _PAGE_PROTNONE | _PAGE_NO_EXEC | + _PAGE_NO_READ), + [VM_SHARED | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_NO_EXEC), + [VM_SHARED | VM_WRITE] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_NO_EXEC | _PAGE_WRITE), + [VM_SHARED | VM_WRITE | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_NO_EXEC | _PAGE_WRITE), + [VM_SHARED | VM_EXEC] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT), + [VM_SHARED | VM_EXEC | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT), + [VM_SHARED | VM_EXEC | VM_WRITE] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_WRITE), + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __pgprot(_CACHE_CC | _PAGE_VALID | + _PAGE_USER | _PAGE_PRESENT | + _PAGE_WRITE) +}; +DECLARE_VM_GET_PAGE_PROT From e4e4b99b80aae3aeb6827cf0b7e3ca31b1cd892e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:44 +0530 Subject: [PATCH 182/282] openrisc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-11-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Stafford Horne Cc: Jonas Bonn Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/Kconfig | 1 + arch/openrisc/include/asm/pgtable.h | 18 ------------------ arch/openrisc/mm/init.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e814df4c483c..fe0dfb50eb86 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -10,6 +10,7 @@ config OPENRISC select ARCH_HAS_DMA_SET_UNCACHED select ARCH_HAS_DMA_CLEAR_UNCACHED select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_VM_GET_PAGE_PROT select COMMON_CLK select OF select OF_EARLY_FLATTREE diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index c3abbf71e09f..dcae8aea132f 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -176,24 +176,6 @@ extern void paging_init(void); __pgprot(_PAGE_ALL | _PAGE_SRE | _PAGE_SWE \ | _PAGE_SHARED | _PAGE_DIRTY | _PAGE_EXEC | _PAGE_CI) -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY_X -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY_X -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY_X -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY_X - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY_X -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED_X -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY_X -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED_X - /* zero page used for uninitialized stuff */ extern unsigned long empty_zero_page[2048]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 3a021ab6f1ae..d531ab82be12 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -208,3 +208,23 @@ void __init mem_init(void) mem_init_done = 1; return; } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY_X, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED_X, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X +}; +DECLARE_VM_GET_PAGE_PROT From 2de9eae10d11906a989d0e9241854dc8d2729d9e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:45 +0530 Subject: [PATCH 183/282] xtensa/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-12-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Chris Zankel Cc: Guo Ren Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/pgtable.h | 18 ------------------ arch/xtensa/mm/init.c | 22 ++++++++++++++++++++++ 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 0b0f0172cced..4c0d83520ff1 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -11,6 +11,7 @@ config XTENSA select ARCH_HAS_DMA_SET_UNCACHED if MMU select ARCH_HAS_STRNCPY_FROM_USER if !KASAN select ARCH_HAS_STRNLEN_USER + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index 0a91376131c5..e0d5531ae00d 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -200,24 +200,6 @@ * What follows is the closest we can get by reasonable means.. * See linux/mm/mmap.c for protection_map[] array that uses these definitions. */ -#define __P000 PAGE_NONE /* private --- */ -#define __P001 PAGE_READONLY /* private --r */ -#define __P010 PAGE_COPY /* private -w- */ -#define __P011 PAGE_COPY /* private -wr */ -#define __P100 PAGE_READONLY_EXEC /* private x-- */ -#define __P101 PAGE_READONLY_EXEC /* private x-r */ -#define __P110 PAGE_COPY_EXEC /* private xw- */ -#define __P111 PAGE_COPY_EXEC /* private xwr */ - -#define __S000 PAGE_NONE /* shared --- */ -#define __S001 PAGE_READONLY /* shared --r */ -#define __S010 PAGE_SHARED /* shared -w- */ -#define __S011 PAGE_SHARED /* shared -wr */ -#define __S100 PAGE_READONLY_EXEC /* shared x-- */ -#define __S101 PAGE_READONLY_EXEC /* shared x-r */ -#define __S110 PAGE_SHARED_EXEC /* shared xw- */ -#define __S111 PAGE_SHARED_EXEC /* shared xwr */ - #ifndef __ASSEMBLY__ #define pte_ERROR(e) \ diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 6a32b2cf2718..b2587a1a7c46 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -216,3 +216,25 @@ static int __init parse_memmap_opt(char *str) return 0; } early_param("memmap", parse_memmap_opt); + +#ifdef CONFIG_MMU +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; +DECLARE_VM_GET_PAGE_PROT +#endif From b2022dcf45540b1f08ce2ef761d9e9110d22e929 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:46 +0530 Subject: [PATCH 184/282] hexagon/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-13-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Brian Cain Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/hexagon/Kconfig | 1 + arch/hexagon/include/asm/pgtable.h | 27 ------------------- arch/hexagon/mm/init.c | 42 ++++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 54eadf265178..bc4ceecd0588 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -6,6 +6,7 @@ config HEXAGON def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_PREEMPT select DMA_GLOBAL_POOL # Other pending projects/to-do items. diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 0610724d6a28..f7048c18b6f9 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -126,33 +126,6 @@ extern unsigned long _dflt_cache_att; */ #define CACHEDEF (CACHE_DEFAULT << 6) -/* Private (copy-on-write) page protections. */ -#define __P000 __pgprot(_PAGE_PRESENT | _PAGE_USER | CACHEDEF) -#define __P001 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | CACHEDEF) -#define __P010 __P000 /* Write-only copy-on-write */ -#define __P011 __P001 /* Read/Write copy-on-write */ -#define __P100 __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_EXECUTE | CACHEDEF) -#define __P101 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_EXECUTE | \ - _PAGE_READ | CACHEDEF) -#define __P110 __P100 /* Write/execute copy-on-write */ -#define __P111 __P101 /* Read/Write/Execute, copy-on-write */ - -/* Shared page protections. */ -#define __S000 __P000 -#define __S001 __P001 -#define __S010 __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_WRITE | CACHEDEF) -#define __S011 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | \ - _PAGE_WRITE | CACHEDEF) -#define __S100 __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_EXECUTE | CACHEDEF) -#define __S101 __P101 -#define __S110 __pgprot(_PAGE_PRESENT | _PAGE_USER | \ - _PAGE_EXECUTE | _PAGE_WRITE | CACHEDEF) -#define __S111 __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_READ | \ - _PAGE_EXECUTE | _PAGE_WRITE | CACHEDEF) - extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; /* located in head.S */ /* HUGETLB not working currently */ diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 3167a3b5c97b..146115c9de61 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -234,3 +234,45 @@ void __init setup_arch_memory(void) * which is called by start_kernel() later on in the process */ } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + CACHEDEF), + [VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_READ | CACHEDEF), + [VM_WRITE] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + CACHEDEF), + [VM_WRITE | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_READ | CACHEDEF), + [VM_EXEC] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | CACHEDEF), + [VM_EXEC | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | _PAGE_READ | + CACHEDEF), + [VM_EXEC | VM_WRITE] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | CACHEDEF), + [VM_EXEC | VM_WRITE | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | _PAGE_READ | + CACHEDEF), + [VM_SHARED] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + CACHEDEF), + [VM_SHARED | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_READ | CACHEDEF), + [VM_SHARED | VM_WRITE] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_WRITE | CACHEDEF), + [VM_SHARED | VM_WRITE | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_READ | _PAGE_WRITE | + CACHEDEF), + [VM_SHARED | VM_EXEC] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | CACHEDEF), + [VM_SHARED | VM_EXEC | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | _PAGE_READ | + CACHEDEF), + [VM_SHARED | VM_EXEC | VM_WRITE] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_EXECUTE | _PAGE_WRITE | + CACHEDEF), + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __pgprot(_PAGE_PRESENT | _PAGE_USER | + _PAGE_READ | _PAGE_EXECUTE | + _PAGE_WRITE | CACHEDEF) +}; +DECLARE_VM_GET_PAGE_PROT From 252358f1a118d8e9b0dd4f2f67bb3cd2b742e854 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:47 +0530 Subject: [PATCH 185/282] parisc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-14-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: "James E.J. Bottomley" Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/pgtable.h | 18 ------------------ arch/parisc/mm/init.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index fa400055b2d5..891d82393957 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -12,6 +12,7 @@ config PARISC select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_PTE_SPECIAL select ARCH_NO_SG_CHAIN select ARCH_SUPPORTS_HUGETLBFS if PA20 diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 69765a6dbe89..6a1899a9b420 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -271,24 +271,6 @@ extern void __update_cache(pte_t pte); */ /*xwr*/ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 __P000 /* copy on write */ -#define __P011 __P001 /* copy on write */ -#define __P100 PAGE_EXECREAD -#define __P101 PAGE_EXECREAD -#define __P110 __P100 /* copy on write */ -#define __P111 __P101 /* copy on write */ - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_WRITEONLY -#define __S011 PAGE_SHARED -#define __S100 PAGE_EXECREAD -#define __S101 PAGE_EXECREAD -#define __S110 PAGE_RWX -#define __S111 PAGE_RWX - extern pgd_t swapper_pg_dir[]; /* declared in init_task.c */ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 0a81499dd35e..f03e0961fa25 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -871,3 +871,23 @@ void flush_tlb_all(void) spin_unlock(&sid_lock); } #endif + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_NONE, + [VM_WRITE | VM_READ] = PAGE_READONLY, + [VM_EXEC] = PAGE_EXECREAD, + [VM_EXEC | VM_READ] = PAGE_EXECREAD, + [VM_EXEC | VM_WRITE] = PAGE_EXECREAD, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_EXECREAD, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_WRITEONLY, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXECREAD, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_EXECREAD, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT From 4975f604feb63c7b6b6ec56cb6013026d6aaa9f8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:48 +0530 Subject: [PATCH 186/282] alpha/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-15-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Richard Henderson Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/Kconfig | 1 + arch/alpha/include/asm/pgtable.h | 17 ----------------- arch/alpha/mm/init.c | 22 ++++++++++++++++++++++ 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 7d0d26b5b3f5..db1c8b329461 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -2,6 +2,7 @@ config ALPHA bool default y + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_32BIT_USTAT_F_TINODE select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 170451fde043..3ea9661c09ff 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -116,23 +116,6 @@ struct vm_area_struct; * arch/alpha/mm/fault.c) */ /* xwr */ -#define __P000 _PAGE_P(_PAGE_FOE | _PAGE_FOW | _PAGE_FOR) -#define __P001 _PAGE_P(_PAGE_FOE | _PAGE_FOW) -#define __P010 _PAGE_P(_PAGE_FOE) -#define __P011 _PAGE_P(_PAGE_FOE) -#define __P100 _PAGE_P(_PAGE_FOW | _PAGE_FOR) -#define __P101 _PAGE_P(_PAGE_FOW) -#define __P110 _PAGE_P(0) -#define __P111 _PAGE_P(0) - -#define __S000 _PAGE_S(_PAGE_FOE | _PAGE_FOW | _PAGE_FOR) -#define __S001 _PAGE_S(_PAGE_FOE | _PAGE_FOW) -#define __S010 _PAGE_S(_PAGE_FOE) -#define __S011 _PAGE_S(_PAGE_FOE) -#define __S100 _PAGE_S(_PAGE_FOW | _PAGE_FOR) -#define __S101 _PAGE_S(_PAGE_FOW) -#define __S110 _PAGE_S(0) -#define __S111 _PAGE_S(0) /* * pgprot_noncached() is only for infiniband pci support, and a real diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 7511723b7669..a155180d7a83 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -280,3 +280,25 @@ mem_init(void) high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); memblock_free_all(); } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = _PAGE_P(_PAGE_FOE | _PAGE_FOW | + _PAGE_FOR), + [VM_READ] = _PAGE_P(_PAGE_FOE | _PAGE_FOW), + [VM_WRITE] = _PAGE_P(_PAGE_FOE), + [VM_WRITE | VM_READ] = _PAGE_P(_PAGE_FOE), + [VM_EXEC] = _PAGE_P(_PAGE_FOW | _PAGE_FOR), + [VM_EXEC | VM_READ] = _PAGE_P(_PAGE_FOW), + [VM_EXEC | VM_WRITE] = _PAGE_P(0), + [VM_EXEC | VM_WRITE | VM_READ] = _PAGE_P(0), + [VM_SHARED] = _PAGE_S(_PAGE_FOE | _PAGE_FOW | + _PAGE_FOR), + [VM_SHARED | VM_READ] = _PAGE_S(_PAGE_FOE | _PAGE_FOW), + [VM_SHARED | VM_WRITE] = _PAGE_S(_PAGE_FOE), + [VM_SHARED | VM_WRITE | VM_READ] = _PAGE_S(_PAGE_FOE), + [VM_SHARED | VM_EXEC] = _PAGE_S(_PAGE_FOW | _PAGE_FOR), + [VM_SHARED | VM_EXEC | VM_READ] = _PAGE_S(_PAGE_FOW), + [VM_SHARED | VM_EXEC | VM_WRITE] = _PAGE_S(0), + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = _PAGE_S(0) +}; +DECLARE_VM_GET_PAGE_PROT From 53e2fdee5f0efdf5bd099a29bfaae0b0ec90e70e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:49 +0530 Subject: [PATCH 187/282] nios2/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-16-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Dinh Nguyen Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/nios2/Kconfig | 1 + arch/nios2/include/asm/pgtable.h | 16 ---------------- arch/nios2/mm/init.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4167f1eb4cd8..e0459dffd218 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -6,6 +6,7 @@ config NIOS2 select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_DMA_SET_UNCACHED + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_SWAP select COMMON_CLK select TIMER_OF diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 262d0609268c..470516d4555e 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -40,24 +40,8 @@ struct mm_struct; */ /* Remove W bit on private pages for COW support */ -#define __P000 MKP(0, 0, 0) -#define __P001 MKP(0, 0, 1) -#define __P010 MKP(0, 0, 0) /* COW */ -#define __P011 MKP(0, 0, 1) /* COW */ -#define __P100 MKP(1, 0, 0) -#define __P101 MKP(1, 0, 1) -#define __P110 MKP(1, 0, 0) /* COW */ -#define __P111 MKP(1, 0, 1) /* COW */ /* Shared pages can have exact HW mapping */ -#define __S000 MKP(0, 0, 0) -#define __S001 MKP(0, 0, 1) -#define __S010 MKP(0, 1, 0) -#define __S011 MKP(0, 1, 1) -#define __S100 MKP(1, 0, 0) -#define __S101 MKP(1, 0, 1) -#define __S110 MKP(1, 1, 0) -#define __S111 MKP(1, 1, 1) /* Used all over the kernel */ #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_CACHED | _PAGE_READ | \ diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 613fcaa5988a..ae24687d12ad 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -124,3 +124,23 @@ const char *arch_vma_name(struct vm_area_struct *vma) { return (vma->vm_start == KUSER_BASE) ? "[kuser]" : NULL; } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = MKP(0, 0, 0), + [VM_READ] = MKP(0, 0, 1), + [VM_WRITE] = MKP(0, 0, 0), + [VM_WRITE | VM_READ] = MKP(0, 0, 1), + [VM_EXEC] = MKP(1, 0, 0), + [VM_EXEC | VM_READ] = MKP(1, 0, 1), + [VM_EXEC | VM_WRITE] = MKP(1, 0, 0), + [VM_EXEC | VM_WRITE | VM_READ] = MKP(1, 0, 1), + [VM_SHARED] = MKP(0, 0, 0), + [VM_SHARED | VM_READ] = MKP(0, 0, 1), + [VM_SHARED | VM_WRITE] = MKP(0, 1, 0), + [VM_SHARED | VM_WRITE | VM_READ] = MKP(0, 1, 1), + [VM_SHARED | VM_EXEC] = MKP(1, 0, 0), + [VM_SHARED | VM_EXEC | VM_READ] = MKP(1, 0, 1), + [VM_SHARED | VM_EXEC | VM_WRITE] = MKP(1, 1, 0), + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = MKP(1, 1, 1) +}; +DECLARE_VM_GET_PAGE_PROT From 4147b5e2d5691078c9f8eaa3644ef11f5351a9db Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:50 +0530 Subject: [PATCH 188/282] riscv/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-17-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Paul Mackerras Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable.h | 20 -------------------- arch/riscv/mm/init.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 32ffef9f6e5b..583389d4e43a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -32,6 +32,7 @@ config RISCV select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_STACKWALK diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 1d1be9d9419c..23e643db6575 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -186,26 +186,6 @@ extern struct pt_alloc_ops pt_ops __initdata; extern pgd_t swapper_pg_dir[]; -/* MAP_PRIVATE permissions: xwr (copy-on-write) */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READ -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_EXEC -#define __P101 PAGE_READ_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_READ_EXEC - -/* MAP_SHARED permissions: xwr */ -#define __S000 PAGE_NONE -#define __S001 PAGE_READ -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_EXEC -#define __S101 PAGE_READ_EXEC -#define __S110 PAGE_SHARED_EXEC -#define __S111 PAGE_SHARED_EXEC - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_present(pmd_t pmd) { diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index d466ec670e1f..a88b7dc31a68 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -288,6 +288,26 @@ static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAG #define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir)) #endif /* CONFIG_XIP_KERNEL */ +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READ, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_EXEC, + [VM_EXEC | VM_READ] = PAGE_READ_EXEC, + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_READ_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READ, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READ_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_EXEC +}; +DECLARE_VM_GET_PAGE_PROT + void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) { unsigned long addr = __fix_to_virt(idx); From 0d70836013f2da08fa190e99a1d50ebf28a09ef5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:51 +0530 Subject: [PATCH 189/282] csky/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-18-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Guo Ren Cc: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/csky/Kconfig | 1 + arch/csky/include/asm/pgtable.h | 18 ------------------ arch/csky/mm/init.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 21d72b078eef..588b8a9c68ed 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -6,6 +6,7 @@ config CSKY select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index bbe245117777..229a5f4ad7fc 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -77,24 +77,6 @@ #define MAX_SWAPFILES_CHECK() \ BUILD_BUG_ON(MAX_SWAPFILES_SHIFT != 5) -#define __P000 PAGE_NONE -#define __P001 PAGE_READ -#define __P010 PAGE_READ -#define __P011 PAGE_READ -#define __P100 PAGE_READ -#define __P101 PAGE_READ -#define __P110 PAGE_READ -#define __P111 PAGE_READ - -#define __S000 PAGE_NONE -#define __S001 PAGE_READ -#define __S010 PAGE_WRITE -#define __S011 PAGE_WRITE -#define __S100 PAGE_READ -#define __S101 PAGE_READ -#define __S110 PAGE_WRITE -#define __S111 PAGE_WRITE - extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index bf2004aa811a..bde7cabd23df 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -197,3 +197,23 @@ void __init fixaddr_init(void) vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; fixrange_init(vaddr, vaddr + PMD_SIZE, swapper_pg_dir); } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READ, + [VM_WRITE] = PAGE_READ, + [VM_WRITE | VM_READ] = PAGE_READ, + [VM_EXEC] = PAGE_READ, + [VM_EXEC | VM_READ] = PAGE_READ, + [VM_EXEC | VM_WRITE] = PAGE_READ, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_READ, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READ, + [VM_SHARED | VM_WRITE] = PAGE_WRITE, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_WRITE, + [VM_SHARED | VM_EXEC] = PAGE_READ, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READ, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_WRITE, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_WRITE +}; +DECLARE_VM_GET_PAGE_PROT From fd5d210fa66beeae6ee296eb06d55c1c6faea912 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:52 +0530 Subject: [PATCH 190/282] s390/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-19-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 + arch/s390/include/asm/pgtable.h | 17 ----------------- arch/s390/mm/mmap.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 91c0b80a8bf0..c4481377ca83 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -81,6 +81,7 @@ config S390 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_VDSO_DATA + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index a397b072a580..c63a05b5368a 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -424,23 +424,6 @@ static inline int is_module_addr(void *addr) * implies read permission. */ /*xwr*/ -#define __P000 PAGE_NONE -#define __P001 PAGE_RO -#define __P010 PAGE_RO -#define __P011 PAGE_RO -#define __P100 PAGE_RX -#define __P101 PAGE_RX -#define __P110 PAGE_RX -#define __P111 PAGE_RX - -#define __S000 PAGE_NONE -#define __S001 PAGE_RO -#define __S010 PAGE_RW -#define __S011 PAGE_RW -#define __S100 PAGE_RX -#define __S101 PAGE_RX -#define __S110 PAGE_RWX -#define __S111 PAGE_RWX /* * Segment entry (large page) protection definitions. diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index d545f5c39f7e..5980ce348832 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -188,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_RO, + [VM_WRITE] = PAGE_RO, + [VM_WRITE | VM_READ] = PAGE_RO, + [VM_EXEC] = PAGE_RX, + [VM_EXEC | VM_READ] = PAGE_RX, + [VM_EXEC | VM_WRITE] = PAGE_RX, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_RO, + [VM_SHARED | VM_WRITE] = PAGE_RW, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW, + [VM_SHARED | VM_EXEC] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT From c619b82c55b730191e19f13e119603c702e11f0f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:53 +0530 Subject: [PATCH 191/282] ia64/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-20-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/ia64/Kconfig | 1 + arch/ia64/include/asm/pgtable.h | 18 ------------------ arch/ia64/mm/init.c | 28 +++++++++++++++++++++++++++- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index cb93769a9f2a..0510a5737711 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -12,6 +12,7 @@ config IA64 select ARCH_HAS_DMA_MARK_CLEAN select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ACPI diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 7aa8f2330fb1..6925e28ae61d 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -161,24 +161,6 @@ * attempts to write to the page. */ /* xwr */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_READONLY /* write to priv pg -> copy & make writable */ -#define __P011 PAGE_READONLY /* ditto */ -#define __P100 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_X_RX) -#define __P101 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX) -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED /* we don't have (and don't need) write-only */ -#define __S011 PAGE_SHARED -#define __S100 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_X_RX) -#define __S101 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX) -#define __S110 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX) -#define __S111 __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RWX) - #define pgd_ERROR(e) printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) #if CONFIG_PGTABLE_LEVELS == 4 #define pud_ERROR(e) printk("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e)) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 855d949d81df..fc4e4217e87f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -273,7 +273,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; + gate_vma.vm_page_prot = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX); return 0; } @@ -490,3 +490,29 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) __remove_pages(start_pfn, nr_pages, altmap); } #endif + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_READONLY, + [VM_WRITE | VM_READ] = PAGE_READONLY, + [VM_EXEC] = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | + _PAGE_AR_X_RX), + [VM_EXEC | VM_READ] = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | + _PAGE_AR_RX), + [VM_EXEC | VM_WRITE] = PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_EXEC, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | + _PAGE_AR_X_RX), + [VM_SHARED | VM_EXEC | VM_READ] = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | + _PAGE_AR_RX), + [VM_SHARED | VM_EXEC | VM_WRITE] = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | + _PAGE_AR_RWX), + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __pgprot(__ACCESS_BITS | _PAGE_PL_3 | + _PAGE_AR_RWX) +}; +DECLARE_VM_GET_PAGE_PROT From 499c1dd92ea1087b98924ebe88cbf48ac0198775 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:54 +0530 Subject: [PATCH 192/282] mips/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-21-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Thomas Bogendoerfer Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/mips/Kconfig | 1 + arch/mips/include/asm/pgtable.h | 22 ---------------------- arch/mips/mm/cache.c | 3 +++ 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index db09d45d59ec..d0b7eb11ec81 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -14,6 +14,7 @@ config MIPS select ARCH_HAS_STRNLEN_USER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_KEEP_MEMBLOCK select ARCH_SUPPORTS_UPROBES diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 374c6322775d..6caec386ad2f 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -41,28 +41,6 @@ struct vm_area_struct; * by reasonable means.. */ -/* - * Dummy values to fill the table in mmap.c - * The real values will be generated at runtime - */ -#define __P000 __pgprot(0) -#define __P001 __pgprot(0) -#define __P010 __pgprot(0) -#define __P011 __pgprot(0) -#define __P100 __pgprot(0) -#define __P101 __pgprot(0) -#define __P110 __pgprot(0) -#define __P111 __pgprot(0) - -#define __S000 __pgprot(0) -#define __S001 __pgprot(0) -#define __S010 __pgprot(0) -#define __S011 __pgprot(0) -#define __S100 __pgprot(0) -#define __S101 __pgprot(0) -#define __S110 __pgprot(0) -#define __S111 __pgprot(0) - extern unsigned long _page_cachable_default; extern void __update_cache(unsigned long address, pte_t pte); diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 7be7240f7703..11b3e7ddafd5 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -159,6 +159,9 @@ EXPORT_SYMBOL(_page_cachable_default); #define PM(p) __pgprot(_page_cachable_default | (p)) +static pgprot_t protection_map[16] __ro_after_init; +DECLARE_VM_GET_PAGE_PROT + static inline void setup_protection_map(void) { protection_map[0] = PM(_PAGE_PRESENT | _PAGE_NO_EXEC | _PAGE_NO_READ); From 6d0b92254510cd7e12358028e85f99f0453006fa Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:55 +0530 Subject: [PATCH 193/282] m68k/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-22-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Cc: Thomas Bogendoerfer Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/m68k/Kconfig | 1 + arch/m68k/include/asm/mcf_pgtable.h | 59 ------------------------ arch/m68k/include/asm/motorola_pgtable.h | 29 ------------ arch/m68k/include/asm/sun3_pgtable.h | 23 --------- arch/m68k/mm/mcfmmu.c | 55 ++++++++++++++++++++++ arch/m68k/mm/motorola.c | 29 ++++++++++++ arch/m68k/mm/sun3mmu.c | 20 ++++++++ 7 files changed, 105 insertions(+), 111 deletions(-) diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 936cce42ae9a..49aa0cf13e96 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -7,6 +7,7 @@ config M68K select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS select ARCH_MIGHT_HAVE_PC_PARPORT if ISA select ARCH_NO_PREEMPT if !COLDFIRE diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 94f38d76e278..b619b22823f8 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -86,65 +86,6 @@ | CF_PAGE_READABLE \ | CF_PAGE_DIRTY) -/* - * Page protections for initialising protection_map. See mm/mmap.c - * for use. In general, the bit positions are xwr, and P-items are - * private, the S-items are shared. - */ -#define __P000 PAGE_NONE -#define __P001 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_READABLE) -#define __P010 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_WRITABLE) -#define __P011 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_READABLE \ - | CF_PAGE_WRITABLE) -#define __P100 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_EXEC) -#define __P101 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_READABLE \ - | CF_PAGE_EXEC) -#define __P110 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_WRITABLE \ - | CF_PAGE_EXEC) -#define __P111 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_READABLE \ - | CF_PAGE_WRITABLE \ - | CF_PAGE_EXEC) - -#define __S000 PAGE_NONE -#define __S001 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_READABLE) -#define __S010 PAGE_SHARED -#define __S011 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_SHARED \ - | CF_PAGE_READABLE) -#define __S100 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_EXEC) -#define __S101 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_READABLE \ - | CF_PAGE_EXEC) -#define __S110 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_SHARED \ - | CF_PAGE_EXEC) -#define __S111 __pgprot(CF_PAGE_VALID \ - | CF_PAGE_ACCESSED \ - | CF_PAGE_SHARED \ - | CF_PAGE_READABLE \ - | CF_PAGE_EXEC) - #define PTE_MASK PAGE_MASK #define CF_PAGE_CHG_MASK (PTE_MASK | CF_PAGE_ACCESSED | CF_PAGE_DIRTY) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 7c9b56e2a750..7ac3d64c6b33 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -76,35 +76,6 @@ extern unsigned long mm_cachebits; #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_RONLY | _PAGE_ACCESSED | mm_cachebits) #define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | mm_cachebits) -/* Alternate definitions that are compile time constants, for - initializing protection_map. The cachebits are fixed later. */ -#define PAGE_NONE_C __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) -#define PAGE_SHARED_C __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED) -#define PAGE_COPY_C __pgprot(_PAGE_PRESENT | _PAGE_RONLY | _PAGE_ACCESSED) -#define PAGE_READONLY_C __pgprot(_PAGE_PRESENT | _PAGE_RONLY | _PAGE_ACCESSED) - -/* - * The m68k can't do page protection for execute, and considers that the same are read. - * Also, write permissions imply read permissions. This is the closest we can get.. - */ -#define __P000 PAGE_NONE_C -#define __P001 PAGE_READONLY_C -#define __P010 PAGE_COPY_C -#define __P011 PAGE_COPY_C -#define __P100 PAGE_READONLY_C -#define __P101 PAGE_READONLY_C -#define __P110 PAGE_COPY_C -#define __P111 PAGE_COPY_C - -#define __S000 PAGE_NONE_C -#define __S001 PAGE_READONLY_C -#define __S010 PAGE_SHARED_C -#define __S011 PAGE_SHARED_C -#define __S100 PAGE_READONLY_C -#define __S101 PAGE_READONLY_C -#define __S110 PAGE_SHARED_C -#define __S111 PAGE_SHARED_C - #define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd)) /* diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 5e4e753f0d24..90d57e537eb1 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -66,29 +66,6 @@ | SUN3_PAGE_SYSTEM \ | SUN3_PAGE_NOCACHE) -/* - * Page protections for initialising protection_map. The sun3 has only two - * protection settings, valid (implying read and execute) and writeable. These - * are as close as we can get... - */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED - /* Use these fake page-protections on PMDs. */ #define SUN3_PMD_VALID (0x00000001) #define SUN3_PMD_MASK (0x0000003F) diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 6f1f25125294..70aa0979e027 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -234,3 +234,58 @@ void steal_context(void) destroy_context(mm); } +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE), + [VM_WRITE] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_WRITABLE), + [VM_WRITE | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE | + CF_PAGE_WRITABLE), + [VM_EXEC] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_EXEC), + [VM_EXEC | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE | + CF_PAGE_EXEC), + [VM_EXEC | VM_WRITE] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_WRITABLE | + CF_PAGE_EXEC), + [VM_EXEC | VM_WRITE | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE | + CF_PAGE_WRITABLE | + CF_PAGE_EXEC), + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE), + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE | + CF_PAGE_SHARED), + [VM_SHARED | VM_EXEC] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_EXEC), + [VM_SHARED | VM_EXEC | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE | + CF_PAGE_EXEC), + [VM_SHARED | VM_EXEC | VM_WRITE] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_SHARED | + CF_PAGE_EXEC), + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __pgprot(CF_PAGE_VALID | + CF_PAGE_ACCESSED | + CF_PAGE_READABLE | + CF_PAGE_SHARED | + CF_PAGE_EXEC) +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index df7f797c908a..2a375637e007 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -382,6 +382,35 @@ static void __init map_node(int node) #endif } +/* + * Alternate definitions that are compile time constants, for + * initializing protection_map. The cachebits are fixed later. + */ +#define PAGE_NONE_C __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED_C __pgprot(_PAGE_PRESENT | _PAGE_ACCESSED) +#define PAGE_COPY_C __pgprot(_PAGE_PRESENT | _PAGE_RONLY | _PAGE_ACCESSED) +#define PAGE_READONLY_C __pgprot(_PAGE_PRESENT | _PAGE_RONLY | _PAGE_ACCESSED) + +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = PAGE_NONE_C, + [VM_READ] = PAGE_READONLY_C, + [VM_WRITE] = PAGE_COPY_C, + [VM_WRITE | VM_READ] = PAGE_COPY_C, + [VM_EXEC] = PAGE_READONLY_C, + [VM_EXEC | VM_READ] = PAGE_READONLY_C, + [VM_EXEC | VM_WRITE] = PAGE_COPY_C, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_C, + [VM_SHARED] = PAGE_NONE_C, + [VM_SHARED | VM_READ] = PAGE_READONLY_C, + [VM_SHARED | VM_WRITE] = PAGE_SHARED_C, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED_C, + [VM_SHARED | VM_EXEC] = PAGE_READONLY_C, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_C, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_C, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_C +}; +DECLARE_VM_GET_PAGE_PROT + /* * paging_init() continues the virtual memory environment setup which * was begun by the code in arch/head.S. diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index dad494224497..b619d0d4319c 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -95,3 +95,23 @@ void __init paging_init(void) } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT From 5d260625b1f2affc00c596a2c8d6314865060b11 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:56 +0530 Subject: [PATCH 194/282] arc/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-23-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Vineet Gupta Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/pgtable-bits-arcv2.h | 18 ------------------ arch/arc/mm/mmap.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 9e3653253ef2..8be56a5d8a9b 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -13,6 +13,7 @@ config ARC select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select ARCH_32BIT_OFF_T select BUILDTIME_TABLE_SORT diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 183d23bc1e00..b23be557403e 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -72,24 +72,6 @@ * This is to enable COW mechanism */ /* xwr */ -#define __P000 PAGE_U_NONE -#define __P001 PAGE_U_R -#define __P010 PAGE_U_R /* Pvt-W => !W */ -#define __P011 PAGE_U_R /* Pvt-W => !W */ -#define __P100 PAGE_U_X_R /* X => R */ -#define __P101 PAGE_U_X_R -#define __P110 PAGE_U_X_R /* Pvt-W => !W and X => R */ -#define __P111 PAGE_U_X_R /* Pvt-W => !W */ - -#define __S000 PAGE_U_NONE -#define __S001 PAGE_U_R -#define __S010 PAGE_U_W_R /* W => R */ -#define __S011 PAGE_U_W_R -#define __S100 PAGE_U_X_R /* X => R */ -#define __S101 PAGE_U_X_R -#define __S110 PAGE_U_X_W_R /* X => R */ -#define __S111 PAGE_U_X_W_R - #ifndef __ASSEMBLY__ #define pte_write(pte) (pte_val(pte) & _PAGE_WRITE) diff --git a/arch/arc/mm/mmap.c b/arch/arc/mm/mmap.c index 722d26b94307..fce5fa2b4f52 100644 --- a/arch/arc/mm/mmap.c +++ b/arch/arc/mm/mmap.c @@ -74,3 +74,23 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.align_offset = pgoff << PAGE_SHIFT; return vm_unmapped_area(&info); } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_U_NONE, + [VM_READ] = PAGE_U_R, + [VM_WRITE] = PAGE_U_R, + [VM_WRITE | VM_READ] = PAGE_U_R, + [VM_EXEC] = PAGE_U_X_R, + [VM_EXEC | VM_READ] = PAGE_U_X_R, + [VM_EXEC | VM_WRITE] = PAGE_U_X_R, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_U_X_R, + [VM_SHARED] = PAGE_U_NONE, + [VM_SHARED | VM_READ] = PAGE_U_R, + [VM_SHARED | VM_WRITE] = PAGE_U_W_R, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_U_W_R, + [VM_SHARED | VM_EXEC] = PAGE_U_X_R, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_U_X_R, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_U_X_W_R, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_U_X_W_R +}; +DECLARE_VM_GET_PAGE_PROT From ca26f936f51b8c9219ede32b1a1f76c4924897aa Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:57 +0530 Subject: [PATCH 195/282] arm/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-24-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Russell King Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 1 + arch/arm/include/asm/pgtable.h | 17 ----------------- arch/arm/lib/uaccess_with_memcpy.c | 2 +- arch/arm/mm/mmu.c | 20 ++++++++++++++++++++ 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 7630ba9cb6cc..e153b6d4fc5b 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,6 +24,7 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index cd1f84bb40ae..78a532068fec 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -137,23 +137,6 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, * 2) If we could do execute protection, then read is implied * 3) write implies read permissions */ -#define __P000 __PAGE_NONE -#define __P001 __PAGE_READONLY -#define __P010 __PAGE_COPY -#define __P011 __PAGE_COPY -#define __P100 __PAGE_READONLY_EXEC -#define __P101 __PAGE_READONLY_EXEC -#define __P110 __PAGE_COPY_EXEC -#define __P111 __PAGE_COPY_EXEC - -#define __S000 __PAGE_NONE -#define __S001 __PAGE_READONLY -#define __S010 __PAGE_SHARED -#define __S011 __PAGE_SHARED -#define __S100 __PAGE_READONLY_EXEC -#define __S101 __PAGE_READONLY_EXEC -#define __S110 __PAGE_SHARED_EXEC -#define __S111 __PAGE_SHARED_EXEC #ifndef __ASSEMBLY__ /* diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index c30b689bec2e..14eecaaf295f 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c @@ -237,7 +237,7 @@ static int __init test_size_treshold(void) if (!dst_page) goto no_dst; kernel_ptr = page_address(src_page); - user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__P010)); + user_ptr = vmap(&dst_page, 1, VM_IOREMAP, __pgprot(__PAGE_COPY)); if (!user_ptr) goto no_vmap; diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 5e2be37a198e..2722abddd725 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -405,6 +405,26 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE); } +static pgprot_t protection_map[16] __ro_after_init = { + [VM_NONE] = __PAGE_NONE, + [VM_READ] = __PAGE_READONLY, + [VM_WRITE] = __PAGE_COPY, + [VM_WRITE | VM_READ] = __PAGE_COPY, + [VM_EXEC] = __PAGE_READONLY_EXEC, + [VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC, + [VM_EXEC | VM_WRITE] = __PAGE_COPY_EXEC, + [VM_EXEC | VM_WRITE | VM_READ] = __PAGE_COPY_EXEC, + [VM_SHARED] = __PAGE_NONE, + [VM_SHARED | VM_READ] = __PAGE_READONLY, + [VM_SHARED | VM_WRITE] = __PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = __PAGE_SHARED, + [VM_SHARED | VM_EXEC] = __PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_READ] = __PAGE_READONLY_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE] = __PAGE_SHARED_EXEC, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __PAGE_SHARED_EXEC +}; +DECLARE_VM_GET_PAGE_PROT + /* * Adjust the PMD section entries according to the CPU in use. */ From 91a8da021c4d767704086386b08bf276f926ebfd Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:58 +0530 Subject: [PATCH 196/282] um/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-25-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Jeff Dike Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/um/Kconfig | 1 + arch/um/include/asm/pgtable.h | 17 ----------------- arch/um/kernel/mem.c | 20 ++++++++++++++++++++ arch/x86/um/mem_32.c | 2 +- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 4ec22e156a2e..7fb43654e5b5 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -10,6 +10,7 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_PREEMPT select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 167e236d9bb8..66bc3f99d9be 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -68,23 +68,6 @@ extern unsigned long end_iomem; * Also, write permissions imply read permissions. This is the closest we can * get.. */ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_READONLY -#define __P101 PAGE_READONLY -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_SHARED -#define __S011 PAGE_SHARED -#define __S100 PAGE_READONLY -#define __S101 PAGE_READONLY -#define __S110 PAGE_SHARED -#define __S111 PAGE_SHARED /* * ZERO_PAGE is a global shared page that is always zero: used diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 15295c3237a0..5b259f0a1f94 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -197,3 +197,23 @@ void *uml_kmalloc(int size, int flags) { return kmalloc(size, flags); } + +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_READONLY, + [VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED +}; +DECLARE_VM_GET_PAGE_PROT diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 19c5dbd46770..cafd01f730da 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -17,7 +17,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; - gate_vma.vm_page_prot = __P101; + gate_vma.vm_page_prot = PAGE_READONLY; return 0; } From 34516fd83fa122e54a261b74e419c16aad5317f0 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:35:59 +0530 Subject: [PATCH 197/282] sh/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This enables ARCH_HAS_VM_GET_PAGE_PROT on the platform and exports standard vm_get_page_prot() implementation via DECLARE_VM_GET_PAGE_PROT, which looks up a private and static protection_map[] array. Subsequently all __SXXX and __PXXX macros can be dropped which are no longer needed. Link: https://lkml.kernel.org/r/20220711070600.2378316-26-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Yoshinori Sato Cc: Rich Felker Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sh/Kconfig | 1 + arch/sh/include/asm/pgtable.h | 17 ----------------- arch/sh/mm/mmap.c | 20 ++++++++++++++++++++ 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5f220e903e5a..91f3ea325388 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -12,6 +12,7 @@ config SUPERH select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HIBERNATION_POSSIBLE if MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index d7ddb1ec86a0..6fb9ec54cf9b 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -89,23 +89,6 @@ static inline unsigned long phys_addr_mask(void) * completely separate permission bits for user and kernel space. */ /*xwr*/ -#define __P000 PAGE_NONE -#define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY -#define __P100 PAGE_EXECREAD -#define __P101 PAGE_EXECREAD -#define __P110 PAGE_COPY -#define __P111 PAGE_COPY - -#define __S000 PAGE_NONE -#define __S001 PAGE_READONLY -#define __S010 PAGE_WRITEONLY -#define __S011 PAGE_SHARED -#define __S100 PAGE_EXECREAD -#define __S101 PAGE_EXECREAD -#define __S110 PAGE_RWX -#define __S111 PAGE_RWX typedef pte_t *pte_addr_t; diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c index 6a1a1297baae..b82199878b45 100644 --- a/arch/sh/mm/mmap.c +++ b/arch/sh/mm/mmap.c @@ -19,6 +19,26 @@ unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ EXPORT_SYMBOL(shm_align_mask); #ifdef CONFIG_MMU +static const pgprot_t protection_map[16] = { + [VM_NONE] = PAGE_NONE, + [VM_READ] = PAGE_READONLY, + [VM_WRITE] = PAGE_COPY, + [VM_WRITE | VM_READ] = PAGE_COPY, + [VM_EXEC] = PAGE_EXECREAD, + [VM_EXEC | VM_READ] = PAGE_EXECREAD, + [VM_EXEC | VM_WRITE] = PAGE_COPY, + [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY, + [VM_SHARED] = PAGE_NONE, + [VM_SHARED | VM_READ] = PAGE_READONLY, + [VM_SHARED | VM_WRITE] = PAGE_WRITEONLY, + [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, + [VM_SHARED | VM_EXEC] = PAGE_EXECREAD, + [VM_SHARED | VM_EXEC | VM_READ] = PAGE_EXECREAD, + [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX, + [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX +}; +DECLARE_VM_GET_PAGE_PROT + /* * To avoid cache aliases, we map the shared page with same color. */ From 3d923c5f1e21ad491acd4c0d62bf2481ce94016c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 11 Jul 2022 12:36:00 +0530 Subject: [PATCH 198/282] mm/mmap: drop ARCH_HAS_VM_GET_PAGE_PROT Now all the platforms enable ARCH_HAS_GET_PAGE_PROT. They define and export own vm_get_page_prot() whether custom or standard DECLARE_VM_GET_PAGE_PROT. Hence there is no need for default generic fallback for vm_get_page_prot(). Just drop this fallback and also ARCH_HAS_GET_PAGE_PROT mechanism. Link: https://lkml.kernel.org/r/20220711070600.2378316-27-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Geert Uytterhoeven Reviewed-by: Christoph Hellwig Reviewed-by: Christophe Leroy Acked-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Chris Zankel Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Guo Ren Cc: Heiko Carstens Cc: Huacai Chen Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Jeff Dike Cc: Jonas Bonn Cc: Michael Ellerman Cc: Michal Simek Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Richard Henderson Cc: Rich Felker Cc: Russell King Cc: Sam Ravnborg Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/loongarch/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/um/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - include/linux/mm.h | 3 --- mm/Kconfig | 3 --- mm/mmap.c | 22 ---------------------- 25 files changed, 50 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index db1c8b329461..7d0d26b5b3f5 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -2,7 +2,6 @@ config ALPHA bool default y - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_32BIT_USTAT_F_TINODE select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 8be56a5d8a9b..9e3653253ef2 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -13,7 +13,6 @@ config ARC select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select ARCH_32BIT_OFF_T select BUILDTIME_TABLE_SORT diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e153b6d4fc5b..7630ba9cb6cc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,7 +24,6 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB || !MMU select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if CPU_V7 || CPU_V7M || CPU_V6K select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1652a9800ebe..7030bf3f8d6f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -45,7 +45,6 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 588b8a9c68ed..21d72b078eef 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -6,7 +6,6 @@ config CSKY select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_WANT_FRAME_POINTERS if !CPU_CK610 && $(cc-option,-mbacktrace) diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index bc4ceecd0588..54eadf265178 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -6,7 +6,6 @@ config HEXAGON def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_PREEMPT select DMA_GLOBAL_POOL # Other pending projects/to-do items. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 0510a5737711..cb93769a9f2a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -12,7 +12,6 @@ config IA64 select ARCH_HAS_DMA_MARK_CLEAN select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ACPI diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index adf8cf6ec5d5..db2838cf8c02 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -9,7 +9,6 @@ config LOONGARCH select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PTE_SPECIAL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_INLINE_READ_LOCK if !PREEMPTION select ARCH_INLINE_READ_LOCK_BH if !PREEMPTION diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 49aa0cf13e96..936cce42ae9a 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -7,7 +7,6 @@ config M68K select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS select ARCH_MIGHT_HAVE_PC_PARPORT if ISA select ARCH_NO_PREEMPT if !COLDFIRE diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 15f91ba8a0c4..8cf429ad1c84 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -7,7 +7,6 @@ config MICROBLAZE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_TABLE_SORT diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index d0b7eb11ec81..db09d45d59ec 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -14,7 +14,6 @@ config MIPS select ARCH_HAS_STRNLEN_USER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_KEEP_MEMBLOCK select ARCH_SUPPORTS_UPROBES diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index e0459dffd218..4167f1eb4cd8 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -6,7 +6,6 @@ config NIOS2 select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE select ARCH_HAS_DMA_SET_UNCACHED - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_SWAP select COMMON_CLK select TIMER_OF diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index fe0dfb50eb86..e814df4c483c 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -10,7 +10,6 @@ config OPENRISC select ARCH_HAS_DMA_SET_UNCACHED select ARCH_HAS_DMA_CLEAR_UNCACHED select ARCH_HAS_SYNC_DMA_FOR_DEVICE - select ARCH_HAS_VM_GET_PAGE_PROT select COMMON_CLK select OF select OF_EARLY_FLATTREE diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 891d82393957..fa400055b2d5 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -12,7 +12,6 @@ config PARISC select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_PTE_SPECIAL select ARCH_NO_SG_CHAIN select ARCH_SUPPORTS_HUGETLBFS if PA20 diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1035d172c7dd..250b8658b2d4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -140,7 +140,6 @@ config PPC select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UACCESS_FLUSHCACHE select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 583389d4e43a..32ffef9f6e5b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -32,7 +32,6 @@ config RISCV select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_STACKWALK diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c4481377ca83..91c0b80a8bf0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -81,7 +81,6 @@ config S390 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_VDSO_DATA - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK select ARCH_INLINE_READ_LOCK_BH diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 91f3ea325388..5f220e903e5a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -12,7 +12,6 @@ config SUPERH select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HIBERNATION_POSSIBLE if MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 09f868613a4d..9c1cce74953a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -13,7 +13,6 @@ config 64BIT config SPARC bool default y - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_MIGHT_HAVE_PC_PARPORT if SPARC64 && PCI select ARCH_MIGHT_HAVE_PC_SERIO select DMA_OPS diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 7fb43654e5b5..4ec22e156a2e 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -10,7 +10,6 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_NO_PREEMPT select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index be0b95e51df6..841e4843d0c4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,7 +94,6 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 4c0d83520ff1..0b0f0172cced 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -11,7 +11,6 @@ config XTENSA select ARCH_HAS_DMA_SET_UNCACHED if MMU select ARCH_HAS_STRNCPY_FROM_USER if !KASAN select ARCH_HAS_STRNLEN_USER - select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/include/linux/mm.h b/include/linux/mm.h index 4b4dc93f9bc3..61e3101c44ea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -425,9 +425,6 @@ extern unsigned int kobjsize(const void *objp); * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -extern pgprot_t protection_map[16]; -#endif /* * The default fault flags that should be used by most of the diff --git a/mm/Kconfig b/mm/Kconfig index c1fa4993a56f..56ca0e7c6f9a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -951,9 +951,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER register alias named "current_stack_pointer", this config can be selected. -config ARCH_HAS_VM_GET_PAGE_PROT - bool - config ARCH_HAS_PTE_DEVMAP bool diff --git a/mm/mmap.c b/mm/mmap.c index 2a58a9cd0752..edf27a2789a2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -81,28 +81,6 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); -#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT -pgprot_t protection_map[16] __ro_after_init = { - [VM_NONE] = __P000, - [VM_READ] = __P001, - [VM_WRITE] = __P010, - [VM_WRITE | VM_READ] = __P011, - [VM_EXEC] = __P100, - [VM_EXEC | VM_READ] = __P101, - [VM_EXEC | VM_WRITE] = __P110, - [VM_EXEC | VM_WRITE | VM_READ] = __P111, - [VM_SHARED] = __S000, - [VM_SHARED | VM_READ] = __S001, - [VM_SHARED | VM_WRITE] = __S010, - [VM_SHARED | VM_WRITE | VM_READ] = __S011, - [VM_SHARED | VM_EXEC] = __S100, - [VM_SHARED | VM_EXEC | VM_READ] = __S101, - [VM_SHARED | VM_EXEC | VM_WRITE] = __S110, - [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111 -}; -DECLARE_VM_GET_PAGE_PROT -#endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */ - static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); From a008a3004340887370aea38b5cd441b1db110041 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:50 +0300 Subject: [PATCH 199/282] csky: drop definition of PTE_ORDER Patch series "arch: make PxD_ORDER generically available", v2. The question what does PxD_ORDER define raises from time to time and there is still a conflict between MIPS and DAX definitions. Some time ago Matthew Wilcox suggested to use PMD_TABLE_ORDER to define the order of page table allocation: [1] https://lore.kernel.org/linux-arch/YPCJftSTUBEnq2lI@casper.infradead.org/ The parisc patch made it in, but mips didn't. Now mips defines from asm/include/pgtable.h were copied to loongarch which made it worse. Let's deal with it once and for all and rename PxD_ORDER defines to PxD_TABLE_ORDER or just drop them when the only possible order of page table is 0. This patch (of 15): This is the order of the page table allocation, not the order of a PTE. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220705154708.181258-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20220703141203.147893-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20220703141203.147893-2-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: Matthew Wilcox Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgtable.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 229a5f4ad7fc..349e03dfb9ba 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -19,11 +19,10 @@ * C-SKY is two-level paging structure: */ #define PGD_ORDER 0 -#define PTE_ORDER 0 #define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) #define PTRS_PER_PMD 1 -#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) +#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) #define pte_ERROR(e) \ pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) From f7536442353d1b6bb0c7c8ad1ec549d9fa215106 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:51 +0300 Subject: [PATCH 200/282] csky: drop definition of PGD_ORDER This is the order of the page table allocation, not the order of a PGD. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-3-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgalloc.h | 2 +- arch/csky/include/asm/pgtable.h | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index bbbd0698b397..7d57e5da0914 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -44,7 +44,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *ret; pgd_t *init; - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + ret = (pgd_t *) __get_free_page(GFP_KERNEL); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init((unsigned long *)ret); diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index 349e03dfb9ba..c3d9b92cbe61 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -18,9 +18,8 @@ /* * C-SKY is two-level paging structure: */ -#define PGD_ORDER 0 -#define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) +#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) #define PTRS_PER_PMD 1 #define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) From c94b14bd1cff085df21125a3591378c5081f143a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 3 Jul 2022 17:11:52 +0300 Subject: [PATCH 201/282] mips: rename PMD_ORDER to PMD_TABLE_ORDER This is the order of the page table allocation, not the order of a PMD. While at it remove unused defintion of _PMD_ORDER in asm-offsets. Link: https://lkml.kernel.org/r/20220703141203.147893-4-rppt@kernel.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 4 ++-- arch/mips/include/asm/pgtable-32.h | 2 +- arch/mips/include/asm/pgtable-64.h | 18 +++++++++--------- arch/mips/kernel/asm-offsets.c | 3 --- 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 867e9c3db76e..0ef245cfcae9 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -67,12 +67,12 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) pmd_t *pmd; struct page *pg; - pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_ORDER); + pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); if (!pg) return NULL; if (!pgtable_pmd_page_ctor(pg)) { - __free_pages(pg, PMD_ORDER); + __free_pages(pg, PMD_TABLE_ORDER); return NULL; } diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 95df9c293d8d..8d57bd5b0b94 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -82,7 +82,7 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, #define PGD_ORDER (__PGD_ORDER >= 0 ? __PGD_ORDER : 0) #define PUD_ORDER aieeee_attempt_to_allocate_pud -#define PMD_ORDER aieeee_attempt_to_allocate_pmd +#define PMD_TABLE_ORDER aieeee_attempt_to_allocate_pmd #define PTE_ORDER 0 #define PTRS_PER_PGD (USER_PTRS_PER_PGD * 2) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 41921acdc9d8..ae0d5a09064d 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -51,12 +51,12 @@ #define PMD_MASK (~(PMD_SIZE-1)) # ifdef __PAGETABLE_PUD_FOLDED -# define PGDIR_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_ORDER - 3)) +# define PGDIR_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_TABLE_ORDER - 3)) # endif #endif #ifndef __PAGETABLE_PUD_FOLDED -#define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_ORDER - 3)) +#define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_TABLE_ORDER - 3)) #define PUD_SIZE (1UL << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) #define PGDIR_SHIFT (PUD_SHIFT + (PAGE_SHIFT + PUD_ORDER - 3)) @@ -91,13 +91,13 @@ # define PGD_ORDER 1 # define PUD_ORDER aieeee_attempt_to_allocate_pud # endif -#define PMD_ORDER 0 +#define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_8KB #define PGD_ORDER 0 #define PUD_ORDER aieeee_attempt_to_allocate_pud -#define PMD_ORDER 0 +#define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_16KB @@ -107,22 +107,22 @@ #define PGD_ORDER 0 #endif #define PUD_ORDER aieeee_attempt_to_allocate_pud -#define PMD_ORDER 0 +#define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_32KB #define PGD_ORDER 0 #define PUD_ORDER aieeee_attempt_to_allocate_pud -#define PMD_ORDER 0 +#define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_64KB #define PGD_ORDER 0 #define PUD_ORDER aieeee_attempt_to_allocate_pud #ifdef CONFIG_MIPS_VA_BITS_48 -#define PMD_ORDER 0 +#define PMD_TABLE_ORDER 0 #else -#define PMD_ORDER aieeee_attempt_to_allocate_pmd +#define PMD_TABLE_ORDER aieeee_attempt_to_allocate_pmd #endif #define PTE_ORDER 0 #endif @@ -132,7 +132,7 @@ #define PTRS_PER_PUD ((PAGE_SIZE << PUD_ORDER) / sizeof(pud_t)) #endif #ifndef __PAGETABLE_PMD_FOLDED -#define PTRS_PER_PMD ((PAGE_SIZE << PMD_ORDER) / sizeof(pmd_t)) +#define PTRS_PER_PMD ((PAGE_SIZE << PMD_TABLE_ORDER) / sizeof(pmd_t)) #endif #define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 04ca75278f02..ca7c5af7697d 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -197,9 +197,6 @@ void output_mm_defines(void) DEFINE(_PTE_T_LOG2, PTE_T_LOG2); BLANK(); DEFINE(_PGD_ORDER, PGD_ORDER); -#ifndef __PAGETABLE_PMD_FOLDED - DEFINE(_PMD_ORDER, PMD_ORDER); -#endif DEFINE(_PTE_ORDER, PTE_ORDER); BLANK(); DEFINE(_PMD_SHIFT, PMD_SHIFT); From 8e20a4decd43ddfa98c146673c3981b2260bc176 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:53 +0300 Subject: [PATCH 202/282] mips: rename PUD_ORDER to PUD_TABLE_ORDER This is the order of the page table allocation, not the order of a PUD. Link: https://lkml.kernel.org/r/20220703141203.147893-5-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 2 +- arch/mips/include/asm/pgtable-32.h | 2 +- arch/mips/include/asm/pgtable-64.h | 16 ++++++++-------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 0ef245cfcae9..1ef8e86ae565 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -91,7 +91,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; - pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_ORDER); + pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER); if (pud) pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table); return pud; diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 8d57bd5b0b94..d9ae244a4fce 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -81,7 +81,7 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, #endif #define PGD_ORDER (__PGD_ORDER >= 0 ? __PGD_ORDER : 0) -#define PUD_ORDER aieeee_attempt_to_allocate_pud +#define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER aieeee_attempt_to_allocate_pmd #define PTE_ORDER 0 diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index ae0d5a09064d..7daf9a6509d8 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -59,7 +59,7 @@ #define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_TABLE_ORDER - 3)) #define PUD_SIZE (1UL << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SHIFT (PUD_SHIFT + (PAGE_SHIFT + PUD_ORDER - 3)) +#define PGDIR_SHIFT (PUD_SHIFT + (PAGE_SHIFT + PUD_TABLE_ORDER - 3)) #endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) @@ -86,17 +86,17 @@ #ifdef CONFIG_PAGE_SIZE_4KB # ifdef CONFIG_MIPS_VA_BITS_48 # define PGD_ORDER 0 -# define PUD_ORDER 0 +# define PUD_TABLE_ORDER 0 # else # define PGD_ORDER 1 -# define PUD_ORDER aieeee_attempt_to_allocate_pud +# define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud # endif #define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_8KB #define PGD_ORDER 0 -#define PUD_ORDER aieeee_attempt_to_allocate_pud +#define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif @@ -106,19 +106,19 @@ #else #define PGD_ORDER 0 #endif -#define PUD_ORDER aieeee_attempt_to_allocate_pud +#define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_32KB #define PGD_ORDER 0 -#define PUD_ORDER aieeee_attempt_to_allocate_pud +#define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 #define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_64KB #define PGD_ORDER 0 -#define PUD_ORDER aieeee_attempt_to_allocate_pud +#define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #ifdef CONFIG_MIPS_VA_BITS_48 #define PMD_TABLE_ORDER 0 #else @@ -129,7 +129,7 @@ #define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) #ifndef __PAGETABLE_PUD_FOLDED -#define PTRS_PER_PUD ((PAGE_SIZE << PUD_ORDER) / sizeof(pud_t)) +#define PTRS_PER_PUD ((PAGE_SIZE << PUD_TABLE_ORDER) / sizeof(pud_t)) #endif #ifndef __PAGETABLE_PMD_FOLDED #define PTRS_PER_PMD ((PAGE_SIZE << PMD_TABLE_ORDER) / sizeof(pmd_t)) From 6963c72d9046eab0f023e1d91960550e764ad7b9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:54 +0300 Subject: [PATCH 203/282] mips: drop definitions of PTE_ORDER This is the order of the page table allocation, not the order of a PTE. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-6-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgtable-32.h | 9 ++++----- arch/mips/include/asm/pgtable-64.h | 15 +++++---------- arch/mips/kernel/asm-offsets.c | 1 - arch/mips/mm/tlbex.c | 2 +- 4 files changed, 10 insertions(+), 17 deletions(-) diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index d9ae244a4fce..35bd519a1078 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -62,9 +62,9 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, /* PGDIR_SHIFT determines what a third-level page table entry can map */ #if defined(CONFIG_MIPS_HUGE_TLB_SUPPORT) && !defined(CONFIG_PHYS_ADDR_T_64BIT) -# define PGDIR_SHIFT (2 * PAGE_SHIFT + PTE_ORDER - PTE_T_LOG2 - 1) +# define PGDIR_SHIFT (2 * PAGE_SHIFT - PTE_T_LOG2 - 1) #else -# define PGDIR_SHIFT (2 * PAGE_SHIFT + PTE_ORDER - PTE_T_LOG2) +# define PGDIR_SHIFT (2 * PAGE_SHIFT - PTE_T_LOG2) #endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) @@ -83,13 +83,12 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, #define PGD_ORDER (__PGD_ORDER >= 0 ? __PGD_ORDER : 0) #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER aieeee_attempt_to_allocate_pmd -#define PTE_ORDER 0 #define PTRS_PER_PGD (USER_PTRS_PER_PGD * 2) #if defined(CONFIG_MIPS_HUGE_TLB_SUPPORT) && !defined(CONFIG_PHYS_ADDR_T_64BIT) -# define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t) / 2) +# define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t) / 2) #else -# define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) +# define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) #endif #define USER_PTRS_PER_PGD (0x80000000UL/PGDIR_SIZE) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 7daf9a6509d8..dbf7e461d360 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -42,11 +42,11 @@ /* PGDIR_SHIFT determines what a third-level page table entry can map */ #ifdef __PAGETABLE_PMD_FOLDED -#define PGDIR_SHIFT (PAGE_SHIFT + PAGE_SHIFT + PTE_ORDER - 3) +#define PGDIR_SHIFT (PAGE_SHIFT + PAGE_SHIFT - 3) #else /* PMD_SHIFT determines the size of the area a second-level page table can map */ -#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT + PTE_ORDER - 3)) +#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) @@ -86,19 +86,17 @@ #ifdef CONFIG_PAGE_SIZE_4KB # ifdef CONFIG_MIPS_VA_BITS_48 # define PGD_ORDER 0 -# define PUD_TABLE_ORDER 0 +# define PUD_TABLE_ORDER 0 # else # define PGD_ORDER 1 -# define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud +# define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud # endif #define PMD_TABLE_ORDER 0 -#define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_8KB #define PGD_ORDER 0 #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 -#define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_16KB #ifdef CONFIG_MIPS_VA_BITS_48 @@ -108,13 +106,11 @@ #endif #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 -#define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_32KB #define PGD_ORDER 0 #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 -#define PTE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_64KB #define PGD_ORDER 0 @@ -124,7 +120,6 @@ #else #define PMD_TABLE_ORDER aieeee_attempt_to_allocate_pmd #endif -#define PTE_ORDER 0 #endif #define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) @@ -134,7 +129,7 @@ #ifndef __PAGETABLE_PMD_FOLDED #define PTRS_PER_PMD ((PAGE_SIZE << PMD_TABLE_ORDER) / sizeof(pmd_t)) #endif -#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) +#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) #define USER_PTRS_PER_PGD ((TASK_SIZE64 / PGDIR_SIZE)?(TASK_SIZE64 / PGDIR_SIZE):1) diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index ca7c5af7697d..0c97f755e256 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -197,7 +197,6 @@ void output_mm_defines(void) DEFINE(_PTE_T_LOG2, PTE_T_LOG2); BLANK(); DEFINE(_PGD_ORDER, PGD_ORDER); - DEFINE(_PTE_ORDER, PTE_ORDER); BLANK(); DEFINE(_PMD_SHIFT, PMD_SHIFT); DEFINE(_PGDIR_SHIFT, PGDIR_SHIFT); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 8dbbd99fc7e8..6e8e71f12fab 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -2065,7 +2065,7 @@ build_r4000_tlbchange_handler_head(u32 **p, struct uasm_label **l, UASM_i_MFC0(p, wr.r1, C0_BADVADDR); UASM_i_LW(p, wr.r2, 0, wr.r2); - UASM_i_SRL(p, wr.r1, wr.r1, PAGE_SHIFT + PTE_ORDER - PTE_T_LOG2); + UASM_i_SRL(p, wr.r1, wr.r1, PAGE_SHIFT - PTE_T_LOG2); uasm_i_andi(p, wr.r1, wr.r1, (PTRS_PER_PTE - 1) << PTE_T_LOG2); UASM_i_ADDU(p, wr.r2, wr.r2, wr.r1); From bb5af4f67a567e723ac83956167efb57687b0793 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:55 +0300 Subject: [PATCH 204/282] mips: rename PGD_ORDER to PGD_TABLE_ORDER This is the order of the page table allocation, not the order of a PGD. While at it remove unused defintion of _PGD_ORDER in asm-offsets. Link: https://lkml.kernel.org/r/20220703141203.147893-7-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 2 +- arch/mips/include/asm/pgtable-32.h | 6 +++--- arch/mips/include/asm/pgtable-64.h | 16 ++++++++-------- arch/mips/kernel/asm-offsets.c | 1 - arch/mips/kvm/mmu.c | 2 +- arch/mips/mm/pgtable.c | 2 +- arch/mips/mm/tlbex.c | 12 ++++++------ 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 1ef8e86ae565..796035784c73 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -51,7 +51,7 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGD_ORDER); + free_pages((unsigned long)pgd, PGD_TABLE_ORDER); } #define __pte_free_tlb(tlb,pte,address) \ diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index 35bd519a1078..495c603c1a30 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -75,12 +75,12 @@ extern int add_temporary_entry(unsigned long entrylo0, unsigned long entrylo1, * we don't really have any PUD/PMD directory physically. */ #if defined(CONFIG_MIPS_HUGE_TLB_SUPPORT) && !defined(CONFIG_PHYS_ADDR_T_64BIT) -# define __PGD_ORDER (32 - 3 * PAGE_SHIFT + PGD_T_LOG2 + PTE_T_LOG2 + 1) +# define __PGD_TABLE_ORDER (32 - 3 * PAGE_SHIFT + PGD_T_LOG2 + PTE_T_LOG2 + 1) #else -# define __PGD_ORDER (32 - 3 * PAGE_SHIFT + PGD_T_LOG2 + PTE_T_LOG2) +# define __PGD_TABLE_ORDER (32 - 3 * PAGE_SHIFT + PGD_T_LOG2 + PTE_T_LOG2) #endif -#define PGD_ORDER (__PGD_ORDER >= 0 ? __PGD_ORDER : 0) +#define PGD_TABLE_ORDER (__PGD_TABLE_ORDER >= 0 ? __PGD_TABLE_ORDER : 0) #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER aieeee_attempt_to_allocate_pmd diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index dbf7e461d360..a259ca4d1272 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -85,35 +85,35 @@ */ #ifdef CONFIG_PAGE_SIZE_4KB # ifdef CONFIG_MIPS_VA_BITS_48 -# define PGD_ORDER 0 +# define PGD_TABLE_ORDER 0 # define PUD_TABLE_ORDER 0 # else -# define PGD_ORDER 1 +# define PGD_TABLE_ORDER 1 # define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud # endif #define PMD_TABLE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_8KB -#define PGD_ORDER 0 +#define PGD_TABLE_ORDER 0 #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_16KB #ifdef CONFIG_MIPS_VA_BITS_48 -#define PGD_ORDER 1 +#define PGD_TABLE_ORDER 1 #else -#define PGD_ORDER 0 +#define PGD_TABLE_ORDER 0 #endif #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_32KB -#define PGD_ORDER 0 +#define PGD_TABLE_ORDER 0 #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #define PMD_TABLE_ORDER 0 #endif #ifdef CONFIG_PAGE_SIZE_64KB -#define PGD_ORDER 0 +#define PGD_TABLE_ORDER 0 #define PUD_TABLE_ORDER aieeee_attempt_to_allocate_pud #ifdef CONFIG_MIPS_VA_BITS_48 #define PMD_TABLE_ORDER 0 @@ -122,7 +122,7 @@ #endif #endif -#define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) +#define PTRS_PER_PGD ((PAGE_SIZE << PGD_TABLE_ORDER) / sizeof(pgd_t)) #ifndef __PAGETABLE_PUD_FOLDED #define PTRS_PER_PUD ((PAGE_SIZE << PUD_TABLE_ORDER) / sizeof(pud_t)) #endif diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index 0c97f755e256..c4501897b870 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -196,7 +196,6 @@ void output_mm_defines(void) #endif DEFINE(_PTE_T_LOG2, PTE_T_LOG2); BLANK(); - DEFINE(_PGD_ORDER, PGD_ORDER); BLANK(); DEFINE(_PMD_SHIFT, PMD_SHIFT); DEFINE(_PGDIR_SHIFT, PGDIR_SHIFT); diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 1bfd1b501d82..db17e870bdff 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -80,7 +80,7 @@ pgd_t *kvm_pgd_alloc(void) { pgd_t *ret; - ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER); + ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); if (ret) kvm_pgd_init(ret); diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c index 05560b042d82..3b7590660a04 100644 --- a/arch/mips/mm/pgtable.c +++ b/arch/mips/mm/pgtable.c @@ -12,7 +12,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init((unsigned long)ret); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 6e8e71f12fab..a57519ae96b1 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -818,7 +818,7 @@ void build_get_pmde64(u32 **p, struct uasm_label **l, struct uasm_reloc **r, * everything but the lower xuseg addresses goes down * the module_alloc/vmalloc path. */ - uasm_i_dsrl_safe(p, ptr, tmp, PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3); + uasm_i_dsrl_safe(p, ptr, tmp, PGDIR_SHIFT + PGD_TABLE_ORDER + PAGE_SHIFT - 3); uasm_il_bnez(p, r, ptr, label_vmalloc); } else { uasm_il_bltz(p, r, tmp, label_vmalloc); @@ -1127,7 +1127,7 @@ build_fast_tlb_refill_handler (u32 **p, struct uasm_label **l, UASM_i_SW(p, scratch, scratchpad_offset(0), 0); uasm_i_dsrl_safe(p, scratch, tmp, - PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3); + PGDIR_SHIFT + PGD_TABLE_ORDER + PAGE_SHIFT - 3); uasm_il_bnez(p, r, scratch, label_vmalloc); if (pgd_reg == -1) { @@ -1493,12 +1493,12 @@ static void setup_pw(void) #endif pgd_i = PGDIR_SHIFT; /* 1st level PGD */ #ifndef __PAGETABLE_PMD_FOLDED - pgd_w = PGDIR_SHIFT - PMD_SHIFT + PGD_ORDER; + pgd_w = PGDIR_SHIFT - PMD_SHIFT + PGD_TABLE_ORDER; pmd_i = PMD_SHIFT; /* 2nd level PMD */ pmd_w = PMD_SHIFT - PAGE_SHIFT; #else - pgd_w = PGDIR_SHIFT - PAGE_SHIFT + PGD_ORDER; + pgd_w = PGDIR_SHIFT - PAGE_SHIFT + PGD_TABLE_ORDER; #endif pt_i = PAGE_SHIFT; /* 3rd level PTE */ @@ -1536,7 +1536,7 @@ static void build_loongson3_tlb_refill_handler(void) if (check_for_high_segbits) { uasm_i_dmfc0(&p, K0, C0_BADVADDR); - uasm_i_dsrl_safe(&p, K1, K0, PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3); + uasm_i_dsrl_safe(&p, K1, K0, PGDIR_SHIFT + PGD_TABLE_ORDER + PAGE_SHIFT - 3); uasm_il_beqz(&p, &r, K1, label_vmalloc); uasm_i_nop(&p); @@ -2611,7 +2611,7 @@ void build_tlb_refill_handler(void) check_pabits(); #ifdef CONFIG_64BIT - check_for_high_segbits = current_cpu_data.vmbits > (PGDIR_SHIFT + PGD_ORDER + PAGE_SHIFT - 3); + check_for_high_segbits = current_cpu_data.vmbits > (PGDIR_SHIFT + PGD_TABLE_ORDER + PAGE_SHIFT - 3); #endif if (cpu_has_3kex) { From bf0dc119c51ffdcfa6db0786ea877eec85985b7e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:56 +0300 Subject: [PATCH 205/282] nios2: drop definition of PTE_ORDER This is the order of the page table allocation, not the order of a PTE. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-8-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Dinh Nguyen Cc: Arnd Bergmann Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable.h | 3 +-- arch/nios2/mm/init.c | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 470516d4555e..1af8dbfe1793 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -53,10 +53,9 @@ struct mm_struct; #define PAGE_COPY MKP(0, 0, 1) #define PGD_ORDER 0 -#define PTE_ORDER 0 #define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) -#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) / sizeof(pte_t)) +#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) #define USER_PTRS_PER_PGD \ (CONFIG_NIOS2_KERNEL_MMU_REGION_BASE / PGDIR_SIZE) diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index ae24687d12ad..7eaba31cea98 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -80,7 +80,7 @@ void __init mmu_init(void) #define __page_aligned(order) __aligned(PAGE_SIZE << (order)) pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER); -pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER); +pte_t invalid_pte_table[PTRS_PER_PTE] __aligned(PAGE_SIZE); static struct page *kuser_page[1]; static int alloc_kuser_page(void) From a6714e720b5e8438ca7f779342ad91b39094f548 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:57 +0300 Subject: [PATCH 206/282] nios2: drop definition of PGD_ORDER This is the order of the page table allocation, not the order of a PGD. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-9-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Dinh Nguyen Cc: Arnd Bergmann Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgtable.h | 4 +--- arch/nios2/mm/init.c | 3 +-- arch/nios2/mm/pgtable.c | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 1af8dbfe1793..b3d45e815295 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -52,9 +52,7 @@ struct mm_struct; #define PAGE_COPY MKP(0, 0, 1) -#define PGD_ORDER 0 - -#define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) / sizeof(pgd_t)) +#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t)) #define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t)) #define USER_PTRS_PER_PGD \ diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 7eaba31cea98..7bc82ee889c9 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -78,8 +78,7 @@ void __init mmu_init(void) flush_tlb_all(); } -#define __page_aligned(order) __aligned(PAGE_SIZE << (order)) -pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER); +pgd_t swapper_pg_dir[PTRS_PER_PGD] __aligned(PAGE_SIZE); pte_t invalid_pte_table[PTRS_PER_PTE] __aligned(PAGE_SIZE); static struct page *kuser_page[1]; diff --git a/arch/nios2/mm/pgtable.c b/arch/nios2/mm/pgtable.c index 9b587fd592dd..7c76e8a7447a 100644 --- a/arch/nios2/mm/pgtable.c +++ b/arch/nios2/mm/pgtable.c @@ -54,7 +54,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + ret = (pgd_t *) __get_free_page(GFP_KERNEL); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init(ret); From 1721b412fc3391646e9cba35e74987516f6d0fce Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:58 +0300 Subject: [PATCH 207/282] loongarch: drop definition of PTE_ORDER This is the order of the page table allocation, not the order of a PTE. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-10-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 9 ++++----- arch/loongarch/kernel/asm-offsets.c | 1 - arch/loongarch/mm/tlbex.S | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index d9e86cfa53e2..e0bbfc31fe72 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -24,17 +24,16 @@ #define PGD_ORDER 0 #define PUD_ORDER 0 #define PMD_ORDER 0 -#define PTE_ORDER 0 #if CONFIG_PGTABLE_LEVELS == 2 -#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT + PTE_ORDER - 3)) +#define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #elif CONFIG_PGTABLE_LEVELS == 3 -#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT + PTE_ORDER - 3)) +#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PGDIR_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_ORDER - 3)) #elif CONFIG_PGTABLE_LEVELS == 4 -#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT + PTE_ORDER - 3)) +#define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) #define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_ORDER - 3)) @@ -55,7 +54,7 @@ #if CONFIG_PGTABLE_LEVELS > 2 #define PTRS_PER_PMD ((PAGE_SIZE << PMD_ORDER) >> 3) #endif -#define PTRS_PER_PTE ((PAGE_SIZE << PTE_ORDER) >> 3) +#define PTRS_PER_PTE (PAGE_SIZE >> 3) #define USER_PTRS_PER_PGD ((TASK_SIZE64 / PGDIR_SIZE)?(TASK_SIZE64 / PGDIR_SIZE):1) diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index bfb65eb2844f..1a1166a7e61c 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -194,7 +194,6 @@ void output_mm_defines(void) #ifndef __PAGETABLE_PMD_FOLDED DEFINE(_PMD_ORDER, PMD_ORDER); #endif - DEFINE(_PTE_ORDER, PTE_ORDER); BLANK(); DEFINE(_PMD_SHIFT, PMD_SHIFT); DEFINE(_PGDIR_SHIFT, PGDIR_SHIFT); diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S index 7eee40271577..e36c2c07dee3 100644 --- a/arch/loongarch/mm/tlbex.S +++ b/arch/loongarch/mm/tlbex.S @@ -83,7 +83,7 @@ vmalloc_done_load: bne t0, $r0, tlb_huge_update_load csrrd t0, LOONGARCH_CSR_BADV - srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER) + srli.d t0, t0, PAGE_SHIFT andi t0, t0, (PTRS_PER_PTE - 1) slli.d t0, t0, _PTE_T_LOG2 add.d t1, ra, t0 @@ -247,7 +247,7 @@ vmalloc_done_store: bne t0, $r0, tlb_huge_update_store csrrd t0, LOONGARCH_CSR_BADV - srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER) + srli.d t0, t0, PAGE_SHIFT andi t0, t0, (PTRS_PER_PTE - 1) slli.d t0, t0, _PTE_T_LOG2 add.d t1, ra, t0 @@ -414,7 +414,7 @@ vmalloc_done_modify: bne t0, $r0, tlb_huge_update_modify csrrd t0, LOONGARCH_CSR_BADV - srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER) + srli.d t0, t0, PAGE_SHIFT andi t0, t0, (PTRS_PER_PTE - 1) slli.d t0, t0, _PTE_T_LOG2 add.d t1, ra, t0 From b7c0f2d454afbeaa9e6fcf5d62be2bd7708c73b0 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:11:59 +0300 Subject: [PATCH 208/282] loongarch: drop definition of PMD_ORDER This is the order of the page table allocation, not the order of a PMD. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-11-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgalloc.h | 4 ++-- arch/loongarch/include/asm/pgtable.h | 7 +++---- arch/loongarch/kernel/asm-offsets.c | 3 --- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index b0a57b25c131..93e785f46639 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -66,12 +66,12 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) pmd_t *pmd; struct page *pg; - pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_ORDER); + pg = alloc_page(GFP_KERNEL_ACCOUNT); if (!pg) return NULL; if (!pgtable_pmd_page_ctor(pg)) { - __free_pages(pg, PMD_ORDER); + __free_page(pg); return NULL; } diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index e0bbfc31fe72..f926537d2233 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -23,7 +23,6 @@ #define PGD_ORDER 0 #define PUD_ORDER 0 -#define PMD_ORDER 0 #if CONFIG_PGTABLE_LEVELS == 2 #define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) @@ -31,12 +30,12 @@ #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PGDIR_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_ORDER - 3)) +#define PGDIR_SHIFT (PMD_SHIFT + (PAGE_SHIFT - 3)) #elif CONFIG_PGTABLE_LEVELS == 4 #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #define PMD_SIZE (1UL << PMD_SHIFT) #define PMD_MASK (~(PMD_SIZE-1)) -#define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT + PMD_ORDER - 3)) +#define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT - 3)) #define PUD_SIZE (1UL << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) #define PGDIR_SHIFT (PUD_SHIFT + (PAGE_SHIFT + PUD_ORDER - 3)) @@ -52,7 +51,7 @@ #define PTRS_PER_PUD ((PAGE_SIZE << PUD_ORDER) >> 3) #endif #if CONFIG_PGTABLE_LEVELS > 2 -#define PTRS_PER_PMD ((PAGE_SIZE << PMD_ORDER) >> 3) +#define PTRS_PER_PMD (PAGE_SIZE >> 3) #endif #define PTRS_PER_PTE (PAGE_SIZE >> 3) diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index 1a1166a7e61c..aa4ef42d759f 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -191,9 +191,6 @@ void output_mm_defines(void) DEFINE(_PTE_T_LOG2, PTE_T_LOG2); BLANK(); DEFINE(_PGD_ORDER, PGD_ORDER); -#ifndef __PAGETABLE_PMD_FOLDED - DEFINE(_PMD_ORDER, PMD_ORDER); -#endif BLANK(); DEFINE(_PMD_SHIFT, PMD_SHIFT); DEFINE(_PGDIR_SHIFT, PGDIR_SHIFT); From f05ecc68690fcfc8724b3a3830e9157754273e27 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:12:00 +0300 Subject: [PATCH 209/282] loongarch: drop definition of PUD_ORDER This is the order of the page table allocation, not the order of a PUD. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-12-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgalloc.h | 2 +- arch/loongarch/include/asm/pgtable.h | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index 93e785f46639..4bfeb3c9c9ac 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -90,7 +90,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; - pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_ORDER); + pud = (pud_t *) __get_free_page(GFP_KERNEL); if (pud) pud_init((unsigned long)pud, (unsigned long)invalid_pmd_table); return pud; diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index f926537d2233..a97996fefaed 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -22,7 +22,6 @@ #endif #define PGD_ORDER 0 -#define PUD_ORDER 0 #if CONFIG_PGTABLE_LEVELS == 2 #define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) @@ -38,7 +37,7 @@ #define PUD_SHIFT (PMD_SHIFT + (PAGE_SHIFT - 3)) #define PUD_SIZE (1UL << PUD_SHIFT) #define PUD_MASK (~(PUD_SIZE-1)) -#define PGDIR_SHIFT (PUD_SHIFT + (PAGE_SHIFT + PUD_ORDER - 3)) +#define PGDIR_SHIFT (PUD_SHIFT + (PAGE_SHIFT - 3)) #endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) @@ -48,7 +47,7 @@ #define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) >> 3) #if CONFIG_PGTABLE_LEVELS > 3 -#define PTRS_PER_PUD ((PAGE_SIZE << PUD_ORDER) >> 3) +#define PTRS_PER_PUD (PAGE_SIZE >> 3) #endif #if CONFIG_PGTABLE_LEVELS > 2 #define PTRS_PER_PMD (PAGE_SIZE >> 3) From 418d5dadaf9de9d230bfd3cdcb6263fc6de1a165 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:12:01 +0300 Subject: [PATCH 210/282] loongarch: drop definition of PGD_ORDER This is the order of the page table allocation, not the order of a PGD. Since its always hardwired to 0, simply drop it. [rppt@linux.ibm.com: drop extra BLANK() line in arch/loongarch/kernel/asm-offsets.c] Link: https://lkml.kernel.org/r/20220705154708.181258-13-rppt@kernel.org Link: https://lkml.kernel.org/r/20220703141203.147893-13-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Huacai Chen Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 6 ++---- arch/loongarch/kernel/asm-offsets.c | 2 -- arch/loongarch/mm/pgtable.c | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index a97996fefaed..e03443abaf7d 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -21,8 +21,6 @@ #include #endif -#define PGD_ORDER 0 - #if CONFIG_PGTABLE_LEVELS == 2 #define PGDIR_SHIFT (PAGE_SHIFT + (PAGE_SHIFT - 3)) #elif CONFIG_PGTABLE_LEVELS == 3 @@ -43,9 +41,9 @@ #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define VA_BITS (PGDIR_SHIFT + (PAGE_SHIFT + PGD_ORDER - 3)) +#define VA_BITS (PGDIR_SHIFT + (PAGE_SHIFT - 3)) -#define PTRS_PER_PGD ((PAGE_SIZE << PGD_ORDER) >> 3) +#define PTRS_PER_PGD (PAGE_SIZE >> 3) #if CONFIG_PGTABLE_LEVELS > 3 #define PTRS_PER_PUD (PAGE_SIZE >> 3) #endif diff --git a/arch/loongarch/kernel/asm-offsets.c b/arch/loongarch/kernel/asm-offsets.c index aa4ef42d759f..4a3bb1b9aef3 100644 --- a/arch/loongarch/kernel/asm-offsets.c +++ b/arch/loongarch/kernel/asm-offsets.c @@ -190,8 +190,6 @@ void output_mm_defines(void) #endif DEFINE(_PTE_T_LOG2, PTE_T_LOG2); BLANK(); - DEFINE(_PGD_ORDER, PGD_ORDER); - BLANK(); DEFINE(_PMD_SHIFT, PMD_SHIFT); DEFINE(_PGDIR_SHIFT, PGDIR_SHIFT); BLANK(); diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 0569647152e9..ee179ccd3e3f 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -13,7 +13,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *ret, *init; - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + ret = (pgd_t *) __get_free_page(GFP_KERNEL); if (ret) { init = pgd_offset(&init_mm, 0UL); pgd_init((unsigned long)ret); From 4501a7a03994f3d8bcf830cc42888868e88c8191 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:12:02 +0300 Subject: [PATCH 211/282] parisc: rename PGD_ORDER to PGD_TABLE_ORDER This is the order of the page table allocation, not the order of a PGD. Link: https://lkml.kernel.org/r/20220703141203.147893-14-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Max Filippov Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/parisc/include/asm/pgalloc.h | 6 +++--- arch/parisc/include/asm/pgtable.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index 54b63374579b..e3e142b1c5c5 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -20,18 +20,18 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; - pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_ORDER); + pgd = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); if (unlikely(pgd == NULL)) return NULL; - memset(pgd, 0, PAGE_SIZE << PGD_ORDER); + memset(pgd, 0, PAGE_SIZE << PGD_TABLE_ORDER); return pgd; } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGD_ORDER); + free_pages((unsigned long)pgd, PGD_TABLE_ORDER); } #if CONFIG_PGTABLE_LEVELS == 3 diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 6a1899a9b420..df7b931865d2 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -118,9 +118,9 @@ extern void __update_cache(pte_t pte); #if CONFIG_PGTABLE_LEVELS == 3 #define PMD_TABLE_ORDER 1 -#define PGD_ORDER 0 +#define PGD_TABLE_ORDER 0 #else -#define PGD_ORDER 1 +#define PGD_TABLE_ORDER 1 #endif /* Definitions for 3rd level (we use PLD here for Page Lower directory @@ -144,10 +144,10 @@ extern void __update_cache(pte_t pte); /* Definitions for 1st level */ #define PGDIR_SHIFT (PLD_SHIFT + BITS_PER_PTE + BITS_PER_PMD) -#if (PGDIR_SHIFT + PAGE_SHIFT + PGD_ORDER - BITS_PER_PGD_ENTRY) > BITS_PER_LONG +#if (PGDIR_SHIFT + PAGE_SHIFT + PGD_TABLE_ORDER - BITS_PER_PGD_ENTRY) > BITS_PER_LONG #define BITS_PER_PGD (BITS_PER_LONG - PGDIR_SHIFT) #else -#define BITS_PER_PGD (PAGE_SHIFT + PGD_ORDER - BITS_PER_PGD_ENTRY) +#define BITS_PER_PGD (PAGE_SHIFT + PGD_TABLE_ORDER - BITS_PER_PGD_ENTRY) #endif #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) From 64c5ed22d6080f8ad07ed782087582a5f023f788 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 3 Jul 2022 17:12:03 +0300 Subject: [PATCH 212/282] xtensa: drop definition of PGD_ORDER This is the order of the page table allocation, not the order of a PGD. Since its always hardwired to 0, simply drop it. Link: https://lkml.kernel.org/r/20220703141203.147893-15-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Helge Deller Acked-by: Max Filippov Cc: Arnd Bergmann Cc: Dinh Nguyen Cc: Guo Ren Cc: Huacai Chen Cc: James Bottomley Cc: "Matthew Wilcox (Oracle)" Cc: Thomas Bogendoerfer Cc: Xuerui Wang Signed-off-by: Andrew Morton --- arch/xtensa/include/asm/pgalloc.h | 2 +- arch/xtensa/include/asm/pgtable.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index eeb2de3a89e5..7fc0f9126dd3 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline pgd_t* pgd_alloc(struct mm_struct *mm) { - return (pgd_t*) __get_free_pages(GFP_KERNEL | __GFP_ZERO, PGD_ORDER); + return (pgd_t*) __get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline void ptes_clear(pte_t *ptep) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index e0d5531ae00d..54f577c13afa 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -57,7 +57,6 @@ #define PTRS_PER_PTE 1024 #define PTRS_PER_PTE_SHIFT 10 #define PTRS_PER_PGD 1024 -#define PGD_ORDER 0 #define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) #define FIRST_USER_PGD_NR (FIRST_USER_ADDRESS >> PGDIR_SHIFT) From 391145380f4b432403d8bdaf53ad7109104cb0df Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 5 Jul 2022 18:47:08 +0300 Subject: [PATCH 213/282] ARM: head.S: rename PMD_ORDER to PMD_ENTRY_ORDER PMD_ORDER denotes order of magnitude for a PMD entry, i.e PMD entry size is 2 ^ PMD_ORDER. Rename PMD_ORDER to PMD_ENTRY_ORDER to allow a generic definition of PMD_ORDER as order of a PMD allocation: (PMD_SHIFT - PAGE_SHIFT). Link: https://lkml.kernel.org/r/20220705154708.181258-16-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/kernel/head.S | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 500612d3da2e..29e2900178a1 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -38,10 +38,10 @@ #ifdef CONFIG_ARM_LPAE /* LPAE requires an additional page for the PGD */ #define PG_DIR_SIZE 0x5000 -#define PMD_ORDER 3 +#define PMD_ENTRY_ORDER 3 /* PMD entry size is 2^PMD_ENTRY_ORDER */ #else #define PG_DIR_SIZE 0x4000 -#define PMD_ORDER 2 +#define PMD_ENTRY_ORDER 2 #endif .globl swapper_pg_dir @@ -240,7 +240,7 @@ __create_page_tables: mov r6, r6, lsr #SECTION_SHIFT 1: orr r3, r7, r5, lsl #SECTION_SHIFT @ flags + kernel base - str r3, [r4, r5, lsl #PMD_ORDER] @ identity mapping + str r3, [r4, r5, lsl #PMD_ENTRY_ORDER] @ identity mapping cmp r5, r6 addlo r5, r5, #1 @ next section blo 1b @@ -250,7 +250,7 @@ __create_page_tables: * set two variables to indicate the physical start and end of the * kernel. */ - add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ENTRY_ORDER) ldr r6, =(_end - 1) adr_l r5, kernel_sec_start @ _pa(kernel_sec_start) #if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32 @@ -259,8 +259,8 @@ __create_page_tables: str r8, [r5] @ Save physical start of kernel (LE) #endif orr r3, r8, r7 @ Add the MMU flags - add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER) -1: str r3, [r0], #1 << PMD_ORDER + add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ENTRY_ORDER) +1: str r3, [r0], #1 << PMD_ENTRY_ORDER add r3, r3, #1 << SECTION_SHIFT cmp r0, r6 bls 1b @@ -280,14 +280,14 @@ __create_page_tables: mov r3, pc mov r3, r3, lsr #SECTION_SHIFT orr r3, r7, r3, lsl #SECTION_SHIFT - add r0, r4, #(XIP_START & 0xff000000) >> (SECTION_SHIFT - PMD_ORDER) - str r3, [r0, #((XIP_START & 0x00f00000) >> SECTION_SHIFT) << PMD_ORDER]! + add r0, r4, #(XIP_START & 0xff000000) >> (SECTION_SHIFT - PMD_ENTRY_ORDER) + str r3, [r0, #((XIP_START & 0x00f00000) >> SECTION_SHIFT) << PMD_ENTRY_ORDER]! ldr r6, =(_edata_loc - 1) - add r0, r0, #1 << PMD_ORDER - add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ORDER) + add r0, r0, #1 << PMD_ENTRY_ORDER + add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ENTRY_ORDER) 1: cmp r0, r6 add r3, r3, #1 << SECTION_SHIFT - strls r3, [r0], #1 << PMD_ORDER + strls r3, [r0], #1 << PMD_ENTRY_ORDER bls 1b #endif @@ -297,10 +297,10 @@ __create_page_tables: */ mov r0, r2, lsr #SECTION_SHIFT cmp r2, #0 - ldrne r3, =FDT_FIXED_BASE >> (SECTION_SHIFT - PMD_ORDER) + ldrne r3, =FDT_FIXED_BASE >> (SECTION_SHIFT - PMD_ENTRY_ORDER) addne r3, r3, r4 orrne r6, r7, r0, lsl #SECTION_SHIFT - strne r6, [r3], #1 << PMD_ORDER + strne r6, [r3], #1 << PMD_ENTRY_ORDER addne r6, r6, #1 << SECTION_SHIFT strne r6, [r3] @@ -319,7 +319,7 @@ __create_page_tables: addruart r7, r3, r0 mov r3, r3, lsr #SECTION_SHIFT - mov r3, r3, lsl #PMD_ORDER + mov r3, r3, lsl #PMD_ENTRY_ORDER add r0, r4, r3 mov r3, r7, lsr #SECTION_SHIFT @@ -349,7 +349,7 @@ __create_page_tables: * If we're using the NetWinder or CATS, we also need to map * in the 16550-type serial port for the debug messages */ - add r0, r4, #0xff000000 >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #0xff000000 >> (SECTION_SHIFT - PMD_ENTRY_ORDER) orr r3, r7, #0x7c000000 str r3, [r0] #endif @@ -359,10 +359,10 @@ __create_page_tables: * Similar reasons here - for debug. This is * only for Acorn RiscPC architectures. */ - add r0, r4, #0x02000000 >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #0x02000000 >> (SECTION_SHIFT - PMD_ENTRY_ORDER) orr r3, r7, #0x02000000 str r3, [r0] - add r0, r4, #0xd8000000 >> (SECTION_SHIFT - PMD_ORDER) + add r0, r4, #0xd8000000 >> (SECTION_SHIFT - PMD_ENTRY_ORDER) str r3, [r0] #endif #endif From 7c38f1812d5bc118e29cb898e7104387a6cc0b76 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:46 +0800 Subject: [PATCH 214/282] mm/huge_memory: use flush_pmd_tlb_range in move_huge_pmd Patch series "A few cleanup patches for huge_memory", v3. This series contains a few cleaup patches to remove duplicated codes, add/use helper functions, fix some obsolete comments and so on. More details can be found in the respective changelogs. This patch (of 16): Arches with special requirements for evicting THP backing TLB entries can implement flush_pmd_tlb_range. Otherwise also, it can help optimize TLB flush in THP regime. Using flush_pmd_tlb_range to take advantage of this in move_huge_pmd. Link: https://lkml.kernel.org/r/20220704132201.14611-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220704132201.14611-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Zach O'Keefe Cc: Yang Shi Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8e1b3d9f7ebf..627b98dfd51e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1749,7 +1749,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pmd = move_soft_dirty_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) - flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); + flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); From 4286f14748c13de42c1c4ab77a92fdfb37e6e5ef Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:47 +0800 Subject: [PATCH 215/282] mm/huge_memory: access vm_page_prot with READ_ONCE in remove_migration_pmd vma->vm_page_prot is read lockless from the rmap_walk, it may be updated concurrently. Using READ_ONCE to prevent the risk of reading intermediate values. Link: https://lkml.kernel.org/r/20220704132201.14611-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Cc: Muchun Song Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 627b98dfd51e..fb0011fe9128 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3205,7 +3205,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) entry = pmd_to_swp_entry(*pvmw->pmd); get_page(new); - pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot)); + pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot))); if (pmd_swp_soft_dirty(*pvmw->pmd)) pmde = pmd_mksoft_dirty(pmde); if (is_writable_migration_entry(entry)) From d965e3907540e829df27a22738536d3b9d989820 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:48 +0800 Subject: [PATCH 216/282] mm/huge_memory: fix comment of __pud_trans_huge_lock __pud_trans_huge_lock returns page table lock pointer if a given pud maps a thp instead of 'true' since introduced. Fix corresponding comments. Link: https://lkml.kernel.org/r/20220704132201.14611-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fb0011fe9128..f0adc9db0d60 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1906,10 +1906,10 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) } /* - * Returns true if a given pud maps a thp, false otherwise. + * Returns page table lock pointer if a given pud maps a thp, NULL otherwise. * - * Note that if it returns true, this routine returns without unlocking page - * table lock. So callers must unlock it. + * Note that if it returns page table lock pointer, this routine returns without + * unlocking page table lock. So callers must unlock it. */ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { From 5fe653e9000da9ce06e3696508c44b45315b9887 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:49 +0800 Subject: [PATCH 217/282] mm/huge_memory: use helper touch_pud in huge_pud_set_accessed Use helper touch_pud to set pud accessed to simplify the code and improve the readability. No functional change intended. Link: https://lkml.kernel.org/r/20220704132201.14611-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0adc9db0d60..cda418064823 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1184,15 +1184,15 @@ out: #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void touch_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, int flags) + pud_t *pud, bool write) { pud_t _pud; _pud = pud_mkyoung(*pud); - if (flags & FOLL_WRITE) + if (write) _pud = pud_mkdirty(_pud); if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK, - pud, _pud, flags & FOLL_WRITE)) + pud, _pud, write)) update_mmu_cache_pud(vma, addr, pud); } @@ -1219,7 +1219,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, return NULL; if (flags & FOLL_TOUCH) - touch_pud(vma, addr, pud, flags); + touch_pud(vma, addr, pud, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the @@ -1284,21 +1284,13 @@ out_unlock: void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { - pud_t entry; - unsigned long haddr; bool write = vmf->flags & FAULT_FLAG_WRITE; vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud); if (unlikely(!pud_same(*vmf->pud, orig_pud))) goto unlock; - entry = pud_mkyoung(orig_pud); - if (write) - entry = pud_mkdirty(entry); - haddr = vmf->address & HPAGE_PUD_MASK; - if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write)) - update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud); - + touch_pud(vmf->vma, vmf->address, vmf->pud, write); unlock: spin_unlock(vmf->ptl); } From a69e4717c62508ad5335e9bd56b1b6984a6b2b98 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:50 +0800 Subject: [PATCH 218/282] mm/huge_memory: use helper touch_pmd in huge_pmd_set_accessed Use helper touch_pmd to set pmd accessed to simplify the code and improve the readability. No functional change intended. Link: https://lkml.kernel.org/r/20220704132201.14611-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cda418064823..ebf2a71931c1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1020,15 +1020,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, int flags) + pmd_t *pmd, bool write) { pmd_t _pmd; _pmd = pmd_mkyoung(*pmd); - if (flags & FOLL_WRITE) + if (write) _pmd = pmd_mkdirty(_pmd); if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, flags & FOLL_WRITE)) + pmd, _pmd, write)) update_mmu_cache_pmd(vma, addr, pmd); } @@ -1061,7 +1061,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return NULL; if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags); + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); /* * device mapped pages can only be returned if the @@ -1298,21 +1298,13 @@ unlock: void huge_pmd_set_accessed(struct vm_fault *vmf) { - pmd_t entry; - unsigned long haddr; bool write = vmf->flags & FAULT_FLAG_WRITE; - pmd_t orig_pmd = vmf->orig_pmd; vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) + if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd))) goto unlock; - entry = pmd_mkyoung(orig_pmd); - if (write) - entry = pmd_mkdirty(entry); - haddr = vmf->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write)) - update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); + touch_pmd(vmf->vma, vmf->address, vmf->pmd, write); unlock: spin_unlock(vmf->ptl); @@ -1448,7 +1440,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-ENOMEM); if (flags & FOLL_TOUCH) - touch_pmd(vma, addr, pmd, flags); + touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); From 4fba8f2a306038da617cd29503a1841b7b44ee23 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:51 +0800 Subject: [PATCH 219/282] mm/huge_memory: rename mmun_start to haddr in remove_migration_pmd mmun_start indicates mmu_notifier start address but there's no mmu_notifier stuff in remove_migration_pmd. This will make it hard to get the meaning of mmun_start. Rename it to haddr to avoid confusing readers and also imporve readability. Link: https://lkml.kernel.org/r/20220704132201.14611-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ebf2a71931c1..5816ef8241af 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3180,7 +3180,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; - unsigned long mmun_start = address & HPAGE_PMD_MASK; + unsigned long haddr = address & HPAGE_PMD_MASK; pmd_t pmde; swp_entry_t entry; @@ -3203,12 +3203,12 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (!is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE; - page_add_anon_rmap(new, vma, mmun_start, rmap_flags); + page_add_anon_rmap(new, vma, haddr, rmap_flags); } else { page_add_file_rmap(new, vma, true); } VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); - set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); + set_pmd_at(mm, haddr, pvmw->pmd, pmde); /* No need to invalidate - it was non-present before */ update_mmu_cache_pmd(vma, address, pvmw->pmd); From 74ba2b38ba990c563393e5cb540cad2939f49d95 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:52 +0800 Subject: [PATCH 220/282] mm/huge_memory: use helper function vma_lookup in split_huge_pages_pid Use helper function vma_lookup to lookup the needed vma to simplify the code. Minor readability improvement. Link: https://lkml.kernel.org/r/20220704132201.14611-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5816ef8241af..a8f8af85ee17 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2941,10 +2941,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, * table filled with PTE-mapped THPs, each of which is distinct. */ for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { - struct vm_area_struct *vma = find_vma(mm, addr); + struct vm_area_struct *vma = vma_lookup(mm, addr); struct page *page; - if (!vma || addr < vma->vm_start) + if (!vma) break; /* skip special VMA and hugetlb VMA */ From 37139bb02c35bedb0c82bd21e7d1e695f5485465 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:53 +0800 Subject: [PATCH 221/282] mm/huge_memory: use helper macro __ATTR_RW Use helper macro __ATTR_RW to define use_zero_page_attr, defrag_attr and enabled_attr to make code more clear. Minor readability improvement. Link: https://lkml.kernel.org/r/20220704132201.14611-9-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a8f8af85ee17..253ab79e8d9b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -276,8 +276,8 @@ static ssize_t enabled_store(struct kobject *kobj, } return ret; } -static struct kobj_attribute enabled_attr = - __ATTR(enabled, 0644, enabled_show, enabled_store); + +static struct kobj_attribute enabled_attr = __ATTR_RW(enabled); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, @@ -366,8 +366,7 @@ static ssize_t defrag_store(struct kobject *kobj, return count; } -static struct kobj_attribute defrag_attr = - __ATTR(defrag, 0644, defrag_show, defrag_store); +static struct kobj_attribute defrag_attr = __ATTR_RW(defrag); static ssize_t use_zero_page_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -381,8 +380,7 @@ static ssize_t use_zero_page_store(struct kobject *kobj, return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } -static struct kobj_attribute use_zero_page_attr = - __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); +static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page); static ssize_t hpage_pmd_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) From 749290799e72f05f7311ec8e85a47664dd7be37e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:54 +0800 Subject: [PATCH 222/282] mm/huge_memory: fix comment in zap_huge_pud The comment about deposited pgtable is borrowed from zap_huge_pmd but there's no deposited pgtable stuff for huge pud in zap_huge_pud. Remove it to avoid confusion. Link: https://lkml.kernel.org/r/20220704132201.14611-10-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Cc: Muchun Song Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 253ab79e8d9b..266afca39058 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1913,12 +1913,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, ptl = __pud_trans_huge_lock(pud, vma); if (!ptl) return 0; - /* - * For architectures like ppc64 we look at deposited pgtable - * when calling pudp_huge_get_and_clear. So do the - * pgtable_trans_huge_withdraw after finishing pudp related - * operations. - */ + pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { From 3ce4fee4401206cf5a2c476ec0ee6c90191dfade Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:55 +0800 Subject: [PATCH 223/282] mm/huge_memory: check pmd_present first in is_huge_zero_pmd When pmd is non-present, pmd_pfn returns an insane value. So we should check pmd_present first to avoid acquiring such insane value and also avoid touching possible cold huge_zero_pfn cache line when pmd isn't present. Link: https://lkml.kernel.org/r/20220704132201.14611-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ae3d8e2fd9e2..12b297f9951d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -273,7 +273,7 @@ static inline bool is_huge_zero_page(struct page *page) static inline bool is_huge_zero_pmd(pmd_t pmd) { - return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); + return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd); } static inline bool is_huge_zero_pud(pud_t pud) From 0b175468a02d9ae8b97919b4de62ab4da578b520 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:56 +0800 Subject: [PATCH 224/282] mm/huge_memory: try to free subpage in swapcache when possible Subpages in swapcache won't be freed even if it is the last user of the page until next time reclaim. It shouldn't hurt indeed, but we could try to free these pages to save more memory for system. Link: https://lkml.kernel.org/r/20220704132201.14611-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Cc: Muchun Song Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 266afca39058..de3feb97f0e9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2538,7 +2538,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, * requires taking the lru_lock so we do the put_page * of the tail pages after the split is complete. */ - put_page(subpage); + free_page_and_swap_cache(subpage); } } From a17206dac7b262e7abed5a05e34a6bd6bd0a9b06 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:57 +0800 Subject: [PATCH 225/282] mm/huge_memory: minor cleanup for split_huge_pages_all There is nothing to do if a zone doesn't have any pages managed by the buddy allocator. So we should check managed_zone instead. Also if a thp is found, there's no need to traverse the subpages again. Link: https://lkml.kernel.org/r/20220704132201.14611-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Cc: Muchun Song Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de3feb97f0e9..b6d915f24909 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2857,9 +2857,12 @@ static void split_huge_pages_all(void) unsigned long total = 0, split = 0; pr_debug("Split all THPs\n"); - for_each_populated_zone(zone) { + for_each_zone(zone) { + if (!managed_zone(zone)) + continue; max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + int nr_pages; if (!pfn_valid(pfn)) continue; @@ -2875,8 +2878,10 @@ static void split_huge_pages_all(void) total++; lock_page(page); + nr_pages = thp_nr_pages(page); if (!split_huge_page(page)) split++; + pfn += nr_pages - 1; unlock_page(page); next: put_page(page); From 121c1781aeb00475d163246d9ae7d8746e377040 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:58 +0800 Subject: [PATCH 226/282] mm/huge_memory: fix comment of page_deferred_list The current comment is confusing because if global or memcg deferred list in the second tail page is occupied by compound_head, why we still use page[2].deferred_list here? I think it wants to say that Global or memcg deferred list in the first tail page is occupied by compound_mapcount and compound_pincount so we use the second tail page's deferred_list instead. Link: https://lkml.kernel.org/r/20220704132201.14611-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 12b297f9951d..37f2f11a6d7e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -294,8 +294,8 @@ static inline bool thp_migration_supported(void) static inline struct list_head *page_deferred_list(struct page *page) { /* - * Global or memcg deferred list in the second tail pages is - * occupied by compound_head. + * See organization of tail pages of compound page in + * "struct page" definition. */ return &page[2].deferred_list; } From d764afedfb04e4eaf04b175c5ac54ffa4a423070 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:21:59 +0800 Subject: [PATCH 227/282] mm/huge_memory: correct comment of prep_transhuge_page We use page->mapping and page->index, instead of page->indexlru in second tail page as list_head. Correct it. Link: https://lkml.kernel.org/r/20220704132201.14611-15-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6d915f24909..f3f3e4b5a3ab 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -581,7 +581,7 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page) void prep_transhuge_page(struct page *page) { /* - * we use page->mapping and page->indexlru in second tail page + * we use page->mapping and page->index in second tail page * as list_head: assuming THP order >= 2 */ From cea3332808f92c0120fb0b157c56d48639e0c713 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:22:00 +0800 Subject: [PATCH 228/282] mm/huge_memory: comment the subtly logic in __split_huge_pmd It's dangerous and wrong to call page_folio(pmd_page(*pmd)) when pmd isn't present. But the caller guarantees pmd is present when folio is set. So we should be safe here. Add comment to make it clear. Link: https://lkml.kernel.org/r/20220704132201.14611-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Cc: Muchun Song Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3f3e4b5a3ab..17e392ec9eb3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2235,6 +2235,10 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)) { + /* + * It's safe to call pmd_page when folio is set because it's + * guaranteed that pmd is present. + */ if (folio && folio != page_folio(pmd_page(*pmd))) goto out; __split_huge_pmd_locked(vma, pmd, range.start, freeze); From e75858b904b44510b7f4a0f8f1c08dfc0bf9009b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 4 Jul 2022 21:22:01 +0800 Subject: [PATCH 229/282] mm/huge_memory: use helper macro IS_ERR_OR_NULL in split_huge_pages_pid Use helper macro IS_ERR_OR_NULL to check the validity of page to simplify the code. Minor readability improvement. Link: https://lkml.kernel.org/r/20220704132201.14611-17-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Cc: Yang Shi Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 17e392ec9eb3..814020689d3e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2958,9 +2958,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - if (IS_ERR(page)) - continue; - if (!page || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) continue; if (!is_transparent_hugepage(page)) From 48725bbc0c3828bb9e36b632c6bf0326ed292ffb Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 4 Jul 2022 19:41:12 +0800 Subject: [PATCH 230/282] mm/mprotect: remove the redundant initialization for error The variable error will be assigned correctly before it is used, the initialization is redundant, so remove it. Link: https://lkml.kernel.org/r/20220704114112.163112-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 5ef478b06a7d..8250c1315d9c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -663,7 +663,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, { unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; - int error = -EINVAL; + int error; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); From a317ebccaa3609917a2c021af870cf3fa607ab0c Mon Sep 17 00:00:00 2001 From: Patrick Wang Date: Tue, 5 Jul 2022 19:31:58 +0800 Subject: [PATCH 231/282] mm: percpu: use kmemleak_ignore_phys() instead of kmemleak_free() Kmemleak recently added a rbtree to store the objects allocted with physical address. Those objects can't be freed with kmemleak_free(). According to the comments, percpu allocations are tracked by kmemleak separately. Kmemleak_free() was used to avoid the unnecessary tracking. If kmemleak_free() fails, those objects would be scanned by kmemleak, which is unnecessary but shouldn't lead to other effects. Use kmemleak_ignore_phys() instead of kmemleak_free() for those objects. Link: https://lkml.kernel.org/r/20220705113158.127600-1-patrick.wang.shcn@gmail.com Fixes: 0c24e061196c ("mm: kmemleak: add rbtree and store physical address for objects allocated with PA") Signed-off-by: Patrick Wang Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Catalin Marinas Signed-off-by: Andrew Morton --- mm/percpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 3633eeefaa0d..27697b2429c2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3104,7 +3104,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, goto out_free_areas; } /* kmemleak tracks the percpu allocations separately */ - kmemleak_free(ptr); + kmemleak_ignore_phys(__pa(ptr)); areas[group] = ptr; base = min(ptr, base); @@ -3304,7 +3304,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t goto enomem; } /* kmemleak tracks the percpu allocations separately */ - kmemleak_free(ptr); + kmemleak_ignore_phys(__pa(ptr)); pages[j++] = virt_to_page(ptr); } } @@ -3417,7 +3417,7 @@ void __init setup_per_cpu_areas(void) if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ - kmemleak_free(fc); + kmemleak_ignore_phys(__pa(fc)); ai->dyn_size = unit_size; ai->unit_size = unit_size; From dcadcf1c30619ead2f3280bfb7f74de8304be2bb Mon Sep 17 00:00:00 2001 From: Gang Li Date: Wed, 6 Jul 2022 11:46:54 +0800 Subject: [PATCH 232/282] mm, hugetlb: skip irrelevant nodes in show_free_areas() show_free_areas() allows to filter out node specific data which is irrelevant to the allocation request. But hugetlb_show_meminfo() still shows hugetlb on all nodes, which is redundant and unnecessary. Use show_mem_node_skip() to skip irrelevant nodes. And replace hugetlb_show_meminfo() with hugetlb_show_meminfo_node(nid). before-and-after sample output of OOM: before: ``` [ 214.362453] Node 1 active_anon:148kB inactive_anon:4050920kB active_file:112kB inactive_file:100kB [ 214.375429] Node 1 Normal free:45100kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig [ 214.388334] lowmem_reserve[]: 0 0 0 0 0 [ 214.390251] Node 1 Normal: 423*4kB (UE) 320*8kB (UME) 187*16kB (UE) 117*32kB (UE) 57*64kB (UME) 20 [ 214.397626] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB [ 214.401518] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB ``` after: ``` [ 145.069705] Node 1 active_anon:128kB inactive_anon:4049412kB active_file:56kB inactive_file:84kB u [ 145.110319] Node 1 Normal free:45424kB boost:0kB min:45576kB low:56968kB high:68360kB reserved_hig [ 145.152315] lowmem_reserve[]: 0 0 0 0 0 [ 145.155244] Node 1 Normal: 470*4kB (UME) 373*8kB (UME) 247*16kB (UME) 168*32kB (UE) 86*64kB (UME) [ 145.164119] Node 1 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB ``` Link: https://lkml.kernel.org/r/20220706034655.1834-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 18 ++++++++---------- mm/page_alloc.c | 8 ++++++-- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 75ee739d815b..4cdfce976644 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -152,7 +152,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct page *ref_page, zap_flags_t zap_flags); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(char *buf, int len, int nid); -void hugetlb_show_meminfo(void); +void hugetlb_show_meminfo_node(int nid); unsigned long hugetlb_total_pages(void); vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags); @@ -298,7 +298,7 @@ static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid) return 0; } -static inline void hugetlb_show_meminfo(void) +static inline void hugetlb_show_meminfo_node(int nid) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 96635a2874e3..bb763f5d30b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4477,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid) nid, h->surplus_huge_pages_node[nid]); } -void hugetlb_show_meminfo(void) +void hugetlb_show_meminfo_node(int nid) { struct hstate *h; - int nid; if (!hugepages_supported()) return; - for_each_node_state(nid, N_MEMORY) - for_each_hstate(h) - pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", - nid, - h->nr_huge_pages_node[nid], - h->free_huge_pages_node[nid], - h->surplus_huge_pages_node[nid], - huge_page_size(h) / SZ_1K); + for_each_hstate(h) + printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n", + nid, + h->nr_huge_pages_node[nid], + h->free_huge_pages_node[nid], + h->surplus_huge_pages_node[nid], + huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 215b26664ad7..4fa96d3510fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6022,7 +6022,7 @@ static void show_migration_types(unsigned char type) void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; - int cpu; + int cpu, nid; struct zone *zone; pg_data_t *pgdat; @@ -6210,7 +6210,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk(KERN_CONT "= %lukB\n", K(total)); } - hugetlb_show_meminfo(); + for_each_online_node(nid) { + if (show_mem_node_skip(filter, nid, nodemask)) + continue; + hugetlb_show_meminfo_node(nid); + } printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES)); From 04ec006171badc73f749dc1cd9d66e05f8575a81 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 8 Jul 2022 16:07:36 +0200 Subject: [PATCH 233/282] mm/page_alloc: use try_cmpxchg in set_pfnblock_flags_mask Use try_cmpxchg instead of cmpxchg in set_pfnblock_flags_mask. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). The main loop improves from: 1c5d: 48 89 c2 mov %rax,%rdx 1c60: 48 89 c1 mov %rax,%rcx 1c63: 48 21 fa and %rdi,%rdx 1c66: 4c 09 c2 or %r8,%rdx 1c69: f0 48 0f b1 16 lock cmpxchg %rdx,(%rsi) 1c6e: 48 39 c1 cmp %rax,%rcx 1c71: 75 ea jne 1c5d <...> to: 1c60: 48 89 ca mov %rcx,%rdx 1c63: 48 21 c2 and %rax,%rdx 1c66: 4c 09 c2 or %r8,%rdx 1c69: f0 48 0f b1 16 lock cmpxchg %rdx,(%rsi) 1c6e: 75 f0 jne 1c60 <...> Link: https://lkml.kernel.org/r/20220708140736.8737-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Andrew Morton Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4fa96d3510fe..50d96fff8855 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -602,7 +602,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, { unsigned long *bitmap; unsigned long bitidx, word_bitidx; - unsigned long old_word, word; + unsigned long word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); @@ -618,12 +618,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, flags <<= bitidx; word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } + do { + } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); } void set_pageblock_migratetype(struct page *page, int migratetype) From 8f0b747d7dde47e9f6ff64d176ad5fcf0a23d524 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 9 Jul 2022 17:24:40 +0800 Subject: [PATCH 234/282] mm/page_vma_mapped.c: use helper function huge_pte_lock Use helper function huge_pte_lock() to lock the huge pte to simplify the code a bit. No functional change intended. Link: https://lkml.kernel.org/r/20220709092440.43018-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e971a467fcdf..8e9e574d535a 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -174,8 +174,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (!pvmw->pte) return false; - pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte); - spin_lock(pvmw->ptl); + pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte); if (!check_pte(pvmw)) return not_found(pvmw); return true; From cdb5c9e53f2e7166409dbf7248364f592d11bd1c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 9 Jul 2022 17:25:27 +0800 Subject: [PATCH 235/282] mm/mmap: fix obsolete comment of find_extend_vma mmget_still_valid() has already been removed via commit 4d45e75a9955 ("mm: remove the now-unnecessary mmget_still_valid() hack"). Update the corresponding comment. Link: https://lkml.kernel.org/r/20220709092527.47778-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/mmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index edf27a2789a2..d529837bc8c3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2541,7 +2541,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - /* don't alter vm_end if the coredump is running */ if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) From 0d8bc0b10aeab543bdccb86180f58db1f79f7cee Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 13 Jul 2022 20:53:14 +0800 Subject: [PATCH 236/282] writeback: cleanup bdi_sched_wait() bdi_sched_wait() is no longer used since commit 839a8e8660b6 ("writeback: replace custom worker pool implementation with unbound workqueue"), so remove it. Link: https://lkml.kernel.org/r/20220713125314.171345-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Jan Kara Reviewed-by: Johannes Thumshirn Acked-by: Jens Axboe Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index d452071db572..e84b745a6811 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -140,12 +140,6 @@ static inline bool mapping_can_writeback(struct address_space *mapping) return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } -static inline int bdi_sched_wait(void *word) -{ - schedule(); - return 0; -} - #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, From 13c1c74af7643f27273cb31a412811b8cd971b78 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 8 Jun 2022 15:25:34 +0800 Subject: [PATCH 237/282] zram: fix unused 'zram_wb_devops' warning drivers/block/zram/zram_drv.c:55:45: warning: 'zram_wb_devops' defined but not used [-Wunused-const-variable=] Fix the above warning if CONFIG_ZRAM_WRITEBACK not enabled. Link: https://lkml.kernel.org/r/20220608072534.68850-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b8549c61ff2c..3e281a193feb 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -52,7 +52,9 @@ static unsigned int num_devices = 1; static size_t huge_class_size; static const struct block_device_operations zram_devops; +#ifdef CONFIG_ZRAM_WRITEBACK static const struct block_device_operations zram_wb_devops; +#endif static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, From 187e7c41445a0f202bb551f08ca7f8158fea1cd7 Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Fri, 8 Jul 2022 11:06:46 +0200 Subject: [PATCH 238/282] selftests/vm: fix va_128TBswitch.sh permissions Restore the +x bit to va_128TBswitch.sh, which got dropped from the previous patch, somehow. Link: https://lkml.kernel.org/r/20220708090646.34927-1-adam@wowsignal.io Fixes: 1afd01d43efc3 ("selftests/vm: Only run 128TBswitch with 5-level paging") Signed-off-by: Adam Sindelar Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/va_128TBswitch.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/vm/va_128TBswitch.sh diff --git a/tools/testing/selftests/vm/va_128TBswitch.sh b/tools/testing/selftests/vm/va_128TBswitch.sh old mode 100644 new mode 100755 From 14773bfa70e67f4d4ebd60e60cb6e25e8c84d4c0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 20 Jul 2022 23:47:55 +0900 Subject: [PATCH 239/282] mm: shrinkers: fix double kfree on shrinker name syzbot is reporting double kfree() at free_prealloced_shrinker() [1], for destroy_unused_super() calls free_prealloced_shrinker() even if prealloc_shrinker() returned an error. Explicitly clear shrinker name when prealloc_shrinker() called kfree(). [roman.gushchin@linux.dev: zero shrinker->name in all cases where shrinker->name is freed] Link: https://lkml.kernel.org/r/YtgteTnQTgyuKUSY@castle Link: https://syzkaller.appspot.com/bug?extid=8b481578352d4637f510 [1] Link: https://lkml.kernel.org/r/ffa62ece-6a42-2644-16cf-0d33ef32c676@I-love.SAKURA.ne.jp Fixes: e33c267ab70de424 ("mm: shrinkers: provide shrinkers with names") Reported-by: syzbot Signed-off-by: Tetsuo Handa Acked-by: Roman Gushchin Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 1 + mm/vmscan.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index e5b40c43221d..b05295bab322 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -251,6 +251,7 @@ void shrinker_debugfs_remove(struct shrinker *shrinker) lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); + shrinker->name = NULL; if (!shrinker->debugfs_entry) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index f58761cea0a0..fbb4108250ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -644,8 +644,10 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) return -ENOMEM; err = __prealloc_shrinker(shrinker); - if (err) + if (err) { kfree_const(shrinker->name); + shrinker->name = NULL; + } return err; } @@ -660,6 +662,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) { #ifdef CONFIG_SHRINKER_DEBUG kfree_const(shrinker->name); + shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); @@ -704,8 +707,10 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) return -ENOMEM; err = __register_shrinker(shrinker); - if (err) + if (err) { kfree_const(shrinker->name); + shrinker->name = NULL; + } return err; } #else From 15d2ce7129f25c51d8a840a8a002c7ba0bb1509d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 29 Jul 2022 18:07:13 -0700 Subject: [PATCH 240/282] mips: rename mt_init to mips_mt_init Move mt_init out of the way for the maple tree. Use mips_mt prefix to match the rest of the functions in the file. Link: https://lkml.kernel.org/r/20220504002554.654642-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/kernel/mips-mt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index d5f7362e8c24..dc023a979803 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -230,7 +230,7 @@ void mips_mt_set_cpuoptions(void) struct class *mt_class; -static int __init mt_init(void) +static int __init mips_mt_init(void) { struct class *mtc; @@ -243,4 +243,4 @@ static int __init mt_init(void) return 0; } -subsys_initcall(mt_init); +subsys_initcall(mips_mt_init); From a43cfc87caaf46710c8027a8c23b8a55f1078f19 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 20 Jun 2022 21:09:09 -0400 Subject: [PATCH 241/282] android: binder: stop saving a pointer to the VMA Do not record a pointer to a VMA outside of the mmap_lock for later use. This is unsafe and there are a number of failure paths *after* the recorded VMA pointer may be freed during setup. There is no callback to the driver to clear the saved pointer from generic mm code. Furthermore, the VMA pointer may become stale if any number of VMA operations end up freeing the VMA so saving it was fragile to being with. Instead, change the binder_alloc struct to record the start address of the VMA and use vma_lookup() to get the vma when needed. Add lockdep mmap_lock checks on updates to the vma pointer to ensure the lock is held and depend on that lock for synchronization of readers and writers - which was already the case anyways, so the smp_wmb()/smp_rmb() was not necessary. [akpm@linux-foundation.org: fix drivers/android/binder_alloc_selftest.c] Link: https://lkml.kernel.org/r/20220621140212.vpkio64idahetbyf@revolver Fixes: da1b9564e85b ("android: binder: fix the race mmap and alloc_new_buf_locked") Reported-by: syzbot+58b51ac2b04e388ab7b0@syzkaller.appspotmail.com Signed-off-by: Liam R. Howlett Cc: Minchan Kim Cc: Christian Brauner (Microsoft) Cc: Greg Kroah-Hartman Cc: Hridya Valsaraju Cc: Joel Fernandes Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 30 ++++++++++++------------- drivers/android/binder_alloc.h | 2 +- drivers/android/binder_alloc_selftest.c | 2 +- 3 files changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 51b502217d00..f555eebceef6 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -213,7 +213,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (mm) { mmap_read_lock(mm); - vma = alloc->vma; + vma = vma_lookup(mm, alloc->vma_addr); } if (!vma && need_mm) { @@ -313,16 +313,15 @@ err_no_vma: static inline void binder_alloc_set_vma(struct binder_alloc *alloc, struct vm_area_struct *vma) { - if (vma) + unsigned long vm_start = 0; + + if (vma) { + vm_start = vma->vm_start; alloc->vma_vm_mm = vma->vm_mm; - /* - * If we see alloc->vma is not NULL, buffer data structures set up - * completely. Look at smp_rmb side binder_alloc_get_vma. - * We also want to guarantee new alloc->vma_vm_mm is always visible - * if alloc->vma is set. - */ - smp_wmb(); - alloc->vma = vma; + } + + mmap_assert_write_locked(alloc->vma_vm_mm); + alloc->vma_addr = vm_start; } static inline struct vm_area_struct *binder_alloc_get_vma( @@ -330,11 +329,9 @@ static inline struct vm_area_struct *binder_alloc_get_vma( { struct vm_area_struct *vma = NULL; - if (alloc->vma) { - /* Look at description in binder_alloc_set_vma */ - smp_rmb(); - vma = alloc->vma; - } + if (alloc->vma_addr) + vma = vma_lookup(alloc->vma_vm_mm, alloc->vma_addr); + return vma; } @@ -817,7 +814,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers = 0; mutex_lock(&alloc->mutex); - BUG_ON(alloc->vma); + BUG_ON(alloc->vma_addr && + vma_lookup(alloc->vma_vm_mm, alloc->vma_addr)); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 7dea57a84c79..1e4fd37af5e0 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,7 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct vm_area_struct *vma; + unsigned long vma_addr; struct mm_struct *vma_vm_mm; void __user *buffer; struct list_head buffers; diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index c2b323bc3b3a..43a881073a42 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -287,7 +287,7 @@ void binder_selftest_alloc(struct binder_alloc *alloc) if (!binder_selftest_run) return; mutex_lock(&binder_selftest_lock); - if (!binder_selftest_run || !alloc->vma) + if (!binder_selftest_run || !alloc->vma_addr) goto done; pr_info("STARTED\n"); binder_selftest_alloc_offset(alloc, end_offset, 0); From b0cab80ecd54ae3b2356bb081af0bffd538c8265 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Jun 2022 15:18:59 +0000 Subject: [PATCH 242/282] android: binder: fix lockdep check on clearing vma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When munmapping a vma, the mmap_lock can be degraded to a write before calling close() on the file handle. The binder close() function calls binder_alloc_set_vma() to clear the vma address, which now has a lock dep check for writing on the mmap_lock. Change the lockdep check to ensure the reading lock is held while clearing and keep the write check while writing. Link: https://lkml.kernel.org/r/20220627151857.2316964-1-Liam.Howlett@oracle.com Fixes: 472a68df605b ("android: binder: stop saving a pointer to the VMA") Signed-off-by: Liam R. Howlett Reported-by: syzbot+da54fa8d793ca89c741f@syzkaller.appspotmail.com Acked-by: Todd Kjos Cc: "Arve Hjønnevåg" Cc: Christian Brauner (Microsoft) Cc: Greg Kroah-Hartman Cc: Hridya Valsaraju Cc: Joel Fernandes Cc: Martijn Coenen Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index f555eebceef6..1014beb12802 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -315,12 +315,19 @@ static inline void binder_alloc_set_vma(struct binder_alloc *alloc, { unsigned long vm_start = 0; + /* + * Allow clearing the vma with holding just the read lock to allow + * munmapping downgrade of the write lock before freeing and closing the + * file using binder_alloc_vma_close(). + */ if (vma) { vm_start = vma->vm_start; alloc->vma_vm_mm = vma->vm_mm; + mmap_assert_write_locked(alloc->vma_vm_mm); + } else { + mmap_assert_locked(alloc->vma_vm_mm); } - mmap_assert_write_locked(alloc->vma_vm_mm); alloc->vma_addr = vm_start; } From 7f82f922319ede486540e8746769865b9508d2c2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jun 2022 16:20:27 +0800 Subject: [PATCH 243/282] mm/mmap.c: fix missing call to vm_unacct_memory in mmap_region Since the beginning, charged is set to 0 to avoid calling vm_unacct_memory twice because vm_unacct_memory will be called by above unmap_region. But since commit 4f74d2c8e827 ("vm: remove 'nr_accounted' calculations from the unmap_vmas() interfaces"), unmap_region doesn't call vm_unacct_memory anymore. So charged shouldn't be set to 0 now otherwise the calling to paired vm_unacct_memory will be missed and leads to imbalanced account. Link: https://lkml.kernel.org/r/20220618082027.43391-1-linmiaohe@huawei.com Fixes: 4f74d2c8e827 ("vm: remove 'nr_accounted' calculations from the unmap_vmas() interfaces") Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/mmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index d529837bc8c3..ec4e0d53a388 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1847,7 +1847,6 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); free_vma: From ccac11da679bc283a5fe3db694d9f4f40245a07e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 27 Jun 2022 21:23:51 +0800 Subject: [PATCH 244/282] filemap: minor cleanup for filemap_write_and_wait_range Restructure the logic in filemap_write_and_wait_range to simplify the code and make it more consistent with file_write_and_wait_range. No functional change intended. Link: https://lkml.kernel.org/r/20220627132351.55680-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ffdfbc8b0e3c..cd59f055e29d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -667,7 +667,7 @@ EXPORT_SYMBOL_GPL(filemap_range_has_writeback); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int err = 0; + int err = 0, err2; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, @@ -678,18 +678,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ - if (err != -EIO) { - int err2 = filemap_fdatawait_range(mapping, - lstart, lend); - if (!err) - err = err2; - } else { - /* Clear any previously stored errors */ - filemap_check_errors(mapping); - } - } else { - err = filemap_check_errors(mapping); + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); } + err2 = filemap_check_errors(mapping); + if (!err) + err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); From d6e103a757fa7876e7ded76128d5dffe12402ab9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 1 Jul 2022 20:35:21 -0700 Subject: [PATCH 245/282] mm: memcontrol: do not miss MEMCG_MAX events for enforced allocations Yafang Shao reported an issue related to the accounting of bpf memory: if a bpf map is charged indirectly for memory consumed from an interrupt context and allocations are enforced, MEMCG_MAX events are not raised. It's not/less of an issue in a generic case because consequent allocations from a process context will trigger the direct reclaim and MEMCG_MAX events will be raised. However a bpf map can belong to a dying/abandoned memory cgroup, so there will be no allocations from a process context and no MEMCG_MAX events will be triggered. Link: https://lkml.kernel.org/r/20220702033521.64630-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reported-by: Yafang Shao Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5bfb3eacd08..767f49a6b987 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2577,6 +2577,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, bool passed_oom = false; bool may_swap = true; bool drained = false; + bool raised_max_event = false; unsigned long pflags; retry: @@ -2616,6 +2617,7 @@ retry: goto nomem; memcg_memory_event(mem_over_limit, MEMCG_MAX); + raised_max_event = true; psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, @@ -2682,6 +2684,13 @@ nomem: if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) return -ENOMEM; force: + /* + * If the allocation has to be enforced, don't forget to raise + * a MEMCG_MAX event. + */ + if (!raised_max_event) + memcg_memory_event(mem_over_limit, MEMCG_MAX); + /* * The allocation either can't fail or will lead to more memory * being freed very soon. Allow memory usage go over the limit From 3b8e7f5c42d1aa44f71fd219717c80e34101361e Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Mon, 4 Jul 2022 19:33:51 +0200 Subject: [PATCH 246/282] selftests/vm: fix errno handling in mrelease_test mrelease_test should return KSFT_SKIP when process_mrelease is not defined, but due to a perror call consuming the errno, it returns KSFT_FAIL. This patch decides the exit code before calling perror. [adam@wowsignal.io: fix remaining instances of errno mishandling] Link: https://lkml.kernel.org/r/20220706141602.10159-1-adam@wowsignal.io Link: https://lkml.kernel.org/r/20220704173351.19595-1-adam@wowsignal.io Fixes: 33776141b812 ("selftests: vm: add process_mrelease tests") Signed-off-by: Adam Sindelar Reviewed-by: David Vernet Reviewed-by: Suren Baghdasaryan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/mrelease_test.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c index 96671c2f7d48..6c62966ab5db 100644 --- a/tools/testing/selftests/vm/mrelease_test.c +++ b/tools/testing/selftests/vm/mrelease_test.c @@ -62,19 +62,22 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) /* The process_mrelease calls in this test are expected to fail */ static void run_negative_tests(int pidfd) { + int res; /* Test invalid flags. Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong flags"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } /* * Test reaping while process is alive with no pending SIGKILL. * Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease on a live process"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } } @@ -100,8 +103,9 @@ int main(void) /* Test a wrong pidfd */ if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong pidfd"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } /* Start the test with 1MB child memory allocation */ @@ -156,8 +160,9 @@ retry: run_negative_tests(pidfd); if (kill(pid, SIGKILL)) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("kill"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); @@ -172,9 +177,10 @@ retry: if (errno == ESRCH) { retry = (size <= MAX_SIZE_MB); } else { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease"); waitpid(pid, NULL, 0); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } } From ac3ced5fc12fb3d7268054485cbc36441c05cf24 Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Mon, 4 Jul 2022 14:38:13 +0200 Subject: [PATCH 247/282] selftests/vm: skip 128TBswitch on unsupported arch The test va_128TBswitch.c exercises a feature only supported on PPC and x86_64, but it's run on other 64-bit archs as well. Before this patch, the test did nothing and returned 0 for KSFT_PASS. This patch makes it return the KSFT codes from kselftest.h, including KSFT_SKIP when appropriate. Verified on arm64 and x86_64. Link: https://lkml.kernel.org/r/20220704123813.427625-1-adam@wowsignal.io Signed-off-by: Adam Sindelar Cc: David Vernet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/va_128TBswitch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c index da6ec3b53ea8..1d2068989883 100644 --- a/tools/testing/selftests/vm/va_128TBswitch.c +++ b/tools/testing/selftests/vm/va_128TBswitch.c @@ -231,7 +231,7 @@ static struct testcase hugetlb_testcases[] = { static int run_test(struct testcase *test, int count) { void *p; - int i, ret = 0; + int i, ret = KSFT_PASS; for (i = 0; i < count; i++) { struct testcase *t = test + i; @@ -242,13 +242,13 @@ static int run_test(struct testcase *test, int count) if (p == MAP_FAILED) { printf("FAILED\n"); - ret = 1; + ret = KSFT_FAIL; continue; } if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { printf("FAILED\n"); - ret = 1; + ret = KSFT_FAIL; } else { /* * Do a dereference of the address returned so that we catch @@ -280,7 +280,7 @@ int main(int argc, char **argv) int ret; if (!supported_arch()) - return 0; + return KSFT_SKIP; ret = run_test(testcases, ARRAY_SIZE(testcases)); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) From b717d6b93b54ec2a2a7d3bb7268d3ff847cc54c5 Mon Sep 17 00:00:00 2001 From: William Lam Date: Mon, 11 Jul 2022 21:28:06 +0100 Subject: [PATCH 248/282] mm: compaction: include compound page count for scanning in pageblock isolation The number of scanned pages can be lower than the number of isolated pages when isolating mirgratable or free pageblock. The metric is being reported in trace event and also used in vmstat. some example output from trace where it shows nr_taken can be greater than nr_scanned: Produced by kernel v5.19-rc6 kcompactd0-42 [001] ..... 1210.268022: mm_compaction_isolate_migratepages: range=(0x107ae4 ~ 0x107c00) nr_scanned=265 nr_taken=255 [...] kcompactd0-42 [001] ..... 1210.268382: mm_compaction_isolate_freepages: range=(0x215800 ~ 0x215a00) nr_scanned=13 nr_taken=128 kcompactd0-42 [001] ..... 1210.268383: mm_compaction_isolate_freepages: range=(0x215600 ~ 0x215680) nr_scanned=1 nr_taken=128 mm_compaction_isolate_migratepages does not seem to have this behaviour, but for the reason of consistency, nr_scanned should also be taken care of in that side. This behaviour is confusing since currently the count for isolated pages takes account of compound page but not for the case of scanned pages. And given that the number of isolated pages(nr_taken) reported in mm_compaction_isolate_template trace event is on a single-page basis, the ambiguity when reporting the number of scanned pages can be removed by also including compound page count. Link: https://lkml.kernel.org/r/20220711202806.22296-1-william.lam@bytedance.com Signed-off-by: William Lam Reviewed-by: Punit Agrawal Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index cd029ab03d0e..d024d18e0b5c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -616,6 +616,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; set_page_private(page, order); + nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; list_add_tail(&page->lru, freelist); @@ -1101,6 +1102,7 @@ isolate_success: isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); + nr_scanned += compound_nr(page) - 1; /* * Avoid isolating too much unless this block is being @@ -1504,6 +1506,7 @@ fast_isolate_freepages(struct compact_control *cc) if (__isolate_free_page(page, order)) { set_page_private(page, order); nr_isolated = 1 << order; + nr_scanned += nr_isolated - 1; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); From 0f0b6931ff0d8de344392f5d470f88af64130709 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 16 Jul 2022 16:03:59 +0800 Subject: [PATCH 249/282] mm: remove obsolete comment in do_fault_around() Since commit 7267ec008b5c ("mm: postpone page table allocation until we have page to map"), do_fault_around is not called with page table lock held. Cleanup the corresponding comments. Link: https://lkml.kernel.org/r/20220716080359.38791-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7cd0d4c086db..8c193b3a32aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4439,10 +4439,6 @@ late_initcall(fault_around_debugfs); * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * - * This function is called with the page table lock taken. In the split ptlock - * case the page table lock only protects only those entries which belong to - * the page table corresponding to the fault address. - * * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * From 450d0e74d886c172ac2f72518b797a18ee8d1327 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Wed, 15 Jun 2022 10:27:42 +0000 Subject: [PATCH 250/282] memblock,arm64: expand the static memblock memory table In a system(Huawei Ascend ARM64 SoC) using HBM, a multi-bit ECC error occurs, and the BIOS will mark the corresponding area (for example, 2 MB) as unusable. When the system restarts next time, these areas are not reported or reported as EFI_UNUSABLE_MEMORY. Both cases lead to an increase in the number of memblocks, whereas EFI_UNUSABLE_MEMORY leads to a larger number of memblocks. For example, if the EFI_UNUSABLE_MEMORY type is reported: ... memory[0x92] [0x0000200834a00000-0x0000200835bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0 memory[0x93] [0x0000200835c00000-0x0000200835dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4 memory[0x94] [0x0000200835e00000-0x00002008367fffff], 0x0000000000a00000 bytes on node 7 flags: 0x0 memory[0x95] [0x0000200836800000-0x00002008369fffff], 0x0000000000200000 bytes on node 7 flags: 0x4 memory[0x96] [0x0000200836a00000-0x0000200837bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0 memory[0x97] [0x0000200837c00000-0x0000200837dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4 memory[0x98] [0x0000200837e00000-0x000020087fffffff], 0x0000000048200000 bytes on node 7 flags: 0x0 memory[0x99] [0x0000200880000000-0x0000200bcfffffff], 0x0000000350000000 bytes on node 6 flags: 0x0 memory[0x9a] [0x0000200bd0000000-0x0000200bd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4 memory[0x9b] [0x0000200bd0200000-0x0000200bd07fffff], 0x0000000000600000 bytes on node 6 flags: 0x0 memory[0x9c] [0x0000200bd0800000-0x0000200bd09fffff], 0x0000000000200000 bytes on node 6 flags: 0x4 memory[0x9d] [0x0000200bd0a00000-0x0000200fcfffffff], 0x00000003ff600000 bytes on node 6 flags: 0x0 memory[0x9e] [0x0000200fd0000000-0x0000200fd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4 memory[0x9f] [0x0000200fd0200000-0x0000200fffffffff], 0x000000002fe00000 bytes on node 6 flags: 0x0 ... The EFI memory map is parsed to construct the memblock arrays before the memblock arrays can be resized. As the result, memory regions beyond INIT_MEMBLOCK_REGIONS are lost. Add a new macro INIT_MEMBLOCK_MEMORY_REGIONS to replace INIT_MEMBLOCK_REGTIONS to define the size of the static memblock.memory array. Allow overriding memblock.memory array size with architecture defined INIT_MEMBLOCK_MEMORY_REGIONS and make arm64 to set INIT_MEMBLOCK_MEMORY_REGIONS to 1024 when CONFIG_EFI is enabled. Link: https://lkml.kernel.org/r/20220615102742.96450-1-zhouguanghui1@huawei.com Signed-off-by: Zhou Guanghui Acked-by: Mike Rapoport Tested-by: Darren Hart Acked-by: Will Deacon [arm64] Reviewed-by: Anshuman Khandual Cc: Xu Qiang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/memory.h | 9 +++++++++ mm/memblock.c | 14 +++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0af70d9abede..ce8614fa376a 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -364,6 +364,15 @@ void dump_mem_limit(void); # define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1) #endif +/* + * memory regions which marked with flag MEMBLOCK_NOMAP(for example, the memory + * of the EFI_UNUSABLE_MEMORY type) may divide a continuous memory block into + * multiple parts. As a result, the number of memory regions is large. + */ +#ifdef CONFIG_EFI +#define INIT_MEMBLOCK_MEMORY_REGIONS (INIT_MEMBLOCK_REGIONS * 8) +#endif + #include #endif /* __ASM_MEMORY_H */ diff --git a/mm/memblock.c b/mm/memblock.c index 749abd2685c4..b7ebf4b7e9d9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -29,6 +29,10 @@ # define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS #endif +#ifndef INIT_MEMBLOCK_MEMORY_REGIONS +#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS +#endif + /** * DOC: memblock overview * @@ -55,9 +59,9 @@ * the allocator metadata. The "memory" and "reserved" types are nicely * wrapped with struct memblock. This structure is statically * initialized at build time. The region arrays are initially sized to - * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS - * for "reserved". The region array for "physmem" is initially sized to - * %INIT_PHYSMEM_REGIONS. + * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and + * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array + * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS. * The memblock_allow_resize() enables automatic resizing of the region * arrays during addition of new regions. This feature should be used * with care so that memory allocated for the region array will not @@ -102,7 +106,7 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; -static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; @@ -111,7 +115,7 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ - .memory.max = INIT_MEMBLOCK_REGIONS, + .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, From fef3e9066d19230f661048ca86937d954c12cd50 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 14 Jul 2022 16:41:47 +0800 Subject: [PATCH 251/282] writeback: remove inode_to_wb_is_valid() inode_to_wb_is_valid() is no longer used since commit fe55d563d417 ("remove inode_congested()"), remove it. Link: https://lkml.kernel.org/r/20220714084147.140324-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e84b745a6811..439815cc1ab9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_to_wb_is_valid - test whether an inode has a wb associated - * @inode: inode of interest - * - * Returns %true if @inode has a wb associated. May be called without any - * locking. - */ -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return inode->i_wb; -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return true; -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; From c7e6f17b52e9486a9d997368819dfec032b550e2 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 14 Jul 2022 16:07:57 +0800 Subject: [PATCH 252/282] zsmalloc: zs_malloc: return ERR_PTR on failure zs_malloc returns 0 if it fails. zs_zpool_malloc will return -1 when zs_malloc return 0. But -1 makes the return value unclear. For example, when zswap_frontswap_store calls zs_malloc through zs_zpool_malloc, it will return -1 to its caller. The other return value is -EINVAL, -ENODEV or something else. This commit changes zs_malloc to return ERR_PTR on failure. It didn't just let zs_zpool_malloc return -ENOMEM becaue zs_malloc has two types of failure: - size is not OK return -EINVAL - memory alloc fail return -ENOMEM. Link: https://lkml.kernel.org/r/20220714080757.12161-1-teawater@gmail.com Signed-off-by: Hui Zhu Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- mm/zsmalloc.c | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3e281a193feb..9d3b06d5dc56 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1389,9 +1389,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, __GFP_HIGHMEM | __GFP_MOVABLE); - if (unlikely(!handle)) { + if (IS_ERR((void *)handle)) { zcomp_stream_put(zram->comp); - return -ENOMEM; + return PTR_ERR((void *)handle); } alloced_pages = zs_get_total_pages(zram->mem_pool); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f24b71568e83..9e13fd7ee635 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -399,7 +399,10 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { *handle = zs_malloc(pool, size, gfp); - return *handle ? 0 : -1; + + if (IS_ERR((void *)(*handle))) + return PTR_ERR((void *)*handle); + return 0; } static void zs_zpool_free(void *pool, unsigned long handle) { @@ -1400,7 +1403,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, - * otherwise 0. + * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) @@ -1411,11 +1414,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; + return (unsigned long)ERR_PTR(-EINVAL); handle = cache_alloc_handle(pool, gfp); if (!handle) - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; @@ -1440,7 +1443,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) zspage = alloc_zspage(pool, class, gfp); if (!zspage) { cache_free_handle(pool, handle); - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); } spin_lock(&class->lock); From 73b73bac90d97400e29e585c678c4d0ebfd2680d Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 14 Jul 2022 06:49:18 +0000 Subject: [PATCH 253/282] mm: vmpressure: don't count proactive reclaim in vmpressure memory.reclaim is a cgroup v2 interface that allows users to proactively reclaim memory from a memcg, without real memory pressure. Reclaim operations invoke vmpressure, which is used: (a) To notify userspace of reclaim efficiency in cgroup v1, and (b) As a signal for a memcg being under memory pressure for networking (see mem_cgroup_under_socket_pressure()). For (a), vmpressure notifications in v1 are not affected by this change since memory.reclaim is a v2 feature. For (b), the effects of the vmpressure signal (according to Shakeel [1]) are as follows: 1. Reducing send and receive buffers of the current socket. 2. May drop packets on the rx path. 3. May throttle current thread on the tx path. Since proactive reclaim is invoked directly by userspace, not by memory pressure, it makes sense not to throttle networking. Hence, this change makes sure that proactive reclaim caused by memory.reclaim does not trigger vmpressure. [1] https://lore.kernel.org/lkml/CALvZod68WdrXEmBpOkadhB5GPYmCXaDZzXH=yyGOCAjFRn4NDQ@mail.gmail.com/ [yosryahmed@google.com: update documentation] Link: https://lkml.kernel.org/r/20220721173015.2643248-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Cc: Muchun Song Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Miaohe Lin Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 7 +++++++ include/linux/swap.h | 5 ++++- mm/memcontrol.c | 24 +++++++++++++--------- mm/vmscan.c | 27 ++++++++++++++++--------- 4 files changed, 42 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index ad9ba3ec90a5..376d0207d1f7 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1229,6 +1229,13 @@ PAGE_SIZE multiple when read back. the target cgroup. If less bytes are reclaimed than the specified amount, -EAGAIN is returned. + Please note that the proactive reclaim (triggered by this + interface) is not meant to indicate memory pressure on the + memory cgroup. Therefore socket memory balancing triggered by + the memory reclaim normally is not exercised in this case. + This means that the networking layer will not adapt based on + reclaim induced by memory.reclaim. + memory.peak A read-only single value file which exists on non-root cgroups. diff --git a/include/linux/swap.h b/include/linux/swap.h index 6d11c51b2b62..ea895b40e6ff 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 767f49a6b987..2b831cc48c7d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2330,7 +2330,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, - gfp_mask, true); + gfp_mask, + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2575,7 +2576,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; unsigned long nr_reclaimed; bool passed_oom = false; - bool may_swap = true; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; bool drained = false; bool raised_max_event = false; unsigned long pflags; @@ -2593,7 +2594,7 @@ retry: mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); - may_swap = false; + reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; } if (batch > nr_pages) { @@ -2621,7 +2622,7 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, may_swap); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3439,8 +3440,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, continue; } - if (!try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, !memsw)) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3550,7 +3551,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6302,7 +6304,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6351,7 +6353,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, true)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6480,6 +6482,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int reclaim_options; int err; buf = strstrip(buf); @@ -6487,6 +6490,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (err) return err; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6503,7 +6507,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index fbb4108250ee..9e7d8db42918 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -101,6 +101,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at @@ -3180,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->priority); /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } @@ -3305,9 +3309,10 @@ again: } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -3589,8 +3594,9 @@ retry: __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, - sc->priority); + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); @@ -3880,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3893,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = may_swap, + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put From 188043c7f4f2bd662f2a55957d684fffa543e600 Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Thu, 14 Jul 2022 14:37:46 +0800 Subject: [PATCH 254/282] mm/damon/reclaim: fix potential memory leak in damon_reclaim_init() damon_reclaim_init() allocates a memory chunk for ctx with damon_new_ctx(). When damon_select_ops() fails, ctx is not released, which will lead to a memory leak. We should release the ctx with damon_destroy_ctx() when damon_select_ops() fails to fix the memory leak. Link: https://lkml.kernel.org/r/20220714063746.2343549-1-niejianglei2021@163.com Fixes: 4d69c3457821 ("mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations()") Signed-off-by: Jianglei Nie Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e69b807fefe4..a7faf51b4bd4 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -435,8 +435,10 @@ static int __init damon_reclaim_init(void) if (!ctx) return -ENOMEM; - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); return -EINVAL; + } ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; From e408e695f5f1f60d784913afc45ff2c387a5aeb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Jul 2022 21:59:12 -0400 Subject: [PATCH 255/282] mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs This allows userspace to set flags like FS_APPEND_FL, FS_IMMUTABLE_FL, FS_NODUMP_FL, etc., like all other standard Linux file systems. [akpm@linux-foundation.org: fix CONFIG_TMPFS_XATTR=n warnings] Link: https://lkml.kernel.org/r/20220715015912.2560575-1-tytso@mit.edu Signed-off-by: Theodore Ts'o Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 11 +++++++ mm/shmem.c | 64 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 1 deletion(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a68f982f22d1..1b6c4013f691 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -25,9 +25,20 @@ struct shmem_inode_info { struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ + unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ struct inode vfs_inode; }; +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE +#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ diff --git a/mm/shmem.c b/mm/shmem.c index 12ac67dc831f..06871a913b49 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1058,6 +1059,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); } + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); if (shmem_is_huge(NULL, inode, 0)) @@ -2272,7 +2282,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, +/* Mask out flags that are inappropriate for the given type of inode. */ +static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & SHMEM_REG_FLMASK; + else + return flags & SHMEM_OTHER_FLMASK; +} + +static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; @@ -2297,6 +2318,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + info->fsflags = shmem_mask_flags(mode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -3138,6 +3162,40 @@ static const char *shmem_get_link(struct dentry *dentry, } #ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); + if (info->fsflags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (info->fsflags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + + inode->i_ctime = current_time(inode); + return 0; +} + /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs @@ -3828,6 +3886,8 @@ static const struct inode_operations shmem_inode_operations = { #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif }; @@ -3847,6 +3907,8 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, From 9b7a4039d6856f66521486da68c76838929039eb Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Mon, 18 Jul 2022 03:55:06 +0800 Subject: [PATCH 256/282] tools/vm/page_owner_sort.c: adjust the indent in is_need() I noticed one more indentation than necessary in is_need(). Link: https://lkml.kernel.org/r/20220717195506.7602-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 74c3dcecf64d..ec2e67c85b84 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -470,23 +470,23 @@ static bool match_str_list(const char *str, char **list, int list_size) static bool is_need(char *buf) { - if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) - return false; - if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) - return false; - if ((filter & FILTER_TGID) && - !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) - return false; + if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) + return false; + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) + return false; + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) + return false; - char *comm = get_comm(buf); + char *comm = get_comm(buf); - if ((filter & FILTER_COMM) && - !match_str_list(comm, fc.comms, fc.comms_size)) { - free(comm); - return false; - } + if ((filter & FILTER_COMM) && + !match_str_list(comm, fc.comms, fc.comms_size)) { free(comm); - return true; + return false; + } + free(comm); + return true; } static void add_list(char *buf, int len, char *ext_buf) From 4d8ff64097092701a5e5506d0d7f643d421e0432 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 16 Jul 2022 16:18:16 +0800 Subject: [PATCH 257/282] mm: remove unneeded PageAnon check in restore_exclusive_pte() When code reaches here, the page must be !PageAnon. There's no need to check PageAnon again. Remove it. Link: https://lkml.kernel.org/r/20220716081816.10752-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 8c193b3a32aa..684c4c7cd2ff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -744,7 +744,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * Currently device exclusive access only supports anonymous * memory so the entry shouldn't point to a filebacked page. */ - WARN_ON_ONCE(!PageAnon(page)); + WARN_ON_ONCE(1); set_pte_at(vma->vm_mm, address, ptep, pte); From 189cdcfeeff31a285313c5132b81ae0b998dcad5 Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Mon, 18 Jul 2022 20:03:35 +0800 Subject: [PATCH 258/282] mm/page_alloc: correct the wrong cpuset file path in comment cpuset.c was moved to kernel/cgroup/ in below commit 201af4c0fab0 ("cgroup: move cgroup files under kernel/cgroup/") Correct the wrong path in comment. Link: https://lkml.kernel.org/r/20220718120336.5145-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 50d96fff8855..4a56b0defcd1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4170,7 +4170,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; From 6d97cf88ddde9c976d04b886b10b464ec8006c85 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 19 Jul 2022 19:52:33 +0800 Subject: [PATCH 259/282] mm/mempolicy: remove unneeded out label We can use unlock label to unlock ptl and return ret directly to remove the unneeded out label and reduce the size of mempolicy.o. No functional change intended. [Before] text data bss dec hex filename 26702 3972 6168 36842 8fea mm/mempolicy.o [After] text data bss dec hex filename 26662 3972 6168 36802 8fc2 mm/mempolicy.o Link: https://lkml.kernel.org/r/20220719115233.6706-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 88a5173c6ff0..b73d3248d976 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -465,9 +465,8 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, } page = pmd_page(*pmd); if (is_huge_zero_page(page)) { - spin_unlock(ptl); walk->action = ACTION_CONTINUE; - goto out; + goto unlock; } if (!queue_pages_required(page, qp)) goto unlock; @@ -484,7 +483,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; unlock: spin_unlock(ptl); -out: return ret; } From 3d5367a0426da61c7cb616cc85b6239467e261dd Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jul 2022 12:42:48 +0300 Subject: [PATCH 260/282] tools/testing/selftests/vm/hugetlb-madvise.c: silence uninitialized variable warning This code just reads from memory without caring about the data itself. However static checkers complain that "tmp" is never properly initialized. Initialize it to zero and change the name to "dummy" to show that we don't care about the value stored in it. Link: https://lkml.kernel.org/r/YtZ8mKJmktA2GaHB@kili Fixes: c4b6cb884011 ("selftests/vm: add hugetlb madvise MADV_DONTNEED MADV_REMOVE test") Signed-off-by: Dan Carpenter Acked-by: Souptick Joarder (HPE) Reviewed-by: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugetlb-madvise.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index 6c6af40f5747..3c9943131881 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -89,10 +89,11 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i, tmp; + unsigned long dummy = 0; + unsigned long i; for (i = 0; i < nr_pages; i++) - tmp += *((unsigned long *)(addr + (i * huge_page_size))); + dummy += *((unsigned long *)(addr + (i * huge_page_size))); } int main(int argc, char **argv) From 198729c9627a754b26aebdc8a26e559424c8f06c Mon Sep 17 00:00:00 2001 From: Kassey Li Date: Tue, 19 Jul 2022 17:15:54 +0800 Subject: [PATCH 261/282] mm/cma_debug.c: align the name buffer length as struct cma Avoids truncating the debugfs output to 16 chars. Potentially alters the userspace output, but this is a debugfs interface and there are no stability guarantees. Link: https://lkml.kernel.org/r/20220719091554.27864-1-quic_yingangl@quicinc.com Signed-off-by: Kassey Li Cc: Sasha Levin Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/cma_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 2e7704955f4f..c3ffe253e055 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -163,7 +163,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; - char name[16]; + char name[CMA_MAX_NAME]; scnprintf(name, sizeof(name), "cma-%s", cma->name); From 360b420dbded8ad5b70a41de98e77354dd9e7d36 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jul 2022 12:04:14 +0300 Subject: [PATCH 262/282] selftest/vm: uninitialized variable in main() Initialize "length" to zero by default. Link: https://lkml.kernel.org/r/YtZzjvHXVXMXxpXO@kili Fixes: ff712a627f72 ("selftests/vm: cleanup hugetlb file after mremap test") Signed-off-by: Dan Carpenter Reviewed-by: Mina Almasry Reviewed-by: Muchun Song Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugepage-mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 585978f181ed..e63a0214f639 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -107,7 +107,7 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { - size_t length; + size_t length = 0; if (argc != 2 && argc != 3) { printf("Usage: %s [length_in_MB] \n", argv[0]); From 2727cfe4072a35ce813e3708f74c135de7da8897 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 23 Jul 2022 15:38:04 +0800 Subject: [PATCH 263/282] hugetlb_cgroup: fix wrong hugetlb cgroup numa stat We forget to set cft->private for numa stat file. As a result, numa stat of hstates[0] is always showed for all hstates. Encode the hstates index into cft->private to fix this issue. Link: https://lkml.kernel.org/r/20220723073804.53035-1-linmiaohe@huawei.com Fixes: f47761999052 ("hugetlb: add hugetlb.*.numa_stat file") Signed-off-by: Miaohe Lin Acked-by: Muchun Song Cc: Kees Cook Cc: Mike Kravetz Cc: Mina Almasry Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- mm/hugetlb_cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f9942841df18..c86691c431fd 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -772,6 +772,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_cgroup_read_numa_stat; cft->flags = CFTYPE_NOT_ON_ROOT; From 914eedcb9ba0ff53c3380829a024b7cef16accfb Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 22 Jul 2022 13:15:13 -0700 Subject: [PATCH 264/282] userfaultfd: don't fail on unrecognized features The basic interaction for setting up a userfaultfd is, userspace issues a UFFDIO_API ioctl, and passes in a set of zero or more feature flags, indicating the features they would prefer to use. Of course, different kernels may support different sets of features (depending on kernel version, kconfig options, architecture, etc). Userspace's expectations may also not match: perhaps it was built against newer kernel headers, which defined some features the kernel it's running on doesn't support. Currently, if userspace passes in a flag we don't recognize, the initialization fails and we return -EINVAL. This isn't great, though. Userspace doesn't have an obvious way to react to this; sure, one of the features I asked for was unavailable, but which one? The only option it has is to turn off things "at random" and hope something works. Instead, modify UFFDIO_API to just ignore any unrecognized feature flags. The interaction is now that the initialization will succeed, and as always we return the *subset* of feature flags that can actually be used back to userspace. Now userspace has an obvious way to react: it checks if any flags it asked for are missing. If so, it can conclude this kernel doesn't support those, and it can either resign itself to not using them, or fail with an error on its own, or whatever else. Link: https://lkml.kernel.org/r/20220722201513.1624158-1-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Cc: Peter Xu Cc: Axel Rasmussen Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d0..4974da1f620c 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1923,10 +1923,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; - features = uffdio_api.features; - ret = -EINVAL; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) - goto err_out; + /* Ignore unsupported features (userspace built against newer kernel) */ + features = uffdio_api.features & UFFD_API_FEATURES; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; From 873f64b791a2b43c246e78b7d9fdd64ce909685b Mon Sep 17 00:00:00 2001 From: Jiebin Sun Date: Sat, 23 Jul 2022 00:49:49 +0800 Subject: [PATCH 265/282] mm/memcontrol.c: remove the redundant updating of stats_flush_threshold Remove the redundant updating of stats_flush_threshold. If the global var stats_flush_threshold has exceeded the trigger value for __mem_cgroup_flush_stats, further increment is unnecessary. Apply the patch and test the pts/hackbench-1.0.0 Count:4 (160 threads). Score gain: 1.95x Reduce CPU cycles in __mod_memcg_lruvec_state (44.88% -> 0.12%) CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/hackbench-1.0.0 Count:4 (160 threads) Link: https://lkml.kernel.org/r/20220722164949.47760-1-jiebin.sun@intel.com Signed-off-by: Jiebin Sun Acked-by: Shakeel Butt Reviewed-by: Roman Gushchin Reviewed-by: Tim Chen Acked-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Cc: "Huang, Ying" Cc: Amadeusz Sawiski Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b831cc48c7d..c65751ab4516 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -626,7 +626,14 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) x = __this_cpu_add_return(stats_updates, abs(val)); if (x > MEMCG_CHARGE_BATCH) { - atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + /* + * If stats_flush_threshold exceeds the threshold + * (>num_online_cpus()), cgroup stats update will be triggered + * in __mem_cgroup_flush_stats(). Increasing this var further + * is redundant and simply adds overhead in atomic update. + */ + if (atomic_read(&stats_flush_threshold) <= num_online_cpus()) + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); __this_cpu_write(stats_updates, 0); } } From 35fcd75af3edf035638e632bb49607cc8fc3cdf4 Mon Sep 17 00:00:00 2001 From: Shiyang Ruan Date: Thu, 9 Jun 2022 22:34:35 +0800 Subject: [PATCH 266/282] xfs: fail dax mount if reflink is enabled on a partition Failure notification is not supported on partitions. So, when we mount a reflink enabled xfs on a partition with dax option, let it fail with -EINVAL code. Link: https://lkml.kernel.org/r/20220609143435.393724-1-ruansy.fnst@fujitsu.com Signed-off-by: Shiyang Ruan Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Cc: Dave Chinner Signed-off-by: Andrew Morton --- fs/xfs/xfs_super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ed18160e6181..c440e90fecef 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -350,8 +350,10 @@ xfs_setup_dax_always( goto disable_dax; } - if (xfs_has_reflink(mp)) { - xfs_alert(mp, "DAX and reflink cannot be used together!"); + if (xfs_has_reflink(mp) && + bdev_is_partition(mp->m_ddev_targp->bt_bdev)) { + xfs_alert(mp, + "DAX and reflink cannot work with multi-partitions!"); return -EINVAL; } From 65974cb9107d9c6da18cefda22f84aeabc638b16 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 21 Jul 2022 12:05:52 +1000 Subject: [PATCH 267/282] mm/gup.c: fix formatting in check_and_migrate_movable_page() Commit b05a79d4377f ("mm/gup: migrate device coherent pages when pinning instead of failing") added a badly formatted if statement. Fix it. Link: https://lkml.kernel.org/r/20220721020552.1397598-2-apopple@nvidia.com Signed-off-by: Alistair Popple Reported-by: David Hildenbrand Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 364b274a10c2..c6d060dee9e0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1980,8 +1980,8 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, folio_nr_pages(folio)); } - if (!list_empty(&movable_page_list) || isolation_error_count - || coherent_pages) + if (!list_empty(&movable_page_list) || isolation_error_count || + coherent_pages) goto unpin_pages; /* From 68aaee147e597b495622b7c9038e5922c7c61f57 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 22 Jul 2022 19:45:39 +0900 Subject: [PATCH 268/282] mm: memcontrol: fix potential oom_lock recursion deadlock syzbot is reporting GFP_KERNEL allocation with oom_lock held when reporting memcg OOM [1]. If this allocation triggers the global OOM situation then the system can livelock because the GFP_KERNEL allocation with oom_lock held cannot trigger the global OOM killer because __alloc_pages_may_oom() fails to hold oom_lock. Fix this problem by removing the allocation from memory_stat_format() completely, and pass static buffer when calling from memcg OOM path. Note that the caller holding filesystem lock was the trigger for syzbot to report this locking dependency. Doing GFP_KERNEL allocation with filesystem lock held can deadlock the system even without involving OOM situation. Link: https://syzkaller.appspot.com/bug?extid=2d2aeadc6ce1e1f11d45 [1] Link: https://lkml.kernel.org/r/86afb39f-8c65-bec2-6cfc-c5e3cd600c0b@I-love.SAKURA.ne.jp Fixes: c8713d0b23123759 ("mm: memcontrol: dump memory.stat during cgroup OOM") Signed-off-by: Tetsuo Handa Reported-by: syzbot Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c65751ab4516..b69979c9ced5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1490,14 +1490,12 @@ static const unsigned int memcg_vm_event_stat[] = { #endif }; -static char *memory_stat_format(struct mem_cgroup *memcg) +static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; int i; - seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE); - if (!s.buffer) - return NULL; + seq_buf_init(&s, buf, bufsize); /* * Provide statistics on the state of the memory subsystem as @@ -1539,8 +1537,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg) /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); - - return s.buffer; } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -1576,7 +1572,10 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * */ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { - char *buf; + /* Use static buffer, for the caller is holding oom_lock. */ + static char buf[PAGE_SIZE]; + + lockdep_assert_held(&oom_lock); pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), @@ -1597,11 +1596,8 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("Memory cgroup stats for "); pr_cont_cgroup_path(memcg->css.cgroup); pr_cont(":"); - buf = memory_stat_format(memcg); - if (!buf) - return; + memory_stat_format(memcg, buf, sizeof(buf)); pr_info("%s", buf); - kfree(buf); } /* @@ -6405,11 +6401,11 @@ static int memory_events_local_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - char *buf; + char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - buf = memory_stat_format(memcg); if (!buf) return -ENOMEM; + memory_stat_format(memcg, buf, PAGE_SIZE); seq_puts(m, buf); kfree(buf); return 0; From 76aefad628aae152207ee624a7981b9aa1a267d8 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 25 Jul 2022 10:20:46 -0400 Subject: [PATCH 269/282] mm/mprotect: fix soft-dirty check in can_change_pte_writable() Patch series "mm/mprotect: Fix soft-dirty checks", v4. This patch (of 3): The check wanted to make sure when soft-dirty tracking is enabled we won't grant write bit by accident, as a page fault is needed for dirty tracking. The intention is correct but we didn't check it right because VM_SOFTDIRTY set actually means soft-dirty tracking disabled. Fix it. There's another thing tricky about soft-dirty is that, we can't check the vma flag !(vma_flags & VM_SOFTDIRTY) directly but only check it after we checked CONFIG_MEM_SOFT_DIRTY because otherwise VM_SOFTDIRTY will be defined as zero, and !(vma_flags & VM_SOFTDIRTY) will constantly return true. To avoid misuse, introduce a helper for checking whether vma has soft-dirty tracking enabled. We can easily verify this with any exclusive anonymous page, like program below: =======8<====== #include #include #include #include #include #include #include #include #include #include #include #include #include #define BIT_ULL(nr) (1ULL << (nr)) #define PM_SOFT_DIRTY BIT_ULL(55) unsigned int psize; char *page; uint64_t pagemap_read_vaddr(int fd, void *vaddr) { uint64_t value; int ret; ret = pread(fd, &value, sizeof(uint64_t), ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); assert(ret == sizeof(uint64_t)); return value; } void clear_refs_write(void) { int fd = open("/proc/self/clear_refs", O_RDWR); assert(fd >= 0); write(fd, "4", 2); close(fd); } #define check_soft_dirty(str, expect) do { \ bool dirty = pagemap_read_vaddr(fd, page) & PM_SOFT_DIRTY; \ if (dirty != expect) { \ printf("ERROR: %s, soft-dirty=%d (expect: %d) ", str, dirty, expect); \ exit(-1); \ } \ } while (0) int main(void) { int fd = open("/proc/self/pagemap", O_RDONLY); assert(fd >= 0); psize = getpagesize(); page = mmap(NULL, psize, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); assert(page != MAP_FAILED); *page = 1; check_soft_dirty("Just faulted in page", 1); clear_refs_write(); check_soft_dirty("Clear_refs written", 0); mprotect(page, psize, PROT_READ); check_soft_dirty("Marked RO", 0); mprotect(page, psize, PROT_READ|PROT_WRITE); check_soft_dirty("Marked RW", 0); *page = 2; check_soft_dirty("Wrote page again", 1); munmap(page, psize); close(fd); printf("Test passed. "); return 0; } =======8<====== Here we attach a Fixes to commit 64fe24a3e05e only for easy tracking, as this patch won't apply to a tree before that point. However the commit wasn't the source of problem, but instead 64e455079e1b. It's just that after 64fe24a3e05e anonymous memory will also suffer from this problem with mprotect(). Link: https://lkml.kernel.org/r/20220725142048.30450-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20220725142048.30450-2-peterx@redhat.com Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared") Fixes: 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection") Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Nadav Amit Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- mm/internal.h | 18 ++++++++++++++++++ mm/mmap.c | 2 +- mm/mprotect.c | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 899dab512c5a..caebaeb2e5c9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -862,4 +862,22 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) +{ + /* + * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty + * enablements, because when without soft-dirty being compiled in, + * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY) + * will be constantly true. + */ + if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY)) + return false; + + /* + * Soft-dirty is kind of special: its tracking is enabled when the + * vma flags not set. + */ + return !(vma->vm_flags & VM_SOFTDIRTY); +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index ec4e0d53a388..c035020d0c89 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1647,7 +1647,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) return 0; /* Do we need to track softdirty? */ - if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + if (vma_soft_dirty_enabled(vma)) return 1; /* Specialty mapping? */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 8250c1315d9c..3a23dde73723 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -49,7 +49,7 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, return false; /* Do we need write faults for softdirty tracking? */ - if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte)) + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) return false; /* Do we need write faults for uffd-wp tracking? */ From c942f5bd17b3a520710c6b80634be5c44aaa109e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 25 Jul 2022 10:20:47 -0400 Subject: [PATCH 270/282] selftests: soft-dirty: add test for mprotect Add two soft-dirty test cases for mprotect() on both anon or file. Link: https://lkml.kernel.org/r/20220725142048.30450-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/soft-dirty.c | 67 ++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c index 08ab62a4a9d0..e3a43f5d4fa2 100644 --- a/tools/testing/selftests/vm/soft-dirty.c +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -121,13 +121,76 @@ static void test_hugepage(int pagemap_fd, int pagesize) free(map); } +static void test_mprotect(int pagemap_fd, int pagesize, bool anon) +{ + const char *type[] = {"file", "anon"}; + const char *fname = "./soft-dirty-test-file"; + int test_fd; + char *map; + + if (anon) { + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (!map) + ksft_exit_fail_msg("anon mmap failed\n"); + } else { + test_fd = open(fname, O_RDWR | O_CREAT); + if (test_fd < 0) { + ksft_test_result_skip("Test %s open() file failed\n", __func__); + return; + } + unlink(fname); + ftruncate(test_fd, pagesize); + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_SHARED, test_fd, 0); + if (!map) + ksft_exit_fail_msg("file mmap failed\n"); + } + + *map = 1; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s dirty bit of new written page\n", + __func__, type[anon]); + clear_softdirty(); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after clear_refs\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RO\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ|PROT_WRITE); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RW\n", + __func__, type[anon]); + *map = 2; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s soft-dirty after rewritten\n", + __func__, type[anon]); + + munmap(map, pagesize); + + if (!anon) + close(test_fd); +} + +static void test_mprotect_anon(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, true); +} + +static void test_mprotect_file(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, false); +} + int main(int argc, char **argv) { int pagemap_fd; int pagesize; ksft_print_header(); - ksft_set_plan(5); + ksft_set_plan(15); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) @@ -138,6 +201,8 @@ int main(int argc, char **argv) test_simple(pagemap_fd, pagesize); test_vma_reuse(pagemap_fd, pagesize); test_hugepage(pagemap_fd, pagesize); + test_mprotect_anon(pagemap_fd, pagesize); + test_mprotect_file(pagemap_fd, pagesize); close(pagemap_fd); From 68deb82a7bfcf67c6491c2387215e038b525475f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 25 Jul 2022 10:20:48 -0400 Subject: [PATCH 271/282] selftests: add soft-dirty into run_vmtests.sh Link: https://lkml.kernel.org/r/20220725142048.30450-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Nadav Amit Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/run_vmtests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index 2af563a9652e..de86983b8a0f 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -190,4 +190,6 @@ then run_test ./protection_keys_64 fi +run_test ./soft-dirty + exit $exitcode From f6c3e1ae0114cd0f5123cf38187d450c1b119e67 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Mon, 25 Jul 2022 11:36:15 -0700 Subject: [PATCH 272/282] mm/hmm: add a test for cross device private faults Add a simple test case for when hmm_range_fault() is called with the HMM_PFN_REQ_FAULT flag and a device private PTE is found for a device other than the hmm_range::dev_private_owner. This should cause the page to be faulted back to system memory from the other device and the PFN returned in the output array. Also, remove a piece of code that unnecessarily unmaps part of the buffer. Link: https://lkml.kernel.org/r/20220727000837.4128709-3-rcampbell@nvidia.com Link: https://lkml.kernel.org/r/20220725183615.4118795-3-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Alistair Popple Cc: Felix Kuehling Cc: Philip Yang Cc: Jason Gunthorpe Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 716b62c05e3d..939a33dc5dc6 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1603,9 +1603,19 @@ TEST_F(hmm2, double_map) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) ASSERT_EQ(ptr[i], i); - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); + /* Migrate pages to device 1 and try to read from device 0. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what device 0 read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); hmm_buffer_free(buffer); } From bb077c3ffd5362a6d9e60574e1bcc83fe8e3fb27 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 26 Jul 2022 21:18:16 +0800 Subject: [PATCH 273/282] mm: cleanup is_highmem() It is unnecessary to add CONFIG_HIGHMEM check in is_highmem(), which has been done in is_highmem_idx(), and move is_highmem() close to is_highmem_idx(). This has no functional impact. Link: https://lkml.kernel.org/r/20220726131816.149075-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 578247a341b2..e24b40c52468 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1137,15 +1137,6 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void); -#else -static inline bool has_managed_dma(void) -{ - return false; -} -#endif - /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references @@ -1155,13 +1146,18 @@ static inline bool has_managed_dma(void) */ static inline int is_highmem(struct zone *zone) { -#ifdef CONFIG_HIGHMEM return is_highmem_idx(zone_idx(zone)); -#else - return 0; -#endif } +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); +#else +static inline bool has_managed_dma(void) +{ + return false; +} +#endif + /* These two functions are used to setup the per zone pages min values */ struct ctl_table; From d00365175e092a3144ecbee3181f46682b5b5e97 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 26 Jul 2022 22:29:14 +0800 Subject: [PATCH 274/282] hugetlbfs: use helper macro SZ_1{K,M} Patch series "A few cleanup and fixup patches for hugetlbfs", v2. This series contains a few cleaup patches to remove unneeded forward declaration, use helper macro and so on. More details can be found in the respective changelogs. This patch (of 5): Use helper macro SZ_1K and SZ_1M to do the size conversion. Minor readability improvement. Link: https://lkml.kernel.org/r/20220726142918.51693-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220726142918.51693-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 02eb72351b15..3f5870769451 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1320,7 +1320,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par ps = memparse(param->string, &rest); ctx->hstate = size_to_hstate(ps); if (!ctx->hstate) { - pr_err("Unsupported page size %lu MB\n", ps >> 20); + pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); return -EINVAL; } return 0; @@ -1566,7 +1566,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) } if (IS_ERR(mnt)) pr_err("Cannot mount internal hugetlbfs for page size %luK", - huge_page_size(h) >> 10); + huge_page_size(h) / SZ_1K); return mnt; } From 7ec3c362cfc4cf7aa8d29416c34d0dc0817cf9aa Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 26 Jul 2022 22:29:15 +0800 Subject: [PATCH 275/282] hugetlbfs: remove unneeded hugetlbfs_ops forward declaration The forward declaration for hugetlbfs_ops is unnecessary. Remove it. Link: https://lkml.kernel.org/r/20220726142918.51693-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3f5870769451..f6247d588816 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -40,7 +40,6 @@ #include #include -static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; From 990e52b17d048970cb9629f1d546542b2c256f3a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 26 Jul 2022 22:29:16 +0800 Subject: [PATCH 276/282] hugetlbfs: remove unneeded header file The header file signal.h is unneeded now. Remove it. Link: https://lkml.kernel.org/r/20220726142918.51693-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f6247d588816..7d780d653850 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -11,7 +11,6 @@ #include #include -#include /* remove ASAP */ #include #include #include From 445c809829dc164c5b1917c72e335bd294aa4dc8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 26 Jul 2022 22:29:17 +0800 Subject: [PATCH 277/282] hugetlbfs: cleanup some comments in inode.c The function generic_file_buffered_read has been renamed to filemap_read since commit 87fa0f3eb267 ("mm/filemap: rename generic_file_buffered_read to filemap_read"). Update the corresponding comment. And duplicated taken in hugetlbfs_fill_super is removed. Link: https://lkml.kernel.org/r/20220726142918.51693-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7d780d653850..998672be99c5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -323,8 +323,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the - * data. Its *very* similar to generic_file_buffered_read(), we can't use that - * since it has PAGE_SIZE assumptions. + * data. This provides functionality similar to filemap_read(). */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -1394,7 +1393,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimum size) are taken - * taken when the subpool is created. + * when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { sbinfo->spool = hugepage_new_subpool(ctx->hstate, From 116807634569bdb9defe1c01e442e76e4f432961 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 26 Jul 2022 22:29:18 +0800 Subject: [PATCH 278/282] hugetlbfs: fix inaccurate comment in hugetlbfs_statfs() In some cases, e.g. when size option is not specified, f_blocks, f_bavail and f_bfree will be set to -1 instead of 0. Likewise, when nr_inodes isn't specified, f_files and f_ffree will be set to -1 too. Update the comment to make this clear. Link: https://lkml.kernel.org/r/20220726142918.51693-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 998672be99c5..be22cffbb579 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1090,7 +1090,7 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = huge_page_size(h); if (sbinfo) { spin_lock(&sbinfo->stat_lock); - /* If no limits set, just report 0 for max/free/used + /* If no limits set, just report 0 or -1 for max/free/used * blocks, like simple_statfs() */ if (sbinfo->spool) { long free_pages; From 07252dfea2c7089bca68949710268cbbb0ce509e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 26 Jul 2022 21:11:35 +0800 Subject: [PATCH 279/282] mm: use is_zone_movable_page() helper Use is_zone_movable_page() helper to simplify code. Link: https://lkml.kernel.org/r/20220726131135.146912-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Acked-by: Jason Wang Signed-off-by: Andrew Morton --- drivers/virtio/virtio_mem.c | 6 ++---- mm/memory_hotplug.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c index e07486f01999..0c2892ec6817 100644 --- a/drivers/virtio/virtio_mem.c +++ b/drivers/virtio/virtio_mem.c @@ -862,8 +862,7 @@ static void virtio_mem_sbm_notify_online(struct virtio_mem *vm, unsigned long mb_id, unsigned long start_pfn) { - const bool is_movable = page_zonenum(pfn_to_page(start_pfn)) == - ZONE_MOVABLE; + const bool is_movable = is_zone_movable_page(pfn_to_page(start_pfn)); int new_state; switch (virtio_mem_sbm_get_mb_state(vm, mb_id)) { @@ -1158,8 +1157,7 @@ static void virtio_mem_fake_online(unsigned long pfn, unsigned long nr_pages) */ static int virtio_mem_fake_offline(unsigned long pfn, unsigned long nr_pages) { - const bool is_movable = page_zonenum(pfn_to_page(pfn)) == - ZONE_MOVABLE; + const bool is_movable = is_zone_movable_page(pfn_to_page(pfn)); int rc, retry_count; /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 99ecb2b3ff53..fad6d1f2262a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -932,7 +932,7 @@ static struct zone *auto_movable_zone_for_pfn(int nid, if (!page) continue; /* If anything is !MOVABLE online the rest !MOVABLE. */ - if (page_zonenum(page) != ZONE_MOVABLE) + if (!is_zone_movable_page(page)) goto kernel_zone; online_pages += PAGES_PER_SECTION; } From 96f96763de26d6ee333d5b2446d1b04a4e6bc75b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 26 Jul 2022 16:10:46 +0800 Subject: [PATCH 280/282] mm: memory-failure: convert to pr_fmt() Use pr_fmt to prefix all pr_ output, but unpoison_memory() and soft_offline_page() are used by error injection, which have own prefixes like "Unpoison:" and "soft offline:", meanwhile, soft_offline_page() could be used by memory hotremove, so reset pr_fmt before unpoison_pr_info definition to keep the original output for them. [wangkefeng.wang@huawei.com: v3] Link: https://lkml.kernel.org/r/20220729031919.72331-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20220726081046.10742-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-failure.c | 58 ++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c9931c676335..cffc7cd24d8d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -33,6 +33,9 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ + +#define pr_fmt(fmt) "Memory failure: " fmt + #include #include #include @@ -252,7 +255,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) short addr_lsb = tk->size_shift; int ret = 0; - pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", + pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n", pfn, t->comm, t->pid); if ((flags & MF_ACTION_REQUIRED) && (t == current)) @@ -270,7 +273,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, addr_lsb, t); /* synchronous? */ if (ret < 0) - pr_info("Memory failure: Error sending signal to %s:%d: %d\n", + pr_info("Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); return ret; } @@ -352,7 +355,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - pr_err("Memory failure: Out of memory while machine check handling\n"); + pr_err("Out of memory while machine check handling\n"); return; } @@ -379,7 +382,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * has a mapping for the page. */ if (tk->addr == -EFAULT) { - pr_info("Memory failure: Unable to find user space address %lx in %s\n", + pr_info("Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); } else if (tk->size_shift == 0) { kfree(tk); @@ -412,7 +415,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, * signal and then access the memory. Just kill it. */ if (fail || tk->addr == -EFAULT) { - pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); do_send_sig_info(SIGKILL, SEND_SIG_PRIV, tk->tsk, PIDTYPE_PID); @@ -425,7 +428,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, * process anyways. */ else if (kill_proc(tk, pfn, flags) < 0) - pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", + pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); @@ -816,12 +819,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn, int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - pr_info("Memory failure: %#lx: Failed to punch page: %d\n", - pfn, err); + pr_info("%#lx: Failed to punch page: %d\n", pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_info("Memory failure: %#lx: failed to release buffers\n", - pfn); + pr_info("%#lx: failed to release buffers\n", pfn); } else { ret = MF_RECOVERED; } @@ -833,8 +834,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - pr_info("Memory failure: %#lx: Failed to invalidate\n", - pfn); + pr_info("%#lx: Failed to invalidate\n", pfn); } return ret; @@ -864,7 +864,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p, count -= 1; if (count > 0) { - pr_err("Memory failure: %#lx: %s still referenced by %d users\n", + pr_err("%#lx: %s still referenced by %d users\n", page_to_pfn(p), action_page_types[ps->type], count); return true; } @@ -888,7 +888,7 @@ static int me_kernel(struct page_state *ps, struct page *p) */ static int me_unknown(struct page_state *ps, struct page *p) { - pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p)); + pr_err("%#lx: Unknown page state\n", page_to_pfn(p)); unlock_page(p); return MF_FAILED; } @@ -1173,7 +1173,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type, trace_memory_failure_event(pfn, type, result); num_poisoned_pages_inc(); - pr_err("Memory failure: %#lx: recovery action for %s: %s\n", + pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -1248,8 +1248,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) if (head == compound_head(page)) return 1; - pr_info("Memory failure: %#lx cannot catch tail\n", - page_to_pfn(page)); + pr_info("%#lx cannot catch tail\n", page_to_pfn(page)); put_page(head); } @@ -1312,7 +1311,7 @@ try_again: } out: if (ret == -EIO) - pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p)); + pr_err("%#lx: unhandlable page.\n", page_to_pfn(p)); return ret; } @@ -1411,13 +1410,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, return true; if (PageKsm(p)) { - pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); + pr_err("%#lx: can't handle KSM pages.\n", pfn); return false; } if (PageSwapCache(p)) { - pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n", - pfn); + pr_err("%#lx: keeping poisoned page in swap cache\n", pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -1435,7 +1433,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -1464,14 +1462,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, try_to_unmap(folio, ttu|TTU_RMAP_LOCKED); i_mmap_unlock_write(mapping); } else - pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn); + pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn); } else { try_to_unmap(folio, ttu); } unmap_success = !page_mapped(hpage); if (!unmap_success) - pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", + pr_err("%#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); /* @@ -1732,7 +1730,7 @@ retry: *hugetlb = 0; return 0; } else if (res == -EHWPOISON) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); + pr_err("%#lx: already hardware poisoned\n", pfn); if (flags & MF_ACTION_REQUIRED) { head = compound_head(p); res = kill_accessing_process(current, page_to_pfn(head), flags); @@ -1901,8 +1899,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; } } - pr_err("Memory failure: %#lx: memory outside kernel control\n", - pfn); + pr_err("%#lx: memory outside kernel control\n", pfn); res = -ENXIO; goto unlock_mutex; } @@ -1913,8 +1910,7 @@ try_again: goto unlock_mutex; if (TestSetPageHWPoison(p)) { - pr_err("Memory failure: %#lx: already hardware poisoned\n", - pfn); + pr_err("%#lx: already hardware poisoned\n", pfn); res = -EHWPOISON; if (flags & MF_ACTION_REQUIRED) res = kill_accessing_process(current, pfn, flags); @@ -2130,7 +2126,7 @@ void memory_failure_queue(unsigned long pfn, int flags) if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else - pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", + pr_err("buffer overflow when queuing memory failure at %#lx\n", pfn); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); put_cpu_var(memory_failure_cpu); @@ -2187,6 +2183,8 @@ static int __init memory_failure_init(void) } core_initcall(memory_failure_init); +#undef pr_fmt +#define pr_fmt(fmt) "" fmt #define unpoison_pr_info(fmt, pfn, rs) \ ({ \ if (__ratelimit(rs)) \ From 1a44131d4f524e18d873bbe363598e39841001bf Mon Sep 17 00:00:00 2001 From: Sophia Gabriella Date: Thu, 28 Jul 2022 16:51:39 +0000 Subject: [PATCH 281/282] mm: Kconfig: fix typo Fixes a typo in the help section for ZSWAP. Link: https://lkml.kernel.org/r/Message-ID: Signed-off-by: Sophia Gabriella Signed-off-by: Andrew Morton --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 56ca0e7c6f9a..f73f5b272144 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -33,7 +33,7 @@ config ZSWAP pages that are in the process of being swapped out and attempts to compress them into a dynamically allocated RAM-based memory pool. This can result in a significant I/O reduction on swap device and, - in the case where decompressing from RAM is faster that swap device + in the case where decompressing from RAM is faster than swap device reads, can also improve workload performance. This is marked experimental because it is a new feature (as of From 360614c01f81f48a89d8b13f8fa69c3ae0a1f5c7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 2 Aug 2022 18:03:03 -0700 Subject: [PATCH 282/282] tools/testing/selftests/vm/hmm-tests.c: fix build hmm-tests.c:1607:42: error: 'HMM_DMIRROR_MIGRATE' undeclared (first use in this function); did you mean 'HMM_DMIRROR_WRITE'? Fixes: f6c3e1ae0114cd0 ("mm/hmm: add a test for cross device private faults") Reported-by: kernel test robot Reviewed-by: Ralph Campbell Cc: Alistair Popple Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hmm-tests.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 939a33dc5dc6..529f53b40296 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1604,7 +1604,7 @@ TEST_F(hmm2, double_map) ASSERT_EQ(ptr[i], i); /* Migrate pages to device 1 and try to read from device 0. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages); + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); ASSERT_EQ(ret, 0); ASSERT_EQ(buffer->cpages, npages);