Folio changes for 5.18

- Rewrite how munlock works to massively reduce the contention
    on i_mmap_rwsem (Hugh Dickins):
    https://lore.kernel.org/linux-mm/8e4356d-9622-a7f0-b2c-f116b5f2efea@google.com/
  - Sort out the page refcount mess for ZONE_DEVICE pages (Christoph Hellwig):
    https://lore.kernel.org/linux-mm/20220210072828.2930359-1-hch@lst.de/
  - Convert GUP to use folios and make pincount available for order-1
    pages. (Matthew Wilcox)
  - Convert a few more truncation functions to use folios (Matthew Wilcox)
  - Convert page_vma_mapped_walk to use PFNs instead of pages (Matthew Wilcox)
  - Convert rmap_walk to use folios (Matthew Wilcox)
  - Convert most of shrink_page_list() to use a folio (Matthew Wilcox)
  - Add support for creating large folios in readahead (Matthew Wilcox)
 -----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCgAdFiEEejHryeLBw/spnjHrDpNsjXcpgj4FAmI4ucgACgkQDpNsjXcp
 gj69Wgf6AwqwmO5Tmy+fLScDPqWxmXJofbocae1kyoGHf7Ui91OK4U2j6IpvAr+g
 P/vLIK+JAAcTQcrSCjymuEkf4HkGZOR03QQn7maPIEe4eLrZRQDEsmHC1L9gpeJp
 s/GMvDWiGE0Tnxu0EOzfVi/yT+qjIl/S8VvqtCoJv1HdzxitZ7+1RDuqImaMC5MM
 Qi3uHag78vLmCltLXpIOdpgZhdZexCdL2Y/1npf+b6FVkAJRRNUnA0gRbS7YpoVp
 CbxEJcmAl9cpJLuj5i5kIfS9trr+/QcvbUlzRxh4ggC58iqnmF2V09l2MJ7YU3XL
 v1O/Elq4lRhXninZFQEm9zjrri7LDQ==
 =n9Ad
 -----END PGP SIGNATURE-----

Merge tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache

Pull folio updates from Matthew Wilcox:

 - Rewrite how munlock works to massively reduce the contention on
   i_mmap_rwsem (Hugh Dickins):

     https://lore.kernel.org/linux-mm/8e4356d-9622-a7f0-b2c-f116b5f2efea@google.com/

 - Sort out the page refcount mess for ZONE_DEVICE pages (Christoph
   Hellwig):

     https://lore.kernel.org/linux-mm/20220210072828.2930359-1-hch@lst.de/

 - Convert GUP to use folios and make pincount available for order-1
   pages. (Matthew Wilcox)

 - Convert a few more truncation functions to use folios (Matthew
   Wilcox)

 - Convert page_vma_mapped_walk to use PFNs instead of pages (Matthew
   Wilcox)

 - Convert rmap_walk to use folios (Matthew Wilcox)

 - Convert most of shrink_page_list() to use a folio (Matthew Wilcox)

 - Add support for creating large folios in readahead (Matthew Wilcox)

* tag 'folio-5.18c' of git://git.infradead.org/users/willy/pagecache: (114 commits)
  mm/damon: minor cleanup for damon_pa_young
  selftests/vm/transhuge-stress: Support file-backed PMD folios
  mm/filemap: Support VM_HUGEPAGE for file mappings
  mm/readahead: Switch to page_cache_ra_order
  mm/readahead: Align file mappings for non-DAX
  mm/readahead: Add large folio readahead
  mm: Support arbitrary THP sizes
  mm: Make large folios depend on THP
  mm: Fix READ_ONLY_THP warning
  mm/filemap: Allow large folios to be added to the page cache
  mm: Turn can_split_huge_page() into can_split_folio()
  mm/vmscan: Convert pageout() to take a folio
  mm/vmscan: Turn page_check_references() into folio_check_references()
  mm/vmscan: Account large folios correctly
  mm/vmscan: Optimise shrink_page_list for non-PMD-sized folios
  mm/vmscan: Free non-shmem folios without splitting them
  mm/rmap: Constify the rmap_walk_control argument
  mm/rmap: Convert rmap_walk() to take a folio
  mm: Turn page_anon_vma() into folio_anon_vma()
  mm/rmap: Turn page_lock_anon_vma_read() into folio_lock_anon_vma_read()
  ...
This commit is contained in:
Linus Torvalds 2022-03-22 17:03:12 -07:00
commit 9030fb0bb9
100 changed files with 2924 additions and 3044 deletions

177
mm/swap.c
View file

@ -74,8 +74,8 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
};
/*
* This path almost never happens for VM activity - pages are normally
* freed via pagevecs. But it gets used by networking.
* This path almost never happens for VM activity - pages are normally freed
* via pagevecs. But it gets used by networking - and for compound pages.
*/
static void __page_cache_release(struct page *page)
{
@ -89,6 +89,14 @@ static void __page_cache_release(struct page *page)
__clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
/* See comment on PageMlocked in release_pages() */
if (unlikely(PageMlocked(page))) {
int nr_pages = thp_nr_pages(page);
__ClearPageMlocked(page);
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
}
__ClearPageWaiters(page);
}
@ -114,17 +122,9 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
if (is_zone_device_page(page)) {
put_dev_pagemap(page->pgmap);
/*
* The page belongs to the device that created pgmap. Do
* not return it to page allocator.
*/
return;
}
if (unlikely(PageCompound(page)))
if (unlikely(is_zone_device_page(page)))
free_zone_device_page(page);
else if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
__put_single_page(page);
@ -482,22 +482,12 @@ EXPORT_SYMBOL(folio_add_lru);
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
bool unevictable;
VM_BUG_ON_PAGE(PageLRU(page), page);
unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
int nr_pages = thp_nr_pages(page);
/*
* We use the irq-unsafe __mod_zone_page_state because this
* counter is not modified from interrupt context, and the pte
* lock is held(spinlock), which implies preemption disabled.
*/
__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
lru_cache_add(page);
if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
mlock_new_page(page);
else
lru_cache_add(page);
}
/*
@ -636,35 +626,37 @@ void lru_add_drain_cpu(int cpu)
pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
mlock_page_drain(cpu);
}
/**
* deactivate_file_page - forcefully deactivate a file page
* @page: page to deactivate
* deactivate_file_folio() - Forcefully deactivate a file folio.
* @folio: Folio to deactivate.
*
* This function hints the VM that @page is a good reclaim candidate,
* for example if its invalidation fails due to the page being dirty
* This function hints to the VM that @folio is a good reclaim candidate,
* for example if its invalidation fails due to the folio being dirty
* or under writeback.
*
* Context: Caller holds a reference on the page.
*/
void deactivate_file_page(struct page *page)
void deactivate_file_folio(struct folio *folio)
{
struct pagevec *pvec;
/*
* In a workload with many unevictable page such as mprotect,
* unevictable page deactivation for accelerating reclaim is pointless.
* In a workload with many unevictable pages such as mprotect,
* unevictable folio deactivation for accelerating reclaim is pointless.
*/
if (PageUnevictable(page))
if (folio_test_unevictable(folio))
return;
if (likely(get_page_unless_zero(page))) {
struct pagevec *pvec;
folio_get(folio);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
if (pagevec_add_and_need_flush(pvec, page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
local_unlock(&lru_pvecs.lock);
}
if (pagevec_add_and_need_flush(pvec, &folio->page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
local_unlock(&lru_pvecs.lock);
}
/*
@ -837,6 +829,7 @@ inline void __lru_add_drain_all(bool force_all_cpus)
pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
need_activate_page_drain(cpu) ||
need_mlock_page_drain(cpu) ||
has_bh_in_lru(cpu, NULL)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
@ -935,18 +928,10 @@ void release_pages(struct page **pages, int nr)
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
/*
* ZONE_DEVICE pages that return 'false' from
* page_is_devmap_managed() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
if (page_is_devmap_managed(page)) {
put_devmap_managed_page(page);
if (put_devmap_managed_page(page))
continue;
}
if (put_page_testzero(page))
put_dev_pagemap(page->pgmap);
free_zone_device_page(page);
continue;
}
@ -974,6 +959,18 @@ void release_pages(struct page **pages, int nr)
__clear_page_lru_flags(page);
}
/*
* In rare cases, when truncation or holepunching raced with
* munlock after VM_LOCKED was cleared, Mlocked may still be
* found set here. This does not indicate a problem, unless
* "unevictable_pgs_cleared" appears worryingly large.
*/
if (unlikely(PageMlocked(page))) {
__ClearPageMlocked(page);
dec_zone_page_state(page, NR_MLOCK);
count_vm_event(UNEVICTABLE_PGCLEARED);
}
__ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
@ -1014,43 +1011,32 @@ static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* A folio becomes evictable in two ways:
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the folio on the correct LRU
* and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
*
* (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
* following strict ordering:
*
* #0: __pagevec_lru_add_fn #1: clear_page_mlock
*
* folio_set_lru() folio_test_clear_mlocked()
* smp_mb() // explicit ordering // above provides strict
* // ordering
* folio_test_mlocked() folio_test_lru()
*
*
* if '#1' does not observe setting of PG_lru by '#0' and
* fails isolation, the explicit barrier will make sure that
* folio_evictable check will put the folio on the correct
* LRU. Without smp_mb(), folio_set_lru() can be reordered
* after folio_test_mlocked() check and can make '#1' fail the
* isolation of the folio whose mlocked bit is cleared (#0 is
* also looking at the same folio) and the evictable folio will
* be stranded on an unevictable LRU.
*/
folio_set_lru(folio);
smp_mb__after_atomic();
/*
* Is an smp_mb__after_atomic() still required here, before
* folio_evictable() tests PageMlocked, to rule out the possibility
* of stranding an evictable folio on an unevictable LRU? I think
* not, because __munlock_page() only clears PageMlocked while the LRU
* lock is held.
*
* (That is not true of __page_cache_release(), and not necessarily
* true of release_pages(): but those only clear PageMlocked after
* put_page_testzero() has excluded any other users of the page.)
*/
if (folio_evictable(folio)) {
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
folio_clear_active(folio);
folio_set_unevictable(folio);
/*
* folio->mlock_count = !!folio_test_mlocked(folio)?
* But that leaves __mlock_page() in doubt whether another
* actor has already counted the mlock or not. Err on the
* safe side, underestimate, let page reclaim fix it, rather
* than leaving a page on the unevictable LRU indefinitely.
*/
folio->mlock_count = 0;
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
@ -1158,26 +1144,3 @@ void __init swap_setup(void)
* _really_ don't want to cluster much more
*/
}
#ifdef CONFIG_DEV_PAGEMAP_OPS
void put_devmap_managed_page(struct page *page)
{
int count;
if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
return;
count = page_ref_dec_return(page);
/*
* devmap page refcounts are 1-based, rather than 0-based: if
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
if (count == 1)
free_devmap_managed_page(page);
else if (!count)
__put_page(page);
}
EXPORT_SYMBOL(put_devmap_managed_page);
#endif