From 29417d292bd0fa174d20360326abaf6444a23c3b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 30 Apr 2023 21:19:17 +0100 Subject: [PATCH 1/5] mm/mmap/vma_merge: always check invariants We may still have inconsistent input parameters even if we choose not to merge and the vma_merge() invariant checks are useful for checking this with no production runtime cost (these are only relevant when CONFIG_DEBUG_VM is specified). Therefore, perform these checks regardless of whether we merge. This is relevant, as a recent issue (addressed in commit "mm/mempolicy: Correctly update prev when policy is equal on mbind") in the mbind logic was only picked up in the 6.2.y stable branch where these assertions are performed prior to determining mergeability. Had this remained the same in mainline this issue may have been picked up faster, so moving forward let's always check them. Link: https://lkml.kernel.org/r/df548a6ae3fa135eec3b446eb3dae8eb4227da97.1682885809.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5522130ae606..13678edaa22c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -960,17 +960,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, merge_next = true; } + /* Verify some invariant that must be enforced by the caller. */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); + VM_WARN_ON(addr >= end); + if (!merge_prev && !merge_next) return NULL; /* Not mergeable. */ res = vma = prev; remove = remove2 = adjust = NULL; - /* Verify some invariant that must be enforced by the caller. */ - VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); - VM_WARN_ON(addr >= end); - /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { From a6a491c048882e7e424d407d32cba0b52d9ef2bf Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 1 May 2023 04:30:46 +0900 Subject: [PATCH 2/5] nilfs2: fix infinite loop in nilfs_mdt_get_block() If the disk image that nilfs2 mounts is corrupted and a virtual block address obtained by block lookup for a metadata file is invalid, nilfs_bmap_lookup_at_level() may return the same internal return code as -ENOENT, meaning the block does not exist in the metadata file. This duplication of return codes confuses nilfs_mdt_get_block(), causing it to read and create a metadata block indefinitely. In particular, if this happens to the inode metadata file, ifile, semaphore i_rwsem can be left held, causing task hangs in lock_mount. Fix this issue by making nilfs_bmap_lookup_at_level() treat virtual block address translation failures with -ENOENT as metadata corruption instead of returning the error code. Link: https://lkml.kernel.org/r/20230430193046.6769-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Reported-by: syzbot+221d75710bde87fa0e97@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=221d75710bde87fa0e97 Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/bmap.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 798a2c1b38c6..7a8f166f2c8d 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -67,20 +67,28 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, down_read(&bmap->b_sem); ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp); - if (ret < 0) { - ret = nilfs_bmap_convert_error(bmap, __func__, ret); + if (ret < 0) goto out; - } + if (NILFS_BMAP_USE_VBN(bmap)) { ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp, &blocknr); if (!ret) *ptrp = blocknr; + else if (ret == -ENOENT) { + /* + * If there was no valid entry in DAT for the block + * address obtained by b_ops->bop_lookup, then pass + * internal code -EINVAL to nilfs_bmap_convert_error + * to treat it as metadata corruption. + */ + ret = -EINVAL; + } } out: up_read(&bmap->b_sem); - return ret; + return nilfs_bmap_convert_error(bmap, __func__, ret); } int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, From d824ec2a154677f63c56cc71ffe4578274f6e32e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 28 Apr 2023 14:41:40 +0200 Subject: [PATCH 3/5] mm: do not reclaim private data from pinned page If the page is pinned, there's no point in trying to reclaim it. Furthermore if the page is from the page cache we don't want to reclaim fs-private data from the page because the pinning process may be writing to the page at any time and reclaiming fs private info on a dirty page can upset the filesystem (see link below). Link: https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz Link: https://lkml.kernel.org/r/20230428124140.30166-1-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Lorenzo Stoakes Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Acked-by: David Hildenbrand Acked-by: Peter Xu Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5bde07409303..d257916f39e5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1967,6 +1967,16 @@ retry: } } + /* + * Folio is unmapped now so it cannot be newly pinned anymore. + * No point in trying to reclaim folio if it is pinned. + * Furthermore we don't want to reclaim underlying fs metadata + * if the folio is pinned and thus potentially modified by the + * pinning process as that may upset the filesystem. + */ + if (folio_maybe_dma_pinned(folio)) + goto activate_locked; + mapping = folio_mapping(folio); if (folio_test_dirty(folio)) { /* From 28a65b49eb53e172d23567005465019658bfdb4d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 27 Apr 2023 10:15:26 +0900 Subject: [PATCH 4/5] nilfs2: do not write dirty data after degenerating to read-only According to syzbot's report, mark_buffer_dirty() called from nilfs_segctor_do_construct() outputs a warning with some patterns after nilfs2 detects metadata corruption and degrades to read-only mode. After such read-only degeneration, page cache data may be cleared through nilfs_clear_dirty_page() which may also clear the uptodate flag for their buffer heads. However, even after the degeneration, log writes are still performed by unmount processing etc., which causes mark_buffer_dirty() to be called for buffer heads without the "uptodate" flag and causes the warning. Since any writes should not be done to a read-only file system in the first place, this fixes the warning in mark_buffer_dirty() by letting nilfs_segctor_do_construct() abort early if in read-only mode. This also changes the retry check of nilfs_segctor_write_out() to avoid unnecessary log write retries if it detects -EROFS that nilfs_segctor_do_construct() returned. Link: https://lkml.kernel.org/r/20230427011526.13457-1-konishi.ryusuke@gmail.com Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Reported-by: syzbot+2af3bc9585be7f23f290@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=2af3bc9585be7f23f290 Cc: Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 228659612c0d..ac949fd7603f 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2041,6 +2041,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int err; + if (sb_rdonly(sci->sc_super)) + return -EROFS; + nilfs_sc_cstage_set(sci, NILFS_ST_INIT); sci->sc_cno = nilfs->ns_cno; @@ -2724,7 +2727,7 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci) flush_work(&sci->sc_iput_work); - } while (ret && retrycount-- > 0); + } while (ret && ret != -EROFS && retrycount-- > 0); } /** From 58f5f6698a72a8af5d7bfc5f49b6df60f378f167 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 3 May 2023 17:45:26 +0200 Subject: [PATCH 5/5] afs: fix the afs_dir_get_folio return value Keep returning NULL on failure instead of letting an ERR_PTR escape to callers that don't expect it. Link: https://lkml.kernel.org/r/20230503154526.1223095-2-hch@lst.de Fixes: 66dabbb65d67 ("mm: return an ERR_PTR from __filemap_get_folio") Signed-off-by: Christoph Hellwig Reported-by: Jan Kara Reviewed-by: Jan Kara Reviewed-by: David Howells Tested-by: David Howells Cc: Marc Dionne Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- fs/afs/dir_edit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index f0eddccbdd95..e2fa577b66fe 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -115,11 +115,12 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping->gfp_mask); - if (IS_ERR(folio)) + if (IS_ERR(folio)) { clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - else if (folio && !folio_test_private(folio)) + return NULL; + } + if (!folio_test_private(folio)) folio_attach_private(folio, (void *)1); - return folio; }