From 9ef8eb6104527bfe9ed31f7a4ffa721390adf9a8 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Oct 2022 23:36:14 +0100 Subject: [PATCH 01/23] squashfs: fix read regression introduced in readahead code Patch series "squashfs: fix some regressions introduced in the readahead code". This patchset fixes 3 regressions introduced by the recent readahead code changes. The first regression is causing "snaps" to randomly fail after a couple of hours or days, which how the regression came to light. This patch (of 3): If a file isn't a whole multiple of the page size, the last page will have trailing bytes unfilled. There was a mistake in the readahead code which did this. In particular it incorrectly assumed that the last page in the readahead page array (page[nr_pages - 1]) will always contain the last page in the block, which if we're at file end, will be the page that needs to be zero filled. But the readahead code may not return the last page in the block, which means it is unmapped and will be skipped by the decompressors (a temporary buffer used). In this case the zero filling code will zero out the wrong page, leading to data corruption. Fix this by by extending the "page actor" to return the last page if present, or NULL if a temporary buffer was used. Link: https://lkml.kernel.org/r/20221020223616.7571-1-phillip@squashfs.org.uk Link: https://lkml.kernel.org/r/20221020223616.7571-2-phillip@squashfs.org.uk Fixes: 8fc78b6fe24c ("squashfs: implement readahead") Link: https://lore.kernel.org/lkml/b0c258c3-6dcf-aade-efc4-d62a8b3a1ce2@alu.unizg.hr/ Signed-off-by: Phillip Lougher Reported-by: Mirsad Goran Todorovac Tested-by: Mirsad Goran Todorovac Tested-by: Slade Watkins Tested-by: Bagas Sanjaya Reported-by: Marc Miltenberger Cc: Dimitri John Ledkov Cc: Hsin-Yi Wang Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 7 ++++--- fs/squashfs/page_actor.c | 3 +++ fs/squashfs/page_actor.h | 6 +++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e56510964b22..e526eb7a1658 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -557,6 +557,7 @@ static void squashfs_readahead(struct readahead_control *ractl) int res, bsize; u64 block = 0; unsigned int expected; + struct page *last_page; nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) @@ -593,15 +594,15 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res == expected) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (pages[nr_pages - 1]->index == file_end && bytes) - memzero_page(pages[nr_pages - 1], bytes, + if (index == file_end && bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); for (i = 0; i < nr_pages; i++) { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 54b93bf4a25c..81af6c4ca115 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor) (actor->next_index != actor->page[actor->next_page]->index)) { actor->next_index++; actor->returned_pages++; + actor->last_page = NULL; return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; actor->returned_pages++; + actor->last_page = actor->page[actor->next_page]; return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); } @@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 95ffbb543d91..97d4983559b1 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -16,6 +16,7 @@ struct squashfs_page_actor { void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); + struct page *last_page; int pages; int length; int next_page; @@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); -static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { + struct page *last_page = actor->last_page; + kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { From c9199de82bad03bceb94ec3c5195c879d7e11911 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Oct 2022 23:36:15 +0100 Subject: [PATCH 02/23] squashfs: fix extending readahead beyond end of file The readahead code will try to extend readahead to the entire size of the Squashfs data block. But, it didn't take into account that the last block at the end of the file may not be a whole block. In this case, the code would extend readahead to beyond the end of the file, leaving trailing pages. Fix this by only requesting the expected number of pages. Link: https://lkml.kernel.org/r/20221020223616.7571-3-phillip@squashfs.org.uk Fixes: 8fc78b6fe24c ("squashfs: implement readahead") Signed-off-by: Phillip Lougher Tested-by: Bagas Sanjaya Reported-by: Marc Miltenberger Cc: Dimitri John Ledkov Cc: Hsin-Yi Wang Cc: Mirsad Goran Todorovac Cc: Slade Watkins Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e526eb7a1658..f0afd4d6fd30 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -559,6 +559,12 @@ static void squashfs_readahead(struct readahead_control *ractl) unsigned int expected; struct page *last_page; + expected = start >> msblk->block_log == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT; + nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) break; @@ -567,13 +573,10 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) goto skip_pages; - expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - if (index == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, From e11c4e088be4c39d17f304fcf331670891905f42 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 20 Oct 2022 23:36:16 +0100 Subject: [PATCH 03/23] squashfs: fix buffer release race condition in readahead code Fix a buffer release race condition, where the error value was used after release. Link: https://lkml.kernel.org/r/20221020223616.7571-4-phillip@squashfs.org.uk Fixes: b09a7a036d20 ("squashfs: support reading fragments in readahead call") Signed-off-by: Phillip Lougher Tested-by: Bagas Sanjaya Reported-by: Marc Miltenberger Cc: Dimitri John Ledkov Cc: Hsin-Yi Wang Cc: Mirsad Goran Todorovac Cc: Slade Watkins Cc: Thorsten Leemhuis Cc: Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index f0afd4d6fd30..8ba8c4c50770 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + int error = buffer->error; - if (buffer->error) + if (error) goto out; expected += squashfs_i(inode)->fragment_offset; @@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page, out: squashfs_cache_put(buffer); - return buffer->error; + return error; } static void squashfs_readahead(struct readahead_control *ractl) From 984a608377cb623351b8a3670b285f32ebeb2d32 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 20 Oct 2022 13:56:19 -0400 Subject: [PATCH 04/23] mm/kmemleak: prevent soft lockup in kmemleak_scan()'s object iteration loops Commit 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") adds cond_resched() in the first object iteration loop of kmemleak_scan(). However, it turns that the 2nd objection iteration loop can still cause soft lockup to happen in some cases. So add a cond_resched() call in the 2nd and 3rd loops as well to prevent that and for completeness. Link: https://lkml.kernel.org/r/20221020175619.366317-1-longman@redhat.com Fixes: 6edda04ccc7c ("mm/kmemleak: prevent soft lockup in first object iteration loop of kmemleak_scan()") Signed-off-by: Waiman Long Cc: Catalin Marinas Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/kmemleak.c | 61 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..646e2979641f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1460,6 +1460,27 @@ static void scan_gray_list(void) WARN_ON(!list_empty(&gray_list)); } +/* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + /* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the @@ -1471,7 +1492,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1501,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1534,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,7 +1605,15 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in @@ -1632,7 +1647,15 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in From b214fadff28d20f96456b73fe93340cb581ca891 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 20 Oct 2022 11:42:55 +0900 Subject: [PATCH 05/23] MAINTAINERS: git://github.com -> https://github.com for nilfs2 Github deprecated the git:// links about a year ago, so let's move to the https:// URLs instead. Link: https://lkml.kernel.org/r/20221020024255.5000-1-konishi.ryusuke@gmail.com Link: https://github.blog/2021-09-01-improving-git-protocol-security-github/ Link: https://lkml.kernel.org/r/20221013214638.30933-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt Signed-off-by: Ryusuke Konishi Reported-by: Conor Dooley Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index cf0f18502372..fe447637bf03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14520,7 +14520,7 @@ L: linux-nilfs@vger.kernel.org S: Supported W: https://nilfs.sourceforge.io/ W: https://nilfs.osdn.jp/ -T: git git://github.com/konis/nilfs2.git +T: git https://github.com/konis/nilfs2.git F: Documentation/filesystems/nilfs2.rst F: fs/nilfs2/ F: include/trace/events/nilfs2.h From 27d676a1c2010450d00d514a8a6c1c780cb8d77f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 20 Oct 2022 09:51:22 +0800 Subject: [PATCH 06/23] memory tier, sysfs: rename attribute "nodes" to "nodelist" In sysfs, we use attribute name "cpumap" or "cpus" for cpu mask and "cpulist" or "cpus_list" for cpu list. For example, in my system, $ cat /sys/devices/system/node/node0/cpumap f,ffffffff $ cat /sys/devices/system/cpu/cpu2/topology/core_cpus 0,00100004 $ cat cat /sys/devices/system/node/node0/cpulist 0-35 $ cat /sys/devices/system/cpu/cpu2/topology/core_cpus_list 2,20 It looks reasonable to use "nodemap" for node mask and "nodelist" for node list. So, rename the attribute to follow the naming convention. Link: https://lkml.kernel.org/r/20221020015122.290097-1-ying.huang@intel.com Fixes: 9832fb87834e2b ("mm/demotion: expose memory tier details via sysfs") Signed-off-by: "Huang, Ying" Acked-by: Wei Xu Reviewed-by: Aneesh Kumar K.V Reviewed-by: Yang Shi Reviewed-by: Davidlohr Bueso Cc: Alistair Popple Cc: Bharata B Rao Cc: Dan Williams Cc: Dave Hansen Cc: Hesham Almatary Cc: Jagdish Gediya Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Tim Chen Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers | 4 ++-- mm/memory-tiers.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers index 45985e411f13..721a05b90109 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers @@ -10,7 +10,7 @@ Description: A collection of all the memory tiers allocated. What: /sys/devices/virtual/memory_tiering/memory_tierN/ - /sys/devices/virtual/memory_tiering/memory_tierN/nodes + /sys/devices/virtual/memory_tiering/memory_tierN/nodelist Date: August 2022 Contact: Linux memory management mailing list Description: Directory with details of a specific memory tier @@ -21,5 +21,5 @@ Description: Directory with details of a specific memory tier A smaller value of N implies a higher (faster) memory tier in the hierarchy. - nodes: NUMA nodes that are part of this memory tier. + nodelist: NUMA nodes that are part of this memory tier. diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..fa8c9d07f9ce 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; From 64b4c411a6c7a5f27555bfc2d6310b87bde3db67 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 20 Oct 2022 21:19:22 -0700 Subject: [PATCH 07/23] ipc/msg.c: fix percpu_counter use after free These percpu counters are referenced in free_ipcs->freeque, so destroy them later. Fixes: 72d1e611082e ("ipc/msg: mitigate the lock contention with percpu counter") Reported-by: syzbot+96e659d35b9d6b541152@syzkaller.appspotmail.com Tested-by: Mark Rutland Cc: Jiebin Sun Signed-off-by: Andrew Morton --- ipc/msg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index e4e0990e08f7..fd08b3cb36d7 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1329,11 +1329,11 @@ fail_msg_bytes: #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { - percpu_counter_destroy(&ns->percpu_msg_bytes); - percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif From bb2282cf01fcae7379314dc026f3b534c83c186c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 21 Oct 2022 08:05:49 -0700 Subject: [PATCH 08/23] fs/ext4/super.c: remove unused `deprecated_msg' fs/ext4/super.c:1744:19: warning: 'deprecated_msg' defined but not used [-Wunused-const-variable=] Reported-by: kernel test robot Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/super.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 989365b878a6..7950904fbf04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1741,10 +1741,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static const char deprecated_msg[] = - "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 From fba4eaf93164a6a6eb3cc12a3391b06f6187aa20 Mon Sep 17 00:00:00 2001 From: Maria Yu Date: Fri, 21 Oct 2022 18:15:55 +0800 Subject: [PATCH 09/23] mm/page_isolation: fix clang deadcode warning When !CONFIG_VM_BUG_ON, there is warning of clang-analyzer-deadcode.DeadStores: Value stored to 'mt' during its initialization is never read. Link: https://lkml.kernel.org/r/20221021101555.7992-2-quic_aiquny@quicinc.com Signed-off-by: Maria Yu Cc: David Hildenbrand Cc: Doug Berger Cc: Mike Kravetz Cc: Zi Yan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { From 8ebe0a5eaaeb099de03d09ad20f54ed962e2261e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 21 Oct 2022 19:28:05 -0400 Subject: [PATCH 10/23] mm,madvise,hugetlb: fix unexpected data loss with MADV_DONTNEED on hugetlbfs A common use case for hugetlbfs is for the application to create memory pools backed by huge pages, which then get handed over to some malloc library (eg. jemalloc) for further management. That malloc library may be doing MADV_DONTNEED calls on memory that is no longer needed, expecting those calls to happen on PAGE_SIZE boundaries. However, currently the MADV_DONTNEED code rounds up any such requests to HPAGE_PMD_SIZE boundaries. This leads to undesired outcomes when jemalloc expects a 4kB MADV_DONTNEED, but 2MB of memory get zeroed out, instead. Use of pre-built shared libraries means that user code does not always know the page size of every memory arena in use. Avoid unexpected data loss with MADV_DONTNEED by rounding up only to PAGE_SIZE (in do_madvise), and rounding down to huge page granularity. That way programs will only get as much memory zeroed out as they requested. Link: https://lkml.kernel.org/r/20221021192805.366ad573@imladris.surriel.com Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings") Signed-off-by: Rik van Riel Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..c7105ec6d08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ From 5aae9265ee1a30cf716d6caf6b29fe99b9d55130 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 22 Oct 2022 00:51:06 -0700 Subject: [PATCH 11/23] mm: prep_compound_tail() clear page->private Although page allocation always clears page->private in the first page or head page of an allocation, it has never made a point of clearing page->private in the tails (though 0 is often what is already there). But now commit 71e2d666ef85 ("mm/huge_memory: do not clobber swp_entry_t during THP split") issues a warning when page_tail->private is found to be non-0 (unless it's swapcache). Change that warning to dump page_tail (which also dumps head), instead of just the head: so far we have seen dead000000000122, dead000000000003, dead000000000001 or 0000000000000002 in the raw output for tail private. We could just delete the warning, but today's consensus appears to want page->private to be 0, unless there's a good reason for it to be set: so now clear it in prep_compound_tail() (more general than just for THP; but not for high order allocation, which makes no pass down the tails). Link: https://lkml.kernel.org/r/1c4233bb-4e4d-5969-fbd4-96604268a285@google.com Fixes: 71e2d666ef85 ("mm/huge_memory: do not clobber swp_entry_t during THP split") Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Matthew Wilcox (Oracle) Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/page_alloc.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fc7e5edf07..561a42567477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2462,7 +2462,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * Fix up and warn once if private is unexpectedly set. */ if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, head); + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); page_tail->private = 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a6c815ae28..218b28ee49ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) From 67eae54bc227b30dedcce9db68b063ba1adb7838 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 24 Oct 2022 15:33:35 -0400 Subject: [PATCH 12/23] mm/uffd: fix vma check on userfault for wp We used to have a report that pte-marker code can be reached even when uffd-wp is not compiled in for file memories, here: https://lore.kernel.org/all/YzeR+R6b4bwBlBHh@x1n/T/#u I just got time to revisit this and found that the root cause is we simply messed up with the vma check, so that for !PTE_MARKER_UFFD_WP system, we will allow UFFDIO_REGISTER of MINOR & WP upon shmem as the check was wrong: if (vm_flags & VM_UFFD_MINOR) return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); Where we'll allow anything to pass on shmem as long as minor mode is requested. Axel did it right when introducing minor mode but I messed it up in b1f9e876862d when moving code around. Fix it. Link: https://lkml.kernel.org/r/20221024193336.1233616-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221024193336.1233616-2-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: Axel Rasmussen Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f07e6998bb68..9df0b9a762cc 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -146,9 +146,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { - if (vm_flags & VM_UFFD_MINOR) - return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); - + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for From 03e5f82ea632af329e32ec03d952b2d99497eeaa Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 24 Oct 2022 16:34:21 +0800 Subject: [PATCH 13/23] mm: migrate: fix return value if all subpages of THPs are migrated successfully During THP migration, if THPs are not migrated but they are split and all subpages are migrated successfully, migrate_pages() will still return the number of THP pages that were not migrated. This will confuse the callers of migrate_pages(). For example, the longterm pinning will failed though all pages are migrated successfully. Thus we should return 0 to indicate that all pages are migrated in this case Link: https://lkml.kernel.org/r/de386aa864be9158d2f3b344091419ea7c38b2f7.1666599848.git.baolin.wang@linux.alibaba.com Fixes: b5bade978e9b ("mm: migrate: fix the return value of migrate_pages()") Signed-off-by: Baolin Wang Reviewed-by: Alistair Popple Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/migrate.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..dff333593a8a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1582,6 +1582,13 @@ out: */ list_splice(&ret_pages, from); + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); From f59a3ee6912997fc56ecee78613fef53aae668d9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:40 +0200 Subject: [PATCH 14/23] mm: kmsan: export kmsan_copy_page_meta() Certain modules call copy_user_highpage(), which calls kmsan_copy_page_meta() under KMSAN, so we need to export the latter. Link: https://lkml.kernel.org/r/20221024212144.2852069-1-glider@google.com Link: https://github.com/google/kmsan/issues/89 Fixes: b073d7f8aee4 ("mm: kmsan: maintain KMSAN metadata for page operations") Signed-off-by: Alexander Potapenko Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { From 42855f588e187a6f22978e54422adbc010ac7630 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:41 +0200 Subject: [PATCH 15/23] x86/purgatory: disable KMSAN instrumentation The stand-alone purgatory.ro does not contain the KMSAN runtime, therefore it can't be built with KMSAN compiler instrumentation. Link: https://lkml.kernel.org/r/20221024212144.2852069-2-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Cc: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton --- arch/x86/purgatory/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 58a200dc762d..17f09dc26381 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -26,6 +26,7 @@ GCOV_PROFILE := n KASAN_SANITIZE := n UBSAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n KCOV_INSTRUMENT := n # These are adjustments to the compiler flags used for objects that From 921757bc9b611efc483a548b86769934384e9c79 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:42 +0200 Subject: [PATCH 16/23] Kconfig.debug: disable CONFIG_FRAME_WARN for KMSAN by default KMSAN adds a lot of instrumentation to the code, which results in increased stack usage (up to 2048 bytes and more in some cases). It's hard to predict how big the stack frames can be, so we disable the warnings for KMSAN instead. Link: https://lkml.kernel.org/r/20221024212144.2852069-3-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Cc: Kees Cook Cc: Masahiro Yamada Cc: Nick Desaulniers Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3fc7abffc7aa..29280072dc0e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -400,8 +400,9 @@ config FRAME_WARN default 1536 if (!64BIT && XTENSA) default 1024 if !64BIT default 2048 if 64BIT + default 0 if KMSAN help - Tell gcc to warn at build time for stack frames larger than this. + Tell the compiler to warn at build time for stack frames larger than this. Setting this too low will cause a lot of warnings. Setting it to 0 disables the warning. From 59c8a02e24894e75639bcecc3cb1e768a2792220 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:43 +0200 Subject: [PATCH 17/23] x86: asm: make sure __put_user_size() evaluates pointer once User access macros must ensure their arguments are evaluated only once if they are used more than once in the macro body. Adding instrument_put_user() to __put_user_size() resulted in double evaluation of the `ptr` argument, which led to correctness issues when performing e.g. unsafe_put_user(..., p++, ...). To fix those issues, evaluate the `ptr` argument of __put_user_size() at the beginning of the macro. Link: https://lkml.kernel.org/r/20221024212144.2852069-4-glider@google.com Fixes: 888f84a6da4d ("x86: asm: instrument usercopy in get_user() and put_user()") Signed-off-by: Alexander Potapenko Reported-by: youling257 Cc: Borislav Petkov Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- arch/x86/include/asm/uaccess.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bc614cfe21b..1cc756eafa44 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -254,24 +254,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ - __chk_user_ptr(ptr); \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ - __put_user_goto(__x, ptr, "b", "iq", label); \ + __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(__x, ptr, "w", "ir", label); \ + __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(__x, ptr, "l", "ir", label); \ + __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(__x, ptr, label); \ + __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ - instrument_put_user(__x, ptr, size); \ + instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT From 78a498c3a227f2ac773a8234b2ce092a4403f2c3 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 24 Oct 2022 23:21:44 +0200 Subject: [PATCH 18/23] x86: fortify: kmsan: fix KMSAN fortify builds Ensure that KMSAN builds replace memset/memcpy/memmove calls with the respective __msan_XXX functions, and that none of the macros are redefined twice. This should allow building kernel with both CONFIG_KMSAN and CONFIG_FORTIFY_SOURCE. Link: https://lkml.kernel.org/r/20221024212144.2852069-5-glider@google.com Link: https://github.com/google/kmsan/issues/89 Signed-off-by: Alexander Potapenko Reported-by: Tamas K Lengyel Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Kees Cook Signed-off-by: Andrew Morton --- arch/x86/include/asm/string_64.h | 11 +++++++---- include/linux/fortify-string.h | 17 +++++++++++++++-- include/linux/kmsan_string.h | 21 +++++++++++++++++++++ mm/kmsan/instrumentation.c | 1 + 4 files changed, 44 insertions(+), 6 deletions(-) create mode 100644 include/linux/kmsan_string.h diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 3b87d889b6e1..888731ccf1f6 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,10 +10,13 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include +#endif + #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memcpy -void *__msan_memcpy(void *dst, const void *src, size_t size); #define memcpy __msan_memcpy #else extern void *memcpy(void *to, const void *from, size_t len); @@ -21,7 +24,7 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) extern void *__msan_memset(void *s, int c, size_t n); #undef memset #define memset __msan_memset @@ -67,7 +70,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memmove void *__msan_memmove(void *dest, const void *src, size_t len); #define memmove __msan_memmove diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 4029fe368a4f..18a31b125f9d 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -43,11 +43,24 @@ extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else -#define __underlying_memchr __builtin_memchr -#define __underlying_memcmp __builtin_memcmp + +#if defined(__SANITIZE_MEMORY__) +/* + * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the + * corresponding __msan_XXX functions. + */ +#include +#define __underlying_memcpy __msan_memcpy +#define __underlying_memmove __msan_memmove +#define __underlying_memset __msan_memset +#else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset +#endif + +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen diff --git a/include/linux/kmsan_string.h b/include/linux/kmsan_string.h new file mode 100644 index 000000000000..7287da6f52ef --- /dev/null +++ b/include/linux/kmsan_string.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN string functions API used in other headers. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_STRING_H +#define _LINUX_KMSAN_STRING_H + +/* + * KMSAN overrides the default memcpy/memset/memmove implementations in the + * kernel, which requires having __msan_XXX function prototypes in several other + * headers. Keep them in one place instead of open-coding. + */ +void *__msan_memcpy(void *dst, const void *src, size_t size); +void *__msan_memset(void *s, int c, size_t n); +void *__msan_memmove(void *dest, const void *src, size_t len); + +#endif /* _LINUX_KMSAN_STRING_H */ diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include +#include #include #include From 5521de7dddd211e3a9403d7bde0b614fd0936ac6 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Sun, 23 Oct 2022 21:34:52 -0700 Subject: [PATCH 19/23] mm/userfaultfd: replace kmap/kmap_atomic() with kmap_local_page() kmap() and kmap_atomic() are being deprecated in favor of kmap_local_page() which is appropriate for any thread local context.[1] A recent locking bug report with userfaultfd showed that the conversion of the kmap_atomic()'s in those code flows requires care with regard to the prevention of deadlock.[2] git archaeology implied that the recursion may not be an actual bug.[3] However, depending on the implementation of the mmap_lock and the condition of the call there may still be a deadlock.[4] So this is not purely a lockdep issue. Considering a single threaded call stack there are 3 options. 1) Different mm's are in play (no issue) 2) Readlock implementation is recursive and same mm is in play (no issue) 3) Readlock implementation is _not_ recursive (issue) The mmap_lock is recursive so with a single thread there is no issue. However, Matthew pointed out a deadlock scenario when you consider additional process' and threads thusly. "The readlock implementation is only recursive if nobody else has taken a write lock. If you have a multithreaded process, one of the other threads can call mmap() and that will prevent recursion (due to fairness). Even if it's a different process that you're trying to acquire the mmap read lock on, you can still get into a deadly embrace. eg: process A thread 1 takes read lock on own mmap_lock process A thread 2 calls mmap, blocks taking write lock process B thread 1 takes page fault, read lock on own mmap lock process B thread 2 calls mmap, blocks taking write lock process A thread 1 blocks taking read lock on process B process B thread 1 blocks taking read lock on process A Now all four threads are blocked waiting for each other." Regardless using pagefault_disable() ensures that no matter what locking implementation is used a deadlock will not occur. Complete kmap conversion in userfaultfd by replacing the kmap() and kmap_atomic() calls with kmap_local_page(). When replacing the kmap_atomic() call ensure page faults continue to be disabled to support the correct fall back behavior and add a comment to inform future souls of the requirement. [1] https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com/ [2] https://lore.kernel.org/all/Y1Mh2S7fUGQ%2FiKFR@iweiny-desk3/ [3] https://lore.kernel.org/all/Y1MymJ%2FINb45AdaY@iweiny-desk3/ [4] https://lore.kernel.org/lkml/Y1bXBtGTCym77%2FoD@casper.infradead.org/ [ira.weiny@intel.com: v2] Link: https://lkml.kernel.org/r/20221025220136.2366143-1-ira.weiny@intel.com Link: https://lkml.kernel.org/r/20221024043452.1491677-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Cc: Matthew Wilcox Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Peter Xu Cc: Axel Rasmussen Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..3d0fef3980b3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -646,11 +663,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; From 5dc21f0c0b1c02ea2c9014cbe7cd3b28884ff306 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 25 Oct 2022 15:01:08 -0700 Subject: [PATCH 20/23] mm/shmem: ensure proper fallback if page faults The kernel test robot flagged a recursive lock as a result of a conversion from kmap_atomic() to kmap_local_folio()[Link] The cause was due to the code depending on the kmap_atomic() side effect of disabling page faults. In that case the code expects the fault to fail and take the fallback case. git archaeology implied that the recursion may not be an actual bug.[1] However, depending on the implementation of the mmap_lock and the condition of the call there may still be a deadlock.[2] So this is not purely a lockdep issue. Considering a single threaded call stack there are 3 options. 1) Different mm's are in play (no issue) 2) Readlock implementation is recursive and same mm is in play (no issue) 3) Readlock implementation is _not_ recursive (issue) The mmap_lock is recursive so with a single thread there is no issue. However, Matthew pointed out a deadlock scenario when you consider additional process' and threads thusly. "The readlock implementation is only recursive if nobody else has taken a write lock. If you have a multithreaded process, one of the other threads can call mmap() and that will prevent recursion (due to fairness). Even if it's a different process that you're trying to acquire the mmap read lock on, you can still get into a deadly embrace. eg: process A thread 1 takes read lock on own mmap_lock process A thread 2 calls mmap, blocks taking write lock process B thread 1 takes page fault, read lock on own mmap lock process B thread 2 calls mmap, blocks taking write lock process A thread 1 blocks taking read lock on process B process B thread 1 blocks taking read lock on process A Now all four threads are blocked waiting for each other." Regardless using pagefault_disable() ensures that no matter what locking implementation is used a deadlock will not occur. Add an explicit pagefault_disable() and a big comment to explain this for future souls looking at this code. [1] https://lore.kernel.org/all/Y1MymJ%2FINb45AdaY@iweiny-desk3/ [2] https://lore.kernel.org/lkml/Y1bXBtGTCym77%2FoD@casper.infradead.org/ Link: https://lkml.kernel.org/r/20221025220108.2366043-1-ira.weiny@intel.com Link: https://lore.kernel.org/r/202210211215.9dc6efb5-yujie.liu@intel.com Fixes: 7a7256d5f512 ("shmem: convert shmem_mfill_atomic_pte() to use a folio") Signed-off-by: Ira Weiny Reported-by: Matthew Wilcox (Oracle) Reported-by: kernel test robot Cc: Randy Dunlap Cc: Peter Xu Cc: Andrea Arcangeli Signed-off-by: Andrew Morton --- mm/shmem.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ From 1db43d3f3733351849ddca4b573c037c7821bfd8 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 25 Oct 2022 16:12:49 +0000 Subject: [PATCH 21/23] mmap: fix remap_file_pages() regression When using the VMA iterator, the final execution will set the variable 'next' to NULL which causes the function to fail out. Restore the break in the loop to exit the VMA iterator early without clearing NULL fixes the issue. Link: https://lore.kernel.org/lkml/29344.1666681759@jrobl/ Link: https://lkml.kernel.org/r/20221025161222.2634030-1-Liam.Howlett@oracle.com Fixes: 763ecb035029 (mm: remove the vma linked list) Signed-off-by: Liam R. Howlett Reported-by: "J. R. Okajima" Tested-by: "J. R. Okajima" Signed-off-by: Andrew Morton --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index e270057ed04e..2def55555e05 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2852,6 +2852,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } From 1b9c918318476b4441ddd754ee6699b5367bb5ee Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 26 Oct 2022 14:00:29 +0200 Subject: [PATCH 22/23] lib: maple_tree: remove unneeded initialization in mtree_range_walk() Before the do-while loop in mtree_range_walk(), the variables next, min, max need to be initialized. The variables last, prev_min and prev_max are set within the loop body before they are eventually used after exiting the loop body. As it is a do-while loop, the loop body is executed at least once, so the variables last, prev_min and prev_max do not need to be initialized before the loop body. Remove unneeded initialization of last and prev_min. The needless initialization was reported by clang-analyzer as Dead Stores. As the compiler already identifies these assignments as unneeded, it optimizes the assignments away. Hence: No functional change. No change in object code. Link: https://lkml.kernel.org/r/20221026120029.12555-2-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e1743803c851..fbde494444b8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2903,8 +2903,8 @@ static inline void *mtree_range_walk(struct ma_state *mas) unsigned long max, min; unsigned long prev_max, prev_min; - last = next = mas->node; - prev_min = min = mas->min; + next = mas->node; + min = mas->min; max = mas->max; do { offset = 0; From dda1c41a07b4a4c3f99b5b28c1e8c485205fe860 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 26 Oct 2022 15:48:30 +0200 Subject: [PATCH 23/23] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off region lru_gen_add_mm() has been added within an IRQ-off region in the commit mentioned below. The other invocations of lru_gen_add_mm() are not within an IRQ-off region. The invocation within IRQ-off region is problematic on PREEMPT_RT because the function is using a spin_lock_t which must not be used within IRQ-disabled regions. The other invocations of lru_gen_add_mm() occur while task_struct::alloc_lock is acquired. Move lru_gen_add_mm() after interrupts are enabled and before task_unlock(). Link: https://lkml.kernel.org/r/20221026134830.711887-1-bigeasy@linutronix.de Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Yu Zhao Cc: Al Viro Cc: "Eric W . Biederman" Cc: Kees Cook Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 349a5da91efe..7ab1f27b805d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1012,7 +1012,6 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; - lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1025,6 +1024,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); + lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) {