mirror of
https://gitee.com/bianbu-linux/linux-6.6
synced 2025-04-24 14:07:52 -04:00
mm->mm_lock_seq effectively functions as a read/write lock; therefore it
must be used with acquire/release semantics.
A specific example is the interaction between userfaultfd_register() and
lock_vma_under_rcu().
userfaultfd_register() does the following from the point where it changes
a VMA's flags to the point where concurrent readers are permitted again
(in a simple scenario where only a single private VMA is accessed and no
merging/splitting is involved):
userfaultfd_register
userfaultfd_set_vm_flags
vm_flags_reset
vma_start_write
down_write(&vma->vm_lock->lock)
vma->vm_lock_seq = mm_lock_seq [marks VMA as busy]
up_write(&vma->vm_lock->lock)
vm_flags_init
[sets VM_UFFD_* in __vm_flags]
vma->vm_userfaultfd_ctx.ctx = ctx
mmap_write_unlock
vma_end_write_all
WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1) [unlocks VMA]
There are no memory barriers in between the __vm_flags update and the
mm->mm_lock_seq update that unlocks the VMA, so the unlock can be
reordered to above the `vm_flags_init()` call, which means from the
perspective of a concurrent reader, a VMA can be marked as a userfaultfd
VMA while it is not VMA-locked. That's bad, we definitely need a
store-release for the unlock operation.
The non-atomic write to vma->vm_lock_seq in vma_start_write() is mostly
fine because all accesses to vma->vm_lock_seq that matter are always
protected by the VMA lock. There is a racy read in vma_start_read()
though that can tolerate false-positives, so we should be using
WRITE_ONCE() to keep things tidy and data-race-free (including for KCSAN).
On the other side, lock_vma_under_rcu() works as follows in the relevant
region for locking and userfaultfd check:
lock_vma_under_rcu
vma_start_read
vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [early bailout]
down_read_trylock(&vma->vm_lock->lock)
vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq) [main check]
userfaultfd_armed
checks vma->vm_flags & __VM_UFFD_FLAGS
Here, the interesting aspect is how far down the mm->mm_lock_seq read can
be reordered - if this read is reordered down below the vma->vm_flags
access, this could cause lock_vma_under_rcu() to partly operate on
information that was read while the VMA was supposed to be locked. To
prevent this kind of downwards bleeding of the mm->mm_lock_seq read, we
need to read it with a load-acquire.
Some of the comment wording is based on suggestions by Suren.
BACKPORT WARNING: One of the functions changed by this patch (which I've
written against Linus' tree) is vma_try_start_write(), but this function
no longer exists in mm/mm-everything. I don't know whether the merged
version of this patch will be ordered before or after the patch that
removes vma_try_start_write(). If you're backporting this patch to a tree
with vma_try_start_write(), make sure this patch changes that function.
Link: https://lkml.kernel.org/r/20230721225107.942336-1-jannh@google.com
Fixes: 5e31275cc9
("mm: add per-VMA lock and helper functions to control it")
Signed-off-by: Jann Horn <jannh@google.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
189 lines
4.8 KiB
C
189 lines
4.8 KiB
C
#ifndef _LINUX_MMAP_LOCK_H
|
|
#define _LINUX_MMAP_LOCK_H
|
|
|
|
#include <linux/lockdep.h>
|
|
#include <linux/mm_types.h>
|
|
#include <linux/mmdebug.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/tracepoint-defs.h>
|
|
#include <linux/types.h>
|
|
|
|
#define MMAP_LOCK_INITIALIZER(name) \
|
|
.mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
|
|
|
|
DECLARE_TRACEPOINT(mmap_lock_start_locking);
|
|
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
|
|
DECLARE_TRACEPOINT(mmap_lock_released);
|
|
|
|
#ifdef CONFIG_TRACING
|
|
|
|
void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
|
|
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
|
|
bool success);
|
|
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);
|
|
|
|
static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
|
|
bool write)
|
|
{
|
|
if (tracepoint_enabled(mmap_lock_start_locking))
|
|
__mmap_lock_do_trace_start_locking(mm, write);
|
|
}
|
|
|
|
static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
|
|
bool write, bool success)
|
|
{
|
|
if (tracepoint_enabled(mmap_lock_acquire_returned))
|
|
__mmap_lock_do_trace_acquire_returned(mm, write, success);
|
|
}
|
|
|
|
static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
|
|
{
|
|
if (tracepoint_enabled(mmap_lock_released))
|
|
__mmap_lock_do_trace_released(mm, write);
|
|
}
|
|
|
|
#else /* !CONFIG_TRACING */
|
|
|
|
static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
|
|
bool write)
|
|
{
|
|
}
|
|
|
|
static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
|
|
bool write, bool success)
|
|
{
|
|
}
|
|
|
|
static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
|
|
{
|
|
}
|
|
|
|
#endif /* CONFIG_TRACING */
|
|
|
|
static inline void mmap_assert_locked(struct mm_struct *mm)
|
|
{
|
|
lockdep_assert_held(&mm->mmap_lock);
|
|
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
|
|
}
|
|
|
|
static inline void mmap_assert_write_locked(struct mm_struct *mm)
|
|
{
|
|
lockdep_assert_held_write(&mm->mmap_lock);
|
|
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
|
|
}
|
|
|
|
#ifdef CONFIG_PER_VMA_LOCK
|
|
static inline void vma_end_write_all(struct mm_struct *mm)
|
|
{
|
|
mmap_assert_write_locked(mm);
|
|
/*
|
|
* Nobody can concurrently modify mm->mm_lock_seq due to exclusive
|
|
* mmap_lock being held.
|
|
* We need RELEASE semantics here to ensure that preceding stores into
|
|
* the VMA take effect before we unlock it with this store.
|
|
* Pairs with ACQUIRE semantics in vma_start_read().
|
|
*/
|
|
smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1);
|
|
}
|
|
#else
|
|
static inline void vma_end_write_all(struct mm_struct *mm) {}
|
|
#endif
|
|
|
|
static inline void mmap_init_lock(struct mm_struct *mm)
|
|
{
|
|
init_rwsem(&mm->mmap_lock);
|
|
}
|
|
|
|
static inline void mmap_write_lock(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_start_locking(mm, true);
|
|
down_write(&mm->mmap_lock);
|
|
__mmap_lock_trace_acquire_returned(mm, true, true);
|
|
}
|
|
|
|
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
|
|
{
|
|
__mmap_lock_trace_start_locking(mm, true);
|
|
down_write_nested(&mm->mmap_lock, subclass);
|
|
__mmap_lock_trace_acquire_returned(mm, true, true);
|
|
}
|
|
|
|
static inline int mmap_write_lock_killable(struct mm_struct *mm)
|
|
{
|
|
int ret;
|
|
|
|
__mmap_lock_trace_start_locking(mm, true);
|
|
ret = down_write_killable(&mm->mmap_lock);
|
|
__mmap_lock_trace_acquire_returned(mm, true, ret == 0);
|
|
return ret;
|
|
}
|
|
|
|
static inline bool mmap_write_trylock(struct mm_struct *mm)
|
|
{
|
|
bool ret;
|
|
|
|
__mmap_lock_trace_start_locking(mm, true);
|
|
ret = down_write_trylock(&mm->mmap_lock) != 0;
|
|
__mmap_lock_trace_acquire_returned(mm, true, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline void mmap_write_unlock(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_released(mm, true);
|
|
vma_end_write_all(mm);
|
|
up_write(&mm->mmap_lock);
|
|
}
|
|
|
|
static inline void mmap_write_downgrade(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_acquire_returned(mm, false, true);
|
|
vma_end_write_all(mm);
|
|
downgrade_write(&mm->mmap_lock);
|
|
}
|
|
|
|
static inline void mmap_read_lock(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_start_locking(mm, false);
|
|
down_read(&mm->mmap_lock);
|
|
__mmap_lock_trace_acquire_returned(mm, false, true);
|
|
}
|
|
|
|
static inline int mmap_read_lock_killable(struct mm_struct *mm)
|
|
{
|
|
int ret;
|
|
|
|
__mmap_lock_trace_start_locking(mm, false);
|
|
ret = down_read_killable(&mm->mmap_lock);
|
|
__mmap_lock_trace_acquire_returned(mm, false, ret == 0);
|
|
return ret;
|
|
}
|
|
|
|
static inline bool mmap_read_trylock(struct mm_struct *mm)
|
|
{
|
|
bool ret;
|
|
|
|
__mmap_lock_trace_start_locking(mm, false);
|
|
ret = down_read_trylock(&mm->mmap_lock) != 0;
|
|
__mmap_lock_trace_acquire_returned(mm, false, ret);
|
|
return ret;
|
|
}
|
|
|
|
static inline void mmap_read_unlock(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_released(mm, false);
|
|
up_read(&mm->mmap_lock);
|
|
}
|
|
|
|
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
|
|
{
|
|
__mmap_lock_trace_released(mm, false);
|
|
up_read_non_owner(&mm->mmap_lock);
|
|
}
|
|
|
|
static inline int mmap_lock_is_contended(struct mm_struct *mm)
|
|
{
|
|
return rwsem_is_contended(&mm->mmap_lock);
|
|
}
|
|
|
|
#endif /* _LINUX_MMAP_LOCK_H */
|