bianbu-linux-6.6/arch/nds32/mm/fault.c
Peter Xu 4064b98270 mm: allow VM_FAULT_RETRY for multiple times
The idea comes from a discussion between Linus and Andrea [1].

Before this patch we only allow a page fault to retry once.  We achieved
this by clearing the FAULT_FLAG_ALLOW_RETRY flag when doing
handle_mm_fault() the second time.  This was majorly used to avoid
unexpected starvation of the system by looping over forever to handle the
page fault on a single page.  However that should hardly happen, and after
all for each code path to return a VM_FAULT_RETRY we'll first wait for a
condition (during which time we should possibly yield the cpu) to happen
before VM_FAULT_RETRY is really returned.

This patch removes the restriction by keeping the FAULT_FLAG_ALLOW_RETRY
flag when we receive VM_FAULT_RETRY.  It means that the page fault handler
now can retry the page fault for multiple times if necessary without the
need to generate another page fault event.  Meanwhile we still keep the
FAULT_FLAG_TRIED flag so page fault handler can still identify whether a
page fault is the first attempt or not.

Then we'll have these combinations of fault flags (only considering
ALLOW_RETRY flag and TRIED flag):

  - ALLOW_RETRY and !TRIED:  this means the page fault allows to
                             retry, and this is the first try

  - ALLOW_RETRY and TRIED:   this means the page fault allows to
                             retry, and this is not the first try

  - !ALLOW_RETRY and !TRIED: this means the page fault does not allow
                             to retry at all

  - !ALLOW_RETRY and TRIED:  this is forbidden and should never be used

In existing code we have multiple places that has taken special care of
the first condition above by checking against (fault_flags &
FAULT_FLAG_ALLOW_RETRY).  This patch introduces a simple helper to detect
the first retry of a page fault by checking against both (fault_flags &
FAULT_FLAG_ALLOW_RETRY) and !(fault_flag & FAULT_FLAG_TRIED) because now
even the 2nd try will have the ALLOW_RETRY set, then use that helper in
all existing special paths.  One example is in __lock_page_or_retry(), now
we'll drop the mmap_sem only in the first attempt of page fault and we'll
keep it in follow up retries, so old locking behavior will be retained.

This will be a nice enhancement for current code [2] at the same time a
supporting material for the future userfaultfd-writeprotect work, since in
that work there will always be an explicit userfault writeprotect retry
for protected pages, and if that cannot resolve the page fault (e.g., when
userfaultfd-writeprotect is used in conjunction with swapped pages) then
we'll possibly need a 3rd retry of the page fault.  It might also benefit
other potential users who will have similar requirement like userfault
write-protection.

GUP code is not touched yet and will be covered in follow up patch.

Please read the thread below for more information.

[1] https://lore.kernel.org/lkml/20171102193644.GB22686@redhat.com/
[2] https://lore.kernel.org/lkml/20181230154648.GB9832@redhat.com/

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Tested-by: Brian Geffon <bgeffon@google.com>
Cc: Bobby Powers <bobbypowers@gmail.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Denis Plotnikov <dplotnikov@virtuozzo.com>
Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Martin Cracauer <cracauer@cons.org>
Cc: Marty McFadden <mcfadden8@llnl.gov>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Maya Gokhale <gokhale2@llnl.gov>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Link: http://lkml.kernel.org/r/20200220160246.9790-1-peterx@redhat.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-04-02 09:35:30 -07:00

416 lines
9.3 KiB
C

// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2005-2017 Andes Technology Corporation
#include <linux/extable.h>
#include <linux/module.h>
#include <linux/signal.h>
#include <linux/ptrace.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/hardirq.h>
#include <linux/uaccess.h>
#include <linux/perf_event.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
extern void die(const char *str, struct pt_regs *regs, long err);
/*
* This is useful to dump out the page tables associated with
* 'addr' in mm 'mm'.
*/
void show_pte(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
if (!mm)
mm = &init_mm;
pr_alert("pgd = %p\n", mm->pgd);
pgd = pgd_offset(mm, addr);
pr_alert("[%08lx] *pgd=%08lx", addr, pgd_val(*pgd));
do {
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
if (pgd_none(*pgd))
break;
if (pgd_bad(*pgd)) {
pr_alert("(bad)");
break;
}
p4d = p4d_offset(pgd, addr);
pud = pud_offset(p4d, addr);
pmd = pmd_offset(pud, addr);
#if PTRS_PER_PMD != 1
pr_alert(", *pmd=%08lx", pmd_val(*pmd));
#endif
if (pmd_none(*pmd))
break;
if (pmd_bad(*pmd)) {
pr_alert("(bad)");
break;
}
if (IS_ENABLED(CONFIG_HIGHMEM))
{
pte_t *pte;
/* We must not map this if we have highmem enabled */
pte = pte_offset_map(pmd, addr);
pr_alert(", *pte=%08lx", pte_val(*pte));
pte_unmap(pte);
}
} while (0);
pr_alert("\n");
}
void do_page_fault(unsigned long entry, unsigned long addr,
unsigned int error_code, struct pt_regs *regs)
{
struct task_struct *tsk;
struct mm_struct *mm;
struct vm_area_struct *vma;
int si_code;
vm_fault_t fault;
unsigned int mask = VM_READ | VM_WRITE | VM_EXEC;
unsigned int flags = FAULT_FLAG_DEFAULT;
error_code = error_code & (ITYPE_mskINST | ITYPE_mskETYPE);
tsk = current;
mm = tsk->mm;
si_code = SEGV_MAPERR;
/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*/
if (addr >= TASK_SIZE) {
if (user_mode(regs))
goto bad_area_nosemaphore;
if (addr >= TASK_SIZE && addr < VMALLOC_END
&& (entry == ENTRY_PTE_NOT_PRESENT))
goto vmalloc_fault;
else
goto no_context;
}
/* Send a signal to the task for handling the unalignment access. */
if (entry == ENTRY_GENERAL_EXCPETION
&& error_code == ETYPE_ALIGNMENT_CHECK) {
if (user_mode(regs))
goto bad_area_nosemaphore;
else
goto no_context;
}
/*
* If we're in an interrupt or have no user
* context, we must not take the fault..
*/
if (unlikely(faulthandler_disabled() || !mm))
goto no_context;
/*
* As per x86, we may deadlock here. However, since the kernel only
* validly references user space from well defined areas of the code,
* we can bug out early if this is from code which shouldn't.
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if (!user_mode(regs) &&
!search_exception_tables(instruction_pointer(regs)))
goto no_context;
retry:
down_read(&mm->mmap_sem);
} else {
/*
* The above down_read_trylock() might have succeeded in which
* case, we'll have missed the might_sleep() from down_read().
*/
might_sleep();
if (IS_ENABLED(CONFIG_DEBUG_VM)) {
if (!user_mode(regs) &&
!search_exception_tables(instruction_pointer(regs)))
goto no_context;
}
}
vma = find_vma(mm, addr);
if (unlikely(!vma))
goto bad_area;
if (vma->vm_start <= addr)
goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
goto bad_area;
if (unlikely(expand_stack(vma, addr)))
goto bad_area;
/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
si_code = SEGV_ACCERR;
/* first do some preliminary protection checks */
if (entry == ENTRY_PTE_NOT_PRESENT) {
if (error_code & ITYPE_mskINST)
mask = VM_EXEC;
else {
mask = VM_READ | VM_WRITE;
}
} else if (entry == ENTRY_TLB_MISC) {
switch (error_code & ITYPE_mskETYPE) {
case RD_PROT:
mask = VM_READ;
break;
case WRT_PROT:
mask = VM_WRITE;
flags |= FAULT_FLAG_WRITE;
break;
case NOEXEC:
mask = VM_EXEC;
break;
case PAGE_MODIFY:
mask = VM_WRITE;
flags |= FAULT_FLAG_WRITE;
break;
case ACC_BIT:
BUG();
default:
break;
}
}
if (!(vma->vm_flags & mask))
goto bad_area;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
fault = handle_mm_fault(vma, addr, flags);
/*
* If we need to retry but a fatal signal is pending, handle the
* signal first. We do not need to release the mmap_sem because it
* would already be released in __lock_page_or_retry in mm/filemap.c.
*/
if (fault_signal_pending(fault, regs)) {
if (!user_mode(regs))
goto no_context;
return;
}
if (unlikely(fault & VM_FAULT_ERROR)) {
if (fault & VM_FAULT_OOM)
goto out_of_memory;
else if (fault & VM_FAULT_SIGBUS)
goto do_sigbus;
else
goto bad_area;
}
/*
* Major/minor page fault accounting is only done on the initial
* attempt. If we go through a retry, it is extremely likely that the
* page will be found in page cache at that point.
*/
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
if (flags & FAULT_FLAG_ALLOW_RETRY) {
if (fault & VM_FAULT_MAJOR) {
tsk->maj_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ,
1, regs, addr);
} else {
tsk->min_flt++;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN,
1, regs, addr);
}
if (fault & VM_FAULT_RETRY) {
flags |= FAULT_FLAG_TRIED;
/* No need to up_read(&mm->mmap_sem) as we would
* have already released it in __lock_page_or_retry
* in mm/filemap.c.
*/
goto retry;
}
}
up_read(&mm->mmap_sem);
return;
/*
* Something tried to access memory that isn't in our memory map..
* Fix it, but check if it's kernel or user first..
*/
bad_area:
up_read(&mm->mmap_sem);
bad_area_nosemaphore:
/* User mode accesses just cause a SIGSEGV */
if (user_mode(regs)) {
tsk->thread.address = addr;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = entry;
force_sig_fault(SIGSEGV, si_code, (void __user *)addr);
return;
}
no_context:
/* Are we prepared to handle this kernel fault?
*
* (The kernel has valid exception-points in the source
* when it acesses user-memory. When it fails in one
* of those points, we find it in a table and do a jump
* to some fixup code that loads an appropriate error
* code)
*/
{
const struct exception_table_entry *entry;
if ((entry =
search_exception_tables(instruction_pointer(regs))) !=
NULL) {
/* Adjust the instruction pointer in the stackframe */
instruction_pointer(regs) = entry->fixup;
return;
}
}
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
*/
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
(addr < PAGE_SIZE) ? "NULL pointer dereference" :
"paging request", addr);
show_pte(mm, addr);
die("Oops", regs, error_code);
bust_spinlocks(0);
do_exit(SIGKILL);
return;
/*
* We ran out of memory, or some other thing happened to us that made
* us unable to handle the page fault gracefully.
*/
out_of_memory:
up_read(&mm->mmap_sem);
if (!user_mode(regs))
goto no_context;
pagefault_out_of_memory();
return;
do_sigbus:
up_read(&mm->mmap_sem);
/* Kernel mode? Handle exceptions or die */
if (!user_mode(regs))
goto no_context;
/*
* Send a sigbus
*/
tsk->thread.address = addr;
tsk->thread.error_code = error_code;
tsk->thread.trap_no = entry;
force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr);
return;
vmalloc_fault:
{
/*
* Synchronize this task's top level page-table
* with the 'reference' page table.
*
* Use current_pgd instead of tsk->active_mm->pgd
* since the latter might be unavailable if this
* code is executed in a misfortunately run irq
* (like inside schedule() between switch_mm and
* switch_to...).
*/
unsigned int index = pgd_index(addr);
pgd_t *pgd, *pgd_k;
p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
pte_t *pte_k;
pgd = (pgd_t *) __va(__nds32__mfsr(NDS32_SR_L1_PPTB)) + index;
pgd_k = init_mm.pgd + index;
if (!pgd_present(*pgd_k))
goto no_context;
p4d = p4d_offset(pgd, addr);
p4d_k = p4d_offset(pgd_k, addr);
if (!p4d_present(*p4d_k))
goto no_context;
pud = pud_offset(p4d, addr);
pud_k = pud_offset(p4d_k, addr);
if (!pud_present(*pud_k))
goto no_context;
pmd = pmd_offset(pud, addr);
pmd_k = pmd_offset(pud_k, addr);
if (!pmd_present(*pmd_k))
goto no_context;
if (!pmd_present(*pmd))
set_pmd(pmd, *pmd_k);
else
BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
/*
* Since the vmalloc area is global, we don't
* need to copy individual PTE's, it is enough to
* copy the pgd pointer into the pte page of the
* root task. If that is there, we'll find our pte if
* it exists.
*/
/* Make sure the actual PTE exists as well to
* catch kernel vmalloc-area accesses to non-mapped
* addres. If we don't do this, this will just
* silently loop forever.
*/
pte_k = pte_offset_kernel(pmd_k, addr);
if (!pte_present(*pte_k))
goto no_context;
return;
}
}