bianbu-linux-6.6/arch/riscv/kernel/vdso.c
Mike Kravetz e9adcfecf5 mm: remove zap_page_range and create zap_vma_pages
zap_page_range was originally designed to unmap pages within an address
range that could span multiple vmas.  While working on [1], it was
discovered that all callers of zap_page_range pass a range entirely within
a single vma.  In addition, the mmu notification call within zap_page
range does not correctly handle ranges that span multiple vmas.  When
crossing a vma boundary, a new mmu_notifier_range_init/end call pair with
the new vma should be made.

Instead of fixing zap_page_range, do the following:
- Create a new routine zap_vma_pages() that will remove all pages within
  the passed vma.  Most users of zap_page_range pass the entire vma and
  can use this new routine.
- For callers of zap_page_range not passing the entire vma, instead call
  zap_page_range_single().
- Remove zap_page_range.

[1] https://lore.kernel.org/linux-mm/20221114235507.294320-2-mike.kravetz@oracle.com/
Link: https://lkml.kernel.org/r/20230104002732.232573-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Suggested-by: Peter Xu <peterx@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Peter Xu <peterx@redhat.com>
Acked-by: Heiko Carstens <hca@linux.ibm.com>	[s390]
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2023-01-18 17:12:55 -08:00

298 lines
6.6 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
* <benh@kernel.crashing.org>
* Copyright (C) 2012 ARM Limited
* Copyright (C) 2015 Regents of the University of California
*/
#include <linux/elf.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/binfmts.h>
#include <linux/err.h>
#include <asm/page.h>
#include <asm/vdso.h>
#include <linux/time_namespace.h>
#ifdef CONFIG_GENERIC_TIME_VSYSCALL
#include <vdso/datapage.h>
#else
struct vdso_data {
};
#endif
extern char vdso_start[], vdso_end[];
#ifdef CONFIG_COMPAT
extern char compat_vdso_start[], compat_vdso_end[];
#endif
enum vvar_pages {
VVAR_DATA_PAGE_OFFSET,
VVAR_TIMENS_PAGE_OFFSET,
VVAR_NR_PAGES,
};
enum rv_vdso_map {
RV_VDSO_MAP_VVAR,
RV_VDSO_MAP_VDSO,
};
#define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT)
/*
* The vDSO data page.
*/
static union {
struct vdso_data data;
u8 page[PAGE_SIZE];
} vdso_data_store __page_aligned_data;
struct vdso_data *vdso_data = &vdso_data_store.data;
struct __vdso_info {
const char *name;
const char *vdso_code_start;
const char *vdso_code_end;
unsigned long vdso_pages;
/* Data Mapping */
struct vm_special_mapping *dm;
/* Code Mapping */
struct vm_special_mapping *cm;
};
static struct __vdso_info vdso_info;
#ifdef CONFIG_COMPAT
static struct __vdso_info compat_vdso_info;
#endif
static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
{
current->mm->context.vdso = (void *)new_vma->vm_start;
return 0;
}
static void __init __vdso_init(struct __vdso_info *vdso_info)
{
unsigned int i;
struct page **vdso_pagelist;
unsigned long pfn;
if (memcmp(vdso_info->vdso_code_start, "\177ELF", 4))
panic("vDSO is not a valid ELF object!\n");
vdso_info->vdso_pages = (
vdso_info->vdso_code_end -
vdso_info->vdso_code_start) >>
PAGE_SHIFT;
vdso_pagelist = kcalloc(vdso_info->vdso_pages,
sizeof(struct page *),
GFP_KERNEL);
if (vdso_pagelist == NULL)
panic("vDSO kcalloc failed!\n");
/* Grab the vDSO code pages. */
pfn = sym_to_pfn(vdso_info->vdso_code_start);
for (i = 0; i < vdso_info->vdso_pages; i++)
vdso_pagelist[i] = pfn_to_page(pfn + i);
vdso_info->cm->pages = vdso_pagelist;
}
#ifdef CONFIG_TIME_NS
struct vdso_data *arch_get_vdso_data(void *vvar_page)
{
return (struct vdso_data *)(vvar_page);
}
/*
* The vvar mapping contains data for a specific time namespace, so when a task
* changes namespace we must unmap its vvar data for the old namespace.
* Subsequent faults will map in data for the new namespace.
*
* For more details see timens_setup_vdso_data().
*/
int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
{
struct mm_struct *mm = task->mm;
struct vm_area_struct *vma;
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
for_each_vma(vmi, vma) {
if (vma_is_special_mapping(vma, vdso_info.dm))
zap_vma_pages(vma);
#ifdef CONFIG_COMPAT
if (vma_is_special_mapping(vma, compat_vdso_info.dm))
zap_vma_pages(vma);
#endif
}
mmap_read_unlock(mm);
return 0;
}
#endif
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *timens_page = find_timens_vvar_page(vma);
unsigned long pfn;
switch (vmf->pgoff) {
case VVAR_DATA_PAGE_OFFSET:
if (timens_page)
pfn = page_to_pfn(timens_page);
else
pfn = sym_to_pfn(vdso_data);
break;
#ifdef CONFIG_TIME_NS
case VVAR_TIMENS_PAGE_OFFSET:
/*
* If a task belongs to a time namespace then a namespace
* specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
* the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
* offset.
* See also the comment near timens_setup_vdso_data().
*/
if (!timens_page)
return VM_FAULT_SIGBUS;
pfn = sym_to_pfn(vdso_data);
break;
#endif /* CONFIG_TIME_NS */
default:
return VM_FAULT_SIGBUS;
}
return vmf_insert_pfn(vma, vmf->address, pfn);
}
static struct vm_special_mapping rv_vdso_maps[] __ro_after_init = {
[RV_VDSO_MAP_VVAR] = {
.name = "[vvar]",
.fault = vvar_fault,
},
[RV_VDSO_MAP_VDSO] = {
.name = "[vdso]",
.mremap = vdso_mremap,
},
};
static struct __vdso_info vdso_info __ro_after_init = {
.name = "vdso",
.vdso_code_start = vdso_start,
.vdso_code_end = vdso_end,
.dm = &rv_vdso_maps[RV_VDSO_MAP_VVAR],
.cm = &rv_vdso_maps[RV_VDSO_MAP_VDSO],
};
#ifdef CONFIG_COMPAT
static struct vm_special_mapping rv_compat_vdso_maps[] __ro_after_init = {
[RV_VDSO_MAP_VVAR] = {
.name = "[vvar]",
.fault = vvar_fault,
},
[RV_VDSO_MAP_VDSO] = {
.name = "[vdso]",
.mremap = vdso_mremap,
},
};
static struct __vdso_info compat_vdso_info __ro_after_init = {
.name = "compat_vdso",
.vdso_code_start = compat_vdso_start,
.vdso_code_end = compat_vdso_end,
.dm = &rv_compat_vdso_maps[RV_VDSO_MAP_VVAR],
.cm = &rv_compat_vdso_maps[RV_VDSO_MAP_VDSO],
};
#endif
static int __init vdso_init(void)
{
__vdso_init(&vdso_info);
#ifdef CONFIG_COMPAT
__vdso_init(&compat_vdso_info);
#endif
return 0;
}
arch_initcall(vdso_init);
static int __setup_additional_pages(struct mm_struct *mm,
struct linux_binprm *bprm,
int uses_interp,
struct __vdso_info *vdso_info)
{
unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
void *ret;
BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
vdso_text_len = vdso_info->vdso_pages << PAGE_SHIFT;
/* Be sure to map the data page */
vdso_mapping_len = vdso_text_len + VVAR_SIZE;
vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
if (IS_ERR_VALUE(vdso_base)) {
ret = ERR_PTR(vdso_base);
goto up_fail;
}
ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE,
(VM_READ | VM_MAYREAD | VM_PFNMAP), vdso_info->dm);
if (IS_ERR(ret))
goto up_fail;
vdso_base += VVAR_SIZE;
mm->context.vdso = (void *)vdso_base;
ret =
_install_special_mapping(mm, vdso_base, vdso_text_len,
(VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC),
vdso_info->cm);
if (IS_ERR(ret))
goto up_fail;
return 0;
up_fail:
mm->context.vdso = NULL;
return PTR_ERR(ret);
}
#ifdef CONFIG_COMPAT
int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp)
{
struct mm_struct *mm = current->mm;
int ret;
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = __setup_additional_pages(mm, bprm, uses_interp,
&compat_vdso_info);
mmap_write_unlock(mm);
return ret;
}
#endif
int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
{
struct mm_struct *mm = current->mm;
int ret;
if (mmap_write_lock_killable(mm))
return -EINTR;
ret = __setup_additional_pages(mm, bprm, uses_interp, &vdso_info);
mmap_write_unlock(mm);
return ret;
}