Merge 4.15.0-rc6 into usb-next

We want the USB fixes in here, and this resolves a merge issue with the
vhci_rx.c file.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Greg Kroah-Hartman 2018-01-02 15:13:41 +01:00
commit 01f1918833
499 changed files with 7321 additions and 2601 deletions

View file

@ -109,6 +109,7 @@ parameter is applicable::
IPV6 IPv6 support is enabled. IPV6 IPv6 support is enabled.
ISAPNP ISA PnP code is enabled. ISAPNP ISA PnP code is enabled.
ISDN Appropriate ISDN support is enabled. ISDN Appropriate ISDN support is enabled.
ISOL CPU Isolation is enabled.
JOY Appropriate joystick support is enabled. JOY Appropriate joystick support is enabled.
KGDB Kernel debugger support is enabled. KGDB Kernel debugger support is enabled.
KVM Kernel Virtual Machine support is enabled. KVM Kernel Virtual Machine support is enabled.

View file

@ -328,11 +328,15 @@
not play well with APC CPU idle - disable it if you have not play well with APC CPU idle - disable it if you have
APC and your system crashes randomly. APC and your system crashes randomly.
apic= [APIC,X86-32] Advanced Programmable Interrupt Controller apic= [APIC,X86] Advanced Programmable Interrupt Controller
Change the output verbosity whilst booting Change the output verbosity whilst booting
Format: { quiet (default) | verbose | debug } Format: { quiet (default) | verbose | debug }
Change the amount of debugging information output Change the amount of debugging information output
when initialising the APIC and IO-APIC components. when initialising the APIC and IO-APIC components.
For X86-32, this can also be used to specify an APIC
driver name.
Format: apic=driver_name
Examples: apic=bigsmp
apic_extnmi= [APIC,X86] External NMI delivery setting apic_extnmi= [APIC,X86] External NMI delivery setting
Format: { bsp (default) | all | none } Format: { bsp (default) | all | none }
@ -1737,7 +1741,7 @@
isapnp= [ISAPNP] isapnp= [ISAPNP]
Format: <RDP>,<reset>,<pci_scan>,<verbosity> Format: <RDP>,<reset>,<pci_scan>,<verbosity>
isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance. isolcpus= [KNL,SMP,ISOL] Isolate a given set of CPUs from disturbance.
[Deprecated - use cpusets instead] [Deprecated - use cpusets instead]
Format: [flag-list,]<cpu-list> Format: [flag-list,]<cpu-list>
@ -2662,7 +2666,7 @@
Valid arguments: on, off Valid arguments: on, off
Default: on Default: on
nohz_full= [KNL,BOOT] nohz_full= [KNL,BOOT,SMP,ISOL]
The argument is a cpu list, as described above. The argument is a cpu list, as described above.
In kernels built with CONFIG_NO_HZ_FULL=y, set In kernels built with CONFIG_NO_HZ_FULL=y, set
the specified list of CPUs whose tick will be stopped the specified list of CPUs whose tick will be stopped
@ -2708,6 +2712,8 @@
steal time is computed, but won't influence scheduler steal time is computed, but won't influence scheduler
behaviour behaviour
nopti [X86-64] Disable kernel page table isolation
nolapic [X86-32,APIC] Do not enable or use the local APIC. nolapic [X86-32,APIC] Do not enable or use the local APIC.
nolapic_timer [X86-32,APIC] Do not use the local APIC timer. nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
@ -3282,6 +3288,12 @@
pt. [PARIDE] pt. [PARIDE]
See Documentation/blockdev/paride.txt. See Documentation/blockdev/paride.txt.
pti= [X86_64]
Control user/kernel address space isolation:
on - enable
off - disable
auto - default setting
pty.legacy_count= pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in [KNL] Number of legacy pty's. Overwrites compiled-in
default number. default number.

View file

@ -230,7 +230,7 @@ If supported by your machine this will be exposed by the WMI bus with
a sysfs attribute called "force_power". a sysfs attribute called "force_power".
For example the intel-wmi-thunderbolt driver exposes this attribute in: For example the intel-wmi-thunderbolt driver exposes this attribute in:
/sys/devices/platform/PNP0C14:00/wmi_bus/wmi_bus-PNP0C14:00/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power /sys/bus/wmi/devices/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
To force the power to on, write 1 to this attribute file. To force the power to on, write 1 to this attribute file.
To disable force power, write 0 to this attribute file. To disable force power, write 0 to this attribute file.

View file

@ -13,7 +13,6 @@ Required properties:
at25df321a at25df321a
at25df641 at25df641
at26df081a at26df081a
en25s64
mr25h128 mr25h128
mr25h256 mr25h256
mr25h10 mr25h10
@ -33,7 +32,6 @@ Required properties:
s25fl008k s25fl008k
s25fl064k s25fl064k
sst25vf040b sst25vf040b
sst25wf040b
m25p40 m25p40
m25p80 m25p80
m25p16 m25p16

View file

@ -73,7 +73,7 @@ Example:
compatible = "dlg,da7218"; compatible = "dlg,da7218";
reg = <0x1a>; reg = <0x1a>;
interrupt-parent = <&gpio6>; interrupt-parent = <&gpio6>;
interrupts = <11 IRQ_TYPE_LEVEL_HIGH>; interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
wakeup-source; wakeup-source;
VDD-supply = <&reg_audio>; VDD-supply = <&reg_audio>;

View file

@ -77,7 +77,7 @@ Example:
reg = <0x1a>; reg = <0x1a>;
interrupt-parent = <&gpio6>; interrupt-parent = <&gpio6>;
interrupts = <11 IRQ_TYPE_LEVEL_HIGH>; interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
VDD-supply = <&reg_audio>; VDD-supply = <&reg_audio>;
VDDMIC-supply = <&reg_audio>; VDDMIC-supply = <&reg_audio>;

View file

@ -12,24 +12,30 @@ Required properties:
- "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc - "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
- reg : Offset and length of the register set for the device - reg : Offset and length of the register set for the device
- interrupts : Should contain CSPI/eCSPI interrupt - interrupts : Should contain CSPI/eCSPI interrupt
- cs-gpios : Specifies the gpio pins to be used for chipselects.
- clocks : Clock specifiers for both ipg and per clocks. - clocks : Clock specifiers for both ipg and per clocks.
- clock-names : Clock names should include both "ipg" and "per" - clock-names : Clock names should include both "ipg" and "per"
See the clock consumer binding, See the clock consumer binding,
Documentation/devicetree/bindings/clock/clock-bindings.txt Documentation/devicetree/bindings/clock/clock-bindings.txt
- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
Documentation/devicetree/bindings/dma/dma.txt
- dma-names: DMA request names should include "tx" and "rx" if present.
Obsolete properties: Recommended properties:
- fsl,spi-num-chipselects : Contains the number of the chipselect - cs-gpios : GPIOs to use as chip selects, see spi-bus.txt. While the native chip
select lines can be used, they appear to always generate a pulse between each
word of a transfer. Most use cases will require GPIO based chip selects to
generate a valid transaction.
Optional properties: Optional properties:
- num-cs : Number of total chip selects, see spi-bus.txt.
- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
Documentation/devicetree/bindings/dma/dma.txt.
- dma-names: DMA request names, if present, should include "tx" and "rx".
- fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register - fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
controlling the SPI_READY handling. Note that to enable the DRCTL consideration, controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
the SPI_READY mode-flag needs to be set too. the SPI_READY mode-flag needs to be set too.
Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst). Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
Obsolete properties:
- fsl,spi-num-chipselects : Contains the number of the chipselect
Example: Example:
ecspi@70010000 { ecspi@70010000 {

View file

@ -1,6 +1,4 @@
<previous description obsolete, deleted>
Virtual memory map with 4 level page tables: Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@ -14,13 +12,16 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ... ... unused hole ...
ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
... unused hole ... ... unused hole ...
fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ... ... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ... ... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls [fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables: Virtual memory map with 5 level page tables:
@ -29,26 +30,29 @@ Virtual memory map with 5 level page tables:
hole caused by [56:63] sign extension hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff91ffffffffffff (=49 bits) hole ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ... ... unused hole ...
ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
... unused hole ... ... unused hole ...
fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ... ... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ... ... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls [fixmap start] - ffffffffff5fffff kernel-internal fixmap range
ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
through to the most-significant implemented bit are set to either all ones through to the most-significant implemented bit are sign extended.
or all zero. This causes hole between user space and kernel addresses. This causes hole between user space and kernel addresses if you interpret them
as unsigned.
The direct mapping covers all memory in the system up to the highest The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory memory address (this means in some cases it can also include PCI memory
@ -58,9 +62,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
the processes using the page fault handler, with init_top_pgt as the processes using the page fault handler, with init_top_pgt as
reference. reference.
Current X86-64 implementations support up to 46 bits of address space (64 TB),
which is our current limit. This expands into MBZ space in the page tables.
We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
memory window (this size is arbitrary, it can be raised later if needed). memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available The mappings are not part of any other kernel PGD and are only available
@ -72,5 +73,3 @@ following fixmap section.
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized. physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time. Their order is preserved but their base will be offset early at boot time.
-Andi Kleen, Jul 2004

View file

@ -2621,24 +2621,22 @@ F: fs/bfs/
F: include/uapi/linux/bfs_fs.h F: include/uapi/linux/bfs_fs.h
BLACKFIN ARCHITECTURE BLACKFIN ARCHITECTURE
M: Steven Miao <realmz6@gmail.com>
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
T: git git://git.code.sf.net/p/adi-linux/code T: git git://git.code.sf.net/p/adi-linux/code
W: http://blackfin.uclinux.org W: http://blackfin.uclinux.org
S: Supported S: Orphan
F: arch/blackfin/ F: arch/blackfin/
BLACKFIN EMAC DRIVER BLACKFIN EMAC DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org W: http://blackfin.uclinux.org
S: Supported S: Orphan
F: drivers/net/ethernet/adi/ F: drivers/net/ethernet/adi/
BLACKFIN MEDIA DRIVER BLACKFIN MEDIA DRIVER
M: Scott Jiang <scott.jiang.linux@gmail.com>
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org/ W: http://blackfin.uclinux.org/
S: Supported S: Orphan
F: drivers/media/platform/blackfin/ F: drivers/media/platform/blackfin/
F: drivers/media/i2c/adv7183* F: drivers/media/i2c/adv7183*
F: drivers/media/i2c/vs6624* F: drivers/media/i2c/vs6624*
@ -2646,25 +2644,25 @@ F: drivers/media/i2c/vs6624*
BLACKFIN RTC DRIVER BLACKFIN RTC DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org W: http://blackfin.uclinux.org
S: Supported S: Orphan
F: drivers/rtc/rtc-bfin.c F: drivers/rtc/rtc-bfin.c
BLACKFIN SDH DRIVER BLACKFIN SDH DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org W: http://blackfin.uclinux.org
S: Supported S: Orphan
F: drivers/mmc/host/bfin_sdh.c F: drivers/mmc/host/bfin_sdh.c
BLACKFIN SERIAL DRIVER BLACKFIN SERIAL DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org W: http://blackfin.uclinux.org
S: Supported S: Orphan
F: drivers/tty/serial/bfin_uart.c F: drivers/tty/serial/bfin_uart.c
BLACKFIN WATCHDOG DRIVER BLACKFIN WATCHDOG DRIVER
L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers) L: adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
W: http://blackfin.uclinux.org W: http://blackfin.uclinux.org
S: Supported S: Orphan
F: drivers/watchdog/bfin_wdt.c F: drivers/watchdog/bfin_wdt.c
BLINKM RGB LED DRIVER BLINKM RGB LED DRIVER
@ -13493,6 +13491,7 @@ M: Mika Westerberg <mika.westerberg@linux.intel.com>
M: Yehezkel Bernat <yehezkel.bernat@intel.com> M: Yehezkel Bernat <yehezkel.bernat@intel.com>
T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
S: Maintained S: Maintained
F: Documentation/admin-guide/thunderbolt.rst
F: drivers/thunderbolt/ F: drivers/thunderbolt/
F: include/linux/thunderbolt.h F: include/linux/thunderbolt.h

View file

@ -2,7 +2,7 @@
VERSION = 4 VERSION = 4
PATCHLEVEL = 15 PATCHLEVEL = 15
SUBLEVEL = 0 SUBLEVEL = 0
EXTRAVERSION = -rc4 EXTRAVERSION = -rc6
NAME = Fearless Coyote NAME = Fearless Coyote
# *DOCUMENTATION* # *DOCUMENTATION*
@ -789,6 +789,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
# disable invalid "can't wrap" optimizations for signed / pointers # disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow) KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
# Make sure -fstack-check isn't enabled (like gentoo apparently did)
KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
# conserve stack if available # conserve stack if available
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack) KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)

View file

@ -85,7 +85,11 @@
.pushsection .text.fixup,"ax" .pushsection .text.fixup,"ax"
.align 4 .align 4
9001: mov r4, #-EFAULT 9001: mov r4, #-EFAULT
#ifdef CONFIG_CPU_SW_DOMAIN_PAN
ldr r5, [sp, #9*4] @ *err_ptr
#else
ldr r5, [sp, #8*4] @ *err_ptr ldr r5, [sp, #8*4] @ *err_ptr
#endif
str r4, [r5] str r4, [r5]
ldmia sp, {r1, r2} @ retrieve dst, len ldmia sp, {r1, r2} @ retrieve dst, len
add r2, r2, r1 add r2, r2, r1

View file

@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
{ {
u64 reg; u64 reg;
/* Clear pmscr in case of early return */
*pmscr_el1 = 0;
/* SPE present on this CPU? */ /* SPE present on this CPU? */
if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1), if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
ID_AA64DFR0_PMSVER_SHIFT)) ID_AA64DFR0_PMSVER_SHIFT))

View file

@ -123,8 +123,8 @@ int puts(const char *s)
while ((nuline = strchr(s, '\n')) != NULL) { while ((nuline = strchr(s, '\n')) != NULL) {
if (nuline != s) if (nuline != s)
pdc_iodc_print(s, nuline - s); pdc_iodc_print(s, nuline - s);
pdc_iodc_print("\r\n", 2); pdc_iodc_print("\r\n", 2);
s = nuline + 1; s = nuline + 1;
} }
if (*s != '\0') if (*s != '\0')
pdc_iodc_print(s, strlen(s)); pdc_iodc_print(s, strlen(s));

View file

@ -35,7 +35,12 @@ struct thread_info {
/* thread information allocation */ /* thread information allocation */
#ifdef CONFIG_IRQSTACKS
#define THREAD_SIZE_ORDER 2 /* PA-RISC requires at least 16k stack */
#else
#define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */ #define THREAD_SIZE_ORDER 3 /* PA-RISC requires at least 32k stack */
#endif
/* Be sure to hunt all references to this down when you change the size of /* Be sure to hunt all references to this down when you change the size of
* the kernel stack */ * the kernel stack */
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)

View file

@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi)
STREG %r19,PT_SR7(%r16) STREG %r19,PT_SR7(%r16)
intr_return: intr_return:
/* NOTE: Need to enable interrupts incase we schedule. */
ssm PSW_SM_I, %r0
/* check for reschedule */ /* check for reschedule */
mfctl %cr30,%r1 mfctl %cr30,%r1
LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */ LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */
@ -907,6 +904,11 @@ intr_check_sig:
LDREG PT_IASQ1(%r16), %r20 LDREG PT_IASQ1(%r16), %r20
cmpib,COND(=),n 0,%r20,intr_restore /* backward */ cmpib,COND(=),n 0,%r20,intr_restore /* backward */
/* NOTE: We need to enable interrupts if we have to deliver
* signals. We used to do this earlier but it caused kernel
* stack overflows. */
ssm PSW_SM_I, %r0
copy %r0, %r25 /* long in_syscall = 0 */ copy %r0, %r25 /* long in_syscall = 0 */
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */ ldo -16(%r30),%r29 /* Reference param save area */
@ -958,6 +960,10 @@ intr_do_resched:
cmpib,COND(=) 0, %r20, intr_do_preempt cmpib,COND(=) 0, %r20, intr_do_preempt
nop nop
/* NOTE: We need to enable interrupts if we schedule. We used
* to do this earlier but it caused kernel stack overflows. */
ssm PSW_SM_I, %r0
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
ldo -16(%r30),%r29 /* Reference param save area */ ldo -16(%r30),%r29 /* Reference param save area */
#endif #endif

View file

@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)
__INITRODATA __INITRODATA
.align 4
.export os_hpmc_size .export os_hpmc_size
os_hpmc_size: os_hpmc_size:
.word .os_hpmc_end-.os_hpmc .word .os_hpmc_end-.os_hpmc

View file

@ -15,7 +15,6 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/kallsyms.h> #include <linux/kallsyms.h>
#include <linux/sort.h> #include <linux/sort.h>
#include <linux/sched.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/assembly.h> #include <asm/assembly.h>

View file

@ -16,9 +16,7 @@
#include <linux/preempt.h> #include <linux/preempt.h>
#include <linux/init.h> #include <linux/init.h>
#include <asm/processor.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/special_insns.h> /* for mfctl() */ #include <asm/special_insns.h> /* for mfctl() */
#include <asm/processor.h> /* for boot_cpu_data */ #include <asm/processor.h> /* for boot_cpu_data */

View file

@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
#endif #endif
} }
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm) struct mm_struct *mm)
{ {
return 0;
} }
#ifndef CONFIG_PPC_BOOK3S_64 #ifndef CONFIG_PPC_BOOK3S_64

View file

@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs)
printk("NIP: "REG" LR: "REG" CTR: "REG"\n", printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
regs->nip, regs->link, regs->ctr); regs->nip, regs->link, regs->ctr);
printk("REGS: %p TRAP: %04lx %s (%s)\n", printk("REGS: %px TRAP: %04lx %s (%s)\n",
regs, regs->trap, print_tainted(), init_utsname()->release); regs, regs->trap, print_tainted(), init_utsname()->release);
printk("MSR: "REG" ", regs->msr); printk("MSR: "REG" ", regs->msr);
print_msr_bits(regs->msr); print_msr_bits(regs->msr);

View file

@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
/* Return the per-cpu state for state saving/migration */ /* Return the per-cpu state for state saving/migration */
return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT | return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
(u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT; (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
(u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
} }
int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
@ -1558,7 +1559,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
/* /*
* Restore P and Q. If the interrupt was pending, we * Restore P and Q. If the interrupt was pending, we
* force both P and Q, which will trigger a resend. * force Q and !P, which will trigger a resend.
* *
* That means that a guest that had both an interrupt * That means that a guest that had both an interrupt
* pending (queued) and Q set will restore with only * pending (queued) and Q set will restore with only
@ -1566,7 +1567,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
* is perfectly fine as coalescing interrupts that haven't * is perfectly fine as coalescing interrupts that haven't
* been presented yet is always allowed. * been presented yet is always allowed.
*/ */
if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING) if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
state->old_p = true; state->old_p = true;
if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING) if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
state->old_q = true; state->old_q = true;

View file

@ -763,7 +763,8 @@ emit_clear:
func = (u8 *) __bpf_call_base + imm; func = (u8 *) __bpf_call_base + imm;
/* Save skb pointer if we need to re-cache skb data */ /* Save skb pointer if we need to re-cache skb data */
if (bpf_helper_changes_pkt_data(func)) if ((ctx->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_func_call(image, ctx, (u64)func); bpf_jit_emit_func_call(image, ctx, (u64)func);
@ -772,7 +773,8 @@ emit_clear:
PPC_MR(b2p[BPF_REG_0], 3); PPC_MR(b2p[BPF_REG_0], 3);
/* refresh skb cache */ /* refresh skb cache */
if (bpf_helper_changes_pkt_data(func)) { if ((ctx->seen & SEEN_SKB) &&
bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */ /* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx); bpf_jit_emit_skb_loads(image, ctx);

View file

@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
int ret; int ret;
__u64 target; __u64 target;
if (is_kernel_addr(addr)) if (is_kernel_addr(addr)) {
return branch_target((unsigned int *)addr); if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
return 0;
return branch_target(&instr);
}
/* Userspace: need copy instruction here then translate it */ /* Userspace: need copy instruction here then translate it */
pagefault_disable(); pagefault_disable();

View file

@ -309,6 +309,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask)) if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
return 0; return 0;
/*
* Check whether nest_imc is registered. We could end up here if the
* cpuhotplug callback registration fails. i.e, callback invokes the
* offline path for all successfully registered nodes. At this stage,
* nest_imc pmu will not be registered and we should return here.
*
* We return with a zero since this is not an offline failure. And
* cpuhp_setup_state() returns the actual failure reason to the caller,
* which in turn will call the cleanup routine.
*/
if (!nest_pmus)
return 0;
/* /*
* Now that this cpu is one of the designated, * Now that this cpu is one of the designated,
* find a next cpu a) which is online and b) in same chip. * find a next cpu a) which is online and b) in same chip.
@ -1171,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
if (nest_pmus == 1) { if (nest_pmus == 1) {
cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
kfree(nest_imc_refc); kfree(nest_imc_refc);
kfree(per_nest_pmu_arr);
} }
if (nest_pmus > 0) if (nest_pmus > 0)
@ -1195,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
kfree(pmu_ptr); kfree(pmu_ptr);
kfree(per_nest_pmu_arr);
return; return;
} }
@ -1309,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
ret = nest_pmu_cpumask_init(); ret = nest_pmu_cpumask_init();
if (ret) { if (ret) {
mutex_unlock(&nest_init_lock); mutex_unlock(&nest_init_lock);
kfree(nest_imc_refc);
kfree(per_nest_pmu_arr);
goto err_free; goto err_free;
} }
} }

View file

@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
} }
static struct lock_class_key fsl_msi_irq_class; static struct lock_class_key fsl_msi_irq_class;
static struct lock_class_key fsl_msi_irq_request_class;
static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev, static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
int offset, int irq_index) int offset, int irq_index)
@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
dev_err(&dev->dev, "No memory for MSI cascade data\n"); dev_err(&dev->dev, "No memory for MSI cascade data\n");
return -ENOMEM; return -ENOMEM;
} }
irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class); irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class,
&fsl_msi_irq_request_class);
cascade_data->index = offset; cascade_data->index = offset;
cascade_data->msi_data = msi; cascade_data->msi_data = msi;
cascade_data->virq = virt_msir; cascade_data->virq = virt_msir;

View file

@ -55,8 +55,7 @@ struct bpf_jit {
#define SEEN_LITERAL 8 /* code uses literals */ #define SEEN_LITERAL 8 /* code uses literals */
#define SEEN_FUNC 16 /* calls C functions */ #define SEEN_FUNC 16 /* calls C functions */
#define SEEN_TAIL_CALL 32 /* code uses tail calls */ #define SEEN_TAIL_CALL 32 /* code uses tail calls */
#define SEEN_SKB_CHANGE 64 /* code changes skb data */ #define SEEN_REG_AX 64 /* code uses constant blinding */
#define SEEN_REG_AX 128 /* code uses constant blinding */
#define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
/* /*
@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
REG_15, 152); REG_15, 152);
} }
if (jit->seen & SEEN_SKB) if (jit->seen & SEEN_SKB) {
emit_load_skb_data_hlen(jit); emit_load_skb_data_hlen(jit);
if (jit->seen & SEEN_SKB_CHANGE)
/* stg %b1,ST_OFF_SKBP(%r0,%r15) */ /* stg %b1,ST_OFF_SKBP(%r0,%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15, EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
STK_OFF_SKBP); STK_OFF_SKBP);
}
} }
/* /*
@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1); EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */ /* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb9040000, BPF_REG_0, REG_2); EMIT4(0xb9040000, BPF_REG_0, REG_2);
if (bpf_helper_changes_pkt_data((void *)func)) { if ((jit->seen & SEEN_SKB) &&
jit->seen |= SEEN_SKB_CHANGE; bpf_helper_changes_pkt_data((void *)func)) {
/* lg %b1,ST_OFF_SKBP(%r15) */ /* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
REG_15, STK_OFF_SKBP); REG_15, STK_OFF_SKBP);

View file

@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
.previous .previous
ENTRY(__arch_hweight64) ENTRY(__arch_hweight64)
sethi %hi(__sw_hweight16), %g1 sethi %hi(__sw_hweight64), %g1
jmpl %g1 + %lo(__sw_hweight16), %g0 jmpl %g1 + %lo(__sw_hweight64), %g0
nop nop
ENDPROC(__arch_hweight64) ENDPROC(__arch_hweight64)
EXPORT_SYMBOL(__arch_hweight64) EXPORT_SYMBOL(__arch_hweight64)

View file

@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
if (!printk_ratelimit()) if (!printk_ratelimit())
return; return;
printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->pc, (void *)regs->u_regs[UREG_I7], (void *)regs->pc, (void *)regs->u_regs[UREG_I7],

View file

@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
if (!printk_ratelimit()) if (!printk_ratelimit())
return; return;
printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x", printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), address, tsk->comm, task_pid_nr(tsk), address,
(void *)regs->tpc, (void *)regs->u_regs[UREG_I7], (void *)regs->tpc, (void *)regs->u_regs[UREG_I7],

View file

@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
u8 *func = ((u8 *)__bpf_call_base) + imm; u8 *func = ((u8 *)__bpf_call_base) + imm;
ctx->saw_call = true; ctx->saw_call = true;
if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
emit_call((u32 *)func, ctx); emit_call((u32 *)func, ctx);
emit_nop(ctx); emit_nop(ctx);
emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx); emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind) if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
load_skb_regs(ctx, bpf2sparc[BPF_REG_6]); load_skb_regs(ctx, L7);
break; break;
} }

View file

@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
/* /*
* Needed since we do not use the asm-generic/mm_hooks.h: * Needed since we do not use the asm-generic/mm_hooks.h:
*/ */
static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{ {
uml_setup_stubs(mm); uml_setup_stubs(mm);
return 0;
} }
extern void arch_exit_mmap(struct mm_struct *mm); extern void arch_exit_mmap(struct mm_struct *mm);
static inline void arch_unmap(struct mm_struct *mm, static inline void arch_unmap(struct mm_struct *mm,

View file

@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
if (!printk_ratelimit()) if (!printk_ratelimit())
return; return;
printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x", printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi), tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
(void *)UPT_IP(regs), (void *)UPT_SP(regs), (void *)UPT_IP(regs), (void *)UPT_SP(regs),

View file

@ -81,9 +81,10 @@ do { \
} \ } \
} while (0) } while (0)
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm) struct mm_struct *mm)
{ {
return 0;
} }
static inline void arch_unmap(struct mm_struct *mm, static inline void arch_unmap(struct mm_struct *mm,

View file

@ -926,7 +926,8 @@ config MAXSMP
config NR_CPUS config NR_CPUS
int "Maximum number of CPUs" if SMP && !MAXSMP int "Maximum number of CPUs" if SMP && !MAXSMP
range 2 8 if SMP && X86_32 && !X86_BIGSMP range 2 8 if SMP && X86_32 && !X86_BIGSMP
range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK range 2 64 if SMP && X86_32 && X86_BIGSMP
range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
default "1" if !SMP default "1" if !SMP
default "8192" if MAXSMP default "8192" if MAXSMP

View file

@ -23,6 +23,9 @@
*/ */
#undef CONFIG_AMD_MEM_ENCRYPT #undef CONFIG_AMD_MEM_ENCRYPT
/* No PAGE_TABLE_ISOLATION support needed either: */
#undef CONFIG_PAGE_TABLE_ISOLATION
#include "misc.h" #include "misc.h"
/* These actually do the work of building the kernel identity maps. */ /* These actually do the work of building the kernel identity maps. */

View file

@ -80,39 +80,43 @@ genfdimage288() {
mcopy $FBZIMAGE w:linux mcopy $FBZIMAGE w:linux
} }
genisoimage() { geniso() {
tmp_dir=`dirname $FIMAGE`/isoimage tmp_dir=`dirname $FIMAGE`/isoimage
rm -rf $tmp_dir rm -rf $tmp_dir
mkdir $tmp_dir mkdir $tmp_dir
for i in lib lib64 share end ; do for i in lib lib64 share ; do
for j in syslinux ISOLINUX ; do for j in syslinux ISOLINUX ; do
if [ -f /usr/$i/$j/isolinux.bin ] ; then if [ -f /usr/$i/$j/isolinux.bin ] ; then
isolinux=/usr/$i/$j/isolinux.bin isolinux=/usr/$i/$j/isolinux.bin
cp $isolinux $tmp_dir
fi fi
done done
for j in syslinux syslinux/modules/bios ; do for j in syslinux syslinux/modules/bios ; do
if [ -f /usr/$i/$j/ldlinux.c32 ]; then if [ -f /usr/$i/$j/ldlinux.c32 ]; then
ldlinux=/usr/$i/$j/ldlinux.c32 ldlinux=/usr/$i/$j/ldlinux.c32
cp $ldlinux $tmp_dir
fi fi
done done
if [ -n "$isolinux" -a -n "$ldlinux" ] ; then if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
break break
fi fi
if [ $i = end -a -z "$isolinux" ] ; then
echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
exit 1
fi
done done
if [ -z "$isolinux" ] ; then
echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
exit 1
fi
if [ -z "$ldlinux" ] ; then
echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
exit 1
fi
cp $isolinux $tmp_dir
cp $ldlinux $tmp_dir
cp $FBZIMAGE $tmp_dir/linux cp $FBZIMAGE $tmp_dir/linux
echo "$KCMDLINE" > $tmp_dir/isolinux.cfg echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
if [ -f "$FDINITRD" ] ; then if [ -f "$FDINITRD" ] ; then
cp "$FDINITRD" $tmp_dir/initrd.img cp "$FDINITRD" $tmp_dir/initrd.img
fi fi
mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \ genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
-c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \ -b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
$tmp_dir -boot-info-table $tmp_dir
isohybrid $FIMAGE 2>/dev/null || true isohybrid $FIMAGE 2>/dev/null || true
rm -rf $tmp_dir rm -rf $tmp_dir
} }
@ -121,6 +125,6 @@ case $1 in
bzdisk) genbzdisk;; bzdisk) genbzdisk;;
fdimage144) genfdimage144;; fdimage144) genfdimage144;;
fdimage288) genfdimage288;; fdimage288) genfdimage288;;
isoimage) genisoimage;; isoimage) geniso;;
*) echo 'Unknown image format'; exit 1; *) echo 'Unknown image format'; exit 1;
esac esac

View file

@ -1,6 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
#include <linux/jump_label.h> #include <linux/jump_label.h>
#include <asm/unwind_hints.h> #include <asm/unwind_hints.h>
#include <asm/cpufeatures.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
/* /*
@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
#endif #endif
.endm .endm
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two
* halves:
*/
#define PTI_SWITCH_PGTABLES_MASK (1<<PAGE_SHIFT)
#define PTI_SWITCH_MASK (PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
.macro SET_NOFLUSH_BIT reg:req
bts $X86_CR3_PCID_NOFLUSH_BIT, \reg
.endm
.macro ADJUST_KERNEL_CR3 reg:req
ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
andq $(~PTI_SWITCH_MASK), \reg
.endm
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ADJUST_KERNEL_CR3 \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
#define THIS_CPU_user_pcid_flush_mask \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* Test if the ASID needs a flush.
*/
movq \scratch_reg, \scratch_reg2
andq $(0x7FF), \scratch_reg /* mask ASID */
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
movq \scratch_reg2, \scratch_reg
jmp .Lwrcr3_\@
.Lnoflush_\@:
movq \scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
.Lwrcr3_\@:
/* Flip the PGD and ASID to the user version */
orq $(PTI_SWITCH_MASK), \scratch_reg
mov \scratch_reg, %cr3
.Lend_\@:
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
pushq %rax
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
popq %rax
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
movq %cr3, \scratch_reg
movq \scratch_reg, \save_reg
/*
* Is the "switch mask" all zero? That means that both of
* these are zero:
*
* 1. The user/kernel PCID bit, and
* 2. The user/kernel "bit" that points CR3 to the
* bottom half of the 8k PGD
*
* That indicates a kernel CR3 value, not a user CR3.
*/
testq $(PTI_SWITCH_MASK), \scratch_reg
jz .Ldone_\@
ADJUST_KERNEL_CR3 \scratch_reg
movq \scratch_reg, %cr3
.Ldone_\@:
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
/*
* KERNEL pages can always resume with NOFLUSH as we do
* explicit flushes.
*/
bt $X86_CR3_PTI_SWITCH_BIT, \save_reg
jnc .Lnoflush_\@
/*
* Check if there's a pending flush for the user ASID we're
* about to set.
*/
movq \save_reg, \scratch_reg
andq $(0x7FF), \scratch_reg
bt \scratch_reg, THIS_CPU_user_pcid_flush_mask
jnc .Lnoflush_\@
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
jmp .Lwrcr3_\@
.Lnoflush_\@:
SET_NOFLUSH_BIT \save_reg
.Lwrcr3_\@:
/*
* The CR3 write could be avoided when not changing its value,
* but would require a CR3 read *and* a scratch register.
*/
movq \save_reg, %cr3
.Lend_\@:
.endm
#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
.endm
.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
.endm
.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
.endm
.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
.endm
.macro RESTORE_CR3 scratch_reg:req save_reg:req
.endm
#endif
#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_64 */
/* /*

View file

@ -941,9 +941,10 @@ ENTRY(debug)
movl %esp, %eax # pt_regs pointer movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */ /* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) movl PER_CPU_VAR(cpu_entry_area), %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
cmpl $SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
cmpl $SIZEOF_entry_stack, %ecx
jb .Ldebug_from_sysenter_stack jb .Ldebug_from_sysenter_stack
TRACE_IRQS_OFF TRACE_IRQS_OFF
@ -984,9 +985,10 @@ ENTRY(nmi)
movl %esp, %eax # pt_regs pointer movl %esp, %eax # pt_regs pointer
/* Are we currently on the SYSENTER stack? */ /* Are we currently on the SYSENTER stack? */
PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) movl PER_CPU_VAR(cpu_entry_area), %ecx
subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
cmpl $SIZEOF_SYSENTER_stack, %ecx subl %eax, %ecx /* ecx = (end of entry_stack) - esp */
cmpl $SIZEOF_entry_stack, %ecx
jb .Lnmi_from_sysenter_stack jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */ /* Not on SYSENTER stack. */

View file

@ -23,7 +23,6 @@
#include <asm/segment.h> #include <asm/segment.h>
#include <asm/cache.h> #include <asm/cache.h>
#include <asm/errno.h> #include <asm/errno.h>
#include "calling.h"
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/msr.h> #include <asm/msr.h>
#include <asm/unistd.h> #include <asm/unistd.h>
@ -40,6 +39,8 @@
#include <asm/frame.h> #include <asm/frame.h>
#include <linux/err.h> #include <linux/err.h>
#include "calling.h"
.code64 .code64
.section .entry.text, "ax" .section .entry.text, "ax"
@ -140,6 +141,67 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs. * with them due to bugs in both AMD and Intel CPUs.
*/ */
.pushsection .entry_trampoline, "ax"
/*
* The code in here gets remapped into cpu_entry_area's trampoline. This means
* that the assembler and linker have the wrong idea as to where this code
* lives (and, in fact, it's mapped more than once, so it's not even at a
* fixed address). So we can't reference any symbols outside the entry
* trampoline and expect it to work.
*
* Instead, we carefully abuse %rip-relative addressing.
* _entry_trampoline(%rip) refers to the start of the remapped) entry
* trampoline. We can thus find cpu_entry_area with this macro:
*/
#define CPU_ENTRY_AREA \
_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
ENTRY(entry_SYSCALL_64_trampoline)
UNWIND_HINT_EMPTY
swapgs
/* Stash the user RSP. */
movq %rsp, RSP_SCRATCH
/* Note: using %rsp as a scratch reg. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
/* Load the top of the task stack into RSP */
movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
/* Start building the simulated IRET frame. */
pushq $__USER_DS /* pt_regs->ss */
pushq RSP_SCRATCH /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
/*
* x86 lacks a near absolute jump, and we can't jump to the real
* entry text with a relative jump. We could push the target
* address and then use retq, but this destroys the pipeline on
* many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
* spill RDI and restore it in a second-stage trampoline.
*/
pushq %rdi
movq $entry_SYSCALL_64_stage2, %rdi
jmp *%rdi
END(entry_SYSCALL_64_trampoline)
.popsection
ENTRY(entry_SYSCALL_64_stage2)
UNWIND_HINT_EMPTY
popq %rdi
jmp entry_SYSCALL_64_after_hwframe
END(entry_SYSCALL_64_stage2)
ENTRY(entry_SYSCALL_64) ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY UNWIND_HINT_EMPTY
/* /*
@ -149,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
*/ */
swapgs swapgs
/*
* This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
* is not required to switch CR3.
*/
movq %rsp, PER_CPU_VAR(rsp_scratch) movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@ -330,8 +396,25 @@ syscall_return_via_sysret:
popq %rsi /* skip rcx */ popq %rsi /* skip rcx */
popq %rdx popq %rdx
popq %rsi popq %rsi
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi popq %rdi
movq RSP-ORIG_RAX(%rsp), %rsp popq %rsp
USERGS_SYSRET64 USERGS_SYSRET64
END(entry_SYSCALL_64) END(entry_SYSCALL_64)
@ -466,12 +549,13 @@ END(irq_entries_start)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY #ifdef CONFIG_DEBUG_ENTRY
pushfq pushq %rax
testl $X86_EFLAGS_IF, (%rsp) SAVE_FLAGS(CLBR_RAX)
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@ jz .Lokay_\@
ud2 ud2
.Lokay_\@: .Lokay_\@:
addq $8, %rsp popq %rax
#endif #endif
.endm .endm
@ -563,6 +647,13 @@ END(irq_entries_start)
/* 0(%rsp): ~(interrupt number) */ /* 0(%rsp): ~(interrupt number) */
.macro interrupt func .macro interrupt func
cld cld
testb $3, CS-ORIG_RAX(%rsp)
jz 1f
SWAPGS
call switch_to_thread_stack
1:
ALLOC_PT_GPREGS_ON_STACK ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS SAVE_C_REGS
SAVE_EXTRA_REGS SAVE_EXTRA_REGS
@ -572,12 +663,8 @@ END(irq_entries_start)
jz 1f jz 1f
/* /*
* IRQ from user mode. Switch to kernel gsbase and inform context * IRQ from user mode.
* tracking that we're in kernel mode. *
*/
SWAPGS
/*
* We need to tell lockdep that IRQs are off. We can't do this until * We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode * we fix gsbase, and we should do it before enter_from_user_mode
* (which can take locks). Since TRACE_IRQS_OFF idempotent, * (which can take locks). Since TRACE_IRQS_OFF idempotent,
@ -630,10 +717,43 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
ud2 ud2
1: 1:
#endif #endif
SWAPGS
POP_EXTRA_REGS POP_EXTRA_REGS
POP_C_REGS popq %r11
addq $8, %rsp /* skip regs->orig_ax */ popq %r10
popq %r9
popq %r8
popq %rax
popq %rcx
popq %rdx
popq %rsi
/*
* The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */
pushq 6*8(%rdi) /* SS */
pushq 5*8(%rdi) /* RSP */
pushq 4*8(%rdi) /* EFLAGS */
pushq 3*8(%rdi) /* CS */
pushq 2*8(%rdi) /* RIP */
/* Push user RDI on the trampoline stack. */
pushq (%rdi)
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
/* Restore RDI. */
popq %rdi
SWAPGS
INTERRUPT_RETURN INTERRUPT_RETURN
@ -713,7 +833,9 @@ native_irq_return_ldt:
*/ */
pushq %rdi /* Stash user RDI */ pushq %rdi /* Stash user RDI */
SWAPGS SWAPGS /* to kernel GS */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */
movq PER_CPU_VAR(espfix_waddr), %rdi movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* user RAX */ movq %rax, (0*8)(%rdi) /* user RAX */
movq (1*8)(%rsp), %rax /* user RIP */ movq (1*8)(%rsp), %rax /* user RIP */
@ -729,7 +851,6 @@ native_irq_return_ldt:
/* Now RAX == RSP. */ /* Now RAX == RSP. */
andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */
popq %rdi /* Restore user RDI */
/* /*
* espfix_stack[31:16] == 0. The page tables are set up such that * espfix_stack[31:16] == 0. The page tables are set up such that
@ -740,7 +861,11 @@ native_irq_return_ldt:
* still points to an RO alias of the ESPFIX stack. * still points to an RO alias of the ESPFIX stack.
*/ */
orq PER_CPU_VAR(espfix_stack), %rax orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
SWAPGS /* to user GS */
popq %rdi /* Restore user RDI */
movq %rax, %rsp movq %rax, %rsp
UNWIND_HINT_IRET_REGS offset=8 UNWIND_HINT_IRET_REGS offset=8
@ -829,7 +954,35 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/* /*
* Exception entry points. * Exception entry points.
*/ */
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
/*
* Switch to the thread stack. This is called with the IRET frame and
* orig_ax on the stack. (That is, RDI..R12 are not on the stack and
* space has not been allocated for them.)
*/
ENTRY(switch_to_thread_stack)
UNWIND_HINT_FUNC
pushq %rdi
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
pushq 7*8(%rdi) /* regs->ss */
pushq 6*8(%rdi) /* regs->rsp */
pushq 5*8(%rdi) /* regs->eflags */
pushq 4*8(%rdi) /* regs->cs */
pushq 3*8(%rdi) /* regs->ip */
pushq 2*8(%rdi) /* regs->orig_ax */
pushq 8(%rdi) /* return address */
UNWIND_HINT_FUNC
movq (%rdi), %rdi
ret
END(switch_to_thread_stack)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym) ENTRY(\sym)
@ -848,11 +1001,12 @@ ENTRY(\sym)
ALLOC_PT_GPREGS_ON_STACK ALLOC_PT_GPREGS_ON_STACK
.if \paranoid .if \paranoid < 2
.if \paranoid == 1
testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
jnz 1f jnz .Lfrom_usermode_switch_stack_\@
.endif .endif
.if \paranoid
call paranoid_entry call paranoid_entry
.else .else
call error_entry call error_entry
@ -894,20 +1048,15 @@ ENTRY(\sym)
jmp error_exit jmp error_exit
.endif .endif
.if \paranoid == 1 .if \paranoid < 2
/* /*
* Paranoid entry from userspace. Switch stacks and treat it * Entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers * as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs). * run in real process context if user_mode(regs).
*/ */
1: .Lfrom_usermode_switch_stack_\@:
call error_entry call error_entry
movq %rsp, %rdi /* pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
movq %rsp, %rdi /* pt_regs pointer */ movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code .if \has_error_code
@ -1119,7 +1268,11 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */ js 1f /* negative -> in kernel */
SWAPGS SWAPGS
xorl %ebx, %ebx xorl %ebx, %ebx
1: ret
1:
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
ret
END(paranoid_entry) END(paranoid_entry)
/* /*
@ -1141,6 +1294,7 @@ ENTRY(paranoid_exit)
testl %ebx, %ebx /* swapgs needed? */ testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ TRACE_IRQS_IRETQ
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore jmp .Lparanoid_exit_restore
.Lparanoid_exit_no_swapgs: .Lparanoid_exit_no_swapgs:
@ -1168,8 +1322,18 @@ ENTRY(error_entry)
* from user mode due to an IRET fault. * from user mode due to an IRET fault.
*/ */
SWAPGS SWAPGS
/* We have user CR3. Change to kernel CR3. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
.Lerror_entry_from_usermode_after_swapgs: .Lerror_entry_from_usermode_after_swapgs:
/* Put us onto the real thread stack. */
popq %r12 /* save return addr in %12 */
movq %rsp, %rdi /* arg0 = pt_regs pointer */
call sync_regs
movq %rax, %rsp /* switch stack */
ENCODE_FRAME_POINTER
pushq %r12
/* /*
* We need to tell lockdep that IRQs are off. We can't do this until * We need to tell lockdep that IRQs are off. We can't do this until
* we fix gsbase, and we should do it before enter_from_user_mode * we fix gsbase, and we should do it before enter_from_user_mode
@ -1206,6 +1370,7 @@ ENTRY(error_entry)
* .Lgs_change's error handler with kernel gsbase. * .Lgs_change's error handler with kernel gsbase.
*/ */
SWAPGS SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done jmp .Lerror_entry_done
.Lbstep_iret: .Lbstep_iret:
@ -1215,10 +1380,11 @@ ENTRY(error_entry)
.Lerror_bad_iret: .Lerror_bad_iret:
/* /*
* We came from an IRET to user mode, so we have user gsbase. * We came from an IRET to user mode, so we have user
* Switch to kernel gsbase: * gsbase and CR3. Switch to kernel gsbase and CR3:
*/ */
SWAPGS SWAPGS
SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
/* /*
* Pretend that the exception came from user mode: set up pt_regs * Pretend that the exception came from user mode: set up pt_regs
@ -1250,6 +1416,10 @@ END(error_exit)
/* /*
* Runs on exception stack. Xen PV does not go through this path at all, * Runs on exception stack. Xen PV does not go through this path at all,
* so we can use real assembly here. * so we can use real assembly here.
*
* Registers:
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/ */
ENTRY(nmi) ENTRY(nmi)
UNWIND_HINT_IRET_REGS UNWIND_HINT_IRET_REGS
@ -1313,6 +1483,7 @@ ENTRY(nmi)
swapgs swapgs
cld cld
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
movq %rsp, %rdx movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
UNWIND_HINT_IRET_REGS base=%rdx offset=8 UNWIND_HINT_IRET_REGS base=%rdx offset=8
@ -1565,6 +1736,8 @@ end_repeat_nmi:
movq $-1, %rsi movq $-1, %rsi
call do_nmi call do_nmi
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
testl %ebx, %ebx /* swapgs needed? */ testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore jnz nmi_restore
nmi_swapgs: nmi_swapgs:

View file

@ -48,7 +48,11 @@
*/ */
ENTRY(entry_SYSENTER_compat) ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK SWAPGS
/* We are about to clobber %rsp anyway, clobbering here is OK */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* /*
@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
pushq $0 /* pt_regs->r14 = 0 */ pushq $0 /* pt_regs->r14 = 0 */
pushq $0 /* pt_regs->r15 = 0 */ pushq $0 /* pt_regs->r15 = 0 */
/*
* We just saved %rdi so it is safe to clobber. It is not
* preserved during the C calls inside TRACE_IRQS_OFF anyway.
*/
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
/* /*
* User mode is traced as though IRQs are on, and SYSENTER * User mode is traced as though IRQs are on, and SYSENTER
* turned them off. * turned them off.
@ -256,10 +266,22 @@ sysret32_from_system_call:
* when the system call started, which is already known to user * when the system call started, which is already known to user
* code. We zero R8-R10 to avoid info leaks. * code. We zero R8-R10 to avoid info leaks.
*/ */
movq RSP-ORIG_RAX(%rsp), %rsp
/*
* The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
* on the process stack which is not mapped to userspace and
* not readable after we SWITCH_TO_USER_CR3. Delay the CR3
* switch until after after the last reference to the process
* stack.
*
* %r8/%r9 are zeroed before the sysret, thus safe to clobber.
*/
SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
xorq %r8, %r8 xorq %r8, %r8
xorq %r9, %r9 xorq %r9, %r9
xorq %r10, %r10 xorq %r10, %r10
movq RSP-ORIG_RAX(%rsp), %rsp
swapgs swapgs
sysretl sysretl
END(entry_SYSCALL_compat) END(entry_SYSCALL_compat)
@ -306,8 +328,11 @@ ENTRY(entry_INT80_compat)
*/ */
movl %eax, %eax movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
pushq %rax /* pt_regs->orig_ax */ pushq %rax /* pt_regs->orig_ax */
/* switch to thread stack expects orig_ax to be pushed */
call switch_to_thread_stack
pushq %rdi /* pt_regs->di */ pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */ pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */ pushq %rdx /* pt_regs->dx */

View file

@ -37,6 +37,7 @@
#include <asm/unistd.h> #include <asm/unistd.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/paravirt.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include "vsyscall_trace.h" #include "vsyscall_trace.h"
@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
WARN_ON_ONCE(address != regs->ip); WARN_ON_ONCE(address != regs->ip);
/* This should be unreachable in NATIVE mode. */
if (WARN_ON(vsyscall_mode == NATIVE))
return false;
if (vsyscall_mode == NONE) { if (vsyscall_mode == NONE) {
warn_bad_vsyscall(KERN_INFO, regs, warn_bad_vsyscall(KERN_INFO, regs,
"vsyscall attempted with vsyscall=none"); "vsyscall attempted with vsyscall=none");
@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)
return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
} }
/*
* The VSYSCALL page is the only user-accessible page in the kernel address
* range. Normally, the kernel page tables can have _PAGE_USER clear, but
* the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
* are enabled.
*
* Some day we may create a "minimal" vsyscall mode in which we emulate
* vsyscalls but leave the page not present. If so, we skip calling
* this.
*/
void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
p4d = p4d_offset(pgd, VSYSCALL_ADDR);
#if CONFIG_PGTABLE_LEVELS >= 5
p4d->p4d |= _PAGE_USER;
#endif
pud = pud_offset(p4d, VSYSCALL_ADDR);
set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
pmd = pmd_offset(pud, VSYSCALL_ADDR);
set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
}
void __init map_vsyscall(void) void __init map_vsyscall(void)
{ {
extern char __vsyscall_page; extern char __vsyscall_page;
unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
if (vsyscall_mode != NONE) if (vsyscall_mode != NONE) {
__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
vsyscall_mode == NATIVE vsyscall_mode == NATIVE
? PAGE_KERNEL_VSYSCALL ? PAGE_KERNEL_VSYSCALL
: PAGE_KERNEL_VVAR); : PAGE_KERNEL_VVAR);
set_vsyscall_pgtable_user_bits(swapper_pg_dir);
}
BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
(unsigned long)VSYSCALL_ADDR); (unsigned long)VSYSCALL_ADDR);

View file

@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = {
__init int intel_pmu_init(void) __init int intel_pmu_init(void)
{ {
struct attribute **extra_attr = NULL;
struct attribute **to_free = NULL;
union cpuid10_edx edx; union cpuid10_edx edx;
union cpuid10_eax eax; union cpuid10_eax eax;
union cpuid10_ebx ebx; union cpuid10_ebx ebx;
@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void)
unsigned int unused; unsigned int unused;
struct extra_reg *er; struct extra_reg *er;
int version, i; int version, i;
struct attribute **extra_attr = NULL;
char *name; char *name;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void)
extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
hsw_format_attr : nhm_format_attr; hsw_format_attr : nhm_format_attr;
extra_attr = merge_attr(extra_attr, skl_format_attr); extra_attr = merge_attr(extra_attr, skl_format_attr);
to_free = extra_attr;
x86_pmu.cpu_events = get_hsw_events_attrs(); x86_pmu.cpu_events = get_hsw_events_attrs();
intel_pmu_pebs_data_source_skl( intel_pmu_pebs_data_source_skl(
boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, "); pr_cont("full-width counters, ");
} }
kfree(to_free);
return 0; return 0;
} }

View file

@ -3,16 +3,18 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <asm/cpu_entry_area.h>
#include <asm/perf_event.h> #include <asm/perf_event.h>
#include <asm/insn.h> #include <asm/insn.h>
#include "../perf_event.h" #include "../perf_event.h"
/* Waste a full page so it can be mapped into the cpu_entry_area */
DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
/* The size of a BTS record in bytes: */ /* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24 #define BTS_RECORD_SIZE 24
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_FIXUP_SIZE PAGE_SIZE #define PEBS_FIXUP_SIZE PAGE_SIZE
/* /*
@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer); static DEFINE_PER_CPU(void *, insn_buffer);
static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
{
phys_addr_t pa;
size_t msz = 0;
pa = virt_to_phys(addr);
for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
cea_set_pte(cea, pa, prot);
}
static void ds_clear_cea(void *cea, size_t size)
{
size_t msz = 0;
for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
cea_set_pte(cea, 0, PAGE_NONE);
}
static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
{
unsigned int order = get_order(size);
int node = cpu_to_node(cpu);
struct page *page;
page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
return page ? page_address(page) : NULL;
}
static void dsfree_pages(const void *buffer, size_t size)
{
if (buffer)
free_pages((unsigned long)buffer, get_order(size));
}
static int alloc_pebs_buffer(int cpu) static int alloc_pebs_buffer(int cpu)
{ {
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
int node = cpu_to_node(cpu); struct debug_store *ds = hwev->ds;
int max; size_t bsiz = x86_pmu.pebs_buffer_size;
void *buffer, *ibuffer; int max, node = cpu_to_node(cpu);
void *buffer, *ibuffer, *cea;
if (!x86_pmu.pebs) if (!x86_pmu.pebs)
return 0; return 0;
buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
if (unlikely(!buffer)) if (unlikely(!buffer))
return -ENOMEM; return -ENOMEM;
@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) { if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) { if (!ibuffer) {
kfree(buffer); dsfree_pages(buffer, bsiz);
return -ENOMEM; return -ENOMEM;
} }
per_cpu(insn_buffer, cpu) = ibuffer; per_cpu(insn_buffer, cpu) = ibuffer;
} }
hwev->ds_pebs_vaddr = buffer;
max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; /* Update the cpu entry area mapping */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds->pebs_buffer_base = (u64)(unsigned long)buffer; ds->pebs_buffer_base = (unsigned long) cea;
ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
ds->pebs_index = ds->pebs_buffer_base; ds->pebs_index = ds->pebs_buffer_base;
ds->pebs_absolute_maximum = ds->pebs_buffer_base + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
max * x86_pmu.pebs_record_size; ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
return 0; return 0;
} }
static void release_pebs_buffer(int cpu) static void release_pebs_buffer(int cpu)
{ {
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *cea;
if (!ds || !x86_pmu.pebs) if (!ds || !x86_pmu.pebs)
return; return;
@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu)); kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL; per_cpu(insn_buffer, cpu) = NULL;
kfree((void *)(unsigned long)ds->pebs_buffer_base); /* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0; ds->pebs_buffer_base = 0;
dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
hwev->ds_pebs_vaddr = NULL;
} }
static int alloc_bts_buffer(int cpu) static int alloc_bts_buffer(int cpu)
{ {
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
int node = cpu_to_node(cpu); struct debug_store *ds = hwev->ds;
int max, thresh; void *buffer, *cea;
void *buffer; int max;
if (!x86_pmu.bts) if (!x86_pmu.bts)
return 0; return 0;
buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
if (unlikely(!buffer)) { if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM; return -ENOMEM;
} }
hwev->ds_bts_vaddr = buffer;
max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; /* Update the fixmap */
thresh = max / 16; cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds->bts_buffer_base = (unsigned long) cea;
ds->bts_buffer_base = (u64)(unsigned long)buffer; ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
ds->bts_index = ds->bts_buffer_base; ds->bts_index = ds->bts_buffer_base;
ds->bts_absolute_maximum = ds->bts_buffer_base + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
max * BTS_RECORD_SIZE; ds->bts_absolute_maximum = ds->bts_buffer_base + max;
ds->bts_interrupt_threshold = ds->bts_absolute_maximum - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
thresh * BTS_RECORD_SIZE;
return 0; return 0;
} }
static void release_bts_buffer(int cpu) static void release_bts_buffer(int cpu)
{ {
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
struct debug_store *ds = hwev->ds;
void *cea;
if (!ds || !x86_pmu.bts) if (!ds || !x86_pmu.bts)
return; return;
kfree((void *)(unsigned long)ds->bts_buffer_base); /* Clear the fixmap */
cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
ds_clear_cea(cea, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0; ds->bts_buffer_base = 0;
dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
hwev->ds_bts_vaddr = NULL;
} }
static int alloc_ds_buffer(int cpu) static int alloc_ds_buffer(int cpu)
{ {
int node = cpu_to_node(cpu); struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
struct debug_store *ds;
ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
if (unlikely(!ds))
return -ENOMEM;
memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds; per_cpu(cpu_hw_events, cpu).ds = ds;
return 0; return 0;
} }
static void release_ds_buffer(int cpu) static void release_ds_buffer(int cpu)
{ {
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
if (!ds)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL; per_cpu(cpu_hw_events, cpu).ds = NULL;
kfree(ds);
} }
void release_ds_buffers(void) void release_ds_buffers(void)

View file

@ -14,6 +14,8 @@
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include <asm/intel_ds.h>
/* To enable MSR tracing please use the generic trace points. */ /* To enable MSR tracing please use the generic trace points. */
/* /*
@ -77,8 +79,6 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX]; struct event_constraint event_constraints[X86_PMC_IDX_MAX];
}; };
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1)
/* /*
@ -95,23 +95,6 @@ struct amd_nb {
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
};
#define PEBS_REGS \ #define PEBS_REGS \
(PERF_REG_X86_AX | \ (PERF_REG_X86_AX | \
PERF_REG_X86_BX | \ PERF_REG_X86_BX | \
@ -216,6 +199,8 @@ struct cpu_hw_events {
* Intel DebugStore bits * Intel DebugStore bits
*/ */
struct debug_store *ds; struct debug_store *ds;
void *ds_pebs_vaddr;
void *ds_bts_vaddr;
u64 pebs_enabled; u64 pebs_enabled;
int n_pebs; int n_pebs;
int n_large_pebs; int n_large_pebs;

View file

@ -136,6 +136,7 @@
#endif #endif
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
#ifndef __BPF__
/* /*
* This output constraint should be used for any inline asm which has a "call" * This output constraint should be used for any inline asm which has a "call"
* instruction. Otherwise the asm may be inserted before the frame pointer * instruction. Otherwise the asm may be inserted before the frame pointer
@ -145,5 +146,6 @@
register unsigned long current_stack_pointer asm(_ASM_SP); register unsigned long current_stack_pointer asm(_ASM_SP);
#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer) #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
#endif #endif
#endif
#endif /* _ASM_X86_ASM_H */ #endif /* _ASM_X86_ASM_H */

View file

@ -0,0 +1,81 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef _ASM_X86_CPU_ENTRY_AREA_H
#define _ASM_X86_CPU_ENTRY_AREA_H
#include <linux/percpu-defs.h>
#include <asm/processor.h>
#include <asm/intel_ds.h>
/*
* cpu_entry_area is a percpu region that contains things needed by the CPU
* and early entry/exit code. Real types aren't used for all fields here
* to avoid circular header dependencies.
*
* Every field is a virtual alias of some other allocated backing store.
* There is no direct allocation of a struct cpu_entry_area.
*/
struct cpu_entry_area {
char gdt[PAGE_SIZE];
/*
* The GDT is just below entry_stack and thus serves (on x86_64) as
* a a read-only guard page.
*/
struct entry_stack_page entry_stack_page;
/*
* On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because
* we need task switches to work, and task switches write to the TSS.
*/
struct tss_struct tss;
char entry_trampoline[PAGE_SIZE];
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
#endif
#ifdef CONFIG_CPU_SUP_INTEL
/*
* Per CPU debug store for Intel performance monitoring. Wastes a
* full page at the moment.
*/
struct debug_store cpu_debug_store;
/*
* The actual PEBS/BTS buffers must be mapped to user space
* Reserve enough fixmap PTEs.
*/
struct debug_store_buffers cpu_debug_buffers;
#endif
};
#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
extern void setup_cpu_entry_areas(void);
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
#define CPU_ENTRY_AREA_MAP_SIZE \
(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
static inline struct entry_stack *cpu_entry_stack(int cpu)
{
return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
}
#endif

View file

@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
set_bit(bit, (unsigned long *)cpu_caps_set); \ set_bit(bit, (unsigned long *)cpu_caps_set); \
} while (0) } while (0)
#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
/* /*
* Static testing of CPU features. Used the same as boot_cpu_has(). * Static testing of CPU features. Used the same as boot_cpu_has().

View file

@ -197,11 +197,12 @@
#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */
#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */
#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */
@ -340,5 +341,6 @@
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ #define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
#define X86_BUG_CPU_INSECURE X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
#endif /* _ASM_X86_CPUFEATURES_H */ #endif /* _ASM_X86_CPUFEATURES_H */

View file

@ -7,6 +7,7 @@
#include <asm/mmu.h> #include <asm/mmu.h>
#include <asm/fixmap.h> #include <asm/fixmap.h>
#include <asm/irq_vectors.h> #include <asm/irq_vectors.h>
#include <asm/cpu_entry_area.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/percpu.h> #include <linux/percpu.h>
@ -20,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
desc->type = (info->read_exec_only ^ 1) << 1; desc->type = (info->read_exec_only ^ 1) << 1;
desc->type |= info->contents << 2; desc->type |= info->contents << 2;
/* Set the ACCESS bit so it can be mapped RO */
desc->type |= 1;
desc->s = 1; desc->s = 1;
desc->dpl = 0x3; desc->dpl = 0x3;
@ -60,17 +63,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
return this_cpu_ptr(&gdt_page)->gdt; return this_cpu_ptr(&gdt_page)->gdt;
} }
/* Get the fixmap index for a specific processor */
static inline unsigned int get_cpu_gdt_ro_index(int cpu)
{
return FIX_GDT_REMAP_BEGIN + cpu;
}
/* Provide the fixmap address of the remapped GDT */ /* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu) static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{ {
unsigned int idx = get_cpu_gdt_ro_index(cpu); return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
return (struct desc_struct *)__fix_to_virt(idx);
} }
/* Provide the current read-only GDT */ /* Provide the current read-only GDT */
@ -185,7 +181,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
#endif #endif
} }
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
{ {
struct desc_struct *d = get_cpu_gdt_rw(cpu); struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss; tss_desc tss;

View file

@ -50,6 +50,12 @@
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif #endif
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define DISABLE_PTI 0
#else
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
/* /*
* Make sure to add features to the correct mask * Make sure to add features to the correct mask
*/ */
@ -60,7 +66,7 @@
#define DISABLED_MASK4 (DISABLE_PCID) #define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0 #define DISABLED_MASK5 0
#define DISABLED_MASK6 0 #define DISABLED_MASK6 0
#define DISABLED_MASK7 0 #define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0 #define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX) #define DISABLED_MASK9 (DISABLE_MPX)
#define DISABLED_MASK10 0 #define DISABLED_MASK10 0

View file

@ -2,7 +2,7 @@
#ifndef _ASM_X86_ESPFIX_H #ifndef _ASM_X86_ESPFIX_H
#define _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_ESPFIX64
#include <asm/percpu.h> #include <asm/percpu.h>
@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
extern void init_espfix_bsp(void); extern void init_espfix_bsp(void);
extern void init_espfix_ap(int cpu); extern void init_espfix_ap(int cpu);
#else
#endif /* CONFIG_X86_64 */ static inline void init_espfix_ap(int cpu) { }
#endif
#endif /* _ASM_X86_ESPFIX_H */ #endif /* _ASM_X86_ESPFIX_H */

View file

@ -44,7 +44,6 @@ extern unsigned long __FIXADDR_TOP;
PAGE_SIZE) PAGE_SIZE)
#endif #endif
/* /*
* Here we define all the compile-time 'special' virtual * Here we define all the compile-time 'special' virtual
* addresses. The point is to have a constant address at * addresses. The point is to have a constant address at
@ -84,7 +83,6 @@ enum fixed_addresses {
FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_0,
FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
#endif #endif
FIX_RO_IDT, /* Virtual mapping for read-only IDT */
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@ -100,9 +98,6 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID #ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC, FIX_LNW_VRTC,
#endif #endif
/* Fixmap entries to remap the GDTs, one per processor. */
FIX_GDT_REMAP_BEGIN,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
#ifdef CONFIG_ACPI_APEI_GHES #ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */ /* Used for GHES mapping from assorted contexts */
@ -143,7 +138,7 @@ enum fixed_addresses {
extern void reserve_top_address(unsigned long reserve); extern void reserve_top_address(unsigned long reserve);
#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
extern int fixmaps_set; extern int fixmaps_set;

View file

@ -20,16 +20,7 @@
#ifndef _ASM_X86_HYPERVISOR_H #ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H
#ifdef CONFIG_HYPERVISOR_GUEST /* x86 hypervisor types */
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
/*
* x86 hypervisor information
*/
enum x86_hypervisor_type { enum x86_hypervisor_type {
X86_HYPER_NATIVE = 0, X86_HYPER_NATIVE = 0,
X86_HYPER_VMWARE, X86_HYPER_VMWARE,
@ -39,6 +30,12 @@ enum x86_hypervisor_type {
X86_HYPER_KVM, X86_HYPER_KVM,
}; };
#ifdef CONFIG_HYPERVISOR_GUEST
#include <asm/kvm_para.h>
#include <asm/x86_init.h>
#include <asm/xen/hypervisor.h>
struct hypervisor_x86 { struct hypervisor_x86 {
/* Hypervisor name */ /* Hypervisor name */
const char *name; const char *name;
@ -58,7 +55,15 @@ struct hypervisor_x86 {
extern enum x86_hypervisor_type x86_hyper_type; extern enum x86_hypervisor_type x86_hyper_type;
extern void init_hypervisor_platform(void); extern void init_hypervisor_platform(void);
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return x86_hyper_type == type;
}
#else #else
static inline void init_hypervisor_platform(void) { } static inline void init_hypervisor_platform(void) { }
static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
{
return type == X86_HYPER_NATIVE;
}
#endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* CONFIG_HYPERVISOR_GUEST */
#endif /* _ASM_X86_HYPERVISOR_H */ #endif /* _ASM_X86_HYPERVISOR_H */

View file

@ -0,0 +1,36 @@
#ifndef _ASM_INTEL_DS_H
#define _ASM_INTEL_DS_H
#include <linux/percpu-defs.h>
#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4)
/* The maximal number of PEBS events: */
#define MAX_PEBS_EVENTS 8
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_event_reset[MAX_PEBS_EVENTS];
} __aligned(PAGE_SIZE);
DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
struct debug_store_buffers {
char bts_buffer[BTS_BUFFER_SIZE];
char pebs_buffer[PEBS_BUFFER_SIZE];
};
#endif

View file

@ -0,0 +1,53 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_INVPCID
#define _ASM_X86_INVPCID
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
{
struct { u64 d[2]; } desc = { { pcid, addr } };
/*
* The memory clobber is because the whole point is to invalidate
* stale TLB entries and, especially if we're flushing global
* mappings, we don't want the compiler to reorder any subsequent
* memory accesses before the TLB flush.
*
* The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
* invpcid (%rcx), %rax in long mode.
*/
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
: : "m" (desc), "a" (type), "c" (&desc) : "memory");
}
#define INVPCID_TYPE_INDIV_ADDR 0
#define INVPCID_TYPE_SINGLE_CTXT 1
#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
#define INVPCID_TYPE_ALL_NON_GLOBAL 3
/* Flush all mappings for a given pcid and addr, not including globals. */
static inline void invpcid_flush_one(unsigned long pcid,
unsigned long addr)
{
__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
}
/* Flush all mappings for a given PCID, not including globals. */
static inline void invpcid_flush_single_context(unsigned long pcid)
{
__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
}
/* Flush all mappings, including globals, for all PCIDs. */
static inline void invpcid_flush_all(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
}
/* Flush all mappings for all PCIDs except globals. */
static inline void invpcid_flush_all_nonglobals(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
}
#endif /* _ASM_X86_INVPCID */

View file

@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq, extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs); unsigned int nr_irqs);
extern int mp_irqdomain_activate(struct irq_domain *domain, extern int mp_irqdomain_activate(struct irq_domain *domain,
struct irq_data *irq_data, bool early); struct irq_data *irq_data, bool reserve);
extern void mp_irqdomain_deactivate(struct irq_domain *domain, extern void mp_irqdomain_deactivate(struct irq_domain *domain,
struct irq_data *irq_data); struct irq_data *irq_data);
extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain); extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);

View file

@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
swapgs; \ swapgs; \
sysretl sysretl
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(x) pushfq; popq %rax
#endif
#else #else
#define INTERRUPT_RETURN iret #define INTERRUPT_RETURN iret
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit

View file

@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long); extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs); extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, int all); extern void __show_regs(struct pt_regs *regs, int all);
extern void show_iret_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void); extern unsigned long oops_begin(void);
extern void oops_end(unsigned long, struct pt_regs *, int signr); extern void oops_end(unsigned long, struct pt_regs *, int signr);

View file

@ -3,6 +3,7 @@
#define _ASM_X86_MMU_H #define _ASM_X86_MMU_H
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/atomic.h> #include <linux/atomic.h>
@ -27,7 +28,8 @@ typedef struct {
atomic64_t tlb_gen; atomic64_t tlb_gen;
#ifdef CONFIG_MODIFY_LDT_SYSCALL #ifdef CONFIG_MODIFY_LDT_SYSCALL
struct ldt_struct *ldt; struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
#endif #endif
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64

View file

@ -50,22 +50,53 @@ struct ldt_struct {
* call gates. On native, we could merge the ldt_struct and LDT * call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize. * allocations, but it's not worth trying to optimize.
*/ */
struct desc_struct *entries; struct desc_struct *entries;
unsigned int nr_entries; unsigned int nr_entries;
/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
}; };
/* This is a multiple of PAGE_SIZE. */
#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
static inline void *ldt_slot_va(int slot)
{
#ifdef CONFIG_X86_64
return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
#else
BUG();
#endif
}
/* /*
* Used for LDT copy/destruction. * Used for LDT copy/destruction.
*/ */
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); static inline void init_new_context_ldt(struct mm_struct *mm)
{
mm->context.ldt = NULL;
init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */ #else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline int init_new_context_ldt(struct task_struct *tsk, static inline void init_new_context_ldt(struct mm_struct *mm) { }
struct mm_struct *mm) static inline int ldt_dup_context(struct mm_struct *oldmm,
struct mm_struct *mm)
{ {
return 0; return 0;
} }
static inline void destroy_context_ldt(struct mm_struct *mm) {} static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif #endif
static inline void load_mm_ldt(struct mm_struct *mm) static inline void load_mm_ldt(struct mm_struct *mm)
@ -90,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
* that we can see. * that we can see.
*/ */
if (unlikely(ldt)) if (unlikely(ldt)) {
set_ldt(ldt->entries, ldt->nr_entries); if (static_cpu_has(X86_FEATURE_PTI)) {
else if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
/*
* Whoops -- either the new LDT isn't mapped
* (if slot == -1) or is mapped into a bogus
* slot (if slot > 1).
*/
clear_LDT();
return;
}
/*
* If page table isolation is enabled, ldt->entries
* will not be mapped in the userspace pagetables.
* Tell the CPU to access the LDT through the alias
* at ldt_slot_va(ldt->slot).
*/
set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
} else {
set_ldt(ldt->entries, ldt->nr_entries);
}
} else {
clear_LDT(); clear_LDT();
}
#else #else
clear_LDT(); clear_LDT();
#endif #endif
@ -132,18 +184,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
static inline int init_new_context(struct task_struct *tsk, static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm) struct mm_struct *mm)
{ {
mutex_init(&mm->context.lock);
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0); atomic64_set(&mm->context.tlb_gen, 0);
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */ /* pkey 0 is the default and always allocated */
mm->context.pkey_allocation_map = 0x1; mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */ /* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1; mm->context.execute_only_pkey = -1;
} }
#endif #endif
return init_new_context_ldt(tsk, mm); init_new_context_ldt(mm);
return 0;
} }
static inline void destroy_context(struct mm_struct *mm) static inline void destroy_context(struct mm_struct *mm)
{ {
@ -176,15 +231,16 @@ do { \
} while (0) } while (0)
#endif #endif
static inline void arch_dup_mmap(struct mm_struct *oldmm, static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
struct mm_struct *mm)
{ {
paravirt_arch_dup_mmap(oldmm, mm); paravirt_arch_dup_mmap(oldmm, mm);
return ldt_dup_context(oldmm, mm);
} }
static inline void arch_exit_mmap(struct mm_struct *mm) static inline void arch_exit_mmap(struct mm_struct *mm)
{ {
paravirt_arch_exit_mmap(mm); paravirt_arch_exit_mmap(mm);
ldt_arch_exit_mmap(mm);
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
@ -281,33 +337,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write); return __pkru_allows_pkey(vma_pkey(vma), write);
} }
/*
* If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
* bits. This serves two purposes. It prevents a nasty situation in
* which PCID-unaware code saves CR3, loads some other value (with PCID
* == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
* the saved ASID was nonzero. It also means that any bugs involving
* loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
* deterministically.
*/
static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
{
if (static_cpu_has(X86_FEATURE_PCID)) {
VM_WARN_ON_ONCE(asid > 4094);
return __sme_pa(mm->pgd) | (asid + 1);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(mm->pgd);
}
}
static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
{
VM_WARN_ON_ONCE(asid > 4094);
return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
}
/* /*
* This can be used from process context to figure out what the value of * This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3(). * CR3 is without needing to do a (slow) __read_cr3().
@ -317,7 +346,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
*/ */
static inline unsigned long __get_current_cr3_fast(void) static inline unsigned long __get_current_cr3_fast(void)
{ {
unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
this_cpu_read(cpu_tlbstate.loaded_mm_asid)); this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */ /* For now, be very restrictive about when this can be called. */

View file

@ -927,6 +927,15 @@ extern void default_banner(void);
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \ CLBR_NONE, \
jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
#ifdef CONFIG_DEBUG_ENTRY
#define SAVE_FLAGS(clobbers) \
PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \
PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
#endif
#endif /* CONFIG_X86_32 */ #endif /* CONFIG_X86_32 */
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */

View file

@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
*/ */
extern gfp_t __userpte_alloc_gfp; extern gfp_t __userpte_alloc_gfp;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Instead of one PGD, we acquire two PGDs. Being order-1, it is
* both 8k in size and 8k-aligned. That lets us just flip bit 12
* in a pointer to swap between the two 4k halves.
*/
#define PGD_ALLOCATION_ORDER 1
#else
#define PGD_ALLOCATION_ORDER 0
#endif
/* /*
* Allocate and free page tables. * Allocate and free page tables.
*/ */

View file

@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
void ptdump_walk_pgd_level_checkwx(void); void ptdump_walk_pgd_level_checkwx(void);
#ifdef CONFIG_DEBUG_WX #ifdef CONFIG_DEBUG_WX
@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
static inline int p4d_bad(p4d_t p4d) static inline int p4d_bad(p4d_t p4d)
{ {
return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (p4d_flags(p4d) & ~ignore_flags) != 0;
} }
#endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */
@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd) static inline int pgd_bad(pgd_t pgd)
{ {
return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; unsigned long ignore_flags = _PAGE_USER;
if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
ignore_flags |= _PAGE_NX;
return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
} }
static inline int pgd_none(pgd_t pgd) static inline int pgd_none(pgd_t pgd)
@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
* pgd_offset() returns a (pgd_t *) * pgd_offset() returns a (pgd_t *)
* pgd_index() is used get the offset into the pgd page's array of pgd_t's; * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
*/ */
#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) #define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
/*
* a shortcut to get a pgd_t in a given mm
*/
#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
/* /*
* a shortcut which implies the use of the kernel's pgd, instead * a shortcut which implies the use of the kernel's pgd, instead
* of a process's * of a process's
@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
*/ */
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{ {
memcpy(dst, src, count * sizeof(pgd_t)); memcpy(dst, src, count * sizeof(pgd_t));
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/* Clone the user space pgd as well */
memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
count * sizeof(pgd_t));
#endif
} }
#define PTE_SHIFT ilog2(PTRS_PER_PTE) #define PTE_SHIFT ilog2(PTRS_PER_PTE)

View file

@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
#define LAST_PKMAP 1024 #define LAST_PKMAP 1024
#endif #endif
#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ /*
& PMD_MASK) * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
* to avoid include recursion hell
*/
#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40)
#define CPU_ENTRY_AREA_BASE \
((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
#define PKMAP_BASE \
((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
#else #else
# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) # define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
#endif #endif
#define MODULES_VADDR VMALLOC_START #define MODULES_VADDR VMALLOC_START

View file

@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif #endif
} }
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
* (8k-aligned and 8k in size). The kernel one is at the beginning 4k and
* the user one is in the last 4k. To switch between them, you
* just need to flip the 12th bit in their addresses.
*/
#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT
/*
* This generates better code than the inline assembly in
* __set_bit().
*/
static inline void *ptr_set_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr |= BIT(bit);
return (void *)__ptr;
}
static inline void *ptr_clear_bit(void *ptr, int bit)
{
unsigned long __ptr = (unsigned long)ptr;
__ptr &= ~BIT(bit);
return (void *)__ptr;
}
static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
{
return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
{
return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
{
return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
{
return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
}
#endif /* CONFIG_PAGE_TABLE_ISOLATION */
/*
* Page table pages are page-aligned. The lower half of the top
* level is used for userspace and the top half for the kernel.
*
* Returns true for parts of the PGD that map userspace and
* false for the parts that map the kernel.
*/
static inline bool pgdp_maps_userspace(void *__ptr)
{
unsigned long ptr = (unsigned long)__ptr;
return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
}
#ifdef CONFIG_PAGE_TABLE_ISOLATION
pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
/*
* Take a PGD location (pgdp) and a pgd value that needs to be set there.
* Populates the user and returns the resulting PGD that must be set in
* the kernel copy of the page tables.
*/
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
if (!static_cpu_has(X86_FEATURE_PTI))
return pgd;
return __pti_set_user_pgd(pgdp, pgd);
}
#else
static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
{
return pgd;
}
#endif
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{ {
#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
#else
*p4dp = p4d; *p4dp = p4d;
#endif
} }
static inline void native_p4d_clear(p4d_t *p4d) static inline void native_p4d_clear(p4d_t *p4d)
@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{ {
#ifdef CONFIG_PAGE_TABLE_ISOLATION
*pgdp = pti_set_user_pgd(pgdp, pgd);
#else
*pgdp = pgd; *pgdp = pgd;
#endif
} }
static inline void native_pgd_clear(pgd_t *pgd) static inline void native_pgd_clear(pgd_t *pgd)

View file

@ -76,32 +76,45 @@ typedef struct { pteval_t pte; } pte_t;
#define PGDIR_MASK (~(PGDIR_SIZE - 1)) #define PGDIR_MASK (~(PGDIR_SIZE - 1))
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#ifdef CONFIG_X86_5LEVEL #ifdef CONFIG_X86_5LEVEL
#define VMALLOC_SIZE_TB _AC(16384, UL) # define VMALLOC_SIZE_TB _AC(12800, UL)
#define __VMALLOC_BASE _AC(0xff92000000000000, UL) # define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
# define LDT_PGD_ENTRY _AC(-112, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#else #else
#define VMALLOC_SIZE_TB _AC(32, UL) # define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
# define LDT_PGD_ENTRY _AC(-4, UL)
# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
#endif #endif
#ifdef CONFIG_RANDOMIZE_MEMORY #ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base # define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base # define VMEMMAP_START vmemmap_base
#else #else
#define VMALLOC_START __VMALLOC_BASE # define VMALLOC_START __VMALLOC_BASE
#define VMEMMAP_START __VMEMMAP_BASE # define VMEMMAP_START __VMEMMAP_BASE
#endif /* CONFIG_RANDOMIZE_MEMORY */ #endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
/* The module sections ends with the start of the fixmap */ /* The module sections ends with the start of the fixmap */
#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) #define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR) #define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) #define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
#define CPU_ENTRY_AREA_PGD _AC(-3, UL)
#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))
#define EARLY_DYNAMIC_PAGE_TABLES 64 #define EARLY_DYNAMIC_PAGE_TABLES 64

View file

@ -38,6 +38,11 @@
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
#define CR3_PCID_MASK 0xFFFull #define CR3_PCID_MASK 0xFFFull
#define CR3_NOFLUSH BIT_ULL(63) #define CR3_NOFLUSH BIT_ULL(63)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define X86_CR3_PTI_SWITCH_BIT 11
#endif
#else #else
/* /*
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save

View file

@ -163,9 +163,9 @@ enum cpuid_regs_idx {
extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data; extern struct cpuinfo_x86 new_cpu_data;
extern struct tss_struct doublefault_tss; extern struct x86_hw_tss doublefault_tss;
extern __u32 cpu_caps_cleared[NCAPINTS]; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
extern __u32 cpu_caps_set[NCAPINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
write_cr3(__sme_pa(pgdir)); write_cr3(__sme_pa(pgdir));
} }
/*
* Note that while the legacy 'TSS' name comes from 'Task State Segment',
* on modern x86 CPUs the TSS also holds information important to 64-bit mode,
* unrelated to the task-switch mechanism:
*/
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
/* This is the TSS defined by the hardware. */ /* This is the TSS defined by the hardware. */
struct x86_hw_tss { struct x86_hw_tss {
@ -305,7 +310,13 @@ struct x86_hw_tss {
struct x86_hw_tss { struct x86_hw_tss {
u32 reserved1; u32 reserved1;
u64 sp0; u64 sp0;
/*
* We store cpu_current_top_of_stack in sp1 so it's always accessible.
* Linux does not use ring 1, so sp1 is not otherwise needed.
*/
u64 sp1; u64 sp1;
u64 sp2; u64 sp2;
u64 reserved2; u64 reserved2;
u64 ist[7]; u64 ist[7];
@ -323,12 +334,22 @@ struct x86_hw_tss {
#define IO_BITMAP_BITS 65536 #define IO_BITMAP_BITS 65536
#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) #define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
#define INVALID_IO_BITMAP_OFFSET 0x8000 #define INVALID_IO_BITMAP_OFFSET 0x8000
struct entry_stack {
unsigned long words[64];
};
struct entry_stack_page {
struct entry_stack stack;
} __aligned(PAGE_SIZE);
struct tss_struct { struct tss_struct {
/* /*
* The hardware state: * The fixed hardware portion. This must not cross a page boundary
* at risk of violating the SDM's advice and potentially triggering
* errata.
*/ */
struct x86_hw_tss x86_tss; struct x86_hw_tss x86_tss;
@ -339,18 +360,9 @@ struct tss_struct {
* be within the limit. * be within the limit.
*/ */
unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
} __aligned(PAGE_SIZE);
#ifdef CONFIG_X86_32 DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
/*
* Space for the temporary SYSENTER stack.
*/
unsigned long SYSENTER_stack_canary;
unsigned long SYSENTER_stack[64];
#endif
} ____cacheline_aligned;
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
/* /*
* sizeof(unsigned long) coming from an extra "long" at the end * sizeof(unsigned long) coming from an extra "long" at the end
@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else
/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif #endif
/* /*
@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
static inline void static inline void
native_load_sp0(unsigned long sp0) native_load_sp0(unsigned long sp0)
{ {
this_cpu_write(cpu_tss.x86_tss.sp0, sp0); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
} }
static inline void native_swapgs(void) static inline void native_swapgs(void)
@ -535,12 +550,12 @@ static inline void native_swapgs(void)
static inline unsigned long current_top_of_stack(void) static inline unsigned long current_top_of_stack(void)
{ {
#ifdef CONFIG_X86_64 /*
return this_cpu_read_stable(cpu_tss.x86_tss.sp0); * We can't read directly from tss.sp0: sp0 on x86_32 is special in
#else * and around vm86 mode and sp0 on x86_64 is special because of the
/* sp0 on x86_32 is special in and around vm86 mode. */ * entry trampoline.
*/
return this_cpu_read_stable(cpu_current_top_of_stack); return this_cpu_read_stable(cpu_current_top_of_stack);
#endif
} }
static inline bool on_thread_stack(void) static inline bool on_thread_stack(void)
@ -837,13 +852,22 @@ static inline void spin_lock_prefetch(const void *x)
#else #else
/* /*
* User space process size. 47bits minus one guard page. The guard * User space process size. This is the first address outside the user range.
* page is necessary on Intel CPUs: if a SYSCALL instruction is at * There are a few constraints that determine this:
* the highest possible canonical userspace address, then that *
* syscall will enter the kernel with a non-canonical return * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
* address, and SYSRET will explode dangerously. We avoid this * address, then that syscall will enter the kernel with a
* particular problem by preventing anything from being mapped * non-canonical return address, and SYSRET will explode dangerously.
* at the maximum canonical address. * We avoid this particular problem by preventing anything executable
* from being mapped at the maximum canonical address.
*
* On AMD CPUs in the Ryzen family, there's a nasty bug in which the
* CPUs malfunction if they execute code from the highest canonical page.
* They'll speculate right off the end of the canonical space, and
* bad things happen. This is worked around in the same way as the
* Intel problem.
*
* With page table isolation enabled, we map the LDT in ... [stay tuned]
*/ */
#define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)

View file

@ -0,0 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef _ASM_X86_PTI_H
#define _ASM_X86_PTI_H
#ifndef __ASSEMBLY__
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern void pti_init(void);
extern void pti_check_boottime_disable(void);
#else
static inline void pti_check_boottime_disable(void) { }
#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PTI_H */

View file

@ -16,6 +16,7 @@ enum stack_type {
STACK_TYPE_TASK, STACK_TYPE_TASK,
STACK_TYPE_IRQ, STACK_TYPE_IRQ,
STACK_TYPE_SOFTIRQ, STACK_TYPE_SOFTIRQ,
STACK_TYPE_ENTRY,
STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION,
STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
}; };
@ -28,6 +29,8 @@ struct stack_info {
bool in_task_stack(unsigned long *stack, struct task_struct *task, bool in_task_stack(unsigned long *stack, struct task_struct *task,
struct stack_info *info); struct stack_info *info);
bool in_entry_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask); struct stack_info *info, unsigned long *visit_mask);

View file

@ -16,8 +16,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
struct tss_struct *tss); struct tss_struct *tss);
/* This runs runs on the previous thread's stack. */ /* This runs runs on the previous thread's stack. */
static inline void prepare_switch_to(struct task_struct *prev, static inline void prepare_switch_to(struct task_struct *next)
struct task_struct *next)
{ {
#ifdef CONFIG_VMAP_STACK #ifdef CONFIG_VMAP_STACK
/* /*
@ -70,7 +69,7 @@ struct fork_frame {
#define switch_to(prev, next, last) \ #define switch_to(prev, next, last) \
do { \ do { \
prepare_switch_to(prev, next); \ prepare_switch_to(next); \
\ \
((last) = __switch_to_asm((prev), (next))); \ ((last) = __switch_to_asm((prev), (next))); \
} while (0) } while (0)
@ -79,10 +78,10 @@ do { \
static inline void refresh_sysenter_cs(struct thread_struct *thread) static inline void refresh_sysenter_cs(struct thread_struct *thread)
{ {
/* Only happens when SEP is enabled, no need to test "SEP"arately: */ /* Only happens when SEP is enabled, no need to test "SEP"arately: */
if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs)) if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
return; return;
this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs); this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
} }
#endif #endif
@ -90,10 +89,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
/* This is used when switching tasks or entering/exiting vm86 mode. */ /* This is used when switching tasks or entering/exiting vm86 mode. */
static inline void update_sp0(struct task_struct *task) static inline void update_sp0(struct task_struct *task)
{ {
/* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
load_sp0(task->thread.sp0); load_sp0(task->thread.sp0);
#else #else
load_sp0(task_top_of_stack(task)); if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif #endif
} }

View file

@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#else /* !__ASSEMBLY__ */ #else /* !__ASSEMBLY__ */
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) # define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
#endif #endif
#endif #endif

View file

@ -9,70 +9,130 @@
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/special_insns.h> #include <asm/special_insns.h>
#include <asm/smp.h> #include <asm/smp.h>
#include <asm/invpcid.h>
#include <asm/pti.h>
#include <asm/processor-flags.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr, /*
unsigned long type) * The x86 feature is called PCID (Process Context IDentifier). It is similar
* to what is traditionally called ASID on the RISC processors.
*
* We don't use the traditional ASID implementation, where each process/mm gets
* its own ASID and flush/restart when we run out of ASID space.
*
* Instead we have a small per-cpu array of ASIDs and cache the last few mm's
* that came by on this CPU, allowing cheaper switch_mm between processes on
* this CPU.
*
* We end up with different spaces for different things. To avoid confusion we
* use different names for each of them:
*
* ASID - [0, TLB_NR_DYN_ASIDS-1]
* the canonical identifier for an mm
*
* kPCID - [1, TLB_NR_DYN_ASIDS]
* the value we write into the PCID part of CR3; corresponds to the
* ASID+1, because PCID 0 is special.
*
* uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS]
* for KPTI each mm has two address spaces and thus needs two
* PCID values, but we can still do with a single ASID denomination
* for each mm. Corresponds to kPCID + 2048.
*
*/
/* There are 12 bits of space for ASIDS in CR3 */
#define CR3_HW_ASID_BITS 12
/*
* When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
* user/kernel switches
*/
#ifdef CONFIG_PAGE_TABLE_ISOLATION
# define PTI_CONSUMED_PCID_BITS 1
#else
# define PTI_CONSUMED_PCID_BITS 0
#endif
#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
/*
* ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account
* for them being zero-based. Another -1 is because PCID 0 is reserved for
* use by non-PCID-aware users.
*/
#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
/*
* 6 because 6 should be plenty and struct tlb_state will fit in two cache
* lines.
*/
#define TLB_NR_DYN_ASIDS 6
/*
* Given @asid, compute kPCID
*/
static inline u16 kern_pcid(u16 asid)
{ {
struct { u64 d[2]; } desc = { { pcid, addr } }; VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Make sure that the dynamic ASID space does not confict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
/* /*
* The memory clobber is because the whole point is to invalidate * The ASID being passed in here should have respected the
* stale TLB entries and, especially if we're flushing global * MAX_ASID_AVAILABLE and thus never have the switch bit set.
* mappings, we don't want the compiler to reorder any subsequent */
* memory accesses before the TLB flush. VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
#endif
/*
* The dynamically-assigned ASIDs that get passed in are small
* (<TLB_NR_DYN_ASIDS). They never have the high switch bit set,
* so do not bother to clear it.
* *
* The hex opcode is invpcid (%ecx), %eax in 32-bit mode and * If PCID is on, ASID-aware code paths put the ASID+1 into the
* invpcid (%rcx), %rax in long mode. * PCID bits. This serves two purposes. It prevents a nasty
* situation in which PCID-unaware code saves CR3, loads some other
* value (with PCID == 0), and then restores CR3, thus corrupting
* the TLB for ASID 0 if the saved ASID was nonzero. It also means
* that any bugs involving loading a PCID-enabled CR3 with
* CR4.PCIDE off will trigger deterministically.
*/ */
asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" return asid + 1;
: : "m" (desc), "a" (type), "c" (&desc) : "memory");
} }
#define INVPCID_TYPE_INDIV_ADDR 0 /*
#define INVPCID_TYPE_SINGLE_CTXT 1 * Given @asid, compute uPCID
#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 */
#define INVPCID_TYPE_ALL_NON_GLOBAL 3 static inline u16 user_pcid(u16 asid)
/* Flush all mappings for a given pcid and addr, not including globals. */
static inline void invpcid_flush_one(unsigned long pcid,
unsigned long addr)
{ {
__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); u16 ret = kern_pcid(asid);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
#endif
return ret;
} }
/* Flush all mappings for a given PCID, not including globals. */ struct pgd_t;
static inline void invpcid_flush_single_context(unsigned long pcid) static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
{ {
__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); if (static_cpu_has(X86_FEATURE_PCID)) {
return __sme_pa(pgd) | kern_pcid(asid);
} else {
VM_WARN_ON_ONCE(asid != 0);
return __sme_pa(pgd);
}
} }
/* Flush all mappings, including globals, for all PCIDs. */ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
static inline void invpcid_flush_all(void)
{ {
__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
} VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
/* Flush all mappings for all PCIDs except globals. */
static inline void invpcid_flush_all_nonglobals(void)
{
__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
}
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
u64 new_tlb_gen;
/*
* Bump the generation count. This also serves as a full barrier
* that synchronizes with switch_mm(): callers are required to order
* their read of mm_cpumask after their writes to the paging
* structures.
*/
smp_mb__before_atomic();
new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
smp_mb__after_atomic();
return new_tlb_gen;
} }
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
@ -99,12 +159,6 @@ static inline bool tlb_defer_switch_to_init_mm(void)
return !static_cpu_has(X86_FEATURE_PCID); return !static_cpu_has(X86_FEATURE_PCID);
} }
/*
* 6 because 6 should be plenty and struct tlb_state will fit in
* two cache lines.
*/
#define TLB_NR_DYN_ASIDS 6
struct tlb_context { struct tlb_context {
u64 ctx_id; u64 ctx_id;
u64 tlb_gen; u64 tlb_gen;
@ -138,6 +192,24 @@ struct tlb_state {
*/ */
bool is_lazy; bool is_lazy;
/*
* If set we changed the page tables in such a way that we
* needed an invalidation of all contexts (aka. PCIDs / ASIDs).
* This tells us to go invalidate all the non-loaded ctxs[]
* on the next context switch.
*
* The current ctx was kept up-to-date as it ran and does not
* need to be invalidated.
*/
bool invalidate_other;
/*
* Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate
* the corresponding user PCID needs a flush next time we
* switch to it; see SWITCH_TO_USER_CR3.
*/
unsigned short user_pcid_flush_mask;
/* /*
* Access to this CR4 shadow and to H/W CR4 is protected by * Access to this CR4 shadow and to H/W CR4 is protected by
* disabling interrupts when modifying either one. * disabling interrupts when modifying either one.
@ -218,6 +290,14 @@ static inline unsigned long cr4_read_shadow(void)
return this_cpu_read(cpu_tlbstate.cr4); return this_cpu_read(cpu_tlbstate.cr4);
} }
/*
* Mark all other ASIDs as invalid, preserves the current.
*/
static inline void invalidate_other_asid(void)
{
this_cpu_write(cpu_tlbstate.invalidate_other, true);
}
/* /*
* Save some of cr4 feature set we're using (e.g. Pentium 4MB * Save some of cr4 feature set we're using (e.g. Pentium 4MB
* enable and PPro Global page enable), so that any CPU's that boot * enable and PPro Global page enable), so that any CPU's that boot
@ -237,37 +317,63 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
extern void initialize_tlbstate_and_flush(void); extern void initialize_tlbstate_and_flush(void);
/*
* Given an ASID, flush the corresponding user ASID. We can delay this
* until the next time we switch to it.
*
* See SWITCH_TO_USER_CR3.
*/
static inline void invalidate_user_asid(u16 asid)
{
/* There is no user ASID if address space separation is off */
if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
return;
/*
* We only have a single ASID if PCID is off and the CR3
* write will have flushed it.
*/
if (!cpu_feature_enabled(X86_FEATURE_PCID))
return;
if (!static_cpu_has(X86_FEATURE_PTI))
return;
__set_bit(kern_pcid(asid),
(unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask));
}
/*
* flush the entire current user mapping
*/
static inline void __native_flush_tlb(void) static inline void __native_flush_tlb(void)
{ {
/* /*
* If current->mm == NULL then we borrow a mm which may change during a * Preemption or interrupts must be disabled to protect the access
* task switch and therefore we must not be preempted while we write CR3 * to the per CPU variable and to prevent being preempted between
* back: * read_cr3() and write_cr3().
*/ */
preempt_disable(); WARN_ON_ONCE(preemptible());
invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* If current->mm == NULL then the read_cr3() "borrows" an mm */
native_write_cr3(__native_read_cr3()); native_write_cr3(__native_read_cr3());
preempt_enable();
}
static inline void __native_flush_tlb_global_irq_disabled(void)
{
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* clear PGE */
native_write_cr4(cr4 & ~X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
} }
/*
* flush everything
*/
static inline void __native_flush_tlb_global(void) static inline void __native_flush_tlb_global(void)
{ {
unsigned long flags; unsigned long cr4, flags;
if (static_cpu_has(X86_FEATURE_INVPCID)) { if (static_cpu_has(X86_FEATURE_INVPCID)) {
/* /*
* Using INVPCID is considerably faster than a pair of writes * Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore. * to CR4 sandwiched inside an IRQ flag save/restore.
*
* Note, this works with CR4.PCIDE=0 or 1.
*/ */
invpcid_flush_all(); invpcid_flush_all();
return; return;
@ -280,36 +386,69 @@ static inline void __native_flush_tlb_global(void)
*/ */
raw_local_irq_save(flags); raw_local_irq_save(flags);
__native_flush_tlb_global_irq_disabled(); cr4 = this_cpu_read(cpu_tlbstate.cr4);
/* toggle PGE */
native_write_cr4(cr4 ^ X86_CR4_PGE);
/* write old PGE again and flush TLBs */
native_write_cr4(cr4);
raw_local_irq_restore(flags); raw_local_irq_restore(flags);
} }
/*
* flush one page in the user mapping
*/
static inline void __native_flush_tlb_single(unsigned long addr) static inline void __native_flush_tlb_single(unsigned long addr)
{ {
asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
}
static inline void __flush_tlb_all(void) asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
{
if (boot_cpu_has(X86_FEATURE_PGE)) if (!static_cpu_has(X86_FEATURE_PTI))
__flush_tlb_global(); return;
else
__flush_tlb();
/* /*
* Note: if we somehow had PCID but not PGE, then this wouldn't work -- * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
* we'd end up flushing kernel translations for the current ASID but * Just use invalidate_user_asid() in case we are called early.
* we might fail to flush kernel translations for other cached ASIDs.
*
* To avoid this issue, we force PCID off if PGE is off.
*/ */
if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
invalidate_user_asid(loaded_mm_asid);
else
invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
} }
/*
* flush everything
*/
static inline void __flush_tlb_all(void)
{
if (boot_cpu_has(X86_FEATURE_PGE)) {
__flush_tlb_global();
} else {
/*
* !PGE -> !PCID (setup_pcid()), thus every flush is total.
*/
__flush_tlb();
}
}
/*
* flush one page in the kernel mapping
*/
static inline void __flush_tlb_one(unsigned long addr) static inline void __flush_tlb_one(unsigned long addr)
{ {
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
__flush_tlb_single(addr); __flush_tlb_single(addr);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
/*
* __flush_tlb_single() will have cleared the TLB entry for this ASID,
* but since kernel space is replicated across all, we must also
* invalidate all others.
*/
invalidate_other_asid();
} }
#define TLB_FLUSH_ALL -1UL #define TLB_FLUSH_ALL -1UL
@ -370,6 +509,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
void native_flush_tlb_others(const struct cpumask *cpumask, void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info); const struct flush_tlb_info *info);
static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
{
/*
* Bump the generation count. This also serves as a full barrier
* that synchronizes with switch_mm(): callers are required to order
* their read of mm_cpumask after their writes to the paging
* structures.
*/
return atomic64_inc_return(&mm->context.tlb_gen);
}
static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
struct mm_struct *mm) struct mm_struct *mm)
{ {

View file

@ -283,34 +283,34 @@ TRACE_EVENT(vector_alloc_managed,
DECLARE_EVENT_CLASS(vector_activate, DECLARE_EVENT_CLASS(vector_activate,
TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve, TP_PROTO(unsigned int irq, bool is_managed, bool can_reserve,
bool early), bool reserve),
TP_ARGS(irq, is_managed, can_reserve, early), TP_ARGS(irq, is_managed, can_reserve, reserve),
TP_STRUCT__entry( TP_STRUCT__entry(
__field( unsigned int, irq ) __field( unsigned int, irq )
__field( bool, is_managed ) __field( bool, is_managed )
__field( bool, can_reserve ) __field( bool, can_reserve )
__field( bool, early ) __field( bool, reserve )
), ),
TP_fast_assign( TP_fast_assign(
__entry->irq = irq; __entry->irq = irq;
__entry->is_managed = is_managed; __entry->is_managed = is_managed;
__entry->can_reserve = can_reserve; __entry->can_reserve = can_reserve;
__entry->early = early; __entry->reserve = reserve;
), ),
TP_printk("irq=%u is_managed=%d can_reserve=%d early=%d", TP_printk("irq=%u is_managed=%d can_reserve=%d reserve=%d",
__entry->irq, __entry->is_managed, __entry->can_reserve, __entry->irq, __entry->is_managed, __entry->can_reserve,
__entry->early) __entry->reserve)
); );
#define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \ #define DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(name) \
DEFINE_EVENT_FN(vector_activate, name, \ DEFINE_EVENT_FN(vector_activate, name, \
TP_PROTO(unsigned int irq, bool is_managed, \ TP_PROTO(unsigned int irq, bool is_managed, \
bool can_reserve, bool early), \ bool can_reserve, bool reserve), \
TP_ARGS(irq, is_managed, can_reserve, early), NULL, NULL); \ TP_ARGS(irq, is_managed, can_reserve, reserve), NULL, NULL); \
DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate); DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_activate);
DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate); DEFINE_IRQ_VECTOR_ACTIVATE_EVENT(vector_deactivate);

View file

@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
dotraplinkage void do_stack_segment(struct pt_regs *, long); dotraplinkage void do_stack_segment(struct pt_regs *, long);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
dotraplinkage void do_double_fault(struct pt_regs *, long); dotraplinkage void do_double_fault(struct pt_regs *, long);
asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
#endif #endif
dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_general_protection(struct pt_regs *, long);
dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);

View file

@ -7,6 +7,9 @@
#include <asm/ptrace.h> #include <asm/ptrace.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
struct unwind_state { struct unwind_state {
struct stack_info stack_info; struct stack_info stack_info;
unsigned long stack_mask; unsigned long stack_mask;
@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
} }
#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
/*
* WARNING: The entire pt_regs may not be safe to dereference. In some cases,
* only the iret frame registers are accessible. Use with caution!
*/
static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
{ {
if (unwind_done(state)) if (unwind_done(state))

View file

@ -7,6 +7,7 @@
#ifdef CONFIG_X86_VSYSCALL_EMULATION #ifdef CONFIG_X86_VSYSCALL_EMULATION
extern void map_vsyscall(void); extern void map_vsyscall(void);
extern void set_vsyscall_pgtable_user_bits(pgd_t *root);
/* /*
* Called on instruction fetch fault in vsyscall page. * Called on instruction fetch fault in vsyscall page.

View file

@ -78,7 +78,12 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
#define X86_CR3_PCID_BITS 12
#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL))
#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/* /*
* Intel CPU features in CR4 * Intel CPU features in CR4

View file

@ -2626,11 +2626,13 @@ static int __init apic_set_verbosity(char *arg)
apic_verbosity = APIC_DEBUG; apic_verbosity = APIC_DEBUG;
else if (strcmp("verbose", arg) == 0) else if (strcmp("verbose", arg) == 0)
apic_verbosity = APIC_VERBOSE; apic_verbosity = APIC_VERBOSE;
#ifdef CONFIG_X86_64
else { else {
pr_warning("APIC Verbosity level %s not recognised" pr_warning("APIC Verbosity level %s not recognised"
" use apic=verbose or apic=debug\n", arg); " use apic=verbose or apic=debug\n", arg);
return -EINVAL; return -EINVAL;
} }
#endif
return 0; return 0;
} }

View file

@ -151,7 +151,7 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid = default_apic_id_valid, .apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered, .apic_id_registered = flat_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio, .irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 1, /* logical */ .irq_dest_mode = 1, /* logical */
.disable_esr = 0, .disable_esr = 0,

View file

@ -110,7 +110,7 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid = default_apic_id_valid, .apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered, .apic_id_registered = noop_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio, .irq_delivery_mode = dest_Fixed,
/* logical delivery broadcast to all CPUs: */ /* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1, .irq_dest_mode = 1,

View file

@ -2988,7 +2988,7 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
} }
int mp_irqdomain_activate(struct irq_domain *domain, int mp_irqdomain_activate(struct irq_domain *domain,
struct irq_data *irq_data, bool early) struct irq_data *irq_data, bool reserve)
{ {
unsigned long flags; unsigned long flags;

View file

@ -39,17 +39,13 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
((apic->irq_dest_mode == 0) ? ((apic->irq_dest_mode == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL : MSI_ADDR_DEST_MODE_PHYSICAL :
MSI_ADDR_DEST_MODE_LOGICAL) | MSI_ADDR_DEST_MODE_LOGICAL) |
((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_REDIRECTION_CPU :
MSI_ADDR_REDIRECTION_LOWPRI) |
MSI_ADDR_DEST_ID(cfg->dest_apicid); MSI_ADDR_DEST_ID(cfg->dest_apicid);
msg->data = msg->data =
MSI_DATA_TRIGGER_EDGE | MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT | MSI_DATA_LEVEL_ASSERT |
((apic->irq_delivery_mode != dest_LowestPrio) ? MSI_DATA_DELIVERY_FIXED |
MSI_DATA_DELIVERY_FIXED :
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(cfg->vector); MSI_DATA_VECTOR(cfg->vector);
} }

View file

@ -105,7 +105,7 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid = default_apic_id_valid, .apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered, .apic_id_registered = default_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio, .irq_delivery_mode = dest_Fixed,
/* logical delivery broadcast to all CPUs: */ /* logical delivery broadcast to all CPUs: */
.irq_dest_mode = 1, .irq_dest_mode = 1,

View file

@ -184,6 +184,7 @@ static void reserve_irq_vector_locked(struct irq_data *irqd)
irq_matrix_reserve(vector_matrix); irq_matrix_reserve(vector_matrix);
apicd->can_reserve = true; apicd->can_reserve = true;
apicd->has_reserved = true; apicd->has_reserved = true;
irqd_set_can_reserve(irqd);
trace_vector_reserve(irqd->irq, 0); trace_vector_reserve(irqd->irq, 0);
vector_assign_managed_shutdown(irqd); vector_assign_managed_shutdown(irqd);
} }
@ -368,8 +369,18 @@ static int activate_reserved(struct irq_data *irqd)
int ret; int ret;
ret = assign_irq_vector_any_locked(irqd); ret = assign_irq_vector_any_locked(irqd);
if (!ret) if (!ret) {
apicd->has_reserved = false; apicd->has_reserved = false;
/*
* Core might have disabled reservation mode after
* allocating the irq descriptor. Ideally this should
* happen before allocation time, but that would require
* completely convoluted ways of transporting that
* information.
*/
if (!irqd_can_reserve(irqd))
apicd->can_reserve = false;
}
return ret; return ret;
} }
@ -398,21 +409,21 @@ static int activate_managed(struct irq_data *irqd)
} }
static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd, static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
bool early) bool reserve)
{ {
struct apic_chip_data *apicd = apic_chip_data(irqd); struct apic_chip_data *apicd = apic_chip_data(irqd);
unsigned long flags; unsigned long flags;
int ret = 0; int ret = 0;
trace_vector_activate(irqd->irq, apicd->is_managed, trace_vector_activate(irqd->irq, apicd->is_managed,
apicd->can_reserve, early); apicd->can_reserve, reserve);
/* Nothing to do for fixed assigned vectors */ /* Nothing to do for fixed assigned vectors */
if (!apicd->can_reserve && !apicd->is_managed) if (!apicd->can_reserve && !apicd->is_managed)
return 0; return 0;
raw_spin_lock_irqsave(&vector_lock, flags); raw_spin_lock_irqsave(&vector_lock, flags);
if (early || irqd_is_managed_and_shutdown(irqd)) if (reserve || irqd_is_managed_and_shutdown(irqd))
vector_assign_managed_shutdown(irqd); vector_assign_managed_shutdown(irqd);
else if (apicd->is_managed) else if (apicd->is_managed)
ret = activate_managed(irqd); ret = activate_managed(irqd);
@ -478,6 +489,7 @@ static bool vector_configure_legacy(unsigned int virq, struct irq_data *irqd,
} else { } else {
/* Release the vector */ /* Release the vector */
apicd->can_reserve = true; apicd->can_reserve = true;
irqd_set_can_reserve(irqd);
clear_irq_vector(irqd); clear_irq_vector(irqd);
realloc = true; realloc = true;
} }

View file

@ -184,7 +184,7 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid, .apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered, .apic_id_registered = x2apic_apic_id_registered,
.irq_delivery_mode = dest_LowestPrio, .irq_delivery_mode = dest_Fixed,
.irq_dest_mode = 1, /* logical */ .irq_dest_mode = 1, /* logical */
.disable_esr = 0, .disable_esr = 0,

View file

@ -17,6 +17,7 @@
#include <asm/sigframe.h> #include <asm/sigframe.h>
#include <asm/bootparam.h> #include <asm/bootparam.h>
#include <asm/suspend.h> #include <asm/suspend.h>
#include <asm/tlbflush.h>
#ifdef CONFIG_XEN #ifdef CONFIG_XEN
#include <xen/interface/xen.h> #include <xen/interface/xen.h>
@ -93,4 +94,13 @@ void common(void) {
BLANK(); BLANK();
DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
/* TLB state for the entry code */
OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask);
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
} }

View file

@ -47,13 +47,8 @@ void foo(void)
BLANK(); BLANK();
/* Offset from the sysenter stack to tss.sp0 */ /* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
offsetofend(struct tss_struct, SYSENTER_stack)); offsetofend(struct cpu_entry_area, entry_stack_page.stack));
/* Offset from cpu_tss to SYSENTER_stack */
OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
/* Size of SYSENTER_stack */
DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR
BLANK(); BLANK();

View file

@ -23,6 +23,9 @@ int main(void)
#ifdef CONFIG_PARAVIRT #ifdef CONFIG_PARAVIRT
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
#endif
BLANK(); BLANK();
#endif #endif
@ -63,6 +66,7 @@ int main(void)
OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_ist, tss_struct, x86_tss.ist);
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
BLANK(); BLANK();
#ifdef CONFIG_CC_STACKPROTECTOR #ifdef CONFIG_CC_STACKPROTECTOR

View file

@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
return NULL; /* Not found */ return NULL; /* Not found */
} }
__u32 cpu_caps_cleared[NCAPINTS]; __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
__u32 cpu_caps_set[NCAPINTS]; __u32 cpu_caps_set[NCAPINTS + NBUGINTS];
void load_percpu_segment(int cpu) void load_percpu_segment(int cpu)
{ {
@ -490,28 +490,23 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment(); load_stack_canary_segment();
} }
/* Setup the fixmap mapping only once per-processor */ #ifdef CONFIG_X86_32
static inline void setup_fixmap_gdt(int cpu) /* The 32-bit entry code needs to find cpu_entry_area. */
{ DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT. */
pgprot_t prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the GDT
* is read-only, that will triple fault.
*
* On Xen PV, the GDT must be read-only because the hypervisor requires
* it.
*/
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
#endif #endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); #ifdef CONFIG_X86_64
} /*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
#endif
/* Load the original GDT from the per-cpu structure */ /* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu) void load_direct_gdt(int cpu)
@ -747,7 +742,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
{ {
int i; int i;
for (i = 0; i < NCAPINTS; i++) { for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i]; c->x86_capability[i] |= cpu_caps_set[i];
} }
@ -927,6 +922,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
} }
setup_force_cpu_cap(X86_FEATURE_ALWAYS); setup_force_cpu_cap(X86_FEATURE_ALWAYS);
/* Assume for now that ALL x86 CPUs are insecure */
setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
fpu__init_system(c); fpu__init_system(c);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
@ -1250,7 +1249,7 @@ void enable_sep_cpu(void)
return; return;
cpu = get_cpu(); cpu = get_cpu();
tss = &per_cpu(cpu_tss, cpu); tss = &per_cpu(cpu_tss_rw, cpu);
/* /*
* We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@ -1259,11 +1258,7 @@ void enable_sep_cpu(void)
tss->x86_tss.ss1 = __KERNEL_CS; tss->x86_tss.ss1 = __KERNEL_CS;
wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
wrmsr(MSR_IA32_SYSENTER_ESP,
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
put_cpu(); put_cpu();
@ -1357,25 +1352,22 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count); EXPORT_PER_CPU_SYMBOL(__preempt_count);
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */ /* May not be marked __init: used by software suspend */
void syscall_init(void) void syscall_init(void)
{ {
extern char _entry_trampoline[];
extern char entry_SYSCALL_64_trampoline[];
int cpu = smp_processor_id();
unsigned long SYSCALL64_entry_trampoline =
(unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
(entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (static_cpu_has(X86_FEATURE_PTI))
wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
else
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION #ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@ -1386,7 +1378,7 @@ void syscall_init(void)
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/ */
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else #else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@ -1530,7 +1522,7 @@ void cpu_init(void)
if (cpu) if (cpu)
load_ucode_ap(); load_ucode_ap();
t = &per_cpu(cpu_tss, cpu); t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu); oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
@ -1569,7 +1561,7 @@ void cpu_init(void)
* set up and load the per-CPU TSS * set up and load the per-CPU TSS
*/ */
if (!oist->ist[0]) { if (!oist->ist[0]) {
char *estacks = per_cpu(exception_stacks, cpu); char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
for (v = 0; v < N_EXCEPTION_STACKS; v++) { for (v = 0; v < N_EXCEPTION_STACKS; v++) {
estacks += exception_stack_sizes[v]; estacks += exception_stack_sizes[v];
@ -1580,7 +1572,7 @@ void cpu_init(void)
} }
} }
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
/* /*
* <= is required because the CPU will access up to * <= is required because the CPU will access up to
@ -1596,11 +1588,12 @@ void cpu_init(void)
enter_lazy_tlb(&init_mm, me); enter_lazy_tlb(&init_mm, me);
/* /*
* Initialize the TSS. Don't bother initializing sp0, as the initial * Initialize the TSS. sp0 points to the entry trampoline stack
* task never enters user mode. * regardless of what task is running.
*/ */
set_tss_desc(cpu, t); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc(); load_TR_desc();
load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
load_mm_ldt(&init_mm); load_mm_ldt(&init_mm);
@ -1612,7 +1605,6 @@ void cpu_init(void)
if (is_uv_system()) if (is_uv_system())
uv_cpu_init(); uv_cpu_init();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu); load_fixmap_gdt(cpu);
} }
@ -1622,7 +1614,7 @@ void cpu_init(void)
{ {
int cpu = smp_processor_id(); int cpu = smp_processor_id();
struct task_struct *curr = current; struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
wait_for_master_cpu(cpu); wait_for_master_cpu(cpu);
@ -1657,12 +1649,12 @@ void cpu_init(void)
* Initialize the TSS. Don't bother initializing sp0, as the initial * Initialize the TSS. Don't bother initializing sp0, as the initial
* task never enters user mode. * task never enters user mode.
*/ */
set_tss_desc(cpu, t); set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
load_TR_desc(); load_TR_desc();
load_mm_ldt(&init_mm); load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
#ifdef CONFIG_DOUBLEFAULT #ifdef CONFIG_DOUBLEFAULT
/* Set up doublefault TSS pointer in the GDT */ /* Set up doublefault TSS pointer in the GDT */
@ -1674,7 +1666,6 @@ void cpu_init(void)
fpu__init_cpu(); fpu__init_cpu();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu); load_fixmap_gdt(cpu);
} }
#endif #endif

View file

@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
} }
#else #else
/*
* Flush global tlb. We only do this in x86_64 where paging has been enabled
* already and PGE should be enabled as well.
*/
static inline void flush_tlb_early(void)
{
__native_flush_tlb_global_irq_disabled();
}
static inline void print_ucode(struct ucode_cpu_info *uci) static inline void print_ucode(struct ucode_cpu_info *uci)
{ {
struct microcode_intel *mc; struct microcode_intel *mc;
@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
if (rev != mc->hdr.rev) if (rev != mc->hdr.rev)
return -1; return -1;
#ifdef CONFIG_X86_64
/* Flush global tlb. This is precaution. */
flush_tlb_early();
#endif
uci->cpu_sig.rev = rev; uci->cpu_sig.rev = rev;
if (early) if (early)

View file

@ -50,25 +50,23 @@ static void doublefault_fn(void)
cpu_relax(); cpu_relax();
} }
struct tss_struct doublefault_tss __cacheline_aligned = { struct x86_hw_tss doublefault_tss __cacheline_aligned = {
.x86_tss = { .sp0 = STACK_START,
.sp0 = STACK_START, .ss0 = __KERNEL_DS,
.ss0 = __KERNEL_DS, .ldt = 0,
.ldt = 0, .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
.ip = (unsigned long) doublefault_fn, .ip = (unsigned long) doublefault_fn,
/* 0x2 bit is always set */ /* 0x2 bit is always set */
.flags = X86_EFLAGS_SF | 0x2, .flags = X86_EFLAGS_SF | 0x2,
.sp = STACK_START, .sp = STACK_START,
.es = __USER_DS, .es = __USER_DS,
.cs = __KERNEL_CS, .cs = __KERNEL_CS,
.ss = __KERNEL_DS, .ss = __KERNEL_DS,
.ds = __USER_DS, .ds = __USER_DS,
.fs = __KERNEL_PERCPU, .fs = __KERNEL_PERCPU,
.__cr3 = __pa_nodebug(swapper_pg_dir), .__cr3 = __pa_nodebug(swapper_pg_dir),
}
}; };
/* dummy for do_double_fault() call */ /* dummy for do_double_fault() call */

View file

@ -18,6 +18,7 @@
#include <linux/nmi.h> #include <linux/nmi.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
#include <asm/unwind.h> #include <asm/unwind.h>
@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true; return true;
} }
bool in_entry_stack(unsigned long *stack, struct stack_info *info)
{
struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
void *begin = ss;
void *end = ss + 1;
if ((void *)stack < begin || (void *)stack >= end)
return false;
info->type = STACK_TYPE_ENTRY;
info->begin = begin;
info->end = end;
info->next_sp = NULL;
return true;
}
static void printk_stack_address(unsigned long address, int reliable, static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl) char *log_lvl)
{ {
@ -50,6 +69,28 @@ static void printk_stack_address(unsigned long address, int reliable,
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
} }
void show_iret_regs(struct pt_regs *regs)
{
printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
regs->sp, regs->flags);
}
static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
{
if (on_stack(info, regs, sizeof(*regs)))
__show_regs(regs, 0);
else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
IRET_FRAME_SIZE)) {
/*
* When an interrupt or exception occurs in entry code, the
* full pt_regs might not have been saved yet. In that case
* just print the iret frame.
*/
show_iret_regs(regs);
}
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl) unsigned long *stack, char *log_lvl)
{ {
@ -71,31 +112,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - task stack * - task stack
* - interrupt stack * - interrupt stack
* - HW exception stacks (double fault, nmi, debug, mce) * - HW exception stacks (double fault, nmi, debug, mce)
* - entry stack
* *
* x86-32 can have up to three stacks: * x86-32 can have up to four stacks:
* - task stack * - task stack
* - softirq stack * - softirq stack
* - hardirq stack * - hardirq stack
* - entry stack
*/ */
for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
const char *stack_name; const char *stack_name;
/* if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
* If we overflowed the task stack into a guard page, jump back /*
* to the bottom of the usable stack. * We weren't on a valid stack. It's possible that
*/ * we overflowed a valid stack into a guard page.
if (task_stack_page(task) - (void *)stack < PAGE_SIZE) * See if the next page up is valid so that we can
stack = task_stack_page(task); * generate some kind of backtrace if this happens.
*/
if (get_stack_info(stack, task, &stack_info, &visit_mask)) stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
break; if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
}
stack_name = stack_type_name(stack_info.type); stack_name = stack_type_name(stack_info.type);
if (stack_name) if (stack_name)
printk("%s <%s>\n", log_lvl, stack_name); printk("%s <%s>\n", log_lvl, stack_name);
if (regs && on_stack(&stack_info, regs, sizeof(*regs))) if (regs)
__show_regs(regs, 0); show_regs_safe(&stack_info, regs);
/* /*
* Scan the stack, printing any text addresses we find. At the * Scan the stack, printing any text addresses we find. At the
@ -119,7 +164,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
/* /*
* Don't print regs->ip again if it was already printed * Don't print regs->ip again if it was already printed
* by __show_regs() below. * by show_regs_safe() below.
*/ */
if (regs && stack == &regs->ip) if (regs && stack == &regs->ip)
goto next; goto next;
@ -155,8 +200,8 @@ next:
/* if the frame has entry regs, print them */ /* if the frame has entry regs, print them */
regs = unwind_get_entry_regs(&state); regs = unwind_get_entry_regs(&state);
if (regs && on_stack(&stack_info, regs, sizeof(*regs))) if (regs)
__show_regs(regs, 0); show_regs_safe(&stack_info, regs);
} }
if (stack_name) if (stack_name)
@ -252,11 +297,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
unsigned long sp; unsigned long sp;
#endif #endif
printk(KERN_DEFAULT printk(KERN_DEFAULT
"%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "",
IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
if (notify_die(DIE_OOPS, str, regs, err, if (notify_die(DIE_OOPS, str, regs, err,
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP)

View file

@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_SOFTIRQ) if (type == STACK_TYPE_SOFTIRQ)
return "SOFTIRQ"; return "SOFTIRQ";
if (type == STACK_TYPE_ENTRY)
return "ENTRY_TRAMPOLINE";
return NULL; return NULL;
} }
@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (task != current) if (task != current)
goto unknown; goto unknown;
if (in_entry_stack(stack, info))
goto recursion_check;
if (in_hardirq_stack(stack, info)) if (in_hardirq_stack(stack, info))
goto recursion_check; goto recursion_check;

View file

@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type)
if (type == STACK_TYPE_IRQ) if (type == STACK_TYPE_IRQ)
return "IRQ"; return "IRQ";
if (type == STACK_TYPE_ENTRY) {
/*
* On 64-bit, we have a generic entry stack that we
* use for all the kernel entry points, including
* SYSENTER.
*/
return "ENTRY_TRAMPOLINE";
}
if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
return exception_stack_names[type - STACK_TYPE_EXCEPTION]; return exception_stack_names[type - STACK_TYPE_EXCEPTION];
@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
if (in_irq_stack(stack, info)) if (in_irq_stack(stack, info))
goto recursion_check; goto recursion_check;
if (in_entry_stack(stack, info))
goto recursion_check;
goto unknown; goto unknown;
recursion_check: recursion_check:

View file

@ -341,6 +341,27 @@ GLOBAL(early_recursion_flag)
.balign PAGE_SIZE; \ .balign PAGE_SIZE; \
GLOBAL(name) GLOBAL(name)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
* Each PGD needs to be 8k long and 8k aligned. We do not
* ever go out to userspace with these, so we do not
* strictly *need* the second page, but this allows us to
* have a single set_pgd() implementation that does not
* need to worry about whether it has 4k or 8k to work
* with.
*
* This ensures PGDs are 8k long:
*/
#define PTI_USER_PGD_FILL 512
/* This ensures they are 8k-aligned: */
#define NEXT_PGD_PAGE(name) \
.balign 2 * PAGE_SIZE; \
GLOBAL(name)
#else
#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
#define PTI_USER_PGD_FILL 0
#endif
/* Automate the creation of 1 to 1 mapping pmd entries */ /* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \ #define PMDS(START, PERM, COUNT) \
i = 0 ; \ i = 0 ; \
@ -350,13 +371,14 @@ GLOBAL(name)
.endr .endr
__INITDATA __INITDATA
NEXT_PAGE(early_top_pgt) NEXT_PGD_PAGE(early_top_pgt)
.fill 511,8,0 .fill 511,8,0
#ifdef CONFIG_X86_5LEVEL #ifdef CONFIG_X86_5LEVEL
.quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#else #else
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
#endif #endif
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts) NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@ -364,13 +386,14 @@ NEXT_PAGE(early_dynamic_pgts)
.data .data
#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) #if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt) NEXT_PGD_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_START_KERNEL*8, 0 .org init_top_pgt + PGD_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
.fill PTI_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt) NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
@ -381,8 +404,9 @@ NEXT_PAGE(level2_ident_pgt)
*/ */
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#else #else
NEXT_PAGE(init_top_pgt) NEXT_PGD_PAGE(init_top_pgt)
.fill 512,8,0 .fill 512,8,0
.fill PTI_USER_PGD_FILL,8,0
#endif #endif
#ifdef CONFIG_X86_5LEVEL #ifdef CONFIG_X86_5LEVEL

View file

@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap * because the ->io_bitmap_max value must match the bitmap
* contents: * contents:
*/ */
tss = &per_cpu(cpu_tss, get_cpu()); tss = &per_cpu(cpu_tss_rw, get_cpu());
if (turn_on) if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num); bitmap_clear(t->io_bitmap_ptr, from, num);

View file

@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
/* high bit used in ret_from_ code */ /* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax; unsigned vector = ~regs->orig_ax;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq(); entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */ /* entering_irq() tells RCU that we're not quiescent. Check it. */

View file

@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (regs->sp >= estack_top && regs->sp <= estack_bottom) if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return; return;
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp, current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom, irq_stack_top, irq_stack_bottom,
estack_top, estack_bottom); estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow) if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n"); panic("low stack detected by irq handler - check messages\n");

View file

@ -5,6 +5,11 @@
* Copyright (C) 2002 Andi Kleen * Copyright (C) 2002 Andi Kleen
* *
* This handles calls from both 32bit and 64bit mode. * This handles calls from both 32bit and 64bit mode.
*
* Lock order:
* contex.ldt_usr_sem
* mmap_sem
* context.lock
*/ */
#include <linux/errno.h> #include <linux/errno.h>
@ -19,6 +24,7 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/ldt.h> #include <asm/ldt.h>
#include <asm/tlb.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
#include <asm/syscalls.h> #include <asm/syscalls.h>
@ -42,17 +48,15 @@ static void refresh_ldt_segments(void)
#endif #endif
} }
/* context.lock is held for us, so we don't need any locking. */ /* context.lock is held by the task which issued the smp function call */
static void flush_ldt(void *__mm) static void flush_ldt(void *__mm)
{ {
struct mm_struct *mm = __mm; struct mm_struct *mm = __mm;
mm_context_t *pc;
if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
return; return;
pc = &mm->context; load_mm_ldt(mm);
set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
refresh_ldt_segments(); refresh_ldt_segments();
} }
@ -89,25 +93,143 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
return NULL; return NULL;
} }
/* The new LDT isn't aliased for PTI yet. */
new_ldt->slot = -1;
new_ldt->nr_entries = num_entries; new_ldt->nr_entries = num_entries;
return new_ldt; return new_ldt;
} }
/*
* If PTI is enabled, this maps the LDT into the kernelmode and
* usermode tables for the given mm.
*
* There is no corresponding unmap function. Even if the LDT is freed, we
* leave the PTEs around until the slot is reused or the mm is destroyed.
* This is harmless: the LDT is always in ordinary memory, and no one will
* access the freed slot.
*
* If we wanted to unmap freed LDTs, we'd also need to do a flush to make
* it useful, and the flush would slow down modify_ldt().
*/
static int
map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
bool is_vmalloc, had_top_level_entry;
unsigned long va;
spinlock_t *ptl;
pgd_t *pgd;
int i;
if (!static_cpu_has(X86_FEATURE_PTI))
return 0;
/*
* Any given ldt_struct should have map_ldt_struct() called at most
* once.
*/
WARN_ON(ldt->slot != -1);
/*
* Did we already have the top level entry allocated? We can't
* use pgd_none() for this because it doens't do anything on
* 4-level page table kernels.
*/
pgd = pgd_offset(mm, LDT_BASE_ADDR);
had_top_level_entry = (pgd->pgd != 0);
is_vmalloc = is_vmalloc_addr(ldt->entries);
for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
unsigned long offset = i << PAGE_SHIFT;
const void *src = (char *)ldt->entries + offset;
unsigned long pfn;
pte_t pte, *ptep;
va = (unsigned long)ldt_slot_va(slot) + offset;
pfn = is_vmalloc ? vmalloc_to_pfn(src) :
page_to_pfn(virt_to_page(src));
/*
* Treat the PTI LDT range as a *userspace* range.
* get_locked_pte() will allocate all needed pagetables
* and account for them in this mm.
*/
ptep = get_locked_pte(mm, va, &ptl);
if (!ptep)
return -ENOMEM;
/*
* Map it RO so the easy to find address is not a primary
* target via some kernel interface which misses a
* permission check.
*/
pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL));
set_pte_at(mm, va, ptep, pte);
pte_unmap_unlock(ptep, ptl);
}
if (mm->context.ldt) {
/*
* We already had an LDT. The top-level entry should already
* have been allocated and synchronized with the usermode
* tables.
*/
WARN_ON(!had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI))
WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
} else {
/*
* This is the first time we're mapping an LDT for this process.
* Sync the pgd to the usermode tables.
*/
WARN_ON(had_top_level_entry);
if (static_cpu_has(X86_FEATURE_PTI)) {
WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
set_pgd(kernel_to_user_pgdp(pgd), *pgd);
}
}
va = (unsigned long)ldt_slot_va(slot);
flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
ldt->slot = slot;
#endif
return 0;
}
static void free_ldt_pgtables(struct mm_struct *mm)
{
#ifdef CONFIG_PAGE_TABLE_ISOLATION
struct mmu_gather tlb;
unsigned long start = LDT_BASE_ADDR;
unsigned long end = start + (1UL << PGDIR_SHIFT);
if (!static_cpu_has(X86_FEATURE_PTI))
return;
tlb_gather_mmu(&tlb, mm, start, end);
free_pgd_range(&tlb, start, end, start, end);
tlb_finish_mmu(&tlb, start, end);
#endif
}
/* After calling this, the LDT is immutable. */ /* After calling this, the LDT is immutable. */
static void finalize_ldt_struct(struct ldt_struct *ldt) static void finalize_ldt_struct(struct ldt_struct *ldt)
{ {
paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
} }
/* context.lock is held */ static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
static void install_ldt(struct mm_struct *current_mm,
struct ldt_struct *ldt)
{ {
/* Synchronizes with READ_ONCE in load_mm_ldt. */ mutex_lock(&mm->context.lock);
smp_store_release(&current_mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using current_mm. */ /* Synchronizes with READ_ONCE in load_mm_ldt. */
on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); smp_store_release(&mm->context.ldt, ldt);
/* Activate the LDT for all CPUs using currents mm. */
on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
mutex_unlock(&mm->context.lock);
} }
static void free_ldt_struct(struct ldt_struct *ldt) static void free_ldt_struct(struct ldt_struct *ldt)
@ -124,27 +246,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
} }
/* /*
* we do not have to muck with descriptors here, that is * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
* done in switch_mm() as needed. * the new task is not running, so nothing can be installed.
*/ */
int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
{ {
struct ldt_struct *new_ldt; struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
int retval = 0; int retval = 0;
mutex_init(&mm->context.lock); if (!old_mm)
old_mm = current->mm;
if (!old_mm) {
mm->context.ldt = NULL;
return 0; return 0;
}
mutex_lock(&old_mm->context.lock); mutex_lock(&old_mm->context.lock);
if (!old_mm->context.ldt) { if (!old_mm->context.ldt)
mm->context.ldt = NULL;
goto out_unlock; goto out_unlock;
}
new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
if (!new_ldt) { if (!new_ldt) {
@ -156,6 +271,12 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
new_ldt->nr_entries * LDT_ENTRY_SIZE); new_ldt->nr_entries * LDT_ENTRY_SIZE);
finalize_ldt_struct(new_ldt); finalize_ldt_struct(new_ldt);
retval = map_ldt_struct(mm, new_ldt, 0);
if (retval) {
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
mm->context.ldt = new_ldt; mm->context.ldt = new_ldt;
out_unlock: out_unlock:
@ -174,13 +295,18 @@ void destroy_context_ldt(struct mm_struct *mm)
mm->context.ldt = NULL; mm->context.ldt = NULL;
} }
void ldt_arch_exit_mmap(struct mm_struct *mm)
{
free_ldt_pgtables(mm);
}
static int read_ldt(void __user *ptr, unsigned long bytecount) static int read_ldt(void __user *ptr, unsigned long bytecount)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
unsigned long entries_size; unsigned long entries_size;
int retval; int retval;
mutex_lock(&mm->context.lock); down_read(&mm->context.ldt_usr_sem);
if (!mm->context.ldt) { if (!mm->context.ldt) {
retval = 0; retval = 0;
@ -209,7 +335,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
retval = bytecount; retval = bytecount;
out_unlock: out_unlock:
mutex_unlock(&mm->context.lock); up_read(&mm->context.ldt_usr_sem);
return retval; return retval;
} }
@ -269,7 +395,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
ldt.avl = 0; ldt.avl = 0;
} }
mutex_lock(&mm->context.lock); if (down_write_killable(&mm->context.ldt_usr_sem))
return -EINTR;
old_ldt = mm->context.ldt; old_ldt = mm->context.ldt;
old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@ -286,12 +413,31 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
new_ldt->entries[ldt_info.entry_number] = ldt; new_ldt->entries[ldt_info.entry_number] = ldt;
finalize_ldt_struct(new_ldt); finalize_ldt_struct(new_ldt);
/*
* If we are using PTI, map the new LDT into the userspace pagetables.
* If there is already an LDT, use the other slot so that other CPUs
* will continue to use the old LDT until install_ldt() switches
* them over to the new LDT.
*/
error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
if (error) {
/*
* This only can fail for the first LDT setup. If an LDT is
* already installed then the PTE page is already
* populated. Mop up a half populated page table.
*/
if (!WARN_ON_ONCE(old_ldt))
free_ldt_pgtables(mm);
free_ldt_struct(new_ldt);
goto out_unlock;
}
install_ldt(mm, new_ldt); install_ldt(mm, new_ldt);
free_ldt_struct(old_ldt); free_ldt_struct(old_ldt);
error = 0; error = 0;
out_unlock: out_unlock:
mutex_unlock(&mm->context.lock); up_write(&mm->context.ldt_usr_sem);
out: out:
return error; return error;
} }

Some files were not shown because too many files have changed in this diff Show more