From e8aabb0b546d5271d7c6549562664f6f90e21c9a Mon Sep 17 00:00:00 2001 From: zhangyuanhang Date: Mon, 27 Jun 2022 13:41:29 +0800 Subject: [PATCH] RPI-RT: add preempt-RT to openEuler 5.10.0-99.0.0 Signed-off-by: zhangyuanhang --- 0001-apply-preempt-RT-patch.patch | 26704 ++++++++++++++++ ...-bcm2711_defconfig-for-rt-rpi-kernel.patch | 34 + _multibuild | 1 + raspberrypi-kernel-rt.spec | 2784 ++ 4 files changed, 29523 insertions(+) create mode 100644 0001-apply-preempt-RT-patch.patch create mode 100644 0002-modify-bcm2711_defconfig-for-rt-rpi-kernel.patch create mode 100644 raspberrypi-kernel-rt.spec diff --git a/0001-apply-preempt-RT-patch.patch b/0001-apply-preempt-RT-patch.patch new file mode 100644 index 0000000..938e28f --- /dev/null +++ b/0001-apply-preempt-RT-patch.patch @@ -0,0 +1,26704 @@ +From 2cd60809901c9dd08588b128a1805f16ee34404d Mon Sep 17 00:00:00 2001 +From: root +Date: Mon, 6 Jun 2022 13:15:08 +0800 +Subject: [PATCH] apply preempt rt patch + +--- + .../Expedited-Grace-Periods.rst | 4 +- + .../RCU/Design/Requirements/Requirements.rst | 26 +- + Documentation/RCU/checklist.rst | 2 +- + Documentation/RCU/rcubarrier.rst | 6 +- + Documentation/RCU/stallwarn.rst | 4 +- + Documentation/RCU/whatisRCU.rst | 10 +- + .../admin-guide/kernel-parameters.txt | 11 + + Documentation/driver-api/io-mapping.rst | 92 +- + arch/Kconfig | 8 +- + arch/alpha/include/asm/kmap_types.h | 15 - + arch/alpha/include/asm/spinlock_types.h | 4 - + arch/arc/Kconfig | 1 + + arch/arc/include/asm/highmem.h | 26 +- + arch/arc/include/asm/kmap_types.h | 14 - + arch/arc/mm/highmem.c | 54 +- + arch/arm/Kconfig | 6 +- + arch/arm/include/asm/fixmap.h | 4 +- + arch/arm/include/asm/hardirq.h | 11 +- + arch/arm/include/asm/highmem.h | 34 +- + arch/arm/include/asm/irq.h | 2 + + arch/arm/include/asm/kmap_types.h | 10 - + arch/arm/include/asm/spinlock_types.h | 4 - + arch/arm/include/asm/thread_info.h | 10 +- + arch/arm/kernel/asm-offsets.c | 1 + + arch/arm/kernel/entry-armv.S | 19 +- + arch/arm/kernel/entry-common.S | 9 +- + arch/arm/kernel/signal.c | 3 +- + arch/arm/kernel/smp.c | 2 - + arch/arm/mm/Makefile | 1 - + arch/arm/mm/cache-feroceon-l2.c | 6 +- + arch/arm/mm/cache-xsc3l2.c | 4 +- + arch/arm/mm/fault.c | 3 + + arch/arm/mm/highmem.c | 121 -- + arch/arm64/Kconfig | 3 + + arch/arm64/include/asm/hardirq.h | 7 +- + arch/arm64/include/asm/preempt.h | 28 +- + arch/arm64/include/asm/spinlock_types.h | 4 - + arch/arm64/include/asm/thread_info.h | 7 +- + arch/arm64/kernel/asm-offsets.c | 1 + + arch/arm64/kernel/entry.S | 13 +- + arch/arm64/kernel/fpsimd.c | 18 +- + arch/arm64/kernel/ipi_nmi.c | 2 - + arch/arm64/kernel/signal.c | 2 +- + arch/arm64/kvm/arm.c | 6 +- + arch/csky/Kconfig | 1 + + arch/csky/include/asm/fixmap.h | 4 +- + arch/csky/include/asm/highmem.h | 6 +- + arch/csky/mm/highmem.c | 75 +- + arch/hexagon/include/asm/spinlock_types.h | 4 - + arch/ia64/include/asm/kmap_types.h | 13 - + arch/ia64/include/asm/spinlock_types.h | 4 - + arch/ia64/kernel/time.c | 20 +- + arch/microblaze/Kconfig | 1 + + arch/microblaze/include/asm/fixmap.h | 4 +- + arch/microblaze/include/asm/highmem.h | 6 +- + arch/microblaze/mm/Makefile | 1 - + arch/microblaze/mm/highmem.c | 78 - + arch/microblaze/mm/init.c | 6 - + arch/mips/Kconfig | 1 + + arch/mips/include/asm/fixmap.h | 4 +- + arch/mips/include/asm/highmem.h | 6 +- + arch/mips/include/asm/kmap_types.h | 13 - + arch/mips/kernel/crash_dump.c | 42 +- + arch/mips/mm/highmem.c | 77 - + arch/mips/mm/init.c | 4 - + arch/nds32/Kconfig.cpu | 1 + + arch/nds32/include/asm/fixmap.h | 4 +- + arch/nds32/include/asm/highmem.h | 22 +- + arch/nds32/mm/Makefile | 1 - + arch/nds32/mm/highmem.c | 48 - + arch/openrisc/mm/init.c | 1 - + arch/openrisc/mm/ioremap.c | 1 - + arch/parisc/include/asm/hardirq.h | 1 - + arch/parisc/include/asm/kmap_types.h | 13 - + arch/powerpc/Kconfig | 4 + + arch/powerpc/include/asm/cmpxchg.h | 2 +- + arch/powerpc/include/asm/fixmap.h | 4 +- + arch/powerpc/include/asm/highmem.h | 7 +- + arch/powerpc/include/asm/kmap_types.h | 13 - + .../include/asm/simple_spinlock_types.h | 2 +- + arch/powerpc/include/asm/spinlock_types.h | 4 - + arch/powerpc/include/asm/stackprotector.h | 4 + + arch/powerpc/include/asm/thread_info.h | 17 +- + arch/powerpc/kernel/asm-offsets.c | 1 + + arch/powerpc/kernel/entry_32.S | 23 +- + arch/powerpc/kernel/exceptions-64e.S | 16 +- + arch/powerpc/kernel/irq.c | 2 + + arch/powerpc/kernel/misc_32.S | 2 + + arch/powerpc/kernel/misc_64.S | 2 + + arch/powerpc/kernel/nvram_64.c | 12 +- + arch/powerpc/kernel/syscall_64.c | 10 +- + arch/powerpc/kernel/time.c | 56 +- + arch/powerpc/kernel/traps.c | 8 +- + arch/powerpc/kernel/watchdog.c | 5 - + arch/powerpc/kexec/crash.c | 3 - + arch/powerpc/kvm/Kconfig | 1 + + arch/powerpc/mm/Makefile | 1 - + arch/powerpc/mm/highmem.c | 67 - + arch/powerpc/mm/mem.c | 7 - + arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- + arch/powerpc/platforms/pseries/iommu.c | 31 +- + arch/powerpc/xmon/xmon.c | 6 +- + arch/s390/Kconfig | 1 + + arch/s390/include/asm/spinlock_types.h | 4 - + arch/s390/include/asm/vtime.h | 1 - + arch/s390/kernel/vtime.c | 51 +- + arch/sh/include/asm/fixmap.h | 8 - + arch/sh/include/asm/hardirq.h | 14 +- + arch/sh/include/asm/kmap_types.h | 15 - + arch/sh/include/asm/spinlock_types.h | 4 - + arch/sh/kernel/irq.c | 4 +- + arch/sh/kernel/traps.c | 2 +- + arch/sh/mm/init.c | 8 - + arch/sparc/Kconfig | 1 + + arch/sparc/include/asm/highmem.h | 8 +- + arch/sparc/include/asm/kmap_types.h | 11 - + arch/sparc/include/asm/vaddrs.h | 4 +- + arch/sparc/kernel/irq_64.c | 2 + + arch/sparc/mm/Makefile | 3 - + arch/sparc/mm/highmem.c | 115 -- + arch/sparc/mm/srmmu.c | 2 - + arch/um/include/asm/fixmap.h | 1 - + arch/um/include/asm/hardirq.h | 17 +- + arch/um/include/asm/kmap_types.h | 13 - + arch/um/kernel/kmsg_dump.c | 13 +- + arch/x86/Kconfig | 3 + + arch/x86/crypto/aesni-intel_glue.c | 22 +- + arch/x86/crypto/cast5_avx_glue.c | 21 +- + arch/x86/crypto/glue_helper.c | 26 +- + arch/x86/include/asm/fixmap.h | 5 +- + arch/x86/include/asm/fpu/api.h | 24 +- + arch/x86/include/asm/highmem.h | 13 +- + arch/x86/include/asm/iomap.h | 13 +- + arch/x86/include/asm/kmap_types.h | 13 - + arch/x86/include/asm/paravirt_types.h | 1 - + arch/x86/include/asm/preempt.h | 37 +- + arch/x86/include/asm/signal.h | 13 + + arch/x86/include/asm/stackprotector.h | 8 +- + arch/x86/include/asm/thread_info.h | 11 + + arch/x86/kernel/cpu/mshyperv.c | 3 +- + arch/x86/kernel/crash_dump_32.c | 48 +- + arch/x86/kernel/fpu/core.c | 12 + + arch/x86/kernel/irq_32.c | 2 + + arch/x86/kernel/irq_64.c | 2 + + arch/x86/kvm/x86.c | 8 + + arch/x86/mm/highmem_32.c | 59 - + arch/x86/mm/init_32.c | 15 - + arch/x86/mm/iomap_32.c | 57 +- + arch/xtensa/Kconfig | 1 + + arch/xtensa/include/asm/fixmap.h | 4 +- + arch/xtensa/include/asm/highmem.h | 12 +- + arch/xtensa/include/asm/spinlock_types.h | 4 - + arch/xtensa/mm/highmem.c | 46 +- + block/blk-mq.c | 124 +- + crypto/cryptd.c | 19 +- + drivers/atm/eni.c | 2 +- + drivers/block/zram/zram_drv.c | 36 + + drivers/block/zram/zram_drv.h | 1 + + drivers/char/random.c | 11 +- + drivers/char/tpm/tpm-dev-common.c | 1 - + drivers/char/tpm/tpm_tis.c | 29 +- + drivers/firewire/ohci.c | 4 +- + drivers/firmware/efi/efi.c | 5 +- + drivers/gpu/drm/i915/display/intel_sprite.c | 15 +- + .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 7 +- + drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 5 +- + drivers/gpu/drm/i915/gt/intel_engine_pm.c | 8 +- + drivers/gpu/drm/i915/i915_gem.c | 40 +- + drivers/gpu/drm/i915/i915_irq.c | 2 + + drivers/gpu/drm/i915/i915_trace.h | 6 +- + drivers/gpu/drm/i915/selftests/i915_gem.c | 4 +- + drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 8 +- + .../drm/nouveau/nvkm/subdev/devinit/fbmem.h | 8 +- + drivers/gpu/drm/qxl/qxl_image.c | 18 +- + drivers/gpu/drm/qxl/qxl_ioctl.c | 27 +- + drivers/gpu/drm/qxl/qxl_object.c | 12 +- + drivers/gpu/drm/qxl/qxl_object.h | 4 +- + drivers/gpu/drm/qxl/qxl_release.c | 4 +- + drivers/gpu/drm/radeon/radeon_display.c | 2 + + drivers/gpu/drm/ttm/ttm_bo_util.c | 20 +- + drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 30 +- + drivers/hv/hyperv_vmbus.h | 1 + + drivers/hv/vmbus_drv.c | 10 +- + drivers/leds/trigger/Kconfig | 1 + + drivers/md/raid5.c | 7 +- + drivers/md/raid5.h | 1 + + drivers/mtd/mtdoops.c | 5 +- + drivers/net/arcnet/arc-rimi.c | 4 +- + drivers/net/arcnet/arcdevice.h | 6 + + drivers/net/arcnet/arcnet.c | 66 +- + drivers/net/arcnet/com20020-isa.c | 4 +- + drivers/net/arcnet/com20020-pci.c | 2 +- + drivers/net/arcnet/com20020_cs.c | 2 +- + drivers/net/arcnet/com90io.c | 4 +- + drivers/net/arcnet/com90xx.c | 4 +- + drivers/net/ethernet/chelsio/cxgb/common.h | 6 +- + drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 54 +- + drivers/net/ethernet/chelsio/cxgb/sge.c | 53 +- + drivers/net/ethernet/chelsio/cxgb/sge.h | 3 +- + drivers/net/ethernet/chelsio/cxgb/subr.c | 64 +- + drivers/net/ethernet/dlink/sundance.c | 2 +- + drivers/net/ethernet/jme.c | 10 +- + drivers/net/ethernet/jme.h | 2 +- + drivers/net/wireless/ath/ath9k/beacon.c | 2 +- + drivers/pci/controller/pci-hyperv.c | 2 +- + drivers/scsi/fcoe/fcoe.c | 16 +- + drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- + drivers/scsi/libfc/fc_exch.c | 4 +- + drivers/tty/serial/8250/8250.h | 47 +- + drivers/tty/serial/8250/8250_core.c | 17 +- + drivers/tty/serial/8250/8250_fsl.c | 9 + + drivers/tty/serial/8250/8250_ingenic.c | 7 + + drivers/tty/serial/8250/8250_mtk.c | 29 +- + drivers/tty/serial/8250/8250_port.c | 92 +- + drivers/tty/serial/amba-pl011.c | 17 +- + drivers/tty/serial/omap-serial.c | 12 +- + drivers/tty/tty_buffer.c | 2 - + fs/afs/dir_silly.c | 2 +- + fs/aio.c | 3 +- + fs/btrfs/ctree.h | 1 - + fs/cifs/readdir.c | 2 +- + fs/dcache.c | 39 +- + fs/eventfd.c | 12 +- + fs/fscache/internal.h | 1 - + fs/fscache/main.c | 6 - + fs/fscache/object.c | 13 +- + fs/fuse/readdir.c | 2 +- + fs/inode.c | 2 +- + fs/namei.c | 4 +- + fs/namespace.c | 8 +- + fs/nfs/dir.c | 4 +- + fs/nfs/unlink.c | 4 +- + fs/proc/array.c | 4 +- + fs/proc/base.c | 3 +- + fs/proc/proc_sysctl.c | 2 +- + fs/pstore/platform.c | 5 +- + include/asm-generic/Kbuild | 2 +- + include/asm-generic/hardirq.h | 6 +- + include/asm-generic/kmap_size.h | 12 + + include/asm-generic/kmap_types.h | 11 - + include/asm-generic/preempt.h | 3 + + include/linux/blkdev.h | 2 +- + include/linux/bottom_half.h | 8 +- + include/linux/console.h | 11 + + include/linux/cpuhotplug.h | 1 + + include/linux/cpumask.h | 6 + + include/linux/dcache.h | 4 +- + include/linux/debug_locks.h | 3 +- + include/linux/delay.h | 6 + + include/linux/entry-common.h | 2 +- + include/linux/eventfd.h | 11 +- + include/linux/fs.h | 2 +- + include/linux/hardirq.h | 7 +- + include/linux/highmem-internal.h | 222 +++ + include/linux/highmem.h | 294 ++- + include/linux/interrupt.h | 34 +- + include/linux/io-mapping.h | 28 +- + include/linux/irq_cpustat.h | 28 - + include/linux/irq_work.h | 13 + + include/linux/irqdesc.h | 1 + + include/linux/irqflags.h | 23 +- + include/linux/kernel.h | 26 +- + include/linux/kmsg_dump.h | 52 +- + include/linux/local_lock_internal.h | 111 +- + include/linux/mm_types.h | 4 + + include/linux/mutex.h | 34 +- + include/linux/mutex_rt.h | 130 ++ + include/linux/nfs_xdr.h | 2 +- + include/linux/notifier.h | 6 +- + include/linux/pid.h | 1 + + include/linux/preempt.h | 190 +- + include/linux/printk.h | 30 +- + include/linux/random.h | 2 +- + include/linux/rbtree.h | 27 +- + include/linux/rbtree_type.h | 31 + + include/linux/rcupdate.h | 10 +- + include/linux/rtmutex.h | 46 +- + include/linux/rwlock_rt.h | 109 ++ + include/linux/rwlock_types.h | 4 + + include/linux/rwlock_types_rt.h | 56 + + include/linux/rwsem-rt.h | 70 + + include/linux/rwsem.h | 12 + + include/linux/sched.h | 123 +- + include/linux/sched/hotplug.h | 2 + + include/linux/sched/mm.h | 11 + + include/linux/sched/rt.h | 8 - + include/linux/sched/wake_q.h | 13 +- + include/linux/serial_8250.h | 5 + + include/linux/shmem_fs.h | 2 +- + include/linux/signal.h | 1 + + include/linux/skbuff.h | 7 + + include/linux/smp.h | 3 + + include/linux/spinlock.h | 12 +- + include/linux/spinlock_api_smp.h | 4 +- + include/linux/spinlock_rt.h | 155 ++ + include/linux/spinlock_types.h | 92 +- + include/linux/spinlock_types_nort.h | 39 + + include/linux/spinlock_types_raw.h | 65 + + include/linux/spinlock_types_rt.h | 38 + + include/linux/spinlock_types_up.h | 2 +- + include/linux/stop_machine.h | 5 + + include/linux/thread_info.h | 12 +- + include/linux/trace_events.h | 65 +- + include/linux/u64_stats_sync.h | 42 +- + include/linux/vmstat.h | 4 + + include/linux/vtime.h | 42 +- + include/linux/wait.h | 1 + + include/linux/ww_mutex.h | 8 + + include/net/gen_stats.h | 11 +- + include/net/net_seq_lock.h | 15 + + include/net/netns/xfrm.h | 2 +- + include/net/sch_generic.h | 27 +- + include/trace/events/sched.h | 12 + + init/Kconfig | 7 +- + kernel/Kconfig.locks | 2 +- + kernel/Kconfig.preempt | 7 + + kernel/cgroup/cpuset.c | 82 +- + kernel/cgroup/rstat.c | 5 +- + kernel/cpu.c | 9 +- + kernel/debug/kdb/kdb_main.c | 10 +- + kernel/entry/common.c | 14 +- + kernel/exit.c | 2 +- + kernel/fork.c | 28 +- + kernel/futex.c | 87 +- + kernel/irq/handle.c | 8 +- + kernel/irq/manage.c | 12 +- + kernel/irq/spurious.c | 8 + + kernel/irq_work.c | 136 +- + kernel/kexec_core.c | 1 - + kernel/ksysfs.c | 12 + + kernel/kthread.c | 16 +- + kernel/locking/Makefile | 10 +- + kernel/locking/lockdep.c | 2 + + kernel/locking/mutex-rt.c | 224 +++ + kernel/locking/rtmutex-debug.c | 102 - + kernel/locking/rtmutex-debug.h | 11 - + kernel/locking/rtmutex.c | 941 +++++++-- + kernel/locking/rtmutex.h | 7 - + kernel/locking/rtmutex_common.h | 36 +- + kernel/locking/rwlock-rt.c | 334 ++++ + kernel/locking/rwsem-rt.c | 317 ++++ + kernel/locking/rwsem.c | 6 + + kernel/locking/spinlock.c | 7 + + kernel/locking/spinlock_debug.c | 5 + + kernel/notifier.c | 12 +- + kernel/panic.c | 33 +- + kernel/printk/Makefile | 1 - + kernel/printk/internal.h | 37 - + kernel/printk/printk.c | 1680 +++++++++-------- + kernel/printk/printk_safe.c | 425 ----- + kernel/ptrace.c | 32 +- + kernel/rcu/Kconfig | 4 +- + kernel/rcu/tree.c | 4 +- + kernel/rcu/update.c | 4 +- + kernel/sched/core.c | 1270 ++++++++++--- + kernel/sched/cpudeadline.c | 4 +- + kernel/sched/cpupri.c | 4 +- + kernel/sched/cputime.c | 36 +- + kernel/sched/deadline.c | 47 +- + kernel/sched/fair.c | 16 +- + kernel/sched/features.h | 8 + + kernel/sched/rt.c | 81 +- + kernel/sched/sched.h | 80 +- + kernel/sched/swait.c | 1 + + kernel/sched/topology.c | 1 + + kernel/signal.c | 105 +- + kernel/smp.c | 14 +- + kernel/softirq.c | 428 ++++- + kernel/stop_machine.c | 27 +- + kernel/time/hrtimer.c | 30 + + kernel/time/tick-sched.c | 2 +- + kernel/time/timer.c | 9 +- + kernel/trace/trace.c | 93 +- + kernel/trace/trace.h | 19 - + kernel/trace/trace_events.c | 2 + + kernel/trace/trace_output.c | 19 +- + kernel/workqueue.c | 4 + + lib/Kconfig.debug | 2 +- + lib/bug.c | 1 + + lib/cpumask.c | 18 + + lib/debugobjects.c | 5 +- + lib/dump_stack.c | 2 + + lib/irq_poll.c | 5 + + lib/locking-selftest.c | 51 + + lib/nmi_backtrace.c | 6 - + lib/scatterlist.c | 2 +- + lib/smp_processor_id.c | 5 + + lib/test_lockup.c | 16 + + mm/Kconfig | 5 +- + mm/highmem.c | 262 ++- + mm/memcontrol.c | 66 +- + mm/page_alloc.c | 184 +- + mm/shmem.c | 31 +- + mm/slab.c | 90 +- + mm/slab.h | 2 +- + mm/slub.c | 148 +- + mm/vmalloc.c | 13 +- + mm/vmstat.c | 12 + + mm/workingset.c | 5 +- + mm/z3fold.c | 17 +- + mm/zsmalloc.c | 85 +- + mm/zswap.c | 1 + + net/Kconfig | 2 +- + net/core/dev.c | 33 +- + net/core/gen_estimator.c | 6 +- + net/core/gen_stats.c | 12 +- + net/core/sock.c | 6 +- + net/ipv4/inet_hashtables.c | 19 +- + net/ipv6/inet6_hashtables.c | 5 +- + net/sched/sch_api.c | 2 +- + net/sched/sch_generic.c | 10 + + net/sunrpc/svc_xprt.c | 4 +- + net/xfrm/xfrm_state.c | 3 +- + 414 files changed, 9028 insertions(+), 4928 deletions(-) + delete mode 100644 arch/alpha/include/asm/kmap_types.h + delete mode 100644 arch/arc/include/asm/kmap_types.h + delete mode 100644 arch/arm/include/asm/kmap_types.h + delete mode 100644 arch/arm/mm/highmem.c + delete mode 100644 arch/ia64/include/asm/kmap_types.h + delete mode 100644 arch/microblaze/mm/highmem.c + delete mode 100644 arch/mips/include/asm/kmap_types.h + delete mode 100644 arch/nds32/mm/highmem.c + delete mode 100644 arch/parisc/include/asm/kmap_types.h + delete mode 100644 arch/powerpc/include/asm/kmap_types.h + delete mode 100644 arch/powerpc/mm/highmem.c + delete mode 100644 arch/sh/include/asm/kmap_types.h + delete mode 100644 arch/sparc/include/asm/kmap_types.h + delete mode 100644 arch/sparc/mm/highmem.c + delete mode 100644 arch/um/include/asm/kmap_types.h + delete mode 100644 arch/x86/include/asm/kmap_types.h + create mode 100644 include/asm-generic/kmap_size.h + delete mode 100644 include/asm-generic/kmap_types.h + create mode 100644 include/linux/highmem-internal.h + delete mode 100644 include/linux/irq_cpustat.h + create mode 100644 include/linux/mutex_rt.h + create mode 100644 include/linux/rbtree_type.h + create mode 100644 include/linux/rwlock_rt.h + create mode 100644 include/linux/rwlock_types_rt.h + create mode 100644 include/linux/rwsem-rt.h + create mode 100644 include/linux/spinlock_rt.h + create mode 100644 include/linux/spinlock_types_nort.h + create mode 100644 include/linux/spinlock_types_raw.h + create mode 100644 include/linux/spinlock_types_rt.h + create mode 100644 include/net/net_seq_lock.h + create mode 100644 kernel/locking/mutex-rt.c + create mode 100644 kernel/locking/rwlock-rt.c + create mode 100644 kernel/locking/rwsem-rt.c + +diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +index 72f0f6fbd..6f89cf1e5 100644 +--- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst ++++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +@@ -38,7 +38,7 @@ sections. + RCU-preempt Expedited Grace Periods + =================================== + +-``CONFIG_PREEMPT=y`` kernels implement RCU-preempt. ++``CONFIG_PREEMPTION=y`` kernels implement RCU-preempt. + The overall flow of the handling of a given CPU by an RCU-preempt + expedited grace period is shown in the following diagram: + +@@ -112,7 +112,7 @@ things. + RCU-sched Expedited Grace Periods + --------------------------------- + +-``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of ++``CONFIG_PREEMPTION=n`` kernels implement RCU-sched. The overall flow of + the handling of a given CPU by an RCU-sched expedited grace period is + shown in the following diagram: + +diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst +index 1ae79a10a..17d38480e 100644 +--- a/Documentation/RCU/Design/Requirements/Requirements.rst ++++ b/Documentation/RCU/Design/Requirements/Requirements.rst +@@ -78,7 +78,7 @@ RCU treats a nested set as one big RCU read-side critical section. + Production-quality implementations of ``rcu_read_lock()`` and + ``rcu_read_unlock()`` are extremely lightweight, and in fact have + exactly zero overhead in Linux kernels built for production use with +-``CONFIG_PREEMPT=n``. ++``CONFIG_PREEMPTION=n``. + + This guarantee allows ordering to be enforced with extremely low + overhead to readers, for example: +@@ -1182,7 +1182,7 @@ and has become decreasingly so as memory sizes have expanded and memory + costs have plummeted. However, as I learned from Matt Mackall's + `bloatwatch `__ efforts, memory + footprint is critically important on single-CPU systems with +-non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny ++non-preemptible (``CONFIG_PREEMPTION=n``) kernels, and thus `tiny + RCU `__ + was born. Josh Triplett has since taken over the small-memory banner + with his `Linux kernel tinification `__ +@@ -1498,7 +1498,7 @@ limitations. + + Implementations of RCU for which ``rcu_read_lock()`` and + ``rcu_read_unlock()`` generate no code, such as Linux-kernel RCU when +-``CONFIG_PREEMPT=n``, can be nested arbitrarily deeply. After all, there ++``CONFIG_PREEMPTION=n``, can be nested arbitrarily deeply. After all, there + is no overhead. Except that if all these instances of + ``rcu_read_lock()`` and ``rcu_read_unlock()`` are visible to the + compiler, compilation will eventually fail due to exhausting memory, +@@ -1771,7 +1771,7 @@ implementation can be a no-op. + + However, once the scheduler has spawned its first kthread, this early + boot trick fails for ``synchronize_rcu()`` (as well as for +-``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPT=y`` kernels. The ++``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPTION=y`` kernels. The + reason is that an RCU read-side critical section might be preempted, + which means that a subsequent ``synchronize_rcu()`` really does have to + wait for something, as opposed to simply returning immediately. +@@ -2010,7 +2010,7 @@ the following: + 5 rcu_read_unlock(); + 6 do_something_with(v, user_v); + +-If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel ++If the compiler did make this transformation in a ``CONFIG_PREEMPTION=n`` kernel + build, and if ``get_user()`` did page fault, the result would be a quiescent + state in the middle of an RCU read-side critical section. This misplaced + quiescent state could result in line 4 being a use-after-free access, +@@ -2289,10 +2289,10 @@ decides to throw at it. + + The Linux kernel is used for real-time workloads, especially in + conjunction with the `-rt +-patchset `__. The ++patchset `__. The + real-time-latency response requirements are such that the traditional + approach of disabling preemption across RCU read-side critical sections +-is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use ++is inappropriate. Kernels built with ``CONFIG_PREEMPTION=y`` therefore use + an RCU implementation that allows RCU read-side critical sections to be + preempted. This requirement made its presence known after users made it + clear that an earlier `real-time +@@ -2414,7 +2414,7 @@ includes ``rcu_read_lock_bh()``, ``rcu_read_unlock_bh()``, + ``call_rcu_bh()``, ``rcu_barrier_bh()``, and + ``rcu_read_lock_bh_held()``. However, the update-side APIs are now + simple wrappers for other RCU flavors, namely RCU-sched in +-CONFIG_PREEMPT=n kernels and RCU-preempt otherwise. ++CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise. + + Sched Flavor (Historical) + ~~~~~~~~~~~~~~~~~~~~~~~~~ +@@ -2432,11 +2432,11 @@ not have this property, given that any point in the code outside of an + RCU read-side critical section can be a quiescent state. Therefore, + *RCU-sched* was created, which follows “classic” RCU in that an + RCU-sched grace period waits for pre-existing interrupt and NMI +-handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and ++handlers. In kernels built with ``CONFIG_PREEMPTION=n``, the RCU and + RCU-sched APIs have identical implementations, while kernels built with +-``CONFIG_PREEMPT=y`` provide a separate implementation for each. ++``CONFIG_PREEMPTION=y`` provide a separate implementation for each. + +-Note well that in ``CONFIG_PREEMPT=y`` kernels, ++Note well that in ``CONFIG_PREEMPTION=y`` kernels, + ``rcu_read_lock_sched()`` and ``rcu_read_unlock_sched()`` disable and + re-enable preemption, respectively. This means that if there was a + preemption attempt during the RCU-sched read-side critical section, +@@ -2599,10 +2599,10 @@ userspace execution also delimit tasks-RCU read-side critical sections. + + The tasks-RCU API is quite compact, consisting only of + ``call_rcu_tasks()``, ``synchronize_rcu_tasks()``, and +-``rcu_barrier_tasks()``. In ``CONFIG_PREEMPT=n`` kernels, trampolines ++``rcu_barrier_tasks()``. In ``CONFIG_PREEMPTION=n`` kernels, trampolines + cannot be preempted, so these APIs map to ``call_rcu()``, + ``synchronize_rcu()``, and ``rcu_barrier()``, respectively. In +-``CONFIG_PREEMPT=y`` kernels, trampolines can be preempted, and these ++``CONFIG_PREEMPTION=y`` kernels, trampolines can be preempted, and these + three APIs are therefore implemented by separate functions that check + for voluntary context switches. + +diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst +index 2efed9926..7ed495604 100644 +--- a/Documentation/RCU/checklist.rst ++++ b/Documentation/RCU/checklist.rst +@@ -214,7 +214,7 @@ over a rather long period of time, but improvements are always welcome! + the rest of the system. + + 7. As of v4.20, a given kernel implements only one RCU flavor, +- which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. ++ which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. + If the updater uses call_rcu() or synchronize_rcu(), + then the corresponding readers my use rcu_read_lock() and + rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), +diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst +index f64f4413a..3b4a24877 100644 +--- a/Documentation/RCU/rcubarrier.rst ++++ b/Documentation/RCU/rcubarrier.rst +@@ -9,7 +9,7 @@ RCU (read-copy update) is a synchronization mechanism that can be thought + of as a replacement for read-writer locking (among other things), but with + very low-overhead readers that are immune to deadlock, priority inversion, + and unbounded latency. RCU read-side critical sections are delimited +-by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT ++by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION + kernels, generate no code whatsoever. + + This means that RCU writers are unaware of the presence of concurrent +@@ -329,10 +329,10 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last + to smp_call_function() and further to smp_call_function_on_cpu(), + causing this latter to spin until the cross-CPU invocation of + rcu_barrier_func() has completed. This by itself would prevent +- a grace period from completing on non-CONFIG_PREEMPT kernels, ++ a grace period from completing on non-CONFIG_PREEMPTION kernels, + since each CPU must undergo a context switch (or other quiescent + state) before the grace period can complete. However, this is +- of no use in CONFIG_PREEMPT kernels. ++ of no use in CONFIG_PREEMPTION kernels. + + Therefore, on_each_cpu() disables preemption across its call + to smp_call_function() and also across the local call to +diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst +index c9ab6af4d..e97d1b487 100644 +--- a/Documentation/RCU/stallwarn.rst ++++ b/Documentation/RCU/stallwarn.rst +@@ -25,7 +25,7 @@ warnings: + + - A CPU looping with bottom halves disabled. + +-- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel ++- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel + without invoking schedule(). If the looping in the kernel is + really expected and desirable behavior, you might need to add + some calls to cond_resched(). +@@ -44,7 +44,7 @@ warnings: + result in the ``rcu_.*kthread starved for`` console-log message, + which will include additional debugging information. + +-- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might ++- A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, +diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst +index fb3ff76c3..3b2b1479f 100644 +--- a/Documentation/RCU/whatisRCU.rst ++++ b/Documentation/RCU/whatisRCU.rst +@@ -684,7 +684,7 @@ Quick Quiz #1: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + This section presents a "toy" RCU implementation that is based on + "classic RCU". It is also short on performance (but only for updates) and +-on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT ++on features such as hotplug CPU and the ability to run in CONFIG_PREEMPTION + kernels. The definitions of rcu_dereference() and rcu_assign_pointer() + are the same as those shown in the preceding section, so they are omitted. + :: +@@ -740,7 +740,7 @@ Quick Quiz #2: + Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in +- PREEMPT_RT, where normal spinlocks can block??? ++ CONFIG_PREEMPT_RT, where normal spinlocks can block??? + + :ref:`Answers to Quick Quiz <8_whatisRCU>` + +@@ -1094,7 +1094,7 @@ Quick Quiz #2: + overhead is **negative**. + + Answer: +- Imagine a single-CPU system with a non-CONFIG_PREEMPT ++ Imagine a single-CPU system with a non-CONFIG_PREEMPTION + kernel where a routing table is used by process-context + code, but can be updated by irq-context code (for example, + by an "ICMP REDIRECT" packet). The usual way of handling +@@ -1121,10 +1121,10 @@ Answer: + Quick Quiz #3: + If it is illegal to block in an RCU read-side + critical section, what the heck do you do in +- PREEMPT_RT, where normal spinlocks can block??? ++ CONFIG_PREEMPT_RT, where normal spinlocks can block??? + + Answer: +- Just as PREEMPT_RT permits preemption of spinlock ++ Just as CONFIG_PREEMPT_RT permits preemption of spinlock + critical sections, it permits preemption of RCU + read-side critical sections. It also permits + spinlocks blocking while in RCU read-side critical +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 98199d3ae..34a611303 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4268,6 +4268,10 @@ + value, meaning that RCU_SOFTIRQ is used by default. + Specify rcutree.use_softirq=0 to use rcuc kthreads. + ++ But note that CONFIG_PREEMPT_RT=y kernels disable ++ this kernel boot parameter, forcibly setting it ++ to zero. ++ + rcutree.rcu_fanout_exact= [KNL] + Disable autobalancing of the rcu_node combining + tree. This is used by rcutorture, and might +@@ -4646,6 +4650,13 @@ + only normal grace-period primitives. No effect + on CONFIG_TINY_RCU kernels. + ++ But note that CONFIG_PREEMPT_RT=y kernels enables ++ this kernel boot parameter, forcibly setting ++ it to the value one, that is, converting any ++ post-boot attempt at an expedited RCU grace ++ period to instead use normal non-expedited ++ grace-period processing. ++ + rcupdate.rcu_task_ipi_delay= [KNL] + Set time in jiffies during which RCU tasks will + avoid sending IPIs, starting with the beginning +diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst +index a966239f0..a7830c594 100644 +--- a/Documentation/driver-api/io-mapping.rst ++++ b/Documentation/driver-api/io-mapping.rst +@@ -20,78 +20,64 @@ A mapping object is created during driver initialization using:: + mappable, while 'size' indicates how large a mapping region to + enable. Both are in bytes. + +-This _wc variant provides a mapping which may only be used +-with the io_mapping_map_atomic_wc or io_mapping_map_wc. ++This _wc variant provides a mapping which may only be used with ++io_mapping_map_local_wc() or io_mapping_map_wc(). + +-With this mapping object, individual pages can be mapped either atomically +-or not, depending on the necessary scheduling environment. Of course, atomic +-maps are more efficient:: ++With this mapping object, individual pages can be mapped either temporarily ++or long term, depending on the requirements. Of course, temporary maps are ++more efficient. + +- void *io_mapping_map_atomic_wc(struct io_mapping *mapping, +- unsigned long offset) ++ void *io_mapping_map_local_wc(struct io_mapping *mapping, ++ unsigned long offset) + +-'offset' is the offset within the defined mapping region. +-Accessing addresses beyond the region specified in the +-creation function yields undefined results. Using an offset +-which is not page aligned yields an undefined result. The +-return value points to a single page in CPU address space. ++'offset' is the offset within the defined mapping region. Accessing ++addresses beyond the region specified in the creation function yields ++undefined results. Using an offset which is not page aligned yields an ++undefined result. The return value points to a single page in CPU address ++space. + +-This _wc variant returns a write-combining map to the +-page and may only be used with mappings created by +-io_mapping_create_wc ++This _wc variant returns a write-combining map to the page and may only be ++used with mappings created by io_mapping_create_wc() + +-Note that the task may not sleep while holding this page +-mapped. ++Temporary mappings are only valid in the context of the caller. The mapping ++is not guaranteed to be globaly visible. + +-:: ++io_mapping_map_local_wc() has a side effect on X86 32bit as it disables ++migration to make the mapping code work. No caller can rely on this side ++effect. + +- void io_mapping_unmap_atomic(void *vaddr) ++Nested mappings need to be undone in reverse order because the mapping ++code uses a stack for keeping track of them:: + +-'vaddr' must be the value returned by the last +-io_mapping_map_atomic_wc call. This unmaps the specified +-page and allows the task to sleep once again. ++ addr1 = io_mapping_map_local_wc(map1, offset1); ++ addr2 = io_mapping_map_local_wc(map2, offset2); ++ ... ++ io_mapping_unmap_local(addr2); ++ io_mapping_unmap_local(addr1); + +-If you need to sleep while holding the lock, you can use the non-atomic +-variant, although they may be significantly slower. ++The mappings are released with:: + +-:: ++ void io_mapping_unmap_local(void *vaddr) ++ ++'vaddr' must be the value returned by the last io_mapping_map_local_wc() ++call. This unmaps the specified mapping and undoes eventual side effects of ++the mapping function. ++ ++If you need to sleep while holding a mapping, you can use the regular ++variant, although this may be significantly slower:: + + void *io_mapping_map_wc(struct io_mapping *mapping, + unsigned long offset) + +-This works like io_mapping_map_atomic_wc except it allows +-the task to sleep while holding the page mapped. ++This works like io_mapping_map_local_wc() except it has no side effects and ++the pointer is globaly visible. + +- +-:: ++The mappings are released with:: + + void io_mapping_unmap(void *vaddr) + +-This works like io_mapping_unmap_atomic, except it is used +-for pages mapped with io_mapping_map_wc. ++Use for pages mapped with io_mapping_map_wc(). + + At driver close time, the io_mapping object must be freed:: + + void io_mapping_free(struct io_mapping *mapping) +- +-Current Implementation +-====================== +- +-The initial implementation of these functions uses existing mapping +-mechanisms and so provides only an abstraction layer and no new +-functionality. +- +-On 64-bit processors, io_mapping_create_wc calls ioremap_wc for the whole +-range, creating a permanent kernel-visible mapping to the resource. The +-map_atomic and map functions add the requested offset to the base of the +-virtual address returned by ioremap_wc. +- +-On 32-bit processors with HIGHMEM defined, io_mapping_map_atomic_wc uses +-kmap_atomic_pfn to map the specified page in an atomic fashion; +-kmap_atomic_pfn isn't really supposed to be used with device pages, but it +-provides an efficient mapping for this usage. +- +-On 32-bit processors without HIGHMEM defined, io_mapping_map_atomic_wc and +-io_mapping_map_wc both use ioremap_wc, a terribly inefficient function which +-performs an IPI to inform all processors about the new mapping. This results +-in a significant performance penalty. +diff --git a/arch/Kconfig b/arch/Kconfig +index 7a8e3d45b..4dbc4c659 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -50,6 +50,7 @@ config OPROFILE + tristate "OProfile system profiling" + depends on PROFILING + depends on HAVE_OPROFILE ++ depends on !PREEMPT_RT + select RING_BUFFER + select RING_BUFFER_ALLOW_SWAP + help +@@ -673,6 +674,12 @@ config HAVE_TIF_NOHZ + config HAVE_VIRT_CPU_ACCOUNTING + bool + ++config HAVE_VIRT_CPU_ACCOUNTING_IDLE ++ bool ++ help ++ Architecture has its own way to account idle CPU time and therefore ++ doesn't implement vtime_account_idle(). ++ + config ARCH_HAS_SCALED_CPUTIME + bool + +@@ -687,7 +694,6 @@ config HAVE_VIRT_CPU_ACCOUNTING_GEN + some 32-bit arches may require multiple accesses, so proper + locking is needed to protect against concurrent accesses. + +- + config HAVE_IRQ_TIME_ACCOUNTING + bool + help +diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h +deleted file mode 100644 +index 651714b45..000000000 +--- a/arch/alpha/include/asm/kmap_types.h ++++ /dev/null +@@ -1,15 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-/* Dummy header just to define km_type. */ +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h +index 1d5716bc0..6883bc952 100644 +--- a/arch/alpha/include/asm/spinlock_types.h ++++ b/arch/alpha/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef _ALPHA_SPINLOCK_TYPES_H + #define _ALPHA_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig +index 0a89cc9de..d8804001d 100644 +--- a/arch/arc/Kconfig ++++ b/arch/arc/Kconfig +@@ -507,6 +507,7 @@ config LINUX_RAM_BASE + config HIGHMEM + bool "High Memory Support" + select ARCH_DISCONTIGMEM_ENABLE ++ select KMAP_LOCAL + help + With ARC 2G:2G address split, only upper 2G is directly addressable by + kernel. Enable this to potentially allow access to rest of 2G and PAE +diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h +index 6e5eafb3a..a6b8e2c35 100644 +--- a/arch/arc/include/asm/highmem.h ++++ b/arch/arc/include/asm/highmem.h +@@ -9,17 +9,29 @@ + #ifdef CONFIG_HIGHMEM + + #include +-#include ++#include ++ ++#define FIXMAP_SIZE PGDIR_SIZE ++#define PKMAP_SIZE PGDIR_SIZE + + /* start after vmalloc area */ + #define FIXMAP_BASE (PAGE_OFFSET - FIXMAP_SIZE - PKMAP_SIZE) +-#define FIXMAP_SIZE PGDIR_SIZE /* only 1 PGD worth */ +-#define KM_TYPE_NR ((FIXMAP_SIZE >> PAGE_SHIFT)/NR_CPUS) +-#define FIXMAP_ADDR(nr) (FIXMAP_BASE + ((nr) << PAGE_SHIFT)) ++ ++#define FIX_KMAP_SLOTS (KM_MAX_IDX * NR_CPUS) ++#define FIX_KMAP_BEGIN (0UL) ++#define FIX_KMAP_END ((FIX_KMAP_BEGIN + FIX_KMAP_SLOTS) - 1) ++ ++#define FIXADDR_TOP (FIXMAP_BASE + (FIX_KMAP_END << PAGE_SHIFT)) ++ ++/* ++ * This should be converted to the asm-generic version, but of course this ++ * is needlessly different from all other architectures. Sigh - tglx ++ */ ++#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) ++#define __virt_to_fix(x) (((FIXADDR_TOP - ((x) & PAGE_MASK))) >> PAGE_SHIFT) + + /* start after fixmap area */ + #define PKMAP_BASE (FIXMAP_BASE + FIXMAP_SIZE) +-#define PKMAP_SIZE PGDIR_SIZE + #define LAST_PKMAP (PKMAP_SIZE >> PAGE_SHIFT) + #define LAST_PKMAP_MASK (LAST_PKMAP - 1) + #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) +@@ -29,11 +41,13 @@ + + extern void kmap_init(void); + ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) ++ + static inline void flush_cache_kmaps(void) + { + flush_cache_all(); + } +- + #endif + + #endif +diff --git a/arch/arc/include/asm/kmap_types.h b/arch/arc/include/asm/kmap_types.h +deleted file mode 100644 +index fecf7851e..000000000 +--- a/arch/arc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,14 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-/* +- * Copyright (C) 2015 Synopsys, Inc. (www.synopsys.com) +- */ +- +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-/* +- * We primarily need to define KM_TYPE_NR here but that in turn +- * is a function of PGDIR_SIZE etc. +- * To avoid circular deps issue, put everything in asm/highmem.h +- */ +-#endif +diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c +index 1b9f473c6..c79912a6b 100644 +--- a/arch/arc/mm/highmem.c ++++ b/arch/arc/mm/highmem.c +@@ -36,9 +36,8 @@ + * This means each only has 1 PGDIR_SIZE worth of kvaddr mappings, which means + * 2M of kvaddr space for typical config (8K page and 11:8:13 traversal split) + * +- * - fixmap anyhow needs a limited number of mappings. So 2M kvaddr == 256 PTE +- * slots across NR_CPUS would be more than sufficient (generic code defines +- * KM_TYPE_NR as 20). ++ * - The fixed KMAP slots for kmap_local/atomic() require KM_MAX_IDX slots per ++ * CPU. So the number of CPUs sharing a single PTE page is limited. + * + * - pkmap being preemptible, in theory could do with more than 256 concurrent + * mappings. However, generic pkmap code: map_new_virtual(), doesn't traverse +@@ -47,48 +46,6 @@ + */ + + extern pte_t * pkmap_page_table; +-static pte_t * fixmap_page_table; +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- int idx, cpu_idx; +- unsigned long vaddr; +- +- cpu_idx = kmap_atomic_idx_push(); +- idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); +- vaddr = FIXMAP_ADDR(idx); +- +- set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, +- mk_pte(page, prot)); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kv) +-{ +- unsigned long kvaddr = (unsigned long)kv; +- +- if (kvaddr >= FIXMAP_BASE && kvaddr < (FIXMAP_BASE + FIXMAP_SIZE)) { +- +- /* +- * Because preemption is disabled, this vaddr can be associated +- * with the current allocated index. +- * But in case of multiple live kmap_atomic(), it still relies on +- * callers to unmap in right order. +- */ +- int cpu_idx = kmap_atomic_idx(); +- int idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); +- +- WARN_ON(kvaddr != FIXMAP_ADDR(idx)); +- +- pte_clear(&init_mm, kvaddr, fixmap_page_table + idx); +- local_flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); +- +- kmap_atomic_idx_pop(); +- } +-} +-EXPORT_SYMBOL(kunmap_atomic_high); + + static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) + { +@@ -108,10 +65,9 @@ void __init kmap_init(void) + { + /* Due to recursive include hell, we can't do this in processor.h */ + BUILD_BUG_ON(PAGE_OFFSET < (VMALLOC_END + FIXMAP_SIZE + PKMAP_SIZE)); ++ BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); ++ BUILD_BUG_ON(FIX_KMAP_SLOTS > PTRS_PER_PTE); + +- BUILD_BUG_ON(KM_TYPE_NR > PTRS_PER_PTE); + pkmap_page_table = alloc_kmap_pgtable(PKMAP_BASE); +- +- BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); +- fixmap_page_table = alloc_kmap_pgtable(FIXMAP_BASE); ++ alloc_kmap_pgtable(FIXMAP_BASE); + } +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 29a634b7d..77c96bea6 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -31,6 +31,7 @@ config ARM + select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 + select ARCH_SUPPORTS_ATOMIC_RMW ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU +@@ -66,7 +67,7 @@ config ARM + select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT + select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 +- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU ++ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT + select HAVE_ARCH_KFENCE if MMU + select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL +@@ -108,6 +109,7 @@ config ARM + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ +@@ -123,6 +125,7 @@ config ARM + select OLD_SIGSUSPEND3 + select PCI_SYSCALL if PCI + select PERF_USE_VMALLOC ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select RTC_LIB + select SET_FS + select SYS_SUPPORTS_APM_EMULATION +@@ -1512,6 +1515,7 @@ config HAVE_ARCH_PFN_VALID + config HIGHMEM + bool "High Memory Support" + depends on MMU ++ select KMAP_LOCAL + help + The address space of ARM processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h +index 9575b4040..707068f85 100644 +--- a/arch/arm/include/asm/fixmap.h ++++ b/arch/arm/include/asm/fixmap.h +@@ -7,14 +7,14 @@ + #define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) + + #include +-#include ++#include + + enum fixed_addresses { + FIX_EARLYCON_MEM_BASE, + __end_of_permanent_fixed_addresses, + + FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses, +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + + /* Support writing RO kernel text via kprobes, jump labels, etc. */ + FIX_TEXT_POKE0, +diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h +index b95848ed2..706efafbf 100644 +--- a/arch/arm/include/asm/hardirq.h ++++ b/arch/arm/include/asm/hardirq.h +@@ -2,16 +2,11 @@ + #ifndef __ASM_HARDIRQ_H + #define __ASM_HARDIRQ_H + +-#include +-#include + #include + +-typedef struct { +- unsigned int __softirq_pending; +-} ____cacheline_aligned irq_cpustat_t; +- +-#include /* Standard mappings for irq_cpustat_t above */ +- + #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 ++#define ack_bad_irq ack_bad_irq ++ ++#include + + #endif /* __ASM_HARDIRQ_H */ +diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h +index 31811be38..b22dffa8c 100644 +--- a/arch/arm/include/asm/highmem.h ++++ b/arch/arm/include/asm/highmem.h +@@ -2,7 +2,8 @@ + #ifndef _ASM_HIGHMEM_H + #define _ASM_HIGHMEM_H + +-#include ++#include ++#include + + #define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE) + #define LAST_PKMAP PTRS_PER_PTE +@@ -46,19 +47,32 @@ extern pte_t *pkmap_page_table; + + #ifdef ARCH_NEEDS_KMAP_HIGH_GET + extern void *kmap_high_get(struct page *page); +-#else ++ ++static inline void *arch_kmap_local_high_get(struct page *page) ++{ ++ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) ++ return NULL; ++ return kmap_high_get(page); ++} ++#define arch_kmap_local_high_get arch_kmap_local_high_get ++ ++#else /* ARCH_NEEDS_KMAP_HIGH_GET */ + static inline void *kmap_high_get(struct page *page) + { + return NULL; + } +-#endif ++#endif /* !ARCH_NEEDS_KMAP_HIGH_GET */ + +-/* +- * The following functions are already defined by +- * when CONFIG_HIGHMEM is not set. +- */ +-#ifdef CONFIG_HIGHMEM +-extern void *kmap_atomic_pfn(unsigned long pfn); +-#endif ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ local_flush_tlb_kernel_page(vaddr) ++ ++#define arch_kmap_local_pre_unmap(vaddr) \ ++do { \ ++ if (cache_is_vivt()) \ ++ __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); \ ++} while (0) ++ ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_kernel_page(vaddr) + + #endif +diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h +index 54b0180c8..36d040c68 100644 +--- a/arch/arm/include/asm/irq.h ++++ b/arch/arm/include/asm/irq.h +@@ -31,6 +31,8 @@ void handle_IRQ(unsigned int, struct pt_regs *); + void init_IRQ(void); + + #ifdef CONFIG_SMP ++#include ++ + extern bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, + bool exclude_self); + #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace +diff --git a/arch/arm/include/asm/kmap_types.h b/arch/arm/include/asm/kmap_types.h +deleted file mode 100644 +index 5590940ee..000000000 +--- a/arch/arm/include/asm/kmap_types.h ++++ /dev/null +@@ -1,10 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __ARM_KMAP_TYPES_H +-#define __ARM_KMAP_TYPES_H +- +-/* +- * This is the "bare minimum". AIO seems to require this. +- */ +-#define KM_TYPE_NR 16 +- +-#endif +diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h +index 597695864..a37c08039 100644 +--- a/arch/arm/include/asm/spinlock_types.h ++++ b/arch/arm/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + #define TICKET_SHIFT 16 + + typedef struct { +diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h +index 9f7ca79cc..0bf67b7f0 100644 +--- a/arch/arm/include/asm/thread_info.h ++++ b/arch/arm/include/asm/thread_info.h +@@ -55,6 +55,7 @@ struct cpu_context_save { + struct thread_info { + unsigned long flags; /* low level flags */ + int preempt_count; /* 0 => preemptable, <0 => bug */ ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + mm_segment_t addr_limit; /* address limit */ + struct task_struct *task; /* main task structure */ + __u32 cpu; /* cpu */ +@@ -145,8 +146,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ + #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ + #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ +-#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ +-#define TIF_PATCH_PENDING 8 /* pending live patching update */ ++#define TIF_NEED_RESCHED_LAZY 7 ++#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ ++#define TIF_PATCH_PENDING 9 /* pending live patching update */ + + #define TIF_USING_IWMMXT 17 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ +@@ -155,6 +157,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) + #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) + #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_UPROBE (1 << TIF_UPROBE) + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +@@ -171,7 +174,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + * Change these and you break ASM code in entry-common.S + */ + #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +- _TIF_NOTIFY_RESUME | _TIF_UPROBE) ++ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ ++ _TIF_NEED_RESCHED_LAZY) + + #endif /* __KERNEL__ */ + #endif /* __ASM_ARM_THREAD_INFO_H */ +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index 70993af22..024c65c3a 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -43,6 +43,7 @@ int main(void) + BLANK(); + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); ++ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); + DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); + DEFINE(TI_TASK, offsetof(struct thread_info, task)); + DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); +diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S +index d74678d95..875f8ed46 100644 +--- a/arch/arm/kernel/entry-armv.S ++++ b/arch/arm/kernel/entry-armv.S +@@ -207,11 +207,18 @@ __irq_svc: + + #ifdef CONFIG_PREEMPTION + ldr r8, [tsk, #TI_PREEMPT] @ get preempt count +- ldr r0, [tsk, #TI_FLAGS] @ get flags + teq r8, #0 @ if preempt count != 0 ++ bne 1f @ return from exeption ++ ldr r0, [tsk, #TI_FLAGS] @ get flags ++ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set ++ blne svc_preempt @ preempt! ++ ++ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r8, #0 @ if preempt lazy count != 0 + movne r0, #0 @ force flags to 0 +- tst r0, #_TIF_NEED_RESCHED ++ tst r0, #_TIF_NEED_RESCHED_LAZY + blne svc_preempt ++1: + #endif + + svc_exit r5, irq = 1 @ return from exception +@@ -226,8 +233,14 @@ svc_preempt: + 1: bl preempt_schedule_irq @ irq en/disable is done inside + ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS + tst r0, #_TIF_NEED_RESCHED ++ bne 1b ++ tst r0, #_TIF_NEED_RESCHED_LAZY + reteq r8 @ go again +- b 1b ++ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r0, #0 @ if preempt lazy count != 0 ++ beq 1b ++ ret r8 @ go again ++ + #endif + + __und_fault: +diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S +index 7a2e63dfb..02fae4a70 100644 +--- a/arch/arm/kernel/entry-common.S ++++ b/arch/arm/kernel/entry-common.S +@@ -54,7 +54,9 @@ __ret_fast_syscall: + cmp r2, r1 + blne addr_limit_check_failed + ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing +- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK ++ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) ++ bne fast_work_pending ++ tst r1, #_TIF_SECCOMP + bne fast_work_pending + + +@@ -92,8 +94,11 @@ __ret_fast_syscall: + cmp r2, r1 + blne addr_limit_check_failed + ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing +- tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK ++ tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) ++ bne do_slower_path ++ tst r1, #_TIF_SECCOMP + beq no_work_pending ++do_slower_path: + UNWIND(.fnend ) + ENDPROC(ret_fast_syscall) + +diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c +index 2f81d3af5..6e69f7b3d 100644 +--- a/arch/arm/kernel/signal.c ++++ b/arch/arm/kernel/signal.c +@@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) + */ + trace_hardirqs_off(); + do { +- if (likely(thread_flags & _TIF_NEED_RESCHED)) { ++ if (likely(thread_flags & (_TIF_NEED_RESCHED | ++ _TIF_NEED_RESCHED_LAZY))) { + schedule(); + } else { + if (unlikely(!user_mode(regs))) +diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c +index d94e39a21..44383bfbf 100644 +--- a/arch/arm/kernel/smp.c ++++ b/arch/arm/kernel/smp.c +@@ -671,9 +671,7 @@ static void do_handle_IPI(int ipinr) + break; + + case IPI_CPU_BACKTRACE: +- printk_nmi_enter(); + nmi_cpu_backtrace(get_irq_regs()); +- printk_nmi_exit(); + break; + + default: +diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile +index 4536159bc..3510503bc 100644 +--- a/arch/arm/mm/Makefile ++++ b/arch/arm/mm/Makefile +@@ -21,7 +21,6 @@ KASAN_SANITIZE_physaddr.o := n + obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o + + obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o +-obj-$(CONFIG_HIGHMEM) += highmem.o + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o + +diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c +index 10f909744..fd9e3e740 100644 +--- a/arch/arm/mm/cache-feroceon-l2.c ++++ b/arch/arm/mm/cache-feroceon-l2.c +@@ -49,9 +49,9 @@ static inline unsigned long l2_get_va(unsigned long paddr) + * we simply install a virtual mapping for it only for the + * TLB lookup to occur, hence no need to flush the untouched + * memory mapping afterwards (note: a cache flush may happen +- * in some circumstances depending on the path taken in kunmap_atomic). ++ * in some circumstances depending on the path taken in kunmap_local). + */ +- void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT); ++ void *vaddr = kmap_local_pfn(paddr >> PAGE_SHIFT); + return (unsigned long)vaddr + (paddr & ~PAGE_MASK); + #else + return __phys_to_virt(paddr); +@@ -61,7 +61,7 @@ static inline unsigned long l2_get_va(unsigned long paddr) + static inline void l2_put_va(unsigned long vaddr) + { + #ifdef CONFIG_HIGHMEM +- kunmap_atomic((void *)vaddr); ++ kunmap_local((void *)vaddr); + #endif + } + +diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c +index 581473165..f34845da3 100644 +--- a/arch/arm/mm/cache-xsc3l2.c ++++ b/arch/arm/mm/cache-xsc3l2.c +@@ -59,7 +59,7 @@ static inline void l2_unmap_va(unsigned long va) + { + #ifdef CONFIG_HIGHMEM + if (va != -1) +- kunmap_atomic((void *)va); ++ kunmap_local((void *)va); + #endif + } + +@@ -75,7 +75,7 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va) + * in place for it. + */ + l2_unmap_va(prev_va); +- va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT); ++ va = (unsigned long)kmap_local_pfn(pa >> PAGE_SHIFT); + } + return va + (pa_offset >> (32 - PAGE_SHIFT)); + #else +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index 91965fb04..d34166682 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + if (addr < TASK_SIZE) + return do_page_fault(addr, fsr, regs); + ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + if (user_mode(regs)) + goto bad_area; + +diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c +deleted file mode 100644 +index 187fab227..000000000 +--- a/arch/arm/mm/highmem.c ++++ /dev/null +@@ -1,121 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-only +-/* +- * arch/arm/mm/highmem.c -- ARM highmem support +- * +- * Author: Nicolas Pitre +- * Created: september 8, 2008 +- * Copyright: Marvell Semiconductors Inc. +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include "mm.h" +- +-static inline void set_fixmap_pte(int idx, pte_t pte) +-{ +- unsigned long vaddr = __fix_to_virt(idx); +- pte_t *ptep = virt_to_kpte(vaddr); +- +- set_pte_ext(ptep, pte, 0); +- local_flush_tlb_kernel_page(vaddr); +-} +- +-static inline pte_t get_fixmap_pte(unsigned long vaddr) +-{ +- pte_t *ptep = virt_to_kpte(vaddr); +- +- return *ptep; +-} +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned int idx; +- unsigned long vaddr; +- void *kmap; +- int type; +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- /* +- * There is no cache coherency issue when non VIVT, so force the +- * dedicated kmap usage for better debugging purposes in that case. +- */ +- if (!cache_is_vivt()) +- kmap = NULL; +- else +-#endif +- kmap = kmap_high_get(page); +- if (kmap) +- return kmap; +- +- type = kmap_atomic_idx_push(); +- +- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- /* +- * With debugging enabled, kunmap_atomic forces that entry to 0. +- * Make sure it was indeed properly unmapped. +- */ +- BUG_ON(!pte_none(get_fixmap_pte(vaddr))); +-#endif +- /* +- * When debugging is off, kunmap_atomic leaves the previous mapping +- * in place, so the contained TLB flush ensures the TLB is updated +- * with the new mapping. +- */ +- set_fixmap_pte(idx, mk_pte(page, prot)); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int idx, type; +- +- if (kvaddr >= (void *)FIXADDR_START) { +- type = kmap_atomic_idx(); +- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +- +- if (cache_is_vivt()) +- __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(vaddr != __fix_to_virt(idx)); +- set_fixmap_pte(idx, __pte(0)); +-#else +- (void) idx; /* to kill a warning */ +-#endif +- kmap_atomic_idx_pop(); +- } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { +- /* this address was obtained through kmap_high_get() */ +- kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); +- } +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- unsigned long vaddr; +- int idx, type; +- struct page *page = pfn_to_page(pfn); +- +- preempt_disable(); +- pagefault_disable(); +- if (!PageHighMem(page)) +- return page_address(page); +- +- type = kmap_atomic_idx_push(); +- idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(get_fixmap_pte(vaddr))); +-#endif +- set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); +- +- return (void *)vaddr; +-} +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index e253fdba1..7dc4e9079 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -76,6 +76,7 @@ config ARM64 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +@@ -178,6 +179,7 @@ config ARM64 + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUTEX_CMPXCHG if FUTEX +@@ -200,6 +202,7 @@ config ARM64 + select PCI_DOMAINS_GENERIC if PCI + select PCI_ECAM if (ACPI && PCI) + select PCI_SYSCALL if PCI ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select POWER_RESET + select POWER_SUPPLY + select SPARSE_IRQ +diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h +index 5ffa4bacd..cbfa7b6f2 100644 +--- a/arch/arm64/include/asm/hardirq.h ++++ b/arch/arm64/include/asm/hardirq.h +@@ -13,11 +13,8 @@ + #include + #include + +-typedef struct { +- unsigned int __softirq_pending; +-} ____cacheline_aligned irq_cpustat_t; +- +-#include /* Standard mappings for irq_cpustat_t above */ ++#define ack_bad_irq ack_bad_irq ++#include + + #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 + +diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h +index e83f0982b..7a5770d82 100644 +--- a/arch/arm64/include/asm/preempt.h ++++ b/arch/arm64/include/asm/preempt.h +@@ -70,17 +70,43 @@ static inline bool __preempt_count_dec_and_test(void) + * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE + * pair. + */ +- return !pc || !READ_ONCE(ti->preempt_count); ++ if (!pc || !READ_ONCE(ti->preempt_count)) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if ((pc & ~PREEMPT_NEED_RESCHED)) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif + } + + static inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u64 pc = READ_ONCE(current_thread_info()->preempt_count); ++ if (pc == preempt_offset) ++ return true; ++ ++ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) ++ return false; ++ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + return pc == preempt_offset; ++#endif + } + + #ifdef CONFIG_PREEMPTION + void preempt_schedule(void); ++#ifdef CONFIG_PREEMPT_RT ++void preempt_schedule_lock(void); ++#endif + #define __preempt_schedule() preempt_schedule() + void preempt_schedule_notrace(void); + #define __preempt_schedule_notrace() preempt_schedule_notrace() +diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h +index 18782f0c4..6672b0535 100644 +--- a/arch/arm64/include/asm/spinlock_types.h ++++ b/arch/arm64/include/asm/spinlock_types.h +@@ -5,10 +5,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +-# error "please don't include this file directly" +-#endif +- + #include + #include + +diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h +index af49b6190..f8ba7a6ec 100644 +--- a/arch/arm64/include/asm/thread_info.h ++++ b/arch/arm64/include/asm/thread_info.h +@@ -27,6 +27,7 @@ struct thread_info { + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ + #endif ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + union { + u64 preempt_count; /* 0 => preemptible, <0 => bug */ + struct { +@@ -69,6 +70,7 @@ void arch_release_task_struct(struct task_struct *tsk); + #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ + #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ + #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ ++#define TIF_NEED_RESCHED_LAZY 6 + #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ + #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ + #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ +@@ -101,14 +103,17 @@ void arch_release_task_struct(struct task_struct *tsk); + #define _TIF_32BIT (1 << TIF_32BIT) + #define _TIF_SVE (1 << TIF_SVE) + #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64) + #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) + + #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ +- _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT) ++ _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ ++ _TIF_NEED_RESCHED_LAZY) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ + _TIF_SYSCALL_EMU) +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 5f59e24c9..d8d41b5f9 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -31,6 +31,7 @@ int main(void) + DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); ++ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); + #endif +diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S +index 64145bfab..70abdfd6f 100644 +--- a/arch/arm64/kernel/entry.S ++++ b/arch/arm64/kernel/entry.S +@@ -521,9 +521,18 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING + orr x24, x24, x0 + alternative_else_nop_endif + #endif +- cbnz x24, 1f // preempt count != 0 || NMI return path +- bl arm64_preempt_schedule_irq // irq en/disable is done inside ++ ++ cbz x24, 1f // (need_resched + count) == 0 ++ cbnz w24, 2f // count != 0 ++ ++ ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count ++ cbnz w24, 2f // preempt lazy count != 0 ++ ++ ldr x0, [tsk, #TSK_TI_FLAGS] // get flags ++ tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling? + 1: ++ bl arm64_preempt_schedule_irq // irq en/disable is done inside ++2: + #endif + + mov x0, sp +diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c +index 5335a6bd1..aa631771e 100644 +--- a/arch/arm64/kernel/fpsimd.c ++++ b/arch/arm64/kernel/fpsimd.c +@@ -180,7 +180,7 @@ static void __get_cpu_fpsimd_context(void) + */ + static void get_cpu_fpsimd_context(void) + { +- local_bh_disable(); ++ preempt_disable(); + __get_cpu_fpsimd_context(); + } + +@@ -201,7 +201,7 @@ static void __put_cpu_fpsimd_context(void) + static void put_cpu_fpsimd_context(void) + { + __put_cpu_fpsimd_context(); +- local_bh_enable(); ++ preempt_enable(); + } + + static bool have_cpu_fpsimd_context(void) +@@ -226,6 +226,16 @@ static void sve_free(struct task_struct *task) + __sve_free(task); + } + ++static void *sve_free_atomic(struct task_struct *task) ++{ ++ void *sve_state = task->thread.sve_state; ++ ++ WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); ++ ++ task->thread.sve_state = NULL; ++ return sve_state; ++} ++ + /* + * TIF_SVE controls whether a task can use SVE without trapping while + * in userspace, and also the way a task's FPSIMD/SVE state is stored +@@ -1022,6 +1032,7 @@ void fpsimd_thread_switch(struct task_struct *next) + void fpsimd_flush_thread(void) + { + int vl, supported_vl; ++ void *mem = NULL; + + if (!system_supports_fpsimd()) + return; +@@ -1034,7 +1045,7 @@ void fpsimd_flush_thread(void) + + if (system_supports_sve()) { + clear_thread_flag(TIF_SVE); +- sve_free(current); ++ mem = sve_free_atomic(current); + + /* + * Reset the task vector length as required. +@@ -1068,6 +1079,7 @@ void fpsimd_flush_thread(void) + } + + put_cpu_fpsimd_context(); ++ kfree(mem); + } + + /* +diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c +index 2cf28e511..fc58fada5 100644 +--- a/arch/arm64/kernel/ipi_nmi.c ++++ b/arch/arm64/kernel/ipi_nmi.c +@@ -35,9 +35,7 @@ void arm64_send_nmi(cpumask_t *mask) + + static void ipi_cpu_backtrace(void *info) + { +- printk_safe_enter(); + nmi_cpu_backtrace(get_irq_regs()); +- printk_safe_exit(); + } + + static void arm64_send_ipi(cpumask_t *mask) +diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c +index d288bb4a1..3e5b354dd 100644 +--- a/arch/arm64/kernel/signal.c ++++ b/arch/arm64/kernel/signal.c +@@ -692,7 +692,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, + unsigned long thread_flags) + { + do { +- if (thread_flags & _TIF_NEED_RESCHED) { ++ if (thread_flags & _TIF_NEED_RESCHED_MASK) { + /* Unmask Debug and SError for the next task */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + +diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c +index d7745ff2e..3c99b499e 100644 +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -779,7 +779,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ +- preempt_disable(); ++ migrate_disable(); + + kvm_pmu_flush_hwstate(vcpu); + +@@ -828,7 +828,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + kvm_timer_sync_user(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); +- preempt_enable(); ++ migrate_enable(); + continue; + } + +@@ -907,7 +907,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, ret); + +- preempt_enable(); ++ migrate_enable(); + + /* + * The ARMv8 architecture doesn't give the hypervisor +diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig +index 7bf0a617e..c9f2533cc 100644 +--- a/arch/csky/Kconfig ++++ b/arch/csky/Kconfig +@@ -286,6 +286,7 @@ config NR_CPUS + config HIGHMEM + bool "High Memory Support" + depends on !CPU_CK610 ++ select KMAP_LOCAL + default y + + config FORCE_MAX_ZONEORDER +diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h +index 81f9477d5..4b589cc20 100644 +--- a/arch/csky/include/asm/fixmap.h ++++ b/arch/csky/include/asm/fixmap.h +@@ -8,7 +8,7 @@ + #include + #ifdef CONFIG_HIGHMEM + #include +-#include ++#include + #endif + + enum fixed_addresses { +@@ -17,7 +17,7 @@ enum fixed_addresses { + #endif + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h +index 14645e3d5..1f4ed3f4c 100644 +--- a/arch/csky/include/asm/highmem.h ++++ b/arch/csky/include/asm/highmem.h +@@ -9,7 +9,7 @@ + #include + #include + #include +-#include ++#include + #include + + /* undef for production */ +@@ -32,10 +32,12 @@ extern pte_t *pkmap_page_table; + + #define ARCH_HAS_KMAP_FLUSH_TLB + extern void kmap_flush_tlb(unsigned long addr); +-extern void *kmap_atomic_pfn(unsigned long pfn); + + #define flush_cache_kmaps() do {} while (0) + ++#define arch_kmap_local_post_map(vaddr, pteval) kmap_flush_tlb(vaddr) ++#define arch_kmap_local_post_unmap(vaddr) kmap_flush_tlb(vaddr) ++ + extern void kmap_init(void); + + #endif /* __KERNEL__ */ +diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c +index 89c10800a..4161df3c6 100644 +--- a/arch/csky/mm/highmem.c ++++ b/arch/csky/mm/highmem.c +@@ -9,8 +9,6 @@ + #include + #include + +-static pte_t *kmap_pte; +- + unsigned long highstart_pfn, highend_pfn; + + void kmap_flush_tlb(unsigned long addr) +@@ -19,67 +17,7 @@ void kmap_flush_tlb(unsigned long addr) + } + EXPORT_SYMBOL(kmap_flush_tlb); + +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte - idx))); +-#endif +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +- flush_tlb_one((unsigned long)vaddr); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int idx; +- +- if (vaddr < FIXADDR_START) +- return; +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); +- +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +- +- pte_clear(&init_mm, vaddr, kmap_pte - idx); +- flush_tlb_one(vaddr); +-#else +- (void) idx; /* to kill a warning */ +-#endif +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- +-/* +- * This is the same as kmap_atomic() but can map memory that doesn't +- * have a struct page associated with it. +- */ +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- unsigned long vaddr; +- int idx, type; +- +- pagefault_disable(); +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); +- flush_tlb_one(vaddr); +- +- return (void *) vaddr; +-} +- +-static void __init kmap_pages_init(void) ++void __init kmap_init(void) + { + unsigned long vaddr; + pgd_t *pgd; +@@ -96,14 +34,3 @@ static void __init kmap_pages_init(void) + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } +- +-void __init kmap_init(void) +-{ +- unsigned long vaddr; +- +- kmap_pages_init(); +- +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN); +- +- kmap_pte = pte_offset_kernel((pmd_t *)pgd_offset_k(vaddr), vaddr); +-} +diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h +index 19d233497..de72fb230 100644 +--- a/arch/hexagon/include/asm/spinlock_types.h ++++ b/arch/hexagon/include/asm/spinlock_types.h +@@ -8,10 +8,6 @@ + #ifndef _ASM_SPINLOCK_TYPES_H + #define _ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h +deleted file mode 100644 +index 5c268cf7c..000000000 +--- a/arch/ia64/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_IA64_KMAP_TYPES_H +-#define _ASM_IA64_KMAP_TYPES_H +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include +- +-#undef __WITH_KM_FENCE +- +-#endif /* _ASM_IA64_KMAP_TYPES_H */ +diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h +index 6e345fefc..681408d68 100644 +--- a/arch/ia64/include/asm/spinlock_types.h ++++ b/arch/ia64/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef _ASM_IA64_SPINLOCK_TYPES_H + #define _ASM_IA64_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c +index 7abc5f37b..733e0e332 100644 +--- a/arch/ia64/kernel/time.c ++++ b/arch/ia64/kernel/time.c +@@ -138,12 +138,8 @@ void vtime_account_kernel(struct task_struct *tsk) + struct thread_info *ti = task_thread_info(tsk); + __u64 stime = vtime_delta(tsk); + +- if ((tsk->flags & PF_VCPU) && !irq_count()) ++ if (tsk->flags & PF_VCPU) + ti->gtime += stime; +- else if (hardirq_count()) +- ti->hardirq_time += stime; +- else if (in_serving_softirq()) +- ti->softirq_time += stime; + else + ti->stime += stime; + } +@@ -156,6 +152,20 @@ void vtime_account_idle(struct task_struct *tsk) + ti->idle_time += vtime_delta(tsk); + } + ++void vtime_account_softirq(struct task_struct *tsk) ++{ ++ struct thread_info *ti = task_thread_info(tsk); ++ ++ ti->softirq_time += vtime_delta(tsk); ++} ++ ++void vtime_account_hardirq(struct task_struct *tsk) ++{ ++ struct thread_info *ti = task_thread_info(tsk); ++ ++ ti->hardirq_time += vtime_delta(tsk); ++} ++ + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + + static irqreturn_t +diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig +index 33925ffed..7f6ca0ab4 100644 +--- a/arch/microblaze/Kconfig ++++ b/arch/microblaze/Kconfig +@@ -155,6 +155,7 @@ config XILINX_UNCACHED_SHADOW + config HIGHMEM + bool "High memory support" + depends on MMU ++ select KMAP_LOCAL + help + The address space of Microblaze processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h +index 0379ce522..e6e9288bf 100644 +--- a/arch/microblaze/include/asm/fixmap.h ++++ b/arch/microblaze/include/asm/fixmap.h +@@ -20,7 +20,7 @@ + #include + #ifdef CONFIG_HIGHMEM + #include +-#include ++#include + #endif + + #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) +@@ -47,7 +47,7 @@ enum fixed_addresses { + FIX_HOLE, + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * num_possible_cpus()) - 1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * num_possible_cpus()) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h +index 284ca8fb5..4418633fb 100644 +--- a/arch/microblaze/include/asm/highmem.h ++++ b/arch/microblaze/include/asm/highmem.h +@@ -25,7 +25,6 @@ + #include + #include + +-extern pte_t *kmap_pte; + extern pte_t *pkmap_page_table; + + /* +@@ -52,6 +51,11 @@ extern pte_t *pkmap_page_table; + + #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } + ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ local_flush_tlb_page(NULL, vaddr); ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_page(NULL, vaddr); ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_HIGHMEM_H */ +diff --git a/arch/microblaze/mm/Makefile b/arch/microblaze/mm/Makefile +index 1b16875ce..8ced71100 100644 +--- a/arch/microblaze/mm/Makefile ++++ b/arch/microblaze/mm/Makefile +@@ -6,4 +6,3 @@ + obj-y := consistent.o init.o + + obj-$(CONFIG_MMU) += pgtable.o mmu_context.o fault.o +-obj-$(CONFIG_HIGHMEM) += highmem.o +diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c +deleted file mode 100644 +index 92e089041..000000000 +--- a/arch/microblaze/mm/highmem.c ++++ /dev/null +@@ -1,78 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * highmem.c: virtual kernel memory mappings for high memory +- * +- * PowerPC version, stolen from the i386 version. +- * +- * Used in CONFIG_HIGHMEM systems for memory pages which +- * are not addressable by direct kernel virtual addresses. +- * +- * Copyright (C) 1999 Gerhard Wichert, Siemens AG +- * Gerhard.Wichert@pdb.siemens.de +- * +- * +- * Redesigned the x86 32-bit VM architecture to deal with +- * up to 16 Terrabyte physical memory. With current x86 CPUs +- * we now support up to 64 Gigabytes physical RAM. +- * +- * Copyright (C) 1999 Ingo Molnar +- * +- * Reworked for PowerPC by various contributors. Moved from +- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. +- */ +- +-#include +-#include +- +-/* +- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap +- * gives a more generic (and caching) interface. But kmap_atomic can +- * be used in IRQ contexts, so in some (very limited) cases we need +- * it. +- */ +-#include +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte-idx))); +-#endif +- set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); +- local_flush_tlb_page(NULL, vaddr); +- +- return (void *) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int type; +- unsigned int idx; +- +- if (vaddr < __fix_to_virt(FIX_KMAP_END)) +- return; +- +- type = kmap_atomic_idx(); +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +-#endif +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- local_flush_tlb_page(NULL, vaddr); +- +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c +index 45da639bd..1f4b5b34e 100644 +--- a/arch/microblaze/mm/init.c ++++ b/arch/microblaze/mm/init.c +@@ -49,17 +49,11 @@ unsigned long lowmem_size; + EXPORT_SYMBOL(min_low_pfn); + EXPORT_SYMBOL(max_low_pfn); + +-#ifdef CONFIG_HIGHMEM +-pte_t *kmap_pte; +-EXPORT_SYMBOL(kmap_pte); +- + static void __init highmem_init(void) + { + pr_debug("%x\n", (u32)PKMAP_BASE); + map_page(PKMAP_BASE, 0, 0); /* XXX gross */ + pkmap_page_table = virt_to_kpte(PKMAP_BASE); +- +- kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + } + + static void highmem_setup(void) +diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig +index 896a29df1..1b3593d53 100644 +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -2727,6 +2727,7 @@ config WAR_MIPS34K_MISSED_ITLB + config HIGHMEM + bool "High Memory Support" + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA ++ select KMAP_LOCAL + + config CPU_SUPPORTS_HIGHMEM + bool +diff --git a/arch/mips/include/asm/fixmap.h b/arch/mips/include/asm/fixmap.h +index 743535be7..beea14761 100644 +--- a/arch/mips/include/asm/fixmap.h ++++ b/arch/mips/include/asm/fixmap.h +@@ -17,7 +17,7 @@ + #include + #ifdef CONFIG_HIGHMEM + #include +-#include ++#include + #endif + + /* +@@ -52,7 +52,7 @@ enum fixed_addresses { + #ifdef CONFIG_HIGHMEM + /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_BEGIN = FIX_CMAP_END + 1, +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h +index 9f021cf51..1716181ea 100644 +--- a/arch/mips/include/asm/highmem.h ++++ b/arch/mips/include/asm/highmem.h +@@ -24,7 +24,7 @@ + #include + #include + #include +-#include ++#include + + /* declarations for highmem.c */ + extern unsigned long highstart_pfn, highend_pfn; +@@ -48,11 +48,11 @@ extern pte_t *pkmap_page_table; + + #define ARCH_HAS_KMAP_FLUSH_TLB + extern void kmap_flush_tlb(unsigned long addr); +-extern void *kmap_atomic_pfn(unsigned long pfn); + + #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) + +-extern void kmap_init(void); ++#define arch_kmap_local_post_map(vaddr, pteval) local_flush_tlb_one(vaddr) ++#define arch_kmap_local_post_unmap(vaddr) local_flush_tlb_one(vaddr) + + #endif /* __KERNEL__ */ + +diff --git a/arch/mips/include/asm/kmap_types.h b/arch/mips/include/asm/kmap_types.h +deleted file mode 100644 +index 16665dc24..000000000 +--- a/arch/mips/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c +index 01b2bd95b..9aba83e1e 100644 +--- a/arch/mips/kernel/crash_dump.c ++++ b/arch/mips/kernel/crash_dump.c +@@ -5,8 +5,6 @@ + #include + #include + +-static void *kdump_buf_page; +- + /** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied +@@ -17,51 +15,25 @@ static void *kdump_buf_page; + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * +- * Copy a page from "oldmem". For this page, there is no pte mapped ++ * Copy a page from "oldmem". For this page, there might be no pte mapped + * in the current kernel. +- * +- * Calling copy_to_user() in atomic context is not desirable. Hence first +- * copying the data to a pre-allocated kernel page and then copying to user +- * space in non-atomic context. + */ +-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, +- size_t csize, unsigned long offset, int userbuf) ++ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, ++ unsigned long offset, int userbuf) + { + void *vaddr; + + if (!csize) + return 0; + +- vaddr = kmap_atomic_pfn(pfn); ++ vaddr = kmap_local_pfn(pfn); + + if (!userbuf) { +- memcpy(buf, (vaddr + offset), csize); +- kunmap_atomic(vaddr); ++ memcpy(buf, vaddr + offset, csize); + } else { +- if (!kdump_buf_page) { +- pr_warn("Kdump: Kdump buffer page not allocated\n"); +- +- return -EFAULT; +- } +- copy_page(kdump_buf_page, vaddr); +- kunmap_atomic(vaddr); +- if (copy_to_user(buf, (kdump_buf_page + offset), csize)) +- return -EFAULT; ++ if (copy_to_user(buf, vaddr + offset, csize)) ++ csize = -EFAULT; + } + + return csize; + } +- +-static int __init kdump_buf_page_init(void) +-{ +- int ret = 0; +- +- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); +- if (!kdump_buf_page) { +- pr_warn("Kdump: Failed to allocate kdump buffer page\n"); +- ret = -ENOMEM; +- } +- +- return ret; +-} +-arch_initcall(kdump_buf_page_init); +diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c +index 5fec7f45d..57e2f08f0 100644 +--- a/arch/mips/mm/highmem.c ++++ b/arch/mips/mm/highmem.c +@@ -8,8 +8,6 @@ + #include + #include + +-static pte_t *kmap_pte; +- + unsigned long highstart_pfn, highend_pfn; + + void kmap_flush_tlb(unsigned long addr) +@@ -17,78 +15,3 @@ void kmap_flush_tlb(unsigned long addr) + flush_tlb_one(addr); + } + EXPORT_SYMBOL(kmap_flush_tlb); +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte - idx))); +-#endif +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +- local_flush_tlb_one((unsigned long)vaddr); +- +- return (void*) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int type __maybe_unused; +- +- if (vaddr < FIXADDR_START) +- return; +- +- type = kmap_atomic_idx(); +-#ifdef CONFIG_DEBUG_HIGHMEM +- { +- int idx = type + KM_TYPE_NR * smp_processor_id(); +- +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +- +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- local_flush_tlb_one(vaddr); +- } +-#endif +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- +-/* +- * This is the same as kmap_atomic() but can map memory that doesn't +- * have a struct page associated with it. +- */ +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- unsigned long vaddr; +- int idx, type; +- +- preempt_disable(); +- pagefault_disable(); +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); +- flush_tlb_one(vaddr); +- +- return (void*) vaddr; +-} +- +-void __init kmap_init(void) +-{ +- unsigned long kmap_vstart; +- +- /* cache the first kmap pte */ +- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); +- kmap_pte = virt_to_kpte(kmap_vstart); +-} +diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c +index 07e84a774..bc80893e5 100644 +--- a/arch/mips/mm/init.c ++++ b/arch/mips/mm/init.c +@@ -36,7 +36,6 @@ + #include + #include + #include +-#include + #include + #include + #include +@@ -402,9 +401,6 @@ void __init paging_init(void) + + pagetable_init(); + +-#ifdef CONFIG_HIGHMEM +- kmap_init(); +-#endif + #ifdef CONFIG_ZONE_DMA + max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; + #endif +diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu +index f88a12fdf..c10759952 100644 +--- a/arch/nds32/Kconfig.cpu ++++ b/arch/nds32/Kconfig.cpu +@@ -157,6 +157,7 @@ config HW_SUPPORT_UNALIGNMENT_ACCESS + config HIGHMEM + bool "High Memory Support" + depends on MMU && !CPU_CACHE_ALIASING ++ select KMAP_LOCAL + help + The address space of Andes processors is only 4 Gigabytes large + and it has to accommodate user address space, kernel address +diff --git a/arch/nds32/include/asm/fixmap.h b/arch/nds32/include/asm/fixmap.h +index 5a4bf11e5..2fa09a2de 100644 +--- a/arch/nds32/include/asm/fixmap.h ++++ b/arch/nds32/include/asm/fixmap.h +@@ -6,7 +6,7 @@ + + #ifdef CONFIG_HIGHMEM + #include +-#include ++#include + #endif + + enum fixed_addresses { +@@ -14,7 +14,7 @@ enum fixed_addresses { + FIX_KMAP_RESERVED, + FIX_KMAP_BEGIN, + #ifdef CONFIG_HIGHMEM +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS), ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + FIX_EARLYCON_MEM_BASE, + __end_of_fixed_addresses +diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h +index fe986d0e6..16159a871 100644 +--- a/arch/nds32/include/asm/highmem.h ++++ b/arch/nds32/include/asm/highmem.h +@@ -5,7 +5,6 @@ + #define _ASM_HIGHMEM_H + + #include +-#include + #include + + /* +@@ -45,11 +44,22 @@ extern pte_t *pkmap_page_table; + extern void kmap_init(void); + + /* +- * The following functions are already defined by +- * when CONFIG_HIGHMEM is not set. ++ * FIXME: The below looks broken vs. a kmap_atomic() in task context which ++ * is interupted and another kmap_atomic() happens in interrupt context. ++ * But what do I know about nds32. -- tglx + */ +-#ifdef CONFIG_HIGHMEM +-extern void *kmap_atomic_pfn(unsigned long pfn); +-#endif ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ do { \ ++ __nds32__tlbop_inv(vaddr); \ ++ __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); \ ++ __nds32__tlbop_rwr(pteval); \ ++ __nds32__isb(); \ ++ } while (0) ++ ++#define arch_kmap_local_pre_unmap(vaddr) \ ++ do { \ ++ __nds32__tlbop_inv(vaddr); \ ++ __nds32__isb(); \ ++ } while (0) + + #endif +diff --git a/arch/nds32/mm/Makefile b/arch/nds32/mm/Makefile +index 897ecaf5c..14fb2e8eb 100644 +--- a/arch/nds32/mm/Makefile ++++ b/arch/nds32/mm/Makefile +@@ -3,7 +3,6 @@ obj-y := extable.o tlb.o fault.o init.o mmap.o \ + mm-nds32.o cacheflush.o proc.o + + obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o +-obj-$(CONFIG_HIGHMEM) += highmem.o + + ifdef CONFIG_FUNCTION_TRACER + CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE) +diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c +deleted file mode 100644 +index 4284cd59e..000000000 +--- a/arch/nds32/mm/highmem.c ++++ /dev/null +@@ -1,48 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-// Copyright (C) 2005-2017 Andes Technology Corporation +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned int idx; +- unsigned long vaddr, pte; +- int type; +- pte_t *ptep; +- +- type = kmap_atomic_idx_push(); +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; +- ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); +- set_pte(ptep, pte); +- +- __nds32__tlbop_inv(vaddr); +- __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); +- __nds32__tlbop_rwr(pte); +- __nds32__isb(); +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- if (kvaddr >= (void *)FIXADDR_START) { +- unsigned long vaddr = (unsigned long)kvaddr; +- pte_t *ptep; +- kmap_atomic_idx_pop(); +- __nds32__tlbop_inv(vaddr); +- __nds32__isb(); +- ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); +- set_pte(ptep, 0); +- } +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c +index 5e88c351e..f3fa02b88 100644 +--- a/arch/openrisc/mm/init.c ++++ b/arch/openrisc/mm/init.c +@@ -33,7 +33,6 @@ + #include + #include + #include +-#include + #include + #include + #include +diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c +index a978590d8..5aed97a18 100644 +--- a/arch/openrisc/mm/ioremap.c ++++ b/arch/openrisc/mm/ioremap.c +@@ -15,7 +15,6 @@ + #include + #include + #include +-#include + #include + #include + #include +diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h +index 7f7039516..fad29aa6f 100644 +--- a/arch/parisc/include/asm/hardirq.h ++++ b/arch/parisc/include/asm/hardirq.h +@@ -32,7 +32,6 @@ typedef struct { + DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); + + #define __ARCH_IRQ_STAT +-#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member) + #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) + #define __inc_irq_stat(member) __this_cpu_inc(irq_stat.member) + #define ack_bad_irq(irq) WARN(1, "unexpected IRQ trap at vector %02x\n", irq) +diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h +deleted file mode 100644 +index 3e70b5cd1..000000000 +--- a/arch/parisc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 4e6f30473..42851014e 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -146,6 +146,7 @@ config PPC + select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX + select ARCH_SUPPORTS_ATOMIC_RMW ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS +@@ -234,6 +235,7 @@ config PPC + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_PAGE_SIZE + select HAVE_REGS_AND_STACK_ACCESS_API +@@ -241,6 +243,7 @@ config PPC + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_IRQ_TIME_ACCOUNTING ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_RSEQ + select IOMMU_HELPER if PPC64 + select IRQ_DOMAIN +@@ -414,6 +417,7 @@ menu "Kernel options" + config HIGHMEM + bool "High memory support" + depends on PPC32 ++ select KMAP_LOCAL + + source "kernel/Kconfig.hz" + +diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h +index cf091c4c2..7371f7e23 100644 +--- a/arch/powerpc/include/asm/cmpxchg.h ++++ b/arch/powerpc/include/asm/cmpxchg.h +@@ -5,7 +5,7 @@ + #ifdef __KERNEL__ + #include + #include +-#include ++#include + + #ifdef __BIG_ENDIAN + #define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) +diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h +index 897cc6875..a832aeafe 100644 +--- a/arch/powerpc/include/asm/fixmap.h ++++ b/arch/powerpc/include/asm/fixmap.h +@@ -20,7 +20,7 @@ + #include + #ifdef CONFIG_HIGHMEM + #include +-#include ++#include + #endif + + #ifdef CONFIG_PPC64 +@@ -61,7 +61,7 @@ enum fixed_addresses { + FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, PAGE_SIZE)/PAGE_SIZE)-1, + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #endif + #ifdef CONFIG_PPC_8xx + /* For IMMR we need an aligned 512K area */ +diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h +index 104026f7d..80a5ae771 100644 +--- a/arch/powerpc/include/asm/highmem.h ++++ b/arch/powerpc/include/asm/highmem.h +@@ -24,12 +24,10 @@ + #ifdef __KERNEL__ + + #include +-#include + #include + #include + #include + +-extern pte_t *kmap_pte; + extern pte_t *pkmap_page_table; + + /* +@@ -60,6 +58,11 @@ extern pte_t *pkmap_page_table; + + #define flush_cache_kmaps() flush_cache_all() + ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ local_flush_tlb_page(NULL, vaddr) ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_page(NULL, vaddr) ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_HIGHMEM_H */ +diff --git a/arch/powerpc/include/asm/kmap_types.h b/arch/powerpc/include/asm/kmap_types.h +deleted file mode 100644 +index c8fa182d4..000000000 +--- a/arch/powerpc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-or-later */ +-#ifndef _ASM_POWERPC_KMAP_TYPES_H +-#define _ASM_POWERPC_KMAP_TYPES_H +- +-#ifdef __KERNEL__ +- +-/* +- */ +- +-#define KM_TYPE_NR 16 +- +-#endif /* __KERNEL__ */ +-#endif /* _ASM_POWERPC_KMAP_TYPES_H */ +diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h +index 0f3cdd8fa..d45561e9e 100644 +--- a/arch/powerpc/include/asm/simple_spinlock_types.h ++++ b/arch/powerpc/include/asm/simple_spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h +index c5d742f18..cc6922a01 100644 +--- a/arch/powerpc/include/asm/spinlock_types.h ++++ b/arch/powerpc/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + #ifdef CONFIG_PPC_QUEUED_SPINLOCKS + #include + #include +diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h +index 1c8460e23..b1653c160 100644 +--- a/arch/powerpc/include/asm/stackprotector.h ++++ b/arch/powerpc/include/asm/stackprotector.h +@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) + unsigned long canary; + + /* Try to get a semi random initial value. */ ++#ifdef CONFIG_PREEMPT_RT ++ canary = (unsigned long)&canary; ++#else + canary = get_random_canary(); ++#endif + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; +diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h +index 28d2908af..3a8128d2b 100644 +--- a/arch/powerpc/include/asm/thread_info.h ++++ b/arch/powerpc/include/asm/thread_info.h +@@ -48,6 +48,8 @@ + struct thread_info { + int preempt_count; /* 0 => preemptable, + <0 => BUG */ ++ int preempt_lazy_count; /* 0 => preemptable, ++ <0 => BUG */ + #ifdef CONFIG_SMP + unsigned int cpu; + #endif +@@ -100,11 +102,12 @@ void arch_setup_new_exec(void); + #define TIF_SINGLESTEP 8 /* singlestepping active */ + #define TIF_NOHZ 9 /* in adaptive nohz mode */ + #define TIF_SECCOMP 10 /* secure computing */ +-#define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ +-#define TIF_NOERROR 12 /* Force successful syscall return */ ++ ++#define TIF_NEED_RESCHED_LAZY 11 /* lazy rescheduling necessary */ ++#define TIF_SYSCALL_TRACEPOINT 12 /* syscall tracepoint instrumentation */ ++ + #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ + #define TIF_UPROBE 14 /* breakpointed or single-stepping */ +-#define TIF_SYSCALL_TRACEPOINT 15 /* syscall tracepoint instrumentation */ + #define TIF_EMULATE_STACK_STORE 16 /* Is an instruction emulation + for stack store? */ + #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ +@@ -113,6 +116,9 @@ void arch_setup_new_exec(void); + #endif + #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ + #define TIF_32BIT 20 /* 32 bit binary */ ++#define TIF_RESTOREALL 21 /* Restore all regs (implies NOERROR) */ ++#define TIF_NOERROR 22 /* Force successful syscall return */ ++ + + /* as above, but as bit values */ + #define _TIF_SYSCALL_TRACE (1<version = cpu_to_be16(OOPS_HDR_VERSION); +diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c +index 310bcd768..ae3212dcf 100644 +--- a/arch/powerpc/kernel/syscall_64.c ++++ b/arch/powerpc/kernel/syscall_64.c +@@ -193,7 +193,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, + ti_flags = READ_ONCE(*ti_flagsp); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); +- if (ti_flags & _TIF_NEED_RESCHED) { ++ if (ti_flags & _TIF_NEED_RESCHED_MASK) { + schedule(); + } else { + /* +@@ -277,7 +277,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned + ti_flags = READ_ONCE(*ti_flagsp); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); /* returning to user: may enable */ +- if (ti_flags & _TIF_NEED_RESCHED) { ++ if (ti_flags & _TIF_NEED_RESCHED_MASK) { + schedule(); + } else { + if (ti_flags & _TIF_SIGPENDING) +@@ -361,11 +361,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign + /* Returning to a kernel context with local irqs enabled. */ + WARN_ON_ONCE(!(regs->msr & MSR_EE)); + again: +- if (IS_ENABLED(CONFIG_PREEMPT)) { ++ if (IS_ENABLED(CONFIG_PREEMPTION)) { + /* Return to preemptible kernel context */ + if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) { + if (preempt_count() == 0) + preempt_schedule_irq(); ++ } else if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED_LAZY)) { ++ if ((preempt_count() == 0) && ++ (current_thread_info()->preempt_lazy_count == 0)) ++ preempt_schedule_irq(); + } + } + +diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c +index 1d20f0f77..7e0a497a3 100644 +--- a/arch/powerpc/kernel/time.c ++++ b/arch/powerpc/kernel/time.c +@@ -312,12 +312,11 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, + return stime_scaled; + } + +-static unsigned long vtime_delta(struct task_struct *tsk, ++static unsigned long vtime_delta(struct cpu_accounting_data *acct, + unsigned long *stime_scaled, + unsigned long *steal_time) + { + unsigned long now, stime; +- struct cpu_accounting_data *acct = get_accounting(tsk); + + WARN_ON_ONCE(!irqs_disabled()); + +@@ -332,29 +331,30 @@ static unsigned long vtime_delta(struct task_struct *tsk, + return stime; + } + ++static void vtime_delta_kernel(struct cpu_accounting_data *acct, ++ unsigned long *stime, unsigned long *stime_scaled) ++{ ++ unsigned long steal_time; ++ ++ *stime = vtime_delta(acct, stime_scaled, &steal_time); ++ *stime -= min(*stime, steal_time); ++ acct->steal_time += steal_time; ++} ++ + void vtime_account_kernel(struct task_struct *tsk) + { +- unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); ++ unsigned long stime, stime_scaled; + +- stime = vtime_delta(tsk, &stime_scaled, &steal_time); +- +- stime -= min(stime, steal_time); +- acct->steal_time += steal_time; ++ vtime_delta_kernel(acct, &stime, &stime_scaled); + +- if ((tsk->flags & PF_VCPU) && !irq_count()) { ++ if (tsk->flags & PF_VCPU) { + acct->gtime += stime; + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->utime_scaled += stime_scaled; + #endif + } else { +- if (hardirq_count()) +- acct->hardirq_time += stime; +- else if (in_serving_softirq()) +- acct->softirq_time += stime; +- else +- acct->stime += stime; +- ++ acct->stime += stime; + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->stime_scaled += stime_scaled; + #endif +@@ -367,10 +367,34 @@ void vtime_account_idle(struct task_struct *tsk) + unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); + +- stime = vtime_delta(tsk, &stime_scaled, &steal_time); ++ stime = vtime_delta(acct, &stime_scaled, &steal_time); + acct->idle_time += stime + steal_time; + } + ++static void vtime_account_irq_field(struct cpu_accounting_data *acct, ++ unsigned long *field) ++{ ++ unsigned long stime, stime_scaled; ++ ++ vtime_delta_kernel(acct, &stime, &stime_scaled); ++ *field += stime; ++#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME ++ acct->stime_scaled += stime_scaled; ++#endif ++} ++ ++void vtime_account_softirq(struct task_struct *tsk) ++{ ++ struct cpu_accounting_data *acct = get_accounting(tsk); ++ vtime_account_irq_field(acct, &acct->softirq_time); ++} ++ ++void vtime_account_hardirq(struct task_struct *tsk) ++{ ++ struct cpu_accounting_data *acct = get_accounting(tsk); ++ vtime_account_irq_field(acct, &acct->hardirq_time); ++} ++ + static void vtime_flush_scaled(struct task_struct *tsk, + struct cpu_accounting_data *acct) + { +diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c +index 069d45124..8eaa8c240 100644 +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void) + + extern void panic_flush_kmsg_end(void) + { +- printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); + debug_locks_off(); +@@ -260,12 +259,17 @@ static char *get_mmu_str(void) + + static int __die(const char *str, struct pt_regs *regs, long err) + { ++ const char *pr = ""; ++ + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; ++ + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, get_mmu_str(), +- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", ++ pr, + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", +diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c +index 75b2a6c4d..db40e20d0 100644 +--- a/arch/powerpc/kernel/watchdog.c ++++ b/arch/powerpc/kernel/watchdog.c +@@ -185,11 +185,6 @@ static void watchdog_smp_panic(int cpu, u64 tb) + + wd_smp_unlock(&flags); + +- printk_safe_flush(); +- /* +- * printk_safe_flush() seems to require another print +- * before anything actually goes out to console. +- */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + +diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c +index c9a889880..d488311ef 100644 +--- a/arch/powerpc/kexec/crash.c ++++ b/arch/powerpc/kexec/crash.c +@@ -311,9 +311,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) + unsigned int i; + int (*old_handler)(struct pt_regs *regs); + +- /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ +- printk_nmi_enter(); +- + /* + * This function is only called after the system + * has panicked or is otherwise in a critical state. +diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig +index 549591d9a..efb5bfe93 100644 +--- a/arch/powerpc/kvm/Kconfig ++++ b/arch/powerpc/kvm/Kconfig +@@ -178,6 +178,7 @@ config KVM_E500MC + config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 ++ depends on !PREEMPT_RT + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING +diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile +index 55b4a8bd4..3b4e9e4e2 100644 +--- a/arch/powerpc/mm/Makefile ++++ b/arch/powerpc/mm/Makefile +@@ -16,7 +16,6 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o + obj-$(CONFIG_PPC_MM_SLICES) += slice.o + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o + obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o +-obj-$(CONFIG_HIGHMEM) += highmem.o + obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o + obj-$(CONFIG_PPC_PTDUMP) += ptdump/ + obj-$(CONFIG_KASAN) += kasan/ +diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c +deleted file mode 100644 +index 624b4438a..000000000 +--- a/arch/powerpc/mm/highmem.c ++++ /dev/null +@@ -1,67 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * highmem.c: virtual kernel memory mappings for high memory +- * +- * PowerPC version, stolen from the i386 version. +- * +- * Used in CONFIG_HIGHMEM systems for memory pages which +- * are not addressable by direct kernel virtual addresses. +- * +- * Copyright (C) 1999 Gerhard Wichert, Siemens AG +- * Gerhard.Wichert@pdb.siemens.de +- * +- * +- * Redesigned the x86 32-bit VM architecture to deal with +- * up to 16 Terrabyte physical memory. With current x86 CPUs +- * we now support up to 64 Gigabytes physical RAM. +- * +- * Copyright (C) 1999 Ingo Molnar +- * +- * Reworked for PowerPC by various contributors. Moved from +- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. +- */ +- +-#include +-#include +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); +- __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); +- local_flush_tlb_page(NULL, vaddr); +- +- return (void*) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- +- if (vaddr < __fix_to_virt(FIX_KMAP_END)) +- return; +- +- if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { +- int type = kmap_atomic_idx(); +- unsigned int idx; +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +- WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +- +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- local_flush_tlb_page(NULL, vaddr); +- } +- +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c +index 22eb1c718..1b74565b3 100644 +--- a/arch/powerpc/mm/mem.c ++++ b/arch/powerpc/mm/mem.c +@@ -62,11 +62,6 @@ + unsigned long long memory_limit; + bool init_mem_is_free; + +-#ifdef CONFIG_HIGHMEM +-pte_t *kmap_pte; +-EXPORT_SYMBOL(kmap_pte); +-#endif +- + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) + { +@@ -236,8 +231,6 @@ void __init paging_init(void) + + map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ + pkmap_page_table = virt_to_kpte(PKMAP_BASE); +- +- kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + #endif /* CONFIG_HIGHMEM */ + + printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", +diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c +index 6c3bc4b4d..ec862846b 100644 +--- a/arch/powerpc/platforms/powernv/opal-kmsg.c ++++ b/arch/powerpc/platforms/powernv/opal-kmsg.c +@@ -20,7 +20,8 @@ + * message, it just ensures that OPAL completely flushes the console buffer. + */ + static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + /* + * Outside of a panic context the pollers will continue to run, +diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c +index 245f1f8df..f05555dde 100644 +--- a/arch/powerpc/platforms/pseries/iommu.c ++++ b/arch/powerpc/platforms/pseries/iommu.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -190,7 +191,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + return ret; + } + +-static DEFINE_PER_CPU(__be64 *, tce_page); ++struct tce_page { ++ __be64 * page; ++ local_lock_t lock; ++}; ++static DEFINE_PER_CPU(struct tce_page, tce_page) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, +@@ -212,9 +219,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + direction, attrs); + } + +- local_irq_save(flags); /* to protect tcep and the page behind it */ ++ /* to protect tcep and the page behind it */ ++ local_lock_irqsave(&tce_page.lock, flags); + +- tcep = __this_cpu_read(tce_page); ++ tcep = __this_cpu_read(tce_page.page); + + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() +@@ -223,12 +231,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + /* If allocation fails, fall back to the loop implementation */ + if (!tcep) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, + npages, uaddr, direction, attrs); + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + rpn = __pa(uaddr) >> TCE_SHIFT; +@@ -258,7 +266,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcenum += limit; + } while (npages > 0 && !rc); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; +@@ -429,16 +437,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + DMA_BIDIRECTIONAL, 0); + } + +- local_irq_disable(); /* to protect tcep and the page behind it */ +- tcep = __this_cpu_read(tce_page); ++ /* to protect tcep and the page behind it */ ++ local_lock_irq(&tce_page.lock); ++ tcep = __this_cpu_read(tce_page.page); + + if (!tcep) { + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return -ENOMEM; + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; +@@ -481,7 +490,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + + /* error cleanup: caller will clear whole range */ + +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return rc; + } + +diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c +index 5559edf36..d62b8e053 100644 +--- a/arch/powerpc/xmon/xmon.c ++++ b/arch/powerpc/xmon/xmon.c +@@ -3005,7 +3005,7 @@ print_address(unsigned long addr) + static void + dump_log_buf(void) + { +- struct kmsg_dumper dumper = { .active = 1 }; ++ struct kmsg_dumper_iter iter = { .active = 1 }; + unsigned char buf[128]; + size_t len; + +@@ -3017,9 +3017,9 @@ dump_log_buf(void) + catch_memory_errors = 1; + sync(); + +- kmsg_dump_rewind_nolock(&dumper); ++ kmsg_dump_rewind(&iter); + xmon_start_pagination(); +- while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) { ++ while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) { + buf[len] = '\0'; + printf("%s", buf); + } +diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig +index a7e386000..d3ed5b2ae 100644 +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -183,6 +183,7 @@ config S390 + select HAVE_RSEQ + select HAVE_SYSCALL_TRACEPOINTS + select HAVE_VIRT_CPU_ACCOUNTING ++ select HAVE_VIRT_CPU_ACCOUNTING_IDLE + select IOMMU_HELPER if PCI + select IOMMU_SUPPORT if PCI + select MODULES_USE_ELF_RELA +diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h +index cfed272e4..8e28e8176 100644 +--- a/arch/s390/include/asm/spinlock_types.h ++++ b/arch/s390/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + int lock; + } __attribute__ ((aligned (4))) arch_spinlock_t; +diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h +index 3622d4ebc..fac6a6798 100644 +--- a/arch/s390/include/asm/vtime.h ++++ b/arch/s390/include/asm/vtime.h +@@ -2,7 +2,6 @@ + #ifndef _S390_VTIME_H + #define _S390_VTIME_H + +-#define __ARCH_HAS_VTIME_ACCOUNT + #define __ARCH_HAS_VTIME_TASK_SWITCH + + #endif /* _S390_VTIME_H */ +diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c +index 579ec3a8c..9b3c5978b 100644 +--- a/arch/s390/kernel/vtime.c ++++ b/arch/s390/kernel/vtime.c +@@ -223,35 +223,50 @@ void vtime_flush(struct task_struct *tsk) + S390_lowcore.avg_steal_timer = avg_steal; + } + ++static u64 vtime_delta(void) ++{ ++ u64 timer = S390_lowcore.last_update_timer; ++ ++ S390_lowcore.last_update_timer = get_vtimer(); ++ ++ return timer - S390_lowcore.last_update_timer; ++} ++ + /* + * Update process times based on virtual cpu times stored by entry.S + * to the lowcore fields user_timer, system_timer & steal_clock. + */ +-void vtime_account_irq_enter(struct task_struct *tsk) ++void vtime_account_kernel(struct task_struct *tsk) + { +- u64 timer; +- +- timer = S390_lowcore.last_update_timer; +- S390_lowcore.last_update_timer = get_vtimer(); +- timer -= S390_lowcore.last_update_timer; ++ u64 delta = vtime_delta(); + +- if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) +- S390_lowcore.guest_timer += timer; +- else if (hardirq_count()) +- S390_lowcore.hardirq_timer += timer; +- else if (in_serving_softirq()) +- S390_lowcore.softirq_timer += timer; ++ if (tsk->flags & PF_VCPU) ++ S390_lowcore.guest_timer += delta; + else +- S390_lowcore.system_timer += timer; ++ S390_lowcore.system_timer += delta; + +- virt_timer_forward(timer); ++ virt_timer_forward(delta); + } +-EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +- +-void vtime_account_kernel(struct task_struct *tsk) +-__attribute__((alias("vtime_account_irq_enter"))); + EXPORT_SYMBOL_GPL(vtime_account_kernel); + ++void vtime_account_softirq(struct task_struct *tsk) ++{ ++ u64 delta = vtime_delta(); ++ ++ S390_lowcore.softirq_timer += delta; ++ ++ virt_timer_forward(delta); ++} ++ ++void vtime_account_hardirq(struct task_struct *tsk) ++{ ++ u64 delta = vtime_delta(); ++ ++ S390_lowcore.hardirq_timer += delta; ++ ++ virt_timer_forward(delta); ++} ++ + /* + * Sorted add to a list. List is linear searched until first bigger + * element is found. +diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h +index f38adc189..b07fbc7f7 100644 +--- a/arch/sh/include/asm/fixmap.h ++++ b/arch/sh/include/asm/fixmap.h +@@ -13,9 +13,6 @@ + #include + #include + #include +-#ifdef CONFIG_HIGHMEM +-#include +-#endif + + /* + * Here we define all the compile-time 'special' virtual +@@ -53,11 +50,6 @@ enum fixed_addresses { + FIX_CMAP_BEGIN, + FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS) - 1, + +-#ifdef CONFIG_HIGHMEM +- FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, +-#endif +- + #ifdef CONFIG_IOREMAP_FIXED + /* + * FIX_IOREMAP entries are useful for mapping physical address +diff --git a/arch/sh/include/asm/hardirq.h b/arch/sh/include/asm/hardirq.h +index edaea3559..9fe4495a8 100644 +--- a/arch/sh/include/asm/hardirq.h ++++ b/arch/sh/include/asm/hardirq.h +@@ -2,16 +2,10 @@ + #ifndef __ASM_SH_HARDIRQ_H + #define __ASM_SH_HARDIRQ_H + +-#include +-#include +- +-typedef struct { +- unsigned int __softirq_pending; +- unsigned int __nmi_count; /* arch dependent */ +-} ____cacheline_aligned irq_cpustat_t; +- +-#include /* Standard mappings for irq_cpustat_t above */ +- + extern void ack_bad_irq(unsigned int irq); ++#define ack_bad_irq ack_bad_irq ++#define ARCH_WANTS_NMI_IRQSTAT ++ ++#include + + #endif /* __ASM_SH_HARDIRQ_H */ +diff --git a/arch/sh/include/asm/kmap_types.h b/arch/sh/include/asm/kmap_types.h +deleted file mode 100644 +index b78107f92..000000000 +--- a/arch/sh/include/asm/kmap_types.h ++++ /dev/null +@@ -1,15 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __SH_KMAP_TYPES_H +-#define __SH_KMAP_TYPES_H +- +-/* Dummy header just to define km_type. */ +- +-#ifdef CONFIG_DEBUG_HIGHMEM +-#define __WITH_KM_FENCE +-#endif +- +-#include +- +-#undef __WITH_KM_FENCE +- +-#endif +diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h +index e82369f28..22ca9a98b 100644 +--- a/arch/sh/include/asm/spinlock_types.h ++++ b/arch/sh/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SH_SPINLOCK_TYPES_H + #define __ASM_SH_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H +-# error "please don't include this file directly" +-#endif +- + typedef struct { + volatile unsigned int lock; + } arch_spinlock_t; +diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c +index 5717c7cbd..5db7af565 100644 +--- a/arch/sh/kernel/irq.c ++++ b/arch/sh/kernel/irq.c +@@ -44,7 +44,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) + + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) +- seq_printf(p, "%10u ", nmi_count(j)); ++ seq_printf(p, "%10u ", per_cpu(irq_stat.__nmi_count, j)); + seq_printf(p, " Non-maskable interrupts\n"); + + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); +@@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu) + hardirq_ctx[cpu] = NULL; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct thread_info *curctx; +@@ -175,6 +176,7 @@ void do_softirq_own_stack(void) + "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" + ); + } ++#endif + #else + static inline void handle_one_irq(unsigned int irq) + { +diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c +index 9c3d32b80..f5beecdac 100644 +--- a/arch/sh/kernel/traps.c ++++ b/arch/sh/kernel/traps.c +@@ -186,7 +186,7 @@ BUILD_TRAP_HANDLER(nmi) + arch_ftrace_nmi_enter(); + + nmi_enter(); +- nmi_count(cpu)++; ++ this_cpu_inc(irq_stat.__nmi_count); + + switch (notify_die(DIE_NMI, "NMI", regs, 0, vec & 0xff, SIGINT)) { + case NOTIFY_OK: +diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c +index 3348e0c4d..0db6919af 100644 +--- a/arch/sh/mm/init.c ++++ b/arch/sh/mm/init.c +@@ -362,9 +362,6 @@ void __init mem_init(void) + mem_init_print_info(NULL); + pr_info("virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +-#ifdef CONFIG_HIGHMEM +- " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" +-#endif + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB) (cached)\n" + #ifdef CONFIG_UNCACHED_MAPPING +@@ -376,11 +373,6 @@ void __init mem_init(void) + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, + +-#ifdef CONFIG_HIGHMEM +- PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, +- (LAST_PKMAP*PAGE_SIZE) >> 10, +-#endif +- + (unsigned long)VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, + +diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig +index 530b7ec5d..a38d00d8b 100644 +--- a/arch/sparc/Kconfig ++++ b/arch/sparc/Kconfig +@@ -139,6 +139,7 @@ config MMU + config HIGHMEM + bool + default y if SPARC32 ++ select KMAP_LOCAL + + config ZONE_DMA + bool +diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h +index 6c35f0d27..875116209 100644 +--- a/arch/sparc/include/asm/highmem.h ++++ b/arch/sparc/include/asm/highmem.h +@@ -24,7 +24,6 @@ + #include + #include + #include +-#include + #include + + /* declarations for highmem.c */ +@@ -33,8 +32,6 @@ extern unsigned long highstart_pfn, highend_pfn; + #define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) + extern pte_t *pkmap_page_table; + +-void kmap_init(void) __init; +- + /* + * Right now we initialize only a single pte table. It can be extended + * easily, subsequent pte tables have to be allocated in one physical +@@ -53,6 +50,11 @@ void kmap_init(void) __init; + + #define flush_cache_kmaps() flush_cache_all() + ++/* FIXME: Use __flush_tlb_one(vaddr) instead of flush_cache_all() -- Anton */ ++#define arch_kmap_local_post_map(vaddr, pteval) flush_cache_all() ++#define arch_kmap_local_post_unmap(vaddr) flush_cache_all() ++ ++ + #endif /* __KERNEL__ */ + + #endif /* _ASM_HIGHMEM_H */ +diff --git a/arch/sparc/include/asm/kmap_types.h b/arch/sparc/include/asm/kmap_types.h +deleted file mode 100644 +index 55a99b6bd..000000000 +--- a/arch/sparc/include/asm/kmap_types.h ++++ /dev/null +@@ -1,11 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_KMAP_TYPES_H +-#define _ASM_KMAP_TYPES_H +- +-/* Dummy header just to define km_type. None of this +- * is actually used on sparc. -DaveM +- */ +- +-#include +- +-#endif +diff --git a/arch/sparc/include/asm/vaddrs.h b/arch/sparc/include/asm/vaddrs.h +index 84d054b07..4fec0341e 100644 +--- a/arch/sparc/include/asm/vaddrs.h ++++ b/arch/sparc/include/asm/vaddrs.h +@@ -32,13 +32,13 @@ + #define SRMMU_NOCACHE_ALCRATIO 64 /* 256 pages per 64MB of system RAM */ + + #ifndef __ASSEMBLY__ +-#include ++#include + + enum fixed_addresses { + FIX_HOLE, + #ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, +- FIX_KMAP_END = (KM_TYPE_NR * NR_CPUS), ++ FIX_KMAP_END = (KM_MAX_IDX * NR_CPUS), + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c +index 3ec9f1402..eb21682ab 100644 +--- a/arch/sparc/kernel/irq_64.c ++++ b/arch/sparc/kernel/irq_64.c +@@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) + set_irq_regs(old_regs); + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + void *orig_sp, *sp = softirq_stack[smp_processor_id()]; +@@ -868,6 +869,7 @@ void do_softirq_own_stack(void) + __asm__ __volatile__("mov %0, %%sp" + : : "r" (orig_sp)); + } ++#endif + + #ifdef CONFIG_HOTPLUG_CPU + void fixup_irqs(void) +diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile +index b078205b7..68db1f859 100644 +--- a/arch/sparc/mm/Makefile ++++ b/arch/sparc/mm/Makefile +@@ -15,6 +15,3 @@ obj-$(CONFIG_SPARC32) += leon_mm.o + + # Only used by sparc64 + obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +- +-# Only used by sparc32 +-obj-$(CONFIG_HIGHMEM) += highmem.o +diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c +deleted file mode 100644 +index 8f2a2afb0..000000000 +--- a/arch/sparc/mm/highmem.c ++++ /dev/null +@@ -1,115 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * highmem.c: virtual kernel memory mappings for high memory +- * +- * Provides kernel-static versions of atomic kmap functions originally +- * found as inlines in include/asm-sparc/highmem.h. These became +- * needed as kmap_atomic() and kunmap_atomic() started getting +- * called from within modules. +- * -- Tomas Szepe , September 2002 +- * +- * But kmap_atomic() and kunmap_atomic() cannot be inlined in +- * modules because they are loaded with btfixup-ped functions. +- */ +- +-/* +- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap +- * gives a more generic (and caching) interface. But kmap_atomic can +- * be used in IRQ contexts, so in some (very limited) cases we need it. +- * +- * XXX This is an old text. Actually, it's good to use atomic kmaps, +- * provided you remember that they are atomic and not try to sleep +- * with a kmap taken, much like a spinlock. Non-atomic kmaps are +- * shared by CPUs, and so precious, and establishing them requires IPI. +- * Atomic kmaps are lightweight and we may have NCPUS more of them. +- */ +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-static pte_t *kmap_pte; +- +-void __init kmap_init(void) +-{ +- unsigned long address = __fix_to_virt(FIX_KMAP_BEGIN); +- +- /* cache the first kmap pte */ +- kmap_pte = virt_to_kpte(address); +-} +- +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- long idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- +-/* XXX Fix - Anton */ +-#if 0 +- __flush_cache_one(vaddr); +-#else +- flush_cache_all(); +-#endif +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte-idx))); +-#endif +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +-/* XXX Fix - Anton */ +-#if 0 +- __flush_tlb_one(vaddr); +-#else +- flush_tlb_all(); +-#endif +- +- return (void*) vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- int type; +- +- if (vaddr < FIXADDR_START) +- return; +- +- type = kmap_atomic_idx(); +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- { +- unsigned long idx; +- +- idx = type + KM_TYPE_NR * smp_processor_id(); +- BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); +- +- /* XXX Fix - Anton */ +-#if 0 +- __flush_cache_one(vaddr); +-#else +- flush_cache_all(); +-#endif +- +- /* +- * force other mappings to Oops if they'll try to access +- * this pte without first remap it +- */ +- pte_clear(&init_mm, vaddr, kmap_pte-idx); +- /* XXX Fix - Anton */ +-#if 0 +- __flush_tlb_one(vaddr); +-#else +- flush_tlb_all(); +-#endif +- } +-#endif +- +- kmap_atomic_idx_pop(); +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c +index 0070f8b9a..a03caa5f6 100644 +--- a/arch/sparc/mm/srmmu.c ++++ b/arch/sparc/mm/srmmu.c +@@ -971,8 +971,6 @@ void __init srmmu_paging_init(void) + + sparc_context_init(num_contexts); + +- kmap_init(); +- + { + unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; + +diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h +index 2c697a145..2efac5827 100644 +--- a/arch/um/include/asm/fixmap.h ++++ b/arch/um/include/asm/fixmap.h +@@ -3,7 +3,6 @@ + #define __UM_FIXMAP_H + + #include +-#include + #include + #include + #include +diff --git a/arch/um/include/asm/hardirq.h b/arch/um/include/asm/hardirq.h +index b426796d2..52e2c3626 100644 +--- a/arch/um/include/asm/hardirq.h ++++ b/arch/um/include/asm/hardirq.h +@@ -2,22 +2,7 @@ + #ifndef __ASM_UM_HARDIRQ_H + #define __ASM_UM_HARDIRQ_H + +-#include +-#include +- +-typedef struct { +- unsigned int __softirq_pending; +-} ____cacheline_aligned irq_cpustat_t; +- +-#include /* Standard mappings for irq_cpustat_t above */ +-#include +- +-#ifndef ack_bad_irq +-static inline void ack_bad_irq(unsigned int irq) +-{ +- printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq); +-} +-#endif ++#include + + #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 + +diff --git a/arch/um/include/asm/kmap_types.h b/arch/um/include/asm/kmap_types.h +deleted file mode 100644 +index b0bd12de1..000000000 +--- a/arch/um/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-/* +- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +- */ +- +-#ifndef __UM_KMAP_TYPES_H +-#define __UM_KMAP_TYPES_H +- +-/* No more #include "asm/arch/kmap_types.h" ! */ +- +-#define KM_TYPE_NR 14 +- +-#endif +diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c +index e4abac6c9..173999422 100644 +--- a/arch/um/kernel/kmsg_dump.c ++++ b/arch/um/kernel/kmsg_dump.c +@@ -1,15 +1,19 @@ + // SPDX-License-Identifier: GPL-2.0 + #include ++#include + #include + #include + #include + #include + + static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { ++ static DEFINE_SPINLOCK(lock); + static char line[1024]; + struct console *con; ++ unsigned long flags; + size_t len = 0; + + /* only dump kmsg when no console is available */ +@@ -24,11 +28,16 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, + if (con) + return; + ++ if (!spin_trylock_irqsave(&lock, flags)) ++ return; ++ + printf("kmsg_dump:\n"); +- while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) { ++ while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) { + line[len] = '\0'; + printf("%s", line); + } ++ ++ spin_unlock_irqrestore(&lock, flags); + } + + static struct kmsg_dumper kmsg_dumper = { +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 040fb7736..79c0da581 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -16,6 +16,7 @@ config X86_32 + select CLKSRC_I8253 + select CLONE_BACKWARDS + select HAVE_DEBUG_STACKOVERFLOW ++ select KMAP_LOCAL + select MODULES_USE_ELF_REL + select OLD_SIGACTION + select GENERIC_VDSO_32 +@@ -95,6 +96,7 @@ config X86 + select ARCH_SUPPORTS_ACPI + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 ++ select ARCH_SUPPORTS_RT + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USE_QUEUED_SPINLOCKS +@@ -216,6 +218,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index be891fdf8..29c716ed1 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -379,14 +379,14 @@ static int ecb_encrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -401,14 +401,14 @@ static int ecb_decrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -423,14 +423,14 @@ static int cbc_encrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -445,14 +445,14 @@ static int cbc_decrypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes)) { ++ kernel_fpu_begin(); + aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } +- kernel_fpu_end(); + + return err; + } +@@ -500,18 +500,20 @@ static int ctr_crypt(struct skcipher_request *req) + + err = skcipher_walk_virt(&walk, req, true); + +- kernel_fpu_begin(); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { ++ kernel_fpu_begin(); + aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); ++ kernel_fpu_end(); + nbytes &= AES_BLOCK_SIZE - 1; + err = skcipher_walk_done(&walk, nbytes); + } + if (walk.nbytes) { ++ kernel_fpu_begin(); + ctr_crypt_final(ctx, &walk); ++ kernel_fpu_end(); + err = skcipher_walk_done(&walk, 0); + } +- kernel_fpu_end(); + + return err; + } +diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c +index 384ccb00f..2f8df8ef8 100644 +--- a/arch/x86/crypto/cast5_avx_glue.c ++++ b/arch/x86/crypto/cast5_avx_glue.c +@@ -46,7 +46,7 @@ static inline void cast5_fpu_end(bool fpu_enabled) + + static int ecb_crypt(struct skcipher_request *req, bool enc) + { +- bool fpu_enabled = false; ++ bool fpu_enabled; + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; +@@ -61,7 +61,7 @@ static int ecb_crypt(struct skcipher_request *req, bool enc) + u8 *wsrc = walk.src.virt.addr; + u8 *wdst = walk.dst.virt.addr; + +- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); ++ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); + + /* Process multi-block batch */ + if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { +@@ -90,10 +90,9 @@ static int ecb_crypt(struct skcipher_request *req, bool enc) + } while (nbytes >= bsize); + + done: ++ cast5_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } +- +- cast5_fpu_end(fpu_enabled); + return err; + } + +@@ -197,7 +196,7 @@ static int cbc_decrypt(struct skcipher_request *req) + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); +- bool fpu_enabled = false; ++ bool fpu_enabled; + struct skcipher_walk walk; + unsigned int nbytes; + int err; +@@ -205,12 +204,11 @@ static int cbc_decrypt(struct skcipher_request *req) + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes)) { +- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); ++ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); + nbytes = __cbc_decrypt(ctx, &walk); ++ cast5_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } +- +- cast5_fpu_end(fpu_enabled); + return err; + } + +@@ -277,7 +275,7 @@ static int ctr_crypt(struct skcipher_request *req) + { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); +- bool fpu_enabled = false; ++ bool fpu_enabled; + struct skcipher_walk walk; + unsigned int nbytes; + int err; +@@ -285,13 +283,12 @@ static int ctr_crypt(struct skcipher_request *req) + err = skcipher_walk_virt(&walk, req, false); + + while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { +- fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); ++ fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); + nbytes = __ctr_crypt(&walk, ctx); ++ cast5_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } + +- cast5_fpu_end(fpu_enabled); +- + if (walk.nbytes) { + ctr_crypt_final(&walk, ctx); + err = skcipher_walk_done(&walk, 0); +diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c +index d3d91a0ab..6d0774721 100644 +--- a/arch/x86/crypto/glue_helper.c ++++ b/arch/x86/crypto/glue_helper.c +@@ -24,7 +24,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; +- bool fpu_enabled = false; ++ bool fpu_enabled; + unsigned int nbytes; + int err; + +@@ -37,7 +37,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + unsigned int i; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +- &walk, fpu_enabled, nbytes); ++ &walk, false, nbytes); + for (i = 0; i < gctx->num_funcs; i++) { + func_bytes = bsize * gctx->funcs[i].num_blocks; + +@@ -55,10 +55,9 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, + if (nbytes < bsize) + break; + } ++ glue_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } +- +- glue_fpu_end(fpu_enabled); + return err; + } + EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); +@@ -101,7 +100,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; +- bool fpu_enabled = false; ++ bool fpu_enabled; + unsigned int nbytes; + int err; + +@@ -115,7 +114,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + u128 last_iv; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +- &walk, fpu_enabled, nbytes); ++ &walk, false, nbytes); + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; +@@ -148,10 +147,10 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, + done: + u128_xor(dst, dst, (u128 *)walk.iv); + *(u128 *)walk.iv = last_iv; ++ glue_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } + +- glue_fpu_end(fpu_enabled); + return err; + } + EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); +@@ -162,7 +161,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); + const unsigned int bsize = 128 / 8; + struct skcipher_walk walk; +- bool fpu_enabled = false; ++ bool fpu_enabled; + unsigned int nbytes; + int err; + +@@ -176,7 +175,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + le128 ctrblk; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, +- &walk, fpu_enabled, nbytes); ++ &walk, false, nbytes); + + be128_to_le128(&ctrblk, (be128 *)walk.iv); + +@@ -202,11 +201,10 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, + } + + le128_to_be128((be128 *)walk.iv, &ctrblk); ++ glue_fpu_end(fpu_enabled); + err = skcipher_walk_done(&walk, nbytes); + } + +- glue_fpu_end(fpu_enabled); +- + if (nbytes) { + le128 ctrblk; + u128 tmp; +@@ -306,8 +304,14 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx, + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { ++ fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, ++ &walk, fpu_enabled, ++ nbytes < bsize ? bsize : nbytes); + nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); + ++ glue_fpu_end(fpu_enabled); ++ fpu_enabled = false; ++ + err = skcipher_walk_done(&walk, nbytes); + nbytes = walk.nbytes; + } +diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h +index 77217bd29..8eba66a33 100644 +--- a/arch/x86/include/asm/fixmap.h ++++ b/arch/x86/include/asm/fixmap.h +@@ -31,7 +31,7 @@ + #include + #ifdef CONFIG_X86_32 + #include +-#include ++#include + #else + #include + #endif +@@ -94,7 +94,7 @@ enum fixed_addresses { + #endif + #ifdef CONFIG_X86_32 + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ +- FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, ++ FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, + #ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, + #endif +@@ -151,7 +151,6 @@ extern void reserve_top_address(unsigned long reserve); + + extern int fixmaps_set; + +-extern pte_t *kmap_pte; + extern pte_t *pkmap_page_table; + + void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); +diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h +index 8b9bfaad6..d31b08865 100644 +--- a/arch/x86/include/asm/fpu/api.h ++++ b/arch/x86/include/asm/fpu/api.h +@@ -28,6 +28,7 @@ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); + extern void kernel_fpu_end(void); + extern bool irq_fpu_usable(void); + extern void fpregs_mark_activate(void); ++extern void kernel_fpu_resched(void); + + /* Code that is unaware of kernel_fpu_begin_mask() can use this */ + static inline void kernel_fpu_begin(void) +@@ -40,17 +41,32 @@ static inline void kernel_fpu_begin(void) + * A context switch will (and softirq might) save CPU's FPU registers to + * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * a random state. ++ * ++ * local_bh_disable() protects against both preemption and soft interrupts ++ * on !RT kernels. ++ * ++ * On RT kernels local_bh_disable() is not sufficient because it only ++ * serializes soft interrupt related sections via a local lock, but stays ++ * preemptible. Disabling preemption is the right choice here as bottom ++ * half processing is always in thread context on RT kernels so it ++ * implicitly prevents bottom half processing as well. ++ * ++ * Disabling preemption also serializes against kernel_fpu_begin(). + */ + static inline void fpregs_lock(void) + { +- preempt_disable(); +- local_bh_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_disable(); ++ else ++ preempt_disable(); + } + + static inline void fpregs_unlock(void) + { +- local_bh_enable(); +- preempt_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_enable(); ++ else ++ preempt_enable(); + } + + #ifdef CONFIG_X86_DEBUG_FPU +diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h +index 0f420b24e..032e02085 100644 +--- a/arch/x86/include/asm/highmem.h ++++ b/arch/x86/include/asm/highmem.h +@@ -23,7 +23,6 @@ + + #include + #include +-#include + #include + #include + #include +@@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn; + #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) + #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) + +-void *kmap_atomic_pfn(unsigned long pfn); +-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +- + #define flush_cache_kmaps() do { } while (0) + ++#define arch_kmap_local_post_map(vaddr, pteval) \ ++ arch_flush_lazy_mmu_mode() ++ ++#define arch_kmap_local_post_unmap(vaddr) \ ++ do { \ ++ flush_tlb_one_kernel((vaddr)); \ ++ arch_flush_lazy_mmu_mode(); \ ++ } while (0) ++ + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn); + +diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h +index bacf68c4d..e2de092fc 100644 +--- a/arch/x86/include/asm/iomap.h ++++ b/arch/x86/include/asm/iomap.h +@@ -9,19 +9,14 @@ + #include + #include + #include ++#include + #include + #include + +-void __iomem * +-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); ++void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot); + +-void +-iounmap_atomic(void __iomem *kvaddr); ++int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +-int +-iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); +- +-void +-iomap_free(resource_size_t base, unsigned long size); ++void iomap_free(resource_size_t base, unsigned long size); + + #endif /* _ASM_X86_IOMAP_H */ +diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h +deleted file mode 100644 +index 04ab8266e..000000000 +--- a/arch/x86/include/asm/kmap_types.h ++++ /dev/null +@@ -1,13 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_X86_KMAP_TYPES_H +-#define _ASM_X86_KMAP_TYPES_H +- +-#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) +-#define __WITH_KM_FENCE +-#endif +- +-#include +- +-#undef __WITH_KM_FENCE +- +-#endif /* _ASM_X86_KMAP_TYPES_H */ +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index b30b56d47..9632218bf 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -43,7 +43,6 @@ + #ifndef __ASSEMBLY__ + + #include +-#include + #include + #include + +diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h +index 2380df419..aacfaad6c 100644 +--- a/arch/x86/include/asm/preempt.h ++++ b/arch/x86/include/asm/preempt.h +@@ -90,21 +90,54 @@ static __always_inline void __preempt_count_sub(int val) + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ +-static __always_inline bool __preempt_count_dec_and_test(void) ++static __always_inline bool ____preempt_count_dec_and_test(void) + { + return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + } + ++static __always_inline bool __preempt_count_dec_and_test(void) ++{ ++ if (____preempt_count_dec_and_test()) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if (preempt_count()) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif ++} ++ + /* + * Returns true when we need to resched and can (barring IRQ state). + */ + static __always_inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u32 tmp; ++ tmp = raw_cpu_read_4(__preempt_count); ++ if (tmp == preempt_offset) ++ return true; ++ ++ /* preempt count == 0 ? */ ++ tmp &= ~PREEMPT_NEED_RESCHED; ++ if (tmp != preempt_offset) ++ return false; ++ /* XXX PREEMPT_LOCK_OFFSET */ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); ++#endif + } + + #ifdef CONFIG_PREEMPTION +- ++#ifdef CONFIG_PREEMPT_RT ++ extern void preempt_schedule_lock(void); ++#endif + extern asmlinkage void preempt_schedule(void); + extern asmlinkage void preempt_schedule_thunk(void); + +diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h +index 6fd8410a3..f3bf2f515 100644 +--- a/arch/x86/include/asm/signal.h ++++ b/arch/x86/include/asm/signal.h +@@ -28,6 +28,19 @@ typedef struct { + #define SA_IA32_ABI 0x02000000u + #define SA_X32_ABI 0x01000000u + ++/* ++ * Because some traps use the IST stack, we must keep preemption ++ * disabled while calling do_trap(), but do_trap() may call ++ * force_sig_info() which will grab the signal spin_locks for the ++ * task, which in PREEMPT_RT are mutexes. By defining ++ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set ++ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the ++ * trap. ++ */ ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ + #ifndef CONFIG_COMPAT + typedef sigset_t compat_sigset_t; + #endif +diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h +index 7fb482f0f..3df0a95c9 100644 +--- a/arch/x86/include/asm/stackprotector.h ++++ b/arch/x86/include/asm/stackprotector.h +@@ -65,7 +65,7 @@ + */ + static __always_inline void boot_init_stack_canary(void) + { +- u64 canary; ++ u64 canary = 0; + u64 tsc; + + #ifdef CONFIG_X86_64 +@@ -76,8 +76,14 @@ static __always_inline void boot_init_stack_canary(void) + * of randomness. The TSC only matters for very early init, + * there it already has some randomness on most systems. Later + * on during the bootup the random pool has true entropy too. ++ * For preempt-rt we need to weaken the randomness a bit, as ++ * we can't call into the random generator from atomic context ++ * due to locking constraints. We just leave canary ++ * uninitialized and use the TSC based randomness on top of it. + */ ++#ifndef CONFIG_PREEMPT_RT + get_random_bytes(&canary, sizeof(canary)); ++#endif + tsc = rdtsc(); + canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index a225c6e2c..ad34b468a 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -60,6 +60,8 @@ struct thread_info { + #ifdef CONFIG_SMP + u32 cpu; /* current CPU */ + #endif ++ int preempt_lazy_count; /* 0 => lazy preemptable ++ <0 => BUG */ + KABI_RESERVE(1) + KABI_RESERVE(2) + }; +@@ -67,12 +69,17 @@ struct thread_info { + #define INIT_THREAD_INFO(tsk) \ + { \ + .flags = 0, \ ++ .preempt_lazy_count = 0, \ + } + + #else /* !__ASSEMBLY__ */ + + #include + ++#define GET_THREAD_INFO(reg) \ ++ _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ ++ _ASM_SUB $(THREAD_SIZE),reg ; ++ + #endif + + /* +@@ -99,6 +106,7 @@ struct thread_info { + #define TIF_NOTSC 16 /* TSC is not accessible in userland */ + #define TIF_IA32 17 /* IA32 compatibility process */ + #define TIF_SLD 18 /* Restore split lock detection on context switch */ ++#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ + #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ + #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ + #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +@@ -128,6 +136,7 @@ struct thread_info { + #define _TIF_NOTSC (1 << TIF_NOTSC) + #define _TIF_IA32 (1 << TIF_IA32) + #define _TIF_SLD (1 << TIF_SLD) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) + #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) + #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) +@@ -160,6 +169,8 @@ struct thread_info { + + #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++ + #define STACK_WARN (THREAD_SIZE/8) + + /* +diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c +index 65d11711c..180981654 100644 +--- a/arch/x86/kernel/cpu/mshyperv.c ++++ b/arch/x86/kernel/cpu/mshyperv.c +@@ -80,11 +80,12 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); + DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) + { + struct pt_regs *old_regs = set_irq_regs(regs); ++ u64 ip = regs ? instruction_pointer(regs) : 0; + + inc_irq_stat(hyperv_stimer0_count); + if (hv_stimer0_handler) + hv_stimer0_handler(); +- add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); ++ add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0, ip); + ack_APIC_irq(); + + set_irq_regs(old_regs); +diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c +index 33ee47670..5fcac46aa 100644 +--- a/arch/x86/kernel/crash_dump_32.c ++++ b/arch/x86/kernel/crash_dump_32.c +@@ -13,8 +13,6 @@ + + #include + +-static void *kdump_buf_page; +- + static inline bool is_crashed_pfn_valid(unsigned long pfn) + { + #ifndef CONFIG_X86_PAE +@@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * +- * Copy a page from "oldmem". For this page, there is no pte mapped +- * in the current kernel. We stitch up a pte, similar to kmap_atomic. +- * +- * Calling copy_to_user() in atomic context is not desirable. Hence first +- * copying the data to a pre-allocated kernel page and then copying to user +- * space in non-atomic context. ++ * Copy a page from "oldmem". For this page, there might be no pte mapped ++ * in the current kernel. + */ +-ssize_t copy_oldmem_page(unsigned long pfn, char *buf, +- size_t csize, unsigned long offset, int userbuf) ++ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, ++ unsigned long offset, int userbuf) + { + void *vaddr; + +@@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + +- vaddr = kmap_atomic_pfn(pfn); ++ vaddr = kmap_local_pfn(pfn); + + if (!userbuf) { +- memcpy(buf, (vaddr + offset), csize); +- kunmap_atomic(vaddr); ++ memcpy(buf, vaddr + offset, csize); + } else { +- if (!kdump_buf_page) { +- printk(KERN_WARNING "Kdump: Kdump buffer page not" +- " allocated\n"); +- kunmap_atomic(vaddr); +- return -EFAULT; +- } +- copy_page(kdump_buf_page, vaddr); +- kunmap_atomic(vaddr); +- if (copy_to_user(buf, (kdump_buf_page + offset), csize)) +- return -EFAULT; ++ if (copy_to_user(buf, vaddr + offset, csize)) ++ csize = -EFAULT; + } + +- return csize; +-} ++ kunmap_local(vaddr); + +-static int __init kdump_buf_page_init(void) +-{ +- int ret = 0; +- +- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); +- if (!kdump_buf_page) { +- printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" +- " page\n"); +- ret = -ENOMEM; +- } +- +- return ret; ++ return csize; + } +-arch_initcall(kdump_buf_page_init); +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 571220ac8..d315d45b6 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -159,6 +159,18 @@ void kernel_fpu_end(void) + } + EXPORT_SYMBOL_GPL(kernel_fpu_end); + ++void kernel_fpu_resched(void) ++{ ++ WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); ++ ++ if (should_resched(PREEMPT_OFFSET)) { ++ kernel_fpu_end(); ++ cond_resched(); ++ kernel_fpu_begin(); ++ } ++} ++EXPORT_SYMBOL_GPL(kernel_fpu_resched); ++ + /* + * Save the FPU state (mark it for reload if necessary): + * +diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c +index 0b79efc87..93c6b88b3 100644 +--- a/arch/x86/kernel/irq_32.c ++++ b/arch/x86/kernel/irq_32.c +@@ -131,6 +131,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) + return 0; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct irq_stack *irqstk; +@@ -147,6 +148,7 @@ void do_softirq_own_stack(void) + + call_on_stack(__do_softirq, isp); + } ++#endif + + void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) + { +diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c +index 440eed558..7cfc4e6b7 100644 +--- a/arch/x86/kernel/irq_64.c ++++ b/arch/x86/kernel/irq_64.c +@@ -72,7 +72,9 @@ int irq_init_percpu_irqstack(unsigned int cpu) + return map_irq_stack(cpu); + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + run_on_irqstack_cond(__do_softirq, NULL); + } ++#endif +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7da272aea..683ad033f 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8038,6 +8038,14 @@ int kvm_arch_init(void *opaque) + goto out; + } + ++#ifdef CONFIG_PREEMPT_RT ++ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { ++ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); ++ r = -EOPNOTSUPP; ++ goto out; ++ } ++#endif ++ + r = -ENOMEM; + x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), + __alignof__(struct fpu), SLAB_ACCOUNT, +diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c +index 075fe5131..2c54b76d8 100644 +--- a/arch/x86/mm/highmem_32.c ++++ b/arch/x86/mm/highmem_32.c +@@ -4,65 +4,6 @@ + #include /* for totalram_pages */ + #include + +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR*smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- BUG_ON(!pte_none(*(kmap_pte-idx))); +- set_pte(kmap_pte-idx, mk_pte(page, prot)); +- arch_flush_lazy_mmu_mode(); +- +- return (void *)vaddr; +-} +-EXPORT_SYMBOL(kmap_atomic_high_prot); +- +-/* +- * This is the same as kmap_atomic() but can map memory that doesn't +- * have a struct page associated with it. +- */ +-void *kmap_atomic_pfn(unsigned long pfn) +-{ +- return kmap_atomic_prot_pfn(pfn, kmap_prot); +-} +-EXPORT_SYMBOL_GPL(kmap_atomic_pfn); +- +-void kunmap_atomic_high(void *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- +- if (vaddr >= __fix_to_virt(FIX_KMAP_END) && +- vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { +- int idx, type; +- +- type = kmap_atomic_idx(); +- idx = type + KM_TYPE_NR * smp_processor_id(); +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +-#endif +- /* +- * Force other mappings to Oops if they'll try to access this +- * pte without first remap it. Keeping stale mappings around +- * is a bad idea also, in case the page changes cacheability +- * attributes or becomes a protected page in a hypervisor. +- */ +- kpte_clear_flush(kmap_pte-idx, vaddr); +- kmap_atomic_idx_pop(); +- arch_flush_lazy_mmu_mode(); +- } +-#ifdef CONFIG_DEBUG_HIGHMEM +- else { +- BUG_ON(vaddr < PAGE_OFFSET); +- BUG_ON(vaddr >= (unsigned long)high_memory); +- } +-#endif +-} +-EXPORT_SYMBOL(kunmap_atomic_high); +- + void __init set_highmem_pages_init(void) + { + struct zone *zone; +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 7c055259d..da31c2635 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -394,19 +394,6 @@ kernel_physical_mapping_init(unsigned long start, + return last_map_addr; + } + +-pte_t *kmap_pte; +- +-static void __init kmap_init(void) +-{ +- unsigned long kmap_vstart; +- +- /* +- * Cache the first kmap pte: +- */ +- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); +- kmap_pte = virt_to_kpte(kmap_vstart); +-} +- + #ifdef CONFIG_HIGHMEM + static void __init permanent_kmaps_init(pgd_t *pgd_base) + { +@@ -712,8 +699,6 @@ void __init paging_init(void) + + __flush_tlb_all(); + +- kmap_init(); +- + /* + * NOTE: at this point the bootmem allocator is fully available. + */ +diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c +index f60398aeb..9aaa756dd 100644 +--- a/arch/x86/mm/iomap_32.c ++++ b/arch/x86/mm/iomap_32.c +@@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) + } + EXPORT_SYMBOL_GPL(iomap_free); + +-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +-{ +- unsigned long vaddr; +- int idx, type; +- +- preempt_disable(); +- pagefault_disable(); +- +- type = kmap_atomic_idx_push(); +- idx = type + KM_TYPE_NR * smp_processor_id(); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +- set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); +- arch_flush_lazy_mmu_mode(); +- +- return (void *)vaddr; +-} +- +-/* +- * Map 'pfn' using protections 'prot' +- */ +-void __iomem * +-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) ++void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) + { + /* + * For non-PAT systems, translate non-WB request to UC- just in +@@ -81,36 +60,6 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) + /* Filter out unsupported __PAGE_KERNEL* bits: */ + pgprot_val(prot) &= __default_kernel_pte_mask; + +- return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); +-} +-EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); +- +-void +-iounmap_atomic(void __iomem *kvaddr) +-{ +- unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; +- +- if (vaddr >= __fix_to_virt(FIX_KMAP_END) && +- vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { +- int idx, type; +- +- type = kmap_atomic_idx(); +- idx = type + KM_TYPE_NR * smp_processor_id(); +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +-#endif +- /* +- * Force other mappings to Oops if they'll try to access this +- * pte without first remap it. Keeping stale mappings around +- * is a bad idea also, in case the page changes cacheability +- * attributes or becomes a protected page in a hypervisor. +- */ +- kpte_clear_flush(kmap_pte-idx, vaddr); +- kmap_atomic_idx_pop(); +- } +- +- pagefault_enable(); +- preempt_enable(); ++ return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); + } +-EXPORT_SYMBOL_GPL(iounmap_atomic); ++EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); +diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig +index 87e08ad38..03cbf6b53 100644 +--- a/arch/xtensa/Kconfig ++++ b/arch/xtensa/Kconfig +@@ -666,6 +666,7 @@ endchoice + config HIGHMEM + bool "High Memory Support" + depends on MMU ++ select KMAP_LOCAL + help + Linux can use the full amount of RAM in the system by + default. However, the default MMUv2 setup only maps the +diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h +index a06ffb0c6..92049b61c 100644 +--- a/arch/xtensa/include/asm/fixmap.h ++++ b/arch/xtensa/include/asm/fixmap.h +@@ -16,7 +16,7 @@ + #ifdef CONFIG_HIGHMEM + #include + #include +-#include ++#include + #endif + + /* +@@ -39,7 +39,7 @@ enum fixed_addresses { + /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_BEGIN, + FIX_KMAP_END = FIX_KMAP_BEGIN + +- (KM_TYPE_NR * NR_CPUS * DCACHE_N_COLORS) - 1, ++ (KM_MAX_IDX * NR_CPUS * DCACHE_N_COLORS) - 1, + #endif + __end_of_fixed_addresses + }; +diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h +index eac503215..0fc3b1ceb 100644 +--- a/arch/xtensa/include/asm/highmem.h ++++ b/arch/xtensa/include/asm/highmem.h +@@ -16,9 +16,8 @@ + #include + #include + #include +-#include + +-#define PKMAP_BASE ((FIXADDR_START - \ ++#define PKMAP_BASE ((FIXADDR_START - \ + (LAST_PKMAP + 1) * PAGE_SIZE) & PMD_MASK) + #define LAST_PKMAP (PTRS_PER_PTE * DCACHE_N_COLORS) + #define LAST_PKMAP_MASK (LAST_PKMAP - 1) +@@ -68,6 +67,15 @@ static inline void flush_cache_kmaps(void) + flush_cache_all(); + } + ++enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn); ++#define arch_kmap_local_map_idx kmap_local_map_idx ++ ++enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr); ++#define arch_kmap_local_unmap_idx kmap_local_unmap_idx ++ ++#define arch_kmap_local_post_unmap(vaddr) \ ++ local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) ++ + void kmap_init(void); + + #endif +diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h +index 64c938925..dc846323b 100644 +--- a/arch/xtensa/include/asm/spinlock_types.h ++++ b/arch/xtensa/include/asm/spinlock_types.h +@@ -2,10 +2,6 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) +-# error "please don't include this file directly" +-#endif +- + #include + #include + +diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c +index 673196fe8..0735ca5e8 100644 +--- a/arch/xtensa/mm/highmem.c ++++ b/arch/xtensa/mm/highmem.c +@@ -12,8 +12,6 @@ + #include + #include + +-static pte_t *kmap_pte; +- + #if DCACHE_WAY_SIZE > PAGE_SIZE + unsigned int last_pkmap_nr_arr[DCACHE_N_COLORS]; + wait_queue_head_t pkmap_map_wait_arr[DCACHE_N_COLORS]; +@@ -33,59 +31,25 @@ static inline void kmap_waitqueues_init(void) + + static inline enum fixed_addresses kmap_idx(int type, unsigned long color) + { +- return (type + KM_TYPE_NR * smp_processor_id()) * DCACHE_N_COLORS + ++ return (type + KM_MAX_IDX * smp_processor_id()) * DCACHE_N_COLORS + + color; + } + +-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) ++enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn) + { +- enum fixed_addresses idx; +- unsigned long vaddr; +- +- idx = kmap_idx(kmap_atomic_idx_push(), +- DCACHE_ALIAS(page_to_phys(page))); +- vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +-#ifdef CONFIG_DEBUG_HIGHMEM +- BUG_ON(!pte_none(*(kmap_pte + idx))); +-#endif +- set_pte(kmap_pte + idx, mk_pte(page, prot)); +- +- return (void *)vaddr; ++ return kmap_idx(type, DCACHE_ALIAS(pfn << PAGE_SHIFT)); + } +-EXPORT_SYMBOL(kmap_atomic_high_prot); + +-void kunmap_atomic_high(void *kvaddr) ++enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr) + { +- if (kvaddr >= (void *)FIXADDR_START && +- kvaddr < (void *)FIXADDR_TOP) { +- int idx = kmap_idx(kmap_atomic_idx(), +- DCACHE_ALIAS((unsigned long)kvaddr)); +- +- /* +- * Force other mappings to Oops if they'll try to access this +- * pte without first remap it. Keeping stale mappings around +- * is a bad idea also, in case the page changes cacheability +- * attributes or becomes a protected page in a hypervisor. +- */ +- pte_clear(&init_mm, kvaddr, kmap_pte + idx); +- local_flush_tlb_kernel_range((unsigned long)kvaddr, +- (unsigned long)kvaddr + PAGE_SIZE); +- +- kmap_atomic_idx_pop(); +- } ++ return kmap_idx(type, DCACHE_ALIAS(addr)); + } +-EXPORT_SYMBOL(kunmap_atomic_high); + + void __init kmap_init(void) + { +- unsigned long kmap_vstart; +- + /* Check if this memory layout is broken because PKMAP overlaps + * page table. + */ + BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); +- /* cache the first kmap pte */ +- kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); +- kmap_pte = virt_to_kpte(kmap_vstart); + kmap_waitqueues_init(); + } +diff --git a/block/blk-mq.c b/block/blk-mq.c +index cedc35521..a15e963df 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -44,7 +44,7 @@ + bool mq_unfair_dtag = true; + module_param_named(unfair_dtag, mq_unfair_dtag, bool, 0444); + +-static DEFINE_PER_CPU(struct list_head, blk_cpu_done); ++static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); + + static void blk_mq_poll_stats_start(struct request_queue *q); + static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); +@@ -590,80 +590,29 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) + } + EXPORT_SYMBOL(blk_mq_end_request); + +-/* +- * Softirq action handler - move entries to local list and loop over them +- * while passing them to the queue registered handler. +- */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static void blk_complete_reqs(struct llist_head *list) + { +- struct list_head *cpu_list, local_list; +- +- local_irq_disable(); +- cpu_list = this_cpu_ptr(&blk_cpu_done); +- list_replace_init(cpu_list, &local_list); +- local_irq_enable(); +- +- while (!list_empty(&local_list)) { +- struct request *rq; ++ struct llist_node *entry = llist_reverse_order(llist_del_all(list)); ++ struct request *rq, *next; + +- rq = list_entry(local_list.next, struct request, ipi_list); +- list_del_init(&rq->ipi_list); ++ llist_for_each_entry_safe(rq, next, entry, ipi_list) + rq->q->mq_ops->complete(rq); +- } + } + +-static void blk_mq_trigger_softirq(struct request *rq) ++static __latent_entropy void blk_done_softirq(struct softirq_action *h) + { +- struct list_head *list; +- unsigned long flags; +- +- local_irq_save(flags); +- list = this_cpu_ptr(&blk_cpu_done); +- list_add_tail(&rq->ipi_list, list); +- +- /* +- * If the list only contains our just added request, signal a raise of +- * the softirq. If there are already entries there, someone already +- * raised the irq but it hasn't run yet. +- */ +- if (list->next == &rq->ipi_list) +- raise_softirq_irqoff(BLOCK_SOFTIRQ); +- local_irq_restore(flags); ++ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); + } + + static int blk_softirq_cpu_dead(unsigned int cpu) + { +- /* +- * If a CPU goes away, splice its entries to the current CPU +- * and trigger a run of the softirq +- */ +- local_irq_disable(); +- list_splice_init(&per_cpu(blk_cpu_done, cpu), +- this_cpu_ptr(&blk_cpu_done)); +- raise_softirq_irqoff(BLOCK_SOFTIRQ); +- local_irq_enable(); +- ++ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); + return 0; + } + +- + static void __blk_mq_complete_request_remote(void *data) + { +- struct request *rq = data; +- +- /* +- * For most of single queue controllers, there is only one irq vector +- * for handling I/O completion, and the only irq's affinity is set +- * to all possible CPUs. On most of ARCHs, this affinity means the irq +- * is handled on one specific CPU. +- * +- * So complete I/O requests in softirq context in case of single queue +- * devices to avoid degrading I/O performance due to irqsoff latency. +- */ +- if (rq->q->nr_hw_queues == 1) +- blk_mq_trigger_softirq(rq); +- else +- rq->q->mq_ops->complete(rq); ++ __raise_softirq_irqoff(BLOCK_SOFTIRQ); + } + + static inline bool blk_mq_complete_need_ipi(struct request *rq) +@@ -673,6 +622,14 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; ++ /* ++ * With force threaded interrupts enabled, raising softirq from an SMP ++ * function call will always result in waking the ksoftirqd thread. ++ * This is probably worse than completing the request on a different ++ * cache domain. ++ */ ++ if (force_irqthreads) ++ return false; + + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || +@@ -684,6 +641,31 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) + return cpu_online(rq->mq_ctx->cpu); + } + ++static void blk_mq_complete_send_ipi(struct request *rq) ++{ ++ struct llist_head *list; ++ unsigned int cpu; ++ ++ cpu = rq->mq_ctx->cpu; ++ list = &per_cpu(blk_cpu_done, cpu); ++ if (llist_add(&rq->ipi_list, list)) { ++ rq->csd.func = __blk_mq_complete_request_remote; ++ rq->csd.info = rq; ++ smp_call_function_single_async(cpu, &rq->csd); ++ } ++} ++ ++static void blk_mq_raise_softirq(struct request *rq) ++{ ++ struct llist_head *list; ++ ++ preempt_disable(); ++ list = this_cpu_ptr(&blk_cpu_done); ++ if (llist_add(&rq->ipi_list, list)) ++ raise_softirq(BLOCK_SOFTIRQ); ++ preempt_enable(); ++} ++ + bool blk_mq_complete_request_remote(struct request *rq) + { + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); +@@ -696,15 +678,15 @@ bool blk_mq_complete_request_remote(struct request *rq) + return false; + + if (blk_mq_complete_need_ipi(rq)) { +- INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); +- smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); +- } else { +- if (rq->q->nr_hw_queues > 1) +- return false; +- blk_mq_trigger_softirq(rq); ++ blk_mq_complete_send_ipi(rq); ++ return true; + } + +- return true; ++ if (rq->q->nr_hw_queues == 1) { ++ blk_mq_raise_softirq(rq); ++ return true; ++ } ++ return false; + } + EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +@@ -1617,14 +1599,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + return; + + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { +- int cpu = get_cpu(); ++ int cpu = get_cpu_light(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); +- put_cpu(); ++ put_cpu_light(); + return; + } + +- put_cpu(); ++ put_cpu_light(); + } + + /* +@@ -4096,7 +4078,7 @@ static int __init blk_mq_init(void) + int i; + + for_each_possible_cpu(i) +- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); ++ init_llist_head(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, +diff --git a/crypto/cryptd.c b/crypto/cryptd.c +index a1bea0f4b..5f8ca8c1f 100644 +--- a/crypto/cryptd.c ++++ b/crypto/cryptd.c +@@ -36,6 +36,7 @@ static struct workqueue_struct *cryptd_wq; + struct cryptd_cpu_queue { + struct crypto_queue queue; + struct work_struct work; ++ spinlock_t qlock; + }; + + struct cryptd_queue { +@@ -105,6 +106,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue, + cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); + crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); + INIT_WORK(&cpu_queue->work, cryptd_queue_worker); ++ spin_lock_init(&cpu_queue->qlock); + } + pr_info("cryptd: max_cpu_qlen set to %d\n", max_cpu_qlen); + return 0; +@@ -129,8 +131,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue, + struct cryptd_cpu_queue *cpu_queue; + refcount_t *refcnt; + +- cpu = get_cpu(); +- cpu_queue = this_cpu_ptr(queue->cpu_queue); ++ cpu_queue = raw_cpu_ptr(queue->cpu_queue); ++ spin_lock_bh(&cpu_queue->qlock); ++ cpu = smp_processor_id(); ++ + err = crypto_enqueue_request(&cpu_queue->queue, request); + + refcnt = crypto_tfm_ctx(request->tfm); +@@ -146,7 +150,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue, + refcount_inc(refcnt); + + out_put_cpu: +- put_cpu(); ++ spin_unlock_bh(&cpu_queue->qlock); + + return err; + } +@@ -162,16 +166,11 @@ static void cryptd_queue_worker(struct work_struct *work) + cpu_queue = container_of(work, struct cryptd_cpu_queue, work); + /* + * Only handle one request at a time to avoid hogging crypto workqueue. +- * preempt_disable/enable is used to prevent being preempted by +- * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent +- * cryptd_enqueue_request() being accessed from software interrupts. + */ +- local_bh_disable(); +- preempt_disable(); ++ spin_lock_bh(&cpu_queue->qlock); + backlog = crypto_get_backlog(&cpu_queue->queue); + req = crypto_dequeue_request(&cpu_queue->queue); +- preempt_enable(); +- local_bh_enable(); ++ spin_unlock_bh(&cpu_queue->qlock); + + if (!req) + return; +diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c +index b574cce98..422753d52 100644 +--- a/drivers/atm/eni.c ++++ b/drivers/atm/eni.c +@@ -2054,7 +2054,7 @@ static int eni_send(struct atm_vcc *vcc,struct sk_buff *skb) + } + submitted++; + ATM_SKB(skb)->vcc = vcc; +- tasklet_disable(&ENI_DEV(vcc->dev)->task); ++ tasklet_disable_in_atomic(&ENI_DEV(vcc->dev)->task); + res = do_tx(skb); + tasklet_enable(&ENI_DEV(vcc->dev)->task); + if (res == enq_ok) return 0; +diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c +index 0636df6b6..1a7523cef 100644 +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); + static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio); + ++#ifdef CONFIG_PREEMPT_RT ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) ++{ ++ size_t index; ++ ++ for (index = 0; index < num_pages; index++) ++ spin_lock_init(&zram->table[index].lock); ++} ++ ++static int zram_slot_trylock(struct zram *zram, u32 index) ++{ ++ int ret; ++ ++ ret = spin_trylock(&zram->table[index].lock); ++ if (ret) ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++ return ret; ++} ++ ++static void zram_slot_lock(struct zram *zram, u32 index) ++{ ++ spin_lock(&zram->table[index].lock); ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++} ++ ++static void zram_slot_unlock(struct zram *zram, u32 index) ++{ ++ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); ++ spin_unlock(&zram->table[index].lock); ++} ++ ++#else ++ ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } + + static int zram_slot_trylock(struct zram *zram, u32 index) + { +@@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) + { + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + } ++#endif + + static inline bool init_done(struct zram *zram) + { +@@ -1165,6 +1200,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) + + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); ++ zram_meta_init_table_locks(zram, num_pages); + return true; + } + +diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h +index f2fd46daa..7e4dd447e 100644 +--- a/drivers/block/zram/zram_drv.h ++++ b/drivers/block/zram/zram_drv.h +@@ -63,6 +63,7 @@ struct zram_table_entry { + unsigned long element; + }; + unsigned long flags; ++ spinlock_t lock; + #ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; + #endif +diff --git a/drivers/char/random.c b/drivers/char/random.c +index 8f29cbc08..2cf5ba921 100644 +--- a/drivers/char/random.c ++++ b/drivers/char/random.c +@@ -1272,28 +1272,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) + return *ptr; + } + +-void add_interrupt_randomness(int irq, int irq_flags) ++void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) + { + struct entropy_store *r; + struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); +- struct pt_regs *regs = get_irq_regs(); + unsigned long now = jiffies; + cycles_t cycles = random_get_entropy(); + __u32 c_high, j_high; +- __u64 ip; + unsigned long seed; + int credit = 0; + + if (cycles == 0) +- cycles = get_reg(fast_pool, regs); ++ cycles = get_reg(fast_pool, NULL); + c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0; + j_high = (sizeof(now) > 4) ? now >> 32 : 0; + fast_pool->pool[0] ^= cycles ^ j_high ^ irq; + fast_pool->pool[1] ^= now ^ c_high; +- ip = regs ? instruction_pointer(regs) : _RET_IP_; ++ if (!ip) ++ ip = _RET_IP_; + fast_pool->pool[2] ^= ip; + fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 : +- get_reg(fast_pool, regs); ++ get_reg(fast_pool, NULL); + + fast_mix(fast_pool); + add_interrupt_bench(cycles); +diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c +index 1784530b8..c08cbb306 100644 +--- a/drivers/char/tpm/tpm-dev-common.c ++++ b/drivers/char/tpm/tpm-dev-common.c +@@ -20,7 +20,6 @@ + #include "tpm-dev.h" + + static struct workqueue_struct *tpm_dev_wq; +-static DEFINE_MUTEX(tpm_dev_wq_lock); + + static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space, + u8 *buf, size_t bufsiz) +diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c +index 4ed6e6602..c2bd0d40b 100644 +--- a/drivers/char/tpm/tpm_tis.c ++++ b/drivers/char/tpm/tpm_tis.c +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da + return container_of(data, struct tpm_tis_tcg_phy, priv); + } + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Flushes previous write operations to chip so that a subsequent ++ * ioread*()s won't stall a cpu. ++ */ ++static inline void tpm_tis_flush(void __iomem *iobase) ++{ ++ ioread8(iobase + TPM_ACCESS(0)); ++} ++#else ++#define tpm_tis_flush(iobase) do { } while (0) ++#endif ++ ++static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite8(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ ++static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite32(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ + static int interrupts = -1; + module_param(interrupts, int, 0444); + MODULE_PARM_DESC(interrupts, "Enable interrupts"); +@@ -169,7 +194,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + + while (len--) +- iowrite8(*value++, phy->iobase + addr); ++ tpm_tis_iowrite8(*value++, phy->iobase, addr); + + return 0; + } +@@ -196,7 +221,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) + { + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + +- iowrite32(value, phy->iobase + addr); ++ tpm_tis_iowrite32(value, phy->iobase, addr); + + return 0; + } +diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c +index 9811c4095..17c9d8251 100644 +--- a/drivers/firewire/ohci.c ++++ b/drivers/firewire/ohci.c +@@ -2545,7 +2545,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) + struct driver_data *driver_data = packet->driver_data; + int ret = -ENOENT; + +- tasklet_disable(&ctx->tasklet); ++ tasklet_disable_in_atomic(&ctx->tasklet); + + if (packet->ack != 0) + goto out; +@@ -3465,7 +3465,7 @@ static int ohci_flush_iso_completions(struct fw_iso_context *base) + struct iso_context *ctx = container_of(base, struct iso_context, base); + int ret = 0; + +- tasklet_disable(&ctx->context.tasklet); ++ tasklet_disable_in_atomic(&ctx->context.tasklet); + + if (!test_and_set_bit_lock(0, &ctx->flushing_completions)) { + context_tasklet((unsigned long)&ctx->context); +diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c +index 28d35b6c6..659367aec 100644 +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -66,7 +66,7 @@ struct mm_struct efi_mm = { + + struct workqueue_struct *efi_rts_wq; + +-static bool disable_runtime; ++static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); + static int __init setup_noefi(char *arg) + { + disable_runtime = true; +@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) + if (parse_option_str(str, "noruntime")) + disable_runtime = true; + ++ if (parse_option_str(str, "runtime")) ++ disable_runtime = false; ++ + if (parse_option_str(str, "nosoftreserve")) + set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); + +diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c +index 12f7128b7..a65061e3e 100644 +--- a/drivers/gpu/drm/i915/display/intel_sprite.c ++++ b/drivers/gpu/drm/i915/display/intel_sprite.c +@@ -118,7 +118,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + "PSR idle timed out 0x%x, atomic update may fail\n", + psr_status); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + + crtc->debug.min_vbl = min; + crtc->debug.max_vbl = max; +@@ -143,11 +144,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + break; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + timeout = schedule_timeout(timeout); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + finish_wait(wq, &wait); +@@ -180,7 +183,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + return; + + irq_disable: +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + /** +@@ -218,7 +222,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) + new_crtc_state->uapi.event = NULL; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + if (intel_vgpu_active(dev_priv)) + return; +diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +index 0c083af5a..2abf043d3 100644 +--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c ++++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +@@ -1080,7 +1080,7 @@ static void reloc_cache_reset(struct reloc_cache *cache, struct i915_execbuffer + struct i915_ggtt *ggtt = cache_to_ggtt(cache); + + intel_gt_flush_ggtt_writes(ggtt->vm.gt); +- io_mapping_unmap_atomic((void __iomem *)vaddr); ++ io_mapping_unmap_local((void __iomem *)vaddr); + + if (drm_mm_node_allocated(&cache->node)) { + ggtt->vm.clear_range(&ggtt->vm, +@@ -1146,7 +1146,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, + + if (cache->vaddr) { + intel_gt_flush_ggtt_writes(ggtt->vm.gt); +- io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); ++ io_mapping_unmap_local((void __force __iomem *) unmask_page(cache->vaddr)); + } else { + struct i915_vma *vma; + int err; +@@ -1194,8 +1194,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, + offset += page << PAGE_SHIFT; + } + +- vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, +- offset); ++ vaddr = (void __force *)io_mapping_map_local_wc(&ggtt->iomap, offset); + cache->page = page; + cache->vaddr = (unsigned long)vaddr; + +diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +index 0040b4765..3f4f85478 100644 +--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c ++++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +@@ -342,10 +342,9 @@ void intel_breadcrumbs_park(struct intel_breadcrumbs *b) + /* Kick the work once more to drain the signalers */ + irq_work_sync(&b->irq_work); + while (unlikely(READ_ONCE(b->irq_armed))) { +- local_irq_disable(); +- signal_irq_work(&b->irq_work); +- local_irq_enable(); ++ irq_work_queue(&b->irq_work); + cond_resched(); ++ irq_work_sync(&b->irq_work); + } + GEM_BUG_ON(!list_empty(&b->signalers)); + } +diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +index f7b2e07e2..313d8a28e 100644 +--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c ++++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +@@ -60,9 +60,10 @@ static int __engine_unpark(struct intel_wakeref *wf) + + static inline unsigned long __timeline_mark_lock(struct intel_context *ce) + { +- unsigned long flags; ++ unsigned long flags = 0; + +- local_irq_save(flags); ++ if (!force_irqthreads) ++ local_irq_save(flags); + mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); + + return flags; +@@ -72,7 +73,8 @@ static inline void __timeline_mark_unlock(struct intel_context *ce, + unsigned long flags) + { + mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); +- local_irq_restore(flags); ++ if (!force_irqthreads) ++ local_irq_restore(flags); + } + + #else +diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c +index 58276694c..88944c3b1 100644 +--- a/drivers/gpu/drm/i915/i915_gem.c ++++ b/drivers/gpu/drm/i915/i915_gem.c +@@ -355,22 +355,15 @@ gtt_user_read(struct io_mapping *mapping, + char __user *user_data, int length) + { + void __iomem *vaddr; +- unsigned long unwritten; ++ bool fail = false; + + /* We can use the cpu mem copy function because this is X86. */ +- vaddr = io_mapping_map_atomic_wc(mapping, base); +- unwritten = __copy_to_user_inatomic(user_data, +- (void __force *)vaddr + offset, +- length); +- io_mapping_unmap_atomic(vaddr); +- if (unwritten) { +- vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE); +- unwritten = copy_to_user(user_data, +- (void __force *)vaddr + offset, +- length); +- io_mapping_unmap(vaddr); +- } +- return unwritten; ++ vaddr = io_mapping_map_local_wc(mapping, base); ++ if (copy_to_user(user_data, (void __force *)vaddr + offset, length)) ++ fail = true; ++ io_mapping_unmap_local(vaddr); ++ ++ return fail; + } + + static int +@@ -539,21 +532,14 @@ ggtt_write(struct io_mapping *mapping, + char __user *user_data, int length) + { + void __iomem *vaddr; +- unsigned long unwritten; ++ bool fail = false; + + /* We can use the cpu mem copy function because this is X86. */ +- vaddr = io_mapping_map_atomic_wc(mapping, base); +- unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset, +- user_data, length); +- io_mapping_unmap_atomic(vaddr); +- if (unwritten) { +- vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE); +- unwritten = copy_from_user((void __force *)vaddr + offset, +- user_data, length); +- io_mapping_unmap(vaddr); +- } +- +- return unwritten; ++ vaddr = io_mapping_map_local_wc(mapping, base); ++ if (copy_from_user((void __force *)vaddr + offset, user_data, length)) ++ fail = true; ++ io_mapping_unmap_local(vaddr); ++ return fail; + } + + /** +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index 759f523c6..7339a42ab 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -847,6 +847,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + + /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_disable_rt(); + + /* Get optional system timestamp before query. */ + if (stime) +@@ -898,6 +899,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + *etime = ktime_get(); + + /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_enable_rt(); + + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); + +diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h +index a4addcc64..396b65986 100644 +--- a/drivers/gpu/drm/i915/i915_trace.h ++++ b/drivers/gpu/drm/i915/i915_trace.h +@@ -2,6 +2,10 @@ + #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) + #define _I915_TRACE_H_ + ++#ifdef CONFIG_PREEMPT_RT ++#define NOTRACE ++#endif ++ + #include + #include + #include +@@ -778,7 +782,7 @@ DEFINE_EVENT(i915_request, i915_request_add, + TP_ARGS(rq) + ); + +-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) ++#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) + DEFINE_EVENT(i915_request, i915_request_submit, + TP_PROTO(struct i915_request *rq), + TP_ARGS(rq) +diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c +index 412e21604..432493183 100644 +--- a/drivers/gpu/drm/i915/selftests/i915_gem.c ++++ b/drivers/gpu/drm/i915/selftests/i915_gem.c +@@ -57,12 +57,12 @@ static void trash_stolen(struct drm_i915_private *i915) + + ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); + +- s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); ++ s = io_mapping_map_local_wc(&ggtt->iomap, slot); + for (x = 0; x < PAGE_SIZE / sizeof(u32); x++) { + prng = next_pseudo_random32(prng); + iowrite32(prng, &s[x]); + } +- io_mapping_unmap_atomic(s); ++ io_mapping_unmap_local(s); + } + + ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); +diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +index 65e28c4cd..ca483285f 100644 +--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c ++++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +@@ -1201,9 +1201,9 @@ static int igt_ggtt_page(void *arg) + u64 offset = tmp.start + order[n] * PAGE_SIZE; + u32 __iomem *vaddr; + +- vaddr = io_mapping_map_atomic_wc(&ggtt->iomap, offset); ++ vaddr = io_mapping_map_local_wc(&ggtt->iomap, offset); + iowrite32(n, vaddr + n); +- io_mapping_unmap_atomic(vaddr); ++ io_mapping_unmap_local(vaddr); + } + intel_gt_flush_ggtt_writes(ggtt->vm.gt); + +@@ -1213,9 +1213,9 @@ static int igt_ggtt_page(void *arg) + u32 __iomem *vaddr; + u32 val; + +- vaddr = io_mapping_map_atomic_wc(&ggtt->iomap, offset); ++ vaddr = io_mapping_map_local_wc(&ggtt->iomap, offset); + val = ioread32(vaddr + n); +- io_mapping_unmap_atomic(vaddr); ++ io_mapping_unmap_local(vaddr); + + if (val != n) { + pr_err("insert page failed: found %d, expected %d\n", +diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h +index 6c5bbff12..411f91ee2 100644 +--- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h ++++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h +@@ -60,19 +60,19 @@ fbmem_fini(struct io_mapping *fb) + static inline u32 + fbmem_peek(struct io_mapping *fb, u32 off) + { +- u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK); ++ u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK); + u32 val = ioread32(p + (off & ~PAGE_MASK)); +- io_mapping_unmap_atomic(p); ++ io_mapping_unmap_local(p); + return val; + } + + static inline void + fbmem_poke(struct io_mapping *fb, u32 off, u32 val) + { +- u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK); ++ u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK); + iowrite32(val, p + (off & ~PAGE_MASK)); + wmb(); +- io_mapping_unmap_atomic(p); ++ io_mapping_unmap_local(p); + } + + static inline bool +diff --git a/drivers/gpu/drm/qxl/qxl_image.c b/drivers/gpu/drm/qxl/qxl_image.c +index 60ab7151b..93f92ccd4 100644 +--- a/drivers/gpu/drm/qxl/qxl_image.c ++++ b/drivers/gpu/drm/qxl/qxl_image.c +@@ -124,12 +124,12 @@ qxl_image_init_helper(struct qxl_device *qdev, + wrong (check the bitmaps are sent correctly + first) */ + +- ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 0); ++ ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 0); + chunk = ptr; + chunk->data_size = height * chunk_stride; + chunk->prev_chunk = 0; + chunk->next_chunk = 0; +- qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); + + { + void *k_data, *i_data; +@@ -143,7 +143,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + i_data = (void *)data; + + while (remain > 0) { +- ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, page << PAGE_SHIFT); ++ ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, page << PAGE_SHIFT); + + if (page == 0) { + chunk = ptr; +@@ -157,7 +157,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + + memcpy(k_data, i_data, size); + +- qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); + i_data += size; + remain -= size; + page++; +@@ -175,10 +175,10 @@ qxl_image_init_helper(struct qxl_device *qdev, + page_offset = offset_in_page(out_offset); + size = min((int)(PAGE_SIZE - page_offset), remain); + +- ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, page_base); ++ ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, page_base); + k_data = ptr + page_offset; + memcpy(k_data, i_data, size); +- qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); + remain -= size; + i_data += size; + out_offset += size; +@@ -189,7 +189,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + qxl_bo_kunmap(chunk_bo); + + image_bo = dimage->bo; +- ptr = qxl_bo_kmap_atomic_page(qdev, image_bo, 0); ++ ptr = qxl_bo_kmap_local_page(qdev, image_bo, 0); + image = ptr; + + image->descriptor.id = 0; +@@ -212,7 +212,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + break; + default: + DRM_ERROR("unsupported image bit depth\n"); +- qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, image_bo, ptr); + return -EINVAL; + } + image->u.bitmap.flags = QXL_BITMAP_TOP_DOWN; +@@ -222,7 +222,7 @@ qxl_image_init_helper(struct qxl_device *qdev, + image->u.bitmap.palette = 0; + image->u.bitmap.data = qxl_bo_physical_address(qdev, chunk_bo, 0); + +- qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, image_bo, ptr); + + return 0; + } +diff --git a/drivers/gpu/drm/qxl/qxl_ioctl.c b/drivers/gpu/drm/qxl/qxl_ioctl.c +index 5cea6eea7..785023081 100644 +--- a/drivers/gpu/drm/qxl/qxl_ioctl.c ++++ b/drivers/gpu/drm/qxl/qxl_ioctl.c +@@ -89,11 +89,11 @@ apply_reloc(struct qxl_device *qdev, struct qxl_reloc_info *info) + { + void *reloc_page; + +- reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); ++ reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); + *(uint64_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = qxl_bo_physical_address(qdev, + info->src_bo, + info->src_offset); +- qxl_bo_kunmap_atomic_page(qdev, info->dst_bo, reloc_page); ++ qxl_bo_kunmap_local_page(qdev, info->dst_bo, reloc_page); + } + + static void +@@ -105,9 +105,9 @@ apply_surf_reloc(struct qxl_device *qdev, struct qxl_reloc_info *info) + if (info->src_bo && !info->src_bo->is_primary) + id = info->src_bo->surface_id; + +- reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); ++ reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); + *(uint32_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = id; +- qxl_bo_kunmap_atomic_page(qdev, info->dst_bo, reloc_page); ++ qxl_bo_kunmap_local_page(qdev, info->dst_bo, reloc_page); + } + + /* return holding the reference to this object */ +@@ -149,7 +149,6 @@ static int qxl_process_single_command(struct qxl_device *qdev, + struct qxl_bo *cmd_bo; + void *fb_cmd; + int i, ret, num_relocs; +- int unwritten; + + switch (cmd->type) { + case QXL_CMD_DRAW: +@@ -185,21 +184,21 @@ static int qxl_process_single_command(struct qxl_device *qdev, + goto out_free_reloc; + + /* TODO copy slow path code from i915 */ +- fb_cmd = qxl_bo_kmap_atomic_page(qdev, cmd_bo, (release->release_offset & PAGE_MASK)); +- unwritten = __copy_from_user_inatomic_nocache +- (fb_cmd + sizeof(union qxl_release_info) + (release->release_offset & ~PAGE_MASK), +- u64_to_user_ptr(cmd->command), cmd->command_size); ++ fb_cmd = qxl_bo_kmap_local_page(qdev, cmd_bo, (release->release_offset & PAGE_MASK)); + +- { ++ if (copy_from_user(fb_cmd + sizeof(union qxl_release_info) + ++ (release->release_offset & ~PAGE_MASK), ++ u64_to_user_ptr(cmd->command), cmd->command_size)) { ++ ret = -EFAULT; ++ } else { + struct qxl_drawable *draw = fb_cmd; + + draw->mm_time = qdev->rom->mm_clock; + } + +- qxl_bo_kunmap_atomic_page(qdev, cmd_bo, fb_cmd); +- if (unwritten) { +- DRM_ERROR("got unwritten %d\n", unwritten); +- ret = -EFAULT; ++ qxl_bo_kunmap_local_page(qdev, cmd_bo, fb_cmd); ++ if (ret) { ++ DRM_ERROR("copy from user failed %d\n", ret); + goto out_free_release; + } + +diff --git a/drivers/gpu/drm/qxl/qxl_object.c b/drivers/gpu/drm/qxl/qxl_object.c +index 544a9e4df..5ee5171d4 100644 +--- a/drivers/gpu/drm/qxl/qxl_object.c ++++ b/drivers/gpu/drm/qxl/qxl_object.c +@@ -173,8 +173,8 @@ int qxl_bo_kmap(struct qxl_bo *bo, void **ptr) + return 0; + } + +-void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, +- struct qxl_bo *bo, int page_offset) ++void *qxl_bo_kmap_local_page(struct qxl_device *qdev, ++ struct qxl_bo *bo, int page_offset) + { + unsigned long offset; + void *rptr; +@@ -189,7 +189,7 @@ void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, + goto fallback; + + offset = bo->tbo.mem.start << PAGE_SHIFT; +- return io_mapping_map_atomic_wc(map, offset + page_offset); ++ return io_mapping_map_local_wc(map, offset + page_offset); + fallback: + if (bo->kptr) { + rptr = bo->kptr + (page_offset * PAGE_SIZE); +@@ -215,14 +215,14 @@ void qxl_bo_kunmap(struct qxl_bo *bo) + ttm_bo_kunmap(&bo->kmap); + } + +-void qxl_bo_kunmap_atomic_page(struct qxl_device *qdev, +- struct qxl_bo *bo, void *pmap) ++void qxl_bo_kunmap_local_page(struct qxl_device *qdev, ++ struct qxl_bo *bo, void *pmap) + { + if ((bo->tbo.mem.mem_type != TTM_PL_VRAM) && + (bo->tbo.mem.mem_type != TTM_PL_PRIV)) + goto fallback; + +- io_mapping_unmap_atomic(pmap); ++ io_mapping_unmap_local(pmap); + return; + fallback: + qxl_bo_kunmap(bo); +diff --git a/drivers/gpu/drm/qxl/qxl_object.h b/drivers/gpu/drm/qxl/qxl_object.h +index 5762ea40d..6ae89b1b3 100644 +--- a/drivers/gpu/drm/qxl/qxl_object.h ++++ b/drivers/gpu/drm/qxl/qxl_object.h +@@ -89,8 +89,8 @@ extern int qxl_bo_create(struct qxl_device *qdev, + struct qxl_bo **bo_ptr); + extern int qxl_bo_kmap(struct qxl_bo *bo, void **ptr); + extern void qxl_bo_kunmap(struct qxl_bo *bo); +-void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, struct qxl_bo *bo, int page_offset); +-void qxl_bo_kunmap_atomic_page(struct qxl_device *qdev, struct qxl_bo *bo, void *map); ++void *qxl_bo_kmap_local_page(struct qxl_device *qdev, struct qxl_bo *bo, int page_offset); ++void qxl_bo_kunmap_local_page(struct qxl_device *qdev, struct qxl_bo *bo, void *map); + extern struct qxl_bo *qxl_bo_ref(struct qxl_bo *bo); + extern void qxl_bo_unref(struct qxl_bo **bo); + extern int qxl_bo_pin(struct qxl_bo *bo); +diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c +index b2a475a0c..b665a33b4 100644 +--- a/drivers/gpu/drm/qxl/qxl_release.c ++++ b/drivers/gpu/drm/qxl/qxl_release.c +@@ -414,7 +414,7 @@ union qxl_release_info *qxl_release_map(struct qxl_device *qdev, + union qxl_release_info *info; + struct qxl_bo *bo = release->release_bo; + +- ptr = qxl_bo_kmap_atomic_page(qdev, bo, release->release_offset & PAGE_MASK); ++ ptr = qxl_bo_kmap_local_page(qdev, bo, release->release_offset & PAGE_MASK); + if (!ptr) + return NULL; + info = ptr + (release->release_offset & ~PAGE_MASK); +@@ -429,7 +429,7 @@ void qxl_release_unmap(struct qxl_device *qdev, + void *ptr; + + ptr = ((void *)info) - (release->release_offset & ~PAGE_MASK); +- qxl_bo_kunmap_atomic_page(qdev, bo, ptr); ++ qxl_bo_kunmap_local_page(qdev, bo, ptr); + } + + void qxl_release_fence_buffer_objects(struct qxl_release *release) +diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c +index 07d23a1e6..add8e6044 100644 +--- a/drivers/gpu/drm/radeon/radeon_display.c ++++ b/drivers/gpu/drm/radeon/radeon_display.c +@@ -1828,6 +1828,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, + struct radeon_device *rdev = dev->dev_private; + + /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_disable_rt(); + + /* Get optional system timestamp before query. */ + if (stime) +@@ -1920,6 +1921,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, + *etime = ktime_get(); + + /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ ++ preempt_enable_rt(); + + /* Decode into vertical and horizontal scanout position. */ + *vpos = position & 0x1fff; +diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c +index fb2a25f84..164b9a015 100644 +--- a/drivers/gpu/drm/ttm/ttm_bo_util.c ++++ b/drivers/gpu/drm/ttm/ttm_bo_util.c +@@ -181,13 +181,15 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, + return -ENOMEM; + + src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); +- dst = kmap_atomic_prot(d, prot); +- if (!dst) +- return -ENOMEM; ++ /* ++ * Ensure that a highmem page is mapped with the correct ++ * pgprot. For non highmem the mapping is already there. ++ */ ++ dst = kmap_local_page_prot(d, prot); + + memcpy_fromio(dst, src, PAGE_SIZE); + +- kunmap_atomic(dst); ++ kunmap_local(dst); + + return 0; + } +@@ -203,13 +205,15 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, + return -ENOMEM; + + dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); +- src = kmap_atomic_prot(s, prot); +- if (!src) +- return -ENOMEM; ++ /* ++ * Ensure that a highmem page is mapped with the correct ++ * pgprot. For non highmem the mapping is already there. ++ */ ++ src = kmap_local_page_prot(s, prot); + + memcpy_toio(dst, src, PAGE_SIZE); + +- kunmap_atomic(src); ++ kunmap_local(src); + + return 0; + } +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +index e8d66182c..71dba228f 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +@@ -375,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); + + if (unmap_src) { +- kunmap_atomic(d->src_addr); ++ kunmap_local(d->src_addr); + d->src_addr = NULL; + } + + if (unmap_dst) { +- kunmap_atomic(d->dst_addr); ++ kunmap_local(d->dst_addr); + d->dst_addr = NULL; + } + +@@ -388,12 +388,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + if (WARN_ON_ONCE(dst_page >= d->dst_num_pages)) + return -EINVAL; + +- d->dst_addr = +- kmap_atomic_prot(d->dst_pages[dst_page], +- d->dst_prot); +- if (!d->dst_addr) +- return -ENOMEM; +- ++ d->dst_addr = kmap_local_page_prot(d->dst_pages[dst_page], ++ d->dst_prot); + d->mapped_dst = dst_page; + } + +@@ -401,12 +397,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + if (WARN_ON_ONCE(src_page >= d->src_num_pages)) + return -EINVAL; + +- d->src_addr = +- kmap_atomic_prot(d->src_pages[src_page], +- d->src_prot); +- if (!d->src_addr) +- return -ENOMEM; +- ++ d->src_addr = kmap_local_page_prot(d->src_pages[src_page], ++ d->src_prot); + d->mapped_src = src_page; + } + diff->do_cpy(diff, d->dst_addr + dst_page_offset, +@@ -436,8 +428,10 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, + * + * Performs a CPU blit from one buffer object to another avoiding a full + * bo vmap which may exhaust- or fragment vmalloc space. +- * On supported architectures (x86), we're using kmap_atomic which avoids +- * cross-processor TLB- and cache flushes and may, on non-HIGHMEM systems ++ * ++ * On supported architectures (x86), we're using kmap_local_prot() which ++ * avoids cross-processor TLB- and cache flushes. kmap_local_prot() will ++ * either map a highmem page with the proper pgprot on HIGHMEM=y systems or + * reference already set-up mappings. + * + * Neither of the buffer objects may be placed in PCI memory +@@ -500,9 +494,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, + } + out: + if (d.src_addr) +- kunmap_atomic(d.src_addr); ++ kunmap_local(d.src_addr); + if (d.dst_addr) +- kunmap_atomic(d.dst_addr); ++ kunmap_local(d.dst_addr); + + return ret; + } +diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h +index 7845fa5de..043e058bb 100644 +--- a/drivers/hv/hyperv_vmbus.h ++++ b/drivers/hv/hyperv_vmbus.h +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + #include "hv_trace.h" + +diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c +index 362da2a83..3dd429a5e 100644 +--- a/drivers/hv/vmbus_drv.c ++++ b/drivers/hv/vmbus_drv.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1307,6 +1308,8 @@ static void vmbus_isr(void) + void *page_addr = hv_cpu->synic_event_page; + struct hv_message *msg; + union hv_synic_event_flags *event; ++ struct pt_regs *regs = get_irq_regs(); ++ u64 ip = regs ? instruction_pointer(regs) : 0; + bool handled = false; + + if (unlikely(page_addr == NULL)) +@@ -1351,7 +1354,7 @@ static void vmbus_isr(void) + tasklet_schedule(&hv_cpu->msg_dpc); + } + +- add_interrupt_randomness(hv_get_vector(), 0); ++ add_interrupt_randomness(hv_get_vector(), 0, ip); + } + + /* +@@ -1359,7 +1362,8 @@ static void vmbus_isr(void) + * buffer and call into Hyper-V to transfer the data. + */ + static void hv_kmsg_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + size_t bytes_written; + phys_addr_t panic_pa; +@@ -1374,7 +1378,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, + * Write dump contents to the page. No need to synchronize; panic should + * be single-threaded. + */ +- kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE, ++ kmsg_dump_get_buffer(iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, + &bytes_written); + if (bytes_written) + hyperv_report_panic_msg(panic_pa, bytes_written); +diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig +index d45aba3e1..6e890131d 100644 +--- a/drivers/leds/trigger/Kconfig ++++ b/drivers/leds/trigger/Kconfig +@@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT + + config LEDS_TRIGGER_CPU + bool "LED CPU Trigger" ++ depends on !PREEMPT_RT + help + This allows LEDs to be controlled by active CPUs. This shows + the active CPUs across an array of LEDs so you can see which +diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c +index c82953a32..061fea763 100644 +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -2217,8 +2217,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + struct raid5_percpu *percpu; + unsigned long cpu; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + percpu = per_cpu_ptr(conf->percpu, cpu); ++ spin_lock(&percpu->lock); + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + ops_run_biofill(sh); + overlap_clear++; +@@ -2277,7 +2278,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } +- put_cpu(); ++ spin_unlock(&percpu->lock); ++ put_cpu_light(); + } + + static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +@@ -7099,6 +7101,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) + __func__, cpu); + return -ENOMEM; + } ++ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); + return 0; + } + +diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h +index 5c05acf20..665fe138a 100644 +--- a/drivers/md/raid5.h ++++ b/drivers/md/raid5.h +@@ -635,6 +635,7 @@ struct r5conf { + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu { ++ spinlock_t lock; /* Protection for -RT */ + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address +diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c +index 774970bfc..6bc2c728a 100644 +--- a/drivers/mtd/mtdoops.c ++++ b/drivers/mtd/mtdoops.c +@@ -267,7 +267,8 @@ static void find_next_position(struct mtdoops_context *cxt) + } + + static void mtdoops_do_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + struct mtdoops_context *cxt = container_of(dumper, + struct mtdoops_context, dump); +@@ -276,7 +277,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, + if (reason == KMSG_DUMP_OOPS && !dump_oops) + return; + +- kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, ++ kmsg_dump_get_buffer(iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, + record_size - MTDOOPS_HEADER_SIZE, NULL); + + if (reason != KMSG_DUMP_OOPS) { +diff --git a/drivers/net/arcnet/arc-rimi.c b/drivers/net/arcnet/arc-rimi.c +index 98df38fe5..12d085405 100644 +--- a/drivers/net/arcnet/arc-rimi.c ++++ b/drivers/net/arcnet/arc-rimi.c +@@ -332,7 +332,7 @@ static int __init arc_rimi_init(void) + dev->irq = 9; + + if (arcrimi_probe(dev)) { +- free_netdev(dev); ++ free_arcdev(dev); + return -EIO; + } + +@@ -349,7 +349,7 @@ static void __exit arc_rimi_exit(void) + iounmap(lp->mem_start); + release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); + free_irq(dev->irq, dev); +- free_netdev(dev); ++ free_arcdev(dev); + } + + #ifndef MODULE +diff --git a/drivers/net/arcnet/arcdevice.h b/drivers/net/arcnet/arcdevice.h +index 22a49c6d7..5d4a4c7ef 100644 +--- a/drivers/net/arcnet/arcdevice.h ++++ b/drivers/net/arcnet/arcdevice.h +@@ -298,6 +298,10 @@ struct arcnet_local { + + int excnak_pending; /* We just got an excesive nak interrupt */ + ++ /* RESET flag handling */ ++ int reset_in_progress; ++ struct work_struct reset_work; ++ + struct { + uint16_t sequence; /* sequence number (incs with each packet) */ + __be16 aborted_seq; +@@ -350,7 +354,9 @@ void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc) + + void arcnet_unregister_proto(struct ArcProto *proto); + irqreturn_t arcnet_interrupt(int irq, void *dev_id); ++ + struct net_device *alloc_arcdev(const char *name); ++void free_arcdev(struct net_device *dev); + + int arcnet_open(struct net_device *dev); + int arcnet_close(struct net_device *dev); +diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c +index e04efc0a5..d76dd7d14 100644 +--- a/drivers/net/arcnet/arcnet.c ++++ b/drivers/net/arcnet/arcnet.c +@@ -387,10 +387,44 @@ static void arcnet_timer(struct timer_list *t) + struct arcnet_local *lp = from_timer(lp, t, timer); + struct net_device *dev = lp->dev; + +- if (!netif_carrier_ok(dev)) { ++ spin_lock_irq(&lp->lock); ++ ++ if (!lp->reset_in_progress && !netif_carrier_ok(dev)) { + netif_carrier_on(dev); + netdev_info(dev, "link up\n"); + } ++ ++ spin_unlock_irq(&lp->lock); ++} ++ ++static void reset_device_work(struct work_struct *work) ++{ ++ struct arcnet_local *lp; ++ struct net_device *dev; ++ ++ lp = container_of(work, struct arcnet_local, reset_work); ++ dev = lp->dev; ++ ++ /* Do not bring the network interface back up if an ifdown ++ * was already done. ++ */ ++ if (!netif_running(dev) || !lp->reset_in_progress) ++ return; ++ ++ rtnl_lock(); ++ ++ /* Do another check, in case of an ifdown that was triggered in ++ * the small race window between the exit condition above and ++ * acquiring RTNL. ++ */ ++ if (!netif_running(dev) || !lp->reset_in_progress) ++ goto out; ++ ++ dev_close(dev); ++ dev_open(dev, NULL); ++ ++out: ++ rtnl_unlock(); + } + + static void arcnet_reply_tasklet(unsigned long data) +@@ -452,12 +486,25 @@ struct net_device *alloc_arcdev(const char *name) + lp->dev = dev; + spin_lock_init(&lp->lock); + timer_setup(&lp->timer, arcnet_timer, 0); ++ INIT_WORK(&lp->reset_work, reset_device_work); + } + + return dev; + } + EXPORT_SYMBOL(alloc_arcdev); + ++void free_arcdev(struct net_device *dev) ++{ ++ struct arcnet_local *lp = netdev_priv(dev); ++ ++ /* Do not cancel this at ->ndo_close(), as the workqueue itself ++ * indirectly calls the ifdown path through dev_close(). ++ */ ++ cancel_work_sync(&lp->reset_work); ++ free_netdev(dev); ++} ++EXPORT_SYMBOL(free_arcdev); ++ + /* Open/initialize the board. This is called sometime after booting when + * the 'ifconfig' program is run. + * +@@ -587,6 +634,10 @@ int arcnet_close(struct net_device *dev) + + /* shut down the card */ + lp->hw.close(dev); ++ ++ /* reset counters */ ++ lp->reset_in_progress = 0; ++ + module_put(lp->hw.owner); + return 0; + } +@@ -820,6 +871,9 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) + + spin_lock_irqsave(&lp->lock, flags); + ++ if (lp->reset_in_progress) ++ goto out; ++ + /* RESET flag was enabled - if device is not running, we must + * clear it right away (but nothing else). + */ +@@ -852,11 +906,14 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) + if (status & RESETflag) { + arc_printk(D_NORMAL, dev, "spurious reset (status=%Xh)\n", + status); +- arcnet_close(dev); +- arcnet_open(dev); ++ ++ lp->reset_in_progress = 1; ++ netif_stop_queue(dev); ++ netif_carrier_off(dev); ++ schedule_work(&lp->reset_work); + + /* get out of the interrupt handler! */ +- break; ++ goto out; + } + /* RX is inhibited - we must have received something. + * Prepare to receive into the next buffer. +@@ -1052,6 +1109,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) + udelay(1); + lp->hw.intmask(dev, lp->intmask); + ++out: + spin_unlock_irqrestore(&lp->lock, flags); + return retval; + } +diff --git a/drivers/net/arcnet/com20020-isa.c b/drivers/net/arcnet/com20020-isa.c +index f983c4ce6..be618e4b9 100644 +--- a/drivers/net/arcnet/com20020-isa.c ++++ b/drivers/net/arcnet/com20020-isa.c +@@ -169,7 +169,7 @@ static int __init com20020_init(void) + dev->irq = 9; + + if (com20020isa_probe(dev)) { +- free_netdev(dev); ++ free_arcdev(dev); + return -EIO; + } + +@@ -182,7 +182,7 @@ static void __exit com20020_exit(void) + unregister_netdev(my_dev); + free_irq(my_dev->irq, my_dev); + release_region(my_dev->base_addr, ARCNET_TOTAL_SIZE); +- free_netdev(my_dev); ++ free_arcdev(my_dev); + } + + #ifndef MODULE +diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c +index 9f44e2e45..b4f8798d8 100644 +--- a/drivers/net/arcnet/com20020-pci.c ++++ b/drivers/net/arcnet/com20020-pci.c +@@ -294,7 +294,7 @@ static void com20020pci_remove(struct pci_dev *pdev) + + unregister_netdev(dev); + free_irq(dev->irq, dev); +- free_netdev(dev); ++ free_arcdev(dev); + } + } + +diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c +index cf607ffcf..9cc5eb6a8 100644 +--- a/drivers/net/arcnet/com20020_cs.c ++++ b/drivers/net/arcnet/com20020_cs.c +@@ -177,7 +177,7 @@ static void com20020_detach(struct pcmcia_device *link) + dev = info->dev; + if (dev) { + dev_dbg(&link->dev, "kfree...\n"); +- free_netdev(dev); ++ free_arcdev(dev); + } + dev_dbg(&link->dev, "kfree2...\n"); + kfree(info); +diff --git a/drivers/net/arcnet/com90io.c b/drivers/net/arcnet/com90io.c +index cf214b730..3856b447d 100644 +--- a/drivers/net/arcnet/com90io.c ++++ b/drivers/net/arcnet/com90io.c +@@ -396,7 +396,7 @@ static int __init com90io_init(void) + err = com90io_probe(dev); + + if (err) { +- free_netdev(dev); ++ free_arcdev(dev); + return err; + } + +@@ -419,7 +419,7 @@ static void __exit com90io_exit(void) + + free_irq(dev->irq, dev); + release_region(dev->base_addr, ARCNET_TOTAL_SIZE); +- free_netdev(dev); ++ free_arcdev(dev); + } + + module_init(com90io_init) +diff --git a/drivers/net/arcnet/com90xx.c b/drivers/net/arcnet/com90xx.c +index 3dc3d533c..d8dfb9ea0 100644 +--- a/drivers/net/arcnet/com90xx.c ++++ b/drivers/net/arcnet/com90xx.c +@@ -554,7 +554,7 @@ static int __init com90xx_found(int ioaddr, int airq, u_long shmem, + err_release_mem: + release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); + err_free_dev: +- free_netdev(dev); ++ free_arcdev(dev); + return -EIO; + } + +@@ -672,7 +672,7 @@ static void __exit com90xx_exit(void) + release_region(dev->base_addr, ARCNET_TOTAL_SIZE); + release_mem_region(dev->mem_start, + dev->mem_end - dev->mem_start + 1); +- free_netdev(dev); ++ free_arcdev(dev); + } + } + +diff --git a/drivers/net/ethernet/chelsio/cxgb/common.h b/drivers/net/ethernet/chelsio/cxgb/common.h +index 647506064..0321be773 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/common.h ++++ b/drivers/net/ethernet/chelsio/cxgb/common.h +@@ -238,7 +238,6 @@ struct adapter { + int msg_enable; + u32 mmio_len; + +- struct work_struct ext_intr_handler_task; + struct adapter_params params; + + /* Terminator modules. */ +@@ -257,6 +256,7 @@ struct adapter { + + /* guards async operations */ + spinlock_t async_lock ____cacheline_aligned; ++ u32 pending_thread_intr; + u32 slow_intr_mask; + int t1powersave; + }; +@@ -334,8 +334,7 @@ void t1_interrupts_enable(adapter_t *adapter); + void t1_interrupts_disable(adapter_t *adapter); + void t1_interrupts_clear(adapter_t *adapter); + int t1_elmer0_ext_intr_handler(adapter_t *adapter); +-void t1_elmer0_ext_intr(adapter_t *adapter); +-int t1_slow_intr_handler(adapter_t *adapter); ++irqreturn_t t1_slow_intr_handler(adapter_t *adapter); + + int t1_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); + const struct board_info *t1_get_board_info(unsigned int board_id); +@@ -347,7 +346,6 @@ int t1_get_board_rev(adapter_t *adapter, const struct board_info *bi, + int t1_init_hw_modules(adapter_t *adapter); + int t1_init_sw_modules(adapter_t *adapter, const struct board_info *bi); + void t1_free_sw_modules(adapter_t *adapter); +-void t1_fatal_err(adapter_t *adapter); + void t1_link_changed(adapter_t *adapter, int port_id); + void t1_link_negotiated(adapter_t *adapter, int port_id, int link_stat, + int speed, int duplex, int pause); +diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +index 1311eac9e..c827273c4 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c ++++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +@@ -211,9 +211,10 @@ static int cxgb_up(struct adapter *adapter) + t1_interrupts_clear(adapter); + + adapter->params.has_msi = !disable_msi && !pci_enable_msi(adapter->pdev); +- err = request_irq(adapter->pdev->irq, t1_interrupt, +- adapter->params.has_msi ? 0 : IRQF_SHARED, +- adapter->name, adapter); ++ err = request_threaded_irq(adapter->pdev->irq, t1_interrupt, ++ t1_interrupt_thread, ++ adapter->params.has_msi ? 0 : IRQF_SHARED, ++ adapter->name, adapter); + if (err) { + if (adapter->params.has_msi) + pci_disable_msi(adapter->pdev); +@@ -924,51 +925,6 @@ static void mac_stats_task(struct work_struct *work) + spin_unlock(&adapter->work_lock); + } + +-/* +- * Processes elmer0 external interrupts in process context. +- */ +-static void ext_intr_task(struct work_struct *work) +-{ +- struct adapter *adapter = +- container_of(work, struct adapter, ext_intr_handler_task); +- +- t1_elmer0_ext_intr_handler(adapter); +- +- /* Now reenable external interrupts */ +- spin_lock_irq(&adapter->async_lock); +- adapter->slow_intr_mask |= F_PL_INTR_EXT; +- writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); +- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, +- adapter->regs + A_PL_ENABLE); +- spin_unlock_irq(&adapter->async_lock); +-} +- +-/* +- * Interrupt-context handler for elmer0 external interrupts. +- */ +-void t1_elmer0_ext_intr(struct adapter *adapter) +-{ +- /* +- * Schedule a task to handle external interrupts as we require +- * a process context. We disable EXT interrupts in the interim +- * and let the task reenable them when it's done. +- */ +- adapter->slow_intr_mask &= ~F_PL_INTR_EXT; +- writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, +- adapter->regs + A_PL_ENABLE); +- schedule_work(&adapter->ext_intr_handler_task); +-} +- +-void t1_fatal_err(struct adapter *adapter) +-{ +- if (adapter->flags & FULL_INIT_DONE) { +- t1_sge_stop(adapter->sge); +- t1_interrupts_disable(adapter); +- } +- pr_alert("%s: encountered fatal error, operation suspended\n", +- adapter->name); +-} +- + static const struct net_device_ops cxgb_netdev_ops = { + .ndo_open = cxgb_open, + .ndo_stop = cxgb_close, +@@ -1070,8 +1026,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + spin_lock_init(&adapter->async_lock); + spin_lock_init(&adapter->mac_lock); + +- INIT_WORK(&adapter->ext_intr_handler_task, +- ext_intr_task); + INIT_DELAYED_WORK(&adapter->stats_update_task, + mac_stats_task); + +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c +index 2d9c2b5a6..cda01f22c 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.c ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.c +@@ -940,10 +940,11 @@ void t1_sge_intr_clear(struct sge *sge) + /* + * SGE 'Error' interrupt handler + */ +-int t1_sge_intr_error_handler(struct sge *sge) ++bool t1_sge_intr_error_handler(struct sge *sge) + { + struct adapter *adapter = sge->adapter; + u32 cause = readl(adapter->regs + A_SG_INT_CAUSE); ++ bool wake = false; + + if (adapter->port[0].dev->hw_features & NETIF_F_TSO) + cause &= ~F_PACKET_TOO_BIG; +@@ -967,11 +968,14 @@ int t1_sge_intr_error_handler(struct sge *sge) + sge->stats.pkt_mismatch++; + pr_alert("%s: SGE packet mismatch\n", adapter->name); + } +- if (cause & SGE_INT_FATAL) +- t1_fatal_err(adapter); ++ if (cause & SGE_INT_FATAL) { ++ t1_interrupts_disable(adapter); ++ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; ++ wake = true; ++ } + + writel(cause, adapter->regs + A_SG_INT_CAUSE); +- return 0; ++ return wake; + } + + const struct sge_intr_counts *t1_sge_get_intr_counts(const struct sge *sge) +@@ -1619,11 +1623,46 @@ int t1_poll(struct napi_struct *napi, int budget) + return work_done; + } + ++irqreturn_t t1_interrupt_thread(int irq, void *data) ++{ ++ struct adapter *adapter = data; ++ u32 pending_thread_intr; ++ ++ spin_lock_irq(&adapter->async_lock); ++ pending_thread_intr = adapter->pending_thread_intr; ++ adapter->pending_thread_intr = 0; ++ spin_unlock_irq(&adapter->async_lock); ++ ++ if (!pending_thread_intr) ++ return IRQ_NONE; ++ ++ if (pending_thread_intr & F_PL_INTR_EXT) ++ t1_elmer0_ext_intr_handler(adapter); ++ ++ /* This error is fatal, interrupts remain off */ ++ if (pending_thread_intr & F_PL_INTR_SGE_ERR) { ++ pr_alert("%s: encountered fatal error, operation suspended\n", ++ adapter->name); ++ t1_sge_stop(adapter->sge); ++ return IRQ_HANDLED; ++ } ++ ++ spin_lock_irq(&adapter->async_lock); ++ adapter->slow_intr_mask |= F_PL_INTR_EXT; ++ ++ writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); ++ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, ++ adapter->regs + A_PL_ENABLE); ++ spin_unlock_irq(&adapter->async_lock); ++ ++ return IRQ_HANDLED; ++} ++ + irqreturn_t t1_interrupt(int irq, void *data) + { + struct adapter *adapter = data; + struct sge *sge = adapter->sge; +- int handled; ++ irqreturn_t handled; + + if (likely(responses_pending(adapter))) { + writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE); +@@ -1645,10 +1684,10 @@ irqreturn_t t1_interrupt(int irq, void *data) + handled = t1_slow_intr_handler(adapter); + spin_unlock(&adapter->async_lock); + +- if (!handled) ++ if (handled == IRQ_NONE) + sge->stats.unhandled_irqs++; + +- return IRQ_RETVAL(handled != 0); ++ return handled; + } + + /* +diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.h b/drivers/net/ethernet/chelsio/cxgb/sge.h +index a1ba591b3..716705b96 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/sge.h ++++ b/drivers/net/ethernet/chelsio/cxgb/sge.h +@@ -74,6 +74,7 @@ struct sge *t1_sge_create(struct adapter *, struct sge_params *); + int t1_sge_configure(struct sge *, struct sge_params *); + int t1_sge_set_coalesce_params(struct sge *, struct sge_params *); + void t1_sge_destroy(struct sge *); ++irqreturn_t t1_interrupt_thread(int irq, void *data); + irqreturn_t t1_interrupt(int irq, void *cookie); + int t1_poll(struct napi_struct *, int); + +@@ -81,7 +82,7 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev); + void t1_vlan_mode(struct adapter *adapter, netdev_features_t features); + void t1_sge_start(struct sge *); + void t1_sge_stop(struct sge *); +-int t1_sge_intr_error_handler(struct sge *); ++bool t1_sge_intr_error_handler(struct sge *sge); + void t1_sge_intr_enable(struct sge *); + void t1_sge_intr_disable(struct sge *); + void t1_sge_intr_clear(struct sge *); +diff --git a/drivers/net/ethernet/chelsio/cxgb/subr.c b/drivers/net/ethernet/chelsio/cxgb/subr.c +index ea0f8741d..310add28f 100644 +--- a/drivers/net/ethernet/chelsio/cxgb/subr.c ++++ b/drivers/net/ethernet/chelsio/cxgb/subr.c +@@ -170,7 +170,7 @@ void t1_link_changed(adapter_t *adapter, int port_id) + t1_link_negotiated(adapter, port_id, link_ok, speed, duplex, fc); + } + +-static int t1_pci_intr_handler(adapter_t *adapter) ++static bool t1_pci_intr_handler(adapter_t *adapter) + { + u32 pcix_cause; + +@@ -179,9 +179,13 @@ static int t1_pci_intr_handler(adapter_t *adapter) + if (pcix_cause) { + pci_write_config_dword(adapter->pdev, A_PCICFG_INTR_CAUSE, + pcix_cause); +- t1_fatal_err(adapter); /* PCI errors are fatal */ ++ /* PCI errors are fatal */ ++ t1_interrupts_disable(adapter); ++ adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; ++ pr_alert("%s: PCI error encountered.\n", adapter->name); ++ return true; + } +- return 0; ++ return false; + } + + #ifdef CONFIG_CHELSIO_T1_1G +@@ -210,13 +214,16 @@ static int fpga_phy_intr_handler(adapter_t *adapter) + /* + * Slow path interrupt handler for FPGAs. + */ +-static int fpga_slow_intr(adapter_t *adapter) ++static irqreturn_t fpga_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); ++ irqreturn_t ret = IRQ_NONE; + + cause &= ~F_PL_INTR_SGE_DATA; +- if (cause & F_PL_INTR_SGE_ERR) +- t1_sge_intr_error_handler(adapter->sge); ++ if (cause & F_PL_INTR_SGE_ERR) { ++ if (t1_sge_intr_error_handler(adapter->sge)) ++ ret = IRQ_WAKE_THREAD; ++ } + + if (cause & FPGA_PCIX_INTERRUPT_GMAC) + fpga_phy_intr_handler(adapter); +@@ -231,14 +238,19 @@ static int fpga_slow_intr(adapter_t *adapter) + /* Clear TP interrupt */ + writel(tp_cause, adapter->regs + FPGA_TP_ADDR_INTERRUPT_CAUSE); + } +- if (cause & FPGA_PCIX_INTERRUPT_PCIX) +- t1_pci_intr_handler(adapter); ++ if (cause & FPGA_PCIX_INTERRUPT_PCIX) { ++ if (t1_pci_intr_handler(adapter)) ++ ret = IRQ_WAKE_THREAD; ++ } + + /* Clear the interrupts just processed. */ + if (cause) + writel(cause, adapter->regs + A_PL_CAUSE); + +- return cause != 0; ++ if (ret != IRQ_NONE) ++ return ret; ++ ++ return cause == 0 ? IRQ_NONE : IRQ_HANDLED; + } + #endif + +@@ -842,31 +854,45 @@ void t1_interrupts_clear(adapter_t* adapter) + /* + * Slow path interrupt handler for ASICs. + */ +-static int asic_slow_intr(adapter_t *adapter) ++static irqreturn_t asic_slow_intr(adapter_t *adapter) + { + u32 cause = readl(adapter->regs + A_PL_CAUSE); ++ irqreturn_t ret = IRQ_HANDLED; + + cause &= adapter->slow_intr_mask; + if (!cause) +- return 0; +- if (cause & F_PL_INTR_SGE_ERR) +- t1_sge_intr_error_handler(adapter->sge); ++ return IRQ_NONE; ++ if (cause & F_PL_INTR_SGE_ERR) { ++ if (t1_sge_intr_error_handler(adapter->sge)) ++ ret = IRQ_WAKE_THREAD; ++ } + if (cause & F_PL_INTR_TP) + t1_tp_intr_handler(adapter->tp); + if (cause & F_PL_INTR_ESPI) + t1_espi_intr_handler(adapter->espi); +- if (cause & F_PL_INTR_PCIX) +- t1_pci_intr_handler(adapter); +- if (cause & F_PL_INTR_EXT) +- t1_elmer0_ext_intr(adapter); ++ if (cause & F_PL_INTR_PCIX) { ++ if (t1_pci_intr_handler(adapter)) ++ ret = IRQ_WAKE_THREAD; ++ } ++ if (cause & F_PL_INTR_EXT) { ++ /* Wake the threaded interrupt to handle external interrupts as ++ * we require a process context. We disable EXT interrupts in ++ * the interim and let the thread reenable them when it's done. ++ */ ++ adapter->pending_thread_intr |= F_PL_INTR_EXT; ++ adapter->slow_intr_mask &= ~F_PL_INTR_EXT; ++ writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, ++ adapter->regs + A_PL_ENABLE); ++ ret = IRQ_WAKE_THREAD; ++ } + + /* Clear the interrupts just processed. */ + writel(cause, adapter->regs + A_PL_CAUSE); + readl(adapter->regs + A_PL_CAUSE); /* flush writes */ +- return 1; ++ return ret; + } + +-int t1_slow_intr_handler(adapter_t *adapter) ++irqreturn_t t1_slow_intr_handler(adapter_t *adapter) + { + #ifdef CONFIG_CHELSIO_T1_1G + if (!t1_is_asic(adapter)) +diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c +index e3a885891..df0eab479 100644 +--- a/drivers/net/ethernet/dlink/sundance.c ++++ b/drivers/net/ethernet/dlink/sundance.c +@@ -963,7 +963,7 @@ static void tx_timeout(struct net_device *dev, unsigned int txqueue) + unsigned long flag; + + netif_stop_queue(dev); +- tasklet_disable(&np->tx_tasklet); ++ tasklet_disable_in_atomic(&np->tx_tasklet); + iowrite16(0, ioaddr + IntrEnable); + printk(KERN_WARNING "%s: Transmit timed out, TxStatus %2.2x " + "TxFrameId %2.2x," +diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c +index 4185ca3dd..cf5c33d0f 100644 +--- a/drivers/net/ethernet/jme.c ++++ b/drivers/net/ethernet/jme.c +@@ -1265,9 +1265,9 @@ jme_stop_shutdown_timer(struct jme_adapter *jme) + jwrite32f(jme, JME_APMC, apmc); + } + +-static void jme_link_change_tasklet(struct tasklet_struct *t) ++static void jme_link_change_work(struct work_struct *work) + { +- struct jme_adapter *jme = from_tasklet(jme, t, linkch_task); ++ struct jme_adapter *jme = container_of(work, struct jme_adapter, linkch_task); + struct net_device *netdev = jme->dev; + int rc; + +@@ -1510,7 +1510,7 @@ jme_intr_msi(struct jme_adapter *jme, u32 intrstat) + * all other events are ignored + */ + jwrite32(jme, JME_IEVE, intrstat); +- tasklet_schedule(&jme->linkch_task); ++ schedule_work(&jme->linkch_task); + goto out_reenable; + } + +@@ -1832,7 +1832,6 @@ jme_open(struct net_device *netdev) + jme_clear_pm_disable_wol(jme); + JME_NAPI_ENABLE(jme); + +- tasklet_setup(&jme->linkch_task, jme_link_change_tasklet); + tasklet_setup(&jme->txclean_task, jme_tx_clean_tasklet); + tasklet_setup(&jme->rxclean_task, jme_rx_clean_tasklet); + tasklet_setup(&jme->rxempty_task, jme_rx_empty_tasklet); +@@ -1920,7 +1919,7 @@ jme_close(struct net_device *netdev) + + JME_NAPI_DISABLE(jme); + +- tasklet_kill(&jme->linkch_task); ++ cancel_work_sync(&jme->linkch_task); + tasklet_kill(&jme->txclean_task); + tasklet_kill(&jme->rxclean_task); + tasklet_kill(&jme->rxempty_task); +@@ -3039,6 +3038,7 @@ jme_init_one(struct pci_dev *pdev, + atomic_set(&jme->rx_empty, 1); + + tasklet_setup(&jme->pcc_task, jme_pcc_tasklet); ++ INIT_WORK(&jme->linkch_task, jme_link_change_work); + jme->dpi.cur = PCC_P1; + + jme->reg_ghc = 0; +diff --git a/drivers/net/ethernet/jme.h b/drivers/net/ethernet/jme.h +index a2c3b00d9..2af76329b 100644 +--- a/drivers/net/ethernet/jme.h ++++ b/drivers/net/ethernet/jme.h +@@ -411,7 +411,7 @@ struct jme_adapter { + struct tasklet_struct rxempty_task; + struct tasklet_struct rxclean_task; + struct tasklet_struct txclean_task; +- struct tasklet_struct linkch_task; ++ struct work_struct linkch_task; + struct tasklet_struct pcc_task; + unsigned long flags; + u32 reg_txcs; +diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c +index 71e2ada86..72e2e71aa 100644 +--- a/drivers/net/wireless/ath/ath9k/beacon.c ++++ b/drivers/net/wireless/ath/ath9k/beacon.c +@@ -251,7 +251,7 @@ void ath9k_beacon_ensure_primary_slot(struct ath_softc *sc) + int first_slot = ATH_BCBUF; + int slot; + +- tasklet_disable(&sc->bcon_tasklet); ++ tasklet_disable_in_atomic(&sc->bcon_tasklet); + + /* Find first taken slot. */ + for (slot = 0; slot < ATH_BCBUF; slot++) { +diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c +index a070e69bb..1fea850af 100644 +--- a/drivers/pci/controller/pci-hyperv.c ++++ b/drivers/pci/controller/pci-hyperv.c +@@ -1457,7 +1457,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) + * Prevents hv_pci_onchannelcallback() from running concurrently + * in the tasklet. + */ +- tasklet_disable(&channel->callback_event); ++ tasklet_disable_in_atomic(&channel->callback_event); + + /* + * Since this function is called with IRQ locks held, can't +diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c +index 0f9274960..dc97e4f1f 100644 +--- a/drivers/scsi/fcoe/fcoe.c ++++ b/drivers/scsi/fcoe/fcoe.c +@@ -1452,11 +1452,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, + static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) + { + struct fcoe_percpu_s *fps; +- int rc; ++ int rc, cpu = get_cpu_light(); + +- fps = &get_cpu_var(fcoe_percpu); ++ fps = &per_cpu(fcoe_percpu, cpu); + rc = fcoe_get_paged_crc_eof(skb, tlen, fps); +- put_cpu_var(fcoe_percpu); ++ put_cpu_light(); + + return rc; + } +@@ -1641,11 +1641,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, + return 0; + } + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + stats->InvalidCRCCount++; + if (stats->InvalidCRCCount < 5) + printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); +- put_cpu(); ++ put_cpu_light(); + return -EINVAL; + } + +@@ -1686,7 +1686,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) + */ + hp = (struct fcoe_hdr *) skb_network_header(skb); + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { + if (stats->ErrorFrames < 5) + printk(KERN_WARNING "fcoe: FCoE version " +@@ -1718,13 +1718,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) + goto drop; + + if (!fcoe_filter_frames(lport, fp)) { +- put_cpu(); ++ put_cpu_light(); + fc_exch_recv(lport, fp); + return; + } + drop: + stats->ErrorFrames++; +- put_cpu(); ++ put_cpu_light(); + kfree_skb(skb); + } + +diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c +index 5ea426eff..0d6b9acc7 100644 +--- a/drivers/scsi/fcoe/fcoe_ctlr.c ++++ b/drivers/scsi/fcoe/fcoe_ctlr.c +@@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + + INIT_LIST_HEAD(&del_list); + +- stats = per_cpu_ptr(fip->lp->stats, get_cpu()); ++ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); + + list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { + deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; +@@ -864,7 +864,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + sel_time = fcf->time; + } + } +- put_cpu(); ++ put_cpu_light(); + + list_for_each_entry_safe(fcf, next, &del_list, list) { + /* Removes fcf from current list */ +diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c +index a50f1eef0..0b2acad7c 100644 +--- a/drivers/scsi/libfc/fc_exch.c ++++ b/drivers/scsi/libfc/fc_exch.c +@@ -826,10 +826,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, + } + memset(ep, 0, sizeof(*ep)); + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = per_cpu_ptr(mp->pool, cpu); + spin_lock_bh(&pool->lock); +- put_cpu(); ++ put_cpu_light(); + + /* peek cache of free slot */ + if (pool->left != FC_XID_UNKNOWN) { +diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h +index 34aa2714f..42cd2baa7 100644 +--- a/drivers/tty/serial/8250/8250.h ++++ b/drivers/tty/serial/8250/8250.h +@@ -131,12 +131,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) + up->dl_write(up, value); + } + ++static inline void serial8250_set_IER(struct uart_8250_port *up, ++ unsigned char ier) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ serial_out(up, UART_IER, ier); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++} ++ ++static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int clearval = 0; ++ unsigned int prior; ++ unsigned int flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ clearval = UART_IER_UUE; ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ prior = serial_port_in(port, UART_IER); ++ serial_port_out(port, UART_IER, clearval); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++ ++ return prior; ++} ++ + static inline bool serial8250_set_THRI(struct uart_8250_port *up) + { + if (up->ier & UART_IER_THRI) + return false; + up->ier |= UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +@@ -145,7 +188,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) + if (!(up->ier & UART_IER_THRI)) + return false; + up->ier &= ~UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c +index cae61d1eb..47dd23056 100644 +--- a/drivers/tty/serial/8250/8250_core.c ++++ b/drivers/tty/serial/8250/8250_core.c +@@ -274,10 +274,8 @@ static void serial8250_backup_timeout(struct timer_list *t) + * Must disable interrupts or else we risk racing with the interrupt + * based handler. + */ +- if (up->port.irq) { +- ier = serial_in(up, UART_IER); +- serial_out(up, UART_IER, 0); +- } ++ if (up->port.irq) ++ ier = serial8250_clear_IER(up); + + iir = serial_in(up, UART_IIR); + +@@ -300,7 +298,7 @@ static void serial8250_backup_timeout(struct timer_list *t) + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -671,6 +677,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c +index fbcc90c31..b33cb454c 100644 +--- a/drivers/tty/serial/8250/8250_fsl.c ++++ b/drivers/tty/serial/8250/8250_fsl.c +@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) + + /* Stop processing interrupts on input overrun */ + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { ++ unsigned int ca_flags; + unsigned long delay; ++ bool is_console; + ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&ca_flags); + up->ier = port->serial_in(port, UART_IER); ++ if (is_console) ++ console_atomic_unlock(ca_flags); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c +index 988bf6bcc..bcd26d672 100644 +--- a/drivers/tty/serial/8250/8250_ingenic.c ++++ b/drivers/tty/serial/8250/8250_ingenic.c +@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", + + static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + { ++ unsigned int flags; ++ bool is_console; + int ier; + + switch (offset) { +@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + * If we have enabled modem status IRQs we should enable + * modem mode. + */ ++ is_console = uart_console(p); ++ if (is_console) ++ console_atomic_lock(&flags); + ier = p->serial_in(p, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); + + if (ier & UART_IER_MSI) + value |= UART_MCR_MDCE | UART_MCR_FCM; +diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c +index fb65dc601..5bc734c70 100644 +--- a/drivers/tty/serial/8250/8250_mtk.c ++++ b/drivers/tty/serial/8250/8250_mtk.c +@@ -218,12 +218,37 @@ static void mtk8250_shutdown(struct uart_port *port) + + static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(&flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier & (~mask)); ++ ++ if (is_console) ++ console_atomic_unlock(flags); + } + + static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ ++ if (uart_console(port)) ++ console_atomic_lock(&flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier | mask); ++ ++ if (uart_console(port)) ++ console_atomic_unlock(flags); + } + + static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) +diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c +index 7c07ebb37..a0a617caa 100644 +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -762,7 +762,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) + serial_out(p, UART_EFR, UART_EFR_ECB); + serial_out(p, UART_LCR, 0); + } +- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); + if (p->capabilities & UART_CAP_EFR) { + serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); + serial_out(p, UART_EFR, efr); +@@ -1436,7 +1436,7 @@ static void serial8250_stop_rx(struct uart_port *port) + + up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); + up->port.read_status_mask &= ~UART_LSR_DR; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + serial8250_rpm_put(up); + } +@@ -1466,7 +1466,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) + serial8250_clear_and_reinit_fifos(p); + + p->ier |= UART_IER_RLSI | UART_IER_RDI; +- serial_port_out(&p->port, UART_IER, p->ier); ++ serial8250_set_IER(p, p->ier); + } + } + EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); +@@ -1694,7 +1694,7 @@ static void serial8250_disable_ms(struct uart_port *port) + mctrl_gpio_disable_ms(up->gpios); + + up->ier &= ~UART_IER_MSI; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + } + + static void serial8250_enable_ms(struct uart_port *port) +@@ -1710,7 +1710,7 @@ static void serial8250_enable_ms(struct uart_port *port) + up->ier |= UART_IER_MSI; + + serial8250_rpm_get(up); +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + serial8250_rpm_put(up); + } + +@@ -2130,14 +2130,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + struct uart_8250_port *up = up_to_u8250p(port); + + serial8250_rpm_get(up); +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + wait_for_xmitr(up, BOTH_EMPTY); + /* +@@ -2150,7 +2143,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + serial8250_rpm_put(up); + } + +@@ -2453,7 +2446,7 @@ void serial8250_do_shutdown(struct uart_port *port) + */ + spin_lock_irqsave(&port->lock, flags); + up->ier = 0; +- serial_port_out(port, UART_IER, 0); ++ serial8250_set_IER(up, 0); + spin_unlock_irqrestore(&port->lock, flags); + + synchronize_irq(port->irq); +@@ -2809,7 +2802,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, + if (up->capabilities & UART_CAP_RTOIE) + up->ier |= UART_IER_RTOIE; + +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + if (up->capabilities & UART_CAP_EFR) { + unsigned char efr = 0; +@@ -3275,7 +3268,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); + + #ifdef CONFIG_SERIAL_8250_CONSOLE + +-static void serial8250_console_putchar(struct uart_port *port, int ch) ++static void serial8250_console_putchar_locked(struct uart_port *port, int ch) + { + struct uart_8250_port *up = up_to_u8250p(port); + +@@ -3283,6 +3276,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) + serial_port_out(port, UART_TX, ch); + } + ++static void serial8250_console_putchar(struct uart_port *port, int ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned int flags; ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ ++ console_atomic_lock(&flags); ++ serial8250_console_putchar_locked(port, ch); ++ console_atomic_unlock(flags); ++} ++ + /* + * Restore serial console when h/w power-off detected + */ +@@ -3304,6 +3309,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) + serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); + } + ++void serial8250_console_write_atomic(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int flags; ++ unsigned int ier; ++ ++ console_atomic_lock(&flags); ++ ++ touch_nmi_watchdog(); ++ ++ ier = serial8250_clear_IER(up); ++ ++ if (atomic_fetch_inc(&up->console_printing)) { ++ uart_console_write(port, "\n", 1, ++ serial8250_console_putchar_locked); ++ } ++ uart_console_write(port, s, count, serial8250_console_putchar_locked); ++ atomic_dec(&up->console_printing); ++ ++ wait_for_xmitr(up, BOTH_EMPTY); ++ serial8250_set_IER(up, ier); ++ ++ console_atomic_unlock(flags); ++} ++ + /* + * Print a string to the serial port trying not to disturb + * any possible real use of the port... +@@ -3320,24 +3351,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; +- int locked = 1; + + touch_nmi_watchdog(); + +- if (oops_in_progress) +- locked = spin_trylock_irqsave(&port->lock, flags); +- else +- spin_lock_irqsave(&port->lock, flags); +- +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); ++ spin_lock_irqsave(&port->lock, flags); + +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + /* check scratch reg to see if port powered off during system sleep */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { +@@ -3351,7 +3370,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + mdelay(port->rs485.delay_rts_before_send); + } + ++ atomic_inc(&up->console_printing); + uart_console_write(port, s, count, serial8250_console_putchar); ++ atomic_dec(&up->console_printing); + + /* + * Finally, wait for transmitter to become empty +@@ -3364,8 +3385,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (em485->tx_stopped) + up->rs485_stop_tx(up); + } +- +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + /* + * The receive handling will happen properly because the +@@ -3377,8 +3397,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (up->msr_saved_flags) + serial8250_modem_status(up); + +- if (locked) +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock_irqrestore(&port->lock, flags); + } + + static unsigned int probe_baud(struct uart_port *port) +@@ -3398,6 +3417,7 @@ static unsigned int probe_baud(struct uart_port *port) + + int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + { ++ struct uart_8250_port *up = up_to_u8250p(port); + int baud = 9600; + int bits = 8; + int parity = 'n'; +@@ -3407,6 +3427,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + if (!port->iobase && !port->membase) + return -ENODEV; + ++ atomic_set(&up->console_printing, 0); ++ + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else if (probe) +diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c +index f65461d64..550a0fd5d 100644 +--- a/drivers/tty/serial/amba-pl011.c ++++ b/drivers/tty/serial/amba-pl011.c +@@ -2304,18 +2304,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + { + struct uart_amba_port *uap = amba_ports[co->index]; + unsigned int old_cr = 0, new_cr; +- unsigned long flags; ++ unsigned long flags = 0; + int locked = 1; + + clk_enable(uap->clk); + +- local_irq_save(flags); ++ /* ++ * local_irq_save(flags); ++ * ++ * This local_irq_save() is nonsense. If we come in via sysrq ++ * handling then interrupts are already disabled. Aside of ++ * that the port.sysrq check is racy on SMP regardless. ++ */ + if (uap->port.sysrq) + locked = 0; + else if (oops_in_progress) +- locked = spin_trylock(&uap->port.lock); ++ locked = spin_trylock_irqsave(&uap->port.lock, flags); + else +- spin_lock(&uap->port.lock); ++ spin_lock_irqsave(&uap->port.lock, flags); + + /* + * First save the CR then disable the interrupts +@@ -2341,8 +2347,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + pl011_write(old_cr, uap, REG_CR); + + if (locked) +- spin_unlock(&uap->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&uap->port.lock, flags); + + clk_disable(uap->clk); + } +diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c +index 84e815808..342005ed5 100644 +--- a/drivers/tty/serial/omap-serial.c ++++ b/drivers/tty/serial/omap-serial.c +@@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s, + + pm_runtime_get_sync(up->dev); + +- local_irq_save(flags); +- if (up->port.sysrq) +- locked = 0; +- else if (oops_in_progress) +- locked = spin_trylock(&up->port.lock); ++ if (up->port.sysrq || oops_in_progress) ++ locked = spin_trylock_irqsave(&up->port.lock, flags); + else +- spin_lock(&up->port.lock); ++ spin_lock_irqsave(&up->port.lock, flags); + + /* + * First save the IER then disable the interrupts +@@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s, + pm_runtime_mark_last_busy(up->dev); + pm_runtime_put_autosuspend(up->dev); + if (locked) +- spin_unlock(&up->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&up->port.lock, flags); + } + + static int __init +diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c +index 713cfa72d..0fc473321 100644 +--- a/drivers/tty/tty_buffer.c ++++ b/drivers/tty/tty_buffer.c +@@ -172,9 +172,7 @@ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) + have queued and recycle that ? */ + if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) + return NULL; +- printk_safe_enter(); + p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC); +- printk_safe_exit(); + if (p == NULL) + return NULL; + +diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c +index dae9a57d7..9a6a0ec4d 100644 +--- a/fs/afs/dir_silly.c ++++ b/fs/afs/dir_silly.c +@@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) + struct dentry *alias; + int ret; + +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); + +diff --git a/fs/aio.c b/fs/aio.c +index 5e5333d72..cc16ac777 100644 +--- a/fs/aio.c ++++ b/fs/aio.c +@@ -43,7 +43,6 @@ + #include + #include + +-#include + #include + #include + +@@ -1762,7 +1761,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + list_del_init(&req->wait.entry); + list_del(&iocb->ki_list); + iocb->ki_res.res = mangle_poll(mask); +- if (iocb->ki_eventfd && eventfd_signal_count()) { ++ if (iocb->ki_eventfd && !eventfd_signal_allowed()) { + iocb = NULL; + INIT_WORK(&req->work, aio_poll_put_work); + schedule_work(&req->work); +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index bcc6848bb..fabbf6cc4 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -17,7 +17,6 @@ + #include + #include + #include +-#include + #include + #include + #include +diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c +index 799be3a5d..d5165a7da 100644 +--- a/fs/cifs/readdir.c ++++ b/fs/cifs/readdir.c +@@ -81,7 +81,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, + struct inode *inode; + struct super_block *sb = parent->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); + +diff --git a/fs/dcache.c b/fs/dcache.c +index f5b78cc80..b2e0d1a07 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -2566,9 +2566,10 @@ EXPORT_SYMBOL(d_rehash); + static inline unsigned start_dir_add(struct inode *dir) + { + ++ preempt_disable_rt(); + for (;;) { +- unsigned n = dir->i_dir_seq; +- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) ++ unsigned n = dir->__i_dir_seq; ++ if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n) + return n; + cpu_relax(); + } +@@ -2576,26 +2577,30 @@ static inline unsigned start_dir_add(struct inode *dir) + + static inline void end_dir_add(struct inode *dir, unsigned n) + { +- smp_store_release(&dir->i_dir_seq, n + 2); ++ smp_store_release(&dir->__i_dir_seq, n + 2); ++ preempt_enable_rt(); + } + + static void d_wait_lookup(struct dentry *dentry) + { +- if (d_in_lookup(dentry)) { +- DECLARE_WAITQUEUE(wait, current); +- add_wait_queue(dentry->d_wait, &wait); +- do { +- set_current_state(TASK_UNINTERRUPTIBLE); +- spin_unlock(&dentry->d_lock); +- schedule(); +- spin_lock(&dentry->d_lock); +- } while (d_in_lookup(dentry)); +- } ++ struct swait_queue __wait; ++ ++ if (!d_in_lookup(dentry)) ++ return; ++ ++ INIT_LIST_HEAD(&__wait.task_list); ++ do { ++ prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&dentry->d_lock); ++ schedule(); ++ spin_lock(&dentry->d_lock); ++ } while (d_in_lookup(dentry)); ++ finish_swait(dentry->d_wait, &__wait); + } + + struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, +- wait_queue_head_t *wq) ++ struct swait_queue_head *wq) + { + unsigned int hash = name->hash; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); +@@ -2609,7 +2614,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, + + retry: + rcu_read_lock(); +- seq = smp_load_acquire(&parent->d_inode->i_dir_seq); ++ seq = smp_load_acquire(&parent->d_inode->__i_dir_seq); + r_seq = read_seqbegin(&rename_lock); + dentry = __d_lookup_rcu(parent, name, &d_seq); + if (unlikely(dentry)) { +@@ -2637,7 +2642,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, + } + + hlist_bl_lock(b); +- if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { ++ if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) { + hlist_bl_unlock(b); + rcu_read_unlock(); + goto retry; +@@ -2710,7 +2715,7 @@ void __d_lookup_done(struct dentry *dentry) + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); +- wake_up_all(dentry->d_wait); ++ swake_up_all(dentry->d_wait); + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); +diff --git a/fs/eventfd.c b/fs/eventfd.c +index df466ef81..9035ca60b 100644 +--- a/fs/eventfd.c ++++ b/fs/eventfd.c +@@ -25,8 +25,6 @@ + #include + #include + +-DEFINE_PER_CPU(int, eventfd_wake_count); +- + static DEFINE_IDA(eventfd_ida); + + struct eventfd_ctx { +@@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) + * Deadlock or stack overflow issues can happen if we recurse here + * through waitqueue wakeup handlers. If the caller users potentially + * nested waitqueues with custom wakeup handlers, then it should +- * check eventfd_signal_count() before calling this function. If +- * it returns true, the eventfd_signal() call should be deferred to a ++ * check eventfd_signal_allowed() before calling this function. If ++ * it returns false, the eventfd_signal() call should be deferred to a + * safe context. + */ +- if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) ++ if (WARN_ON_ONCE(current->in_eventfd_signal)) + return 0; + + spin_lock_irqsave(&ctx->wqh.lock, flags); +- this_cpu_inc(eventfd_wake_count); ++ current->in_eventfd_signal = 1; + if (ULLONG_MAX - ctx->count < n) + n = ULLONG_MAX - ctx->count; + ctx->count += n; + if (waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, EPOLLIN); +- this_cpu_dec(eventfd_wake_count); ++ current->in_eventfd_signal = 0; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return n; +diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h +index 64aa552b2..7dae569da 100644 +--- a/fs/fscache/internal.h ++++ b/fs/fscache/internal.h +@@ -95,7 +95,6 @@ extern unsigned fscache_debug; + extern struct kobject *fscache_root; + extern struct workqueue_struct *fscache_object_wq; + extern struct workqueue_struct *fscache_op_wq; +-DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + + extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + +diff --git a/fs/fscache/main.c b/fs/fscache/main.c +index 4207f98e4..85f8cf3a3 100644 +--- a/fs/fscache/main.c ++++ b/fs/fscache/main.c +@@ -41,8 +41,6 @@ struct kobject *fscache_root; + struct workqueue_struct *fscache_object_wq; + struct workqueue_struct *fscache_op_wq; + +-DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +- + /* these values serve as lower bounds, will be adjusted in fscache_init() */ + static unsigned fscache_object_max_active = 4; + static unsigned fscache_op_max_active = 2; +@@ -138,7 +136,6 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) + static int __init fscache_init(void) + { + unsigned int nr_cpus = num_possible_cpus(); +- unsigned int cpu; + int ret; + + fscache_object_max_active = +@@ -161,9 +158,6 @@ static int __init fscache_init(void) + if (!fscache_op_wq) + goto error_op_wq; + +- for_each_possible_cpu(cpu) +- init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); +- + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; +diff --git a/fs/fscache/object.c b/fs/fscache/object.c +index cb2146e02..fb9794dce 100644 +--- a/fs/fscache/object.c ++++ b/fs/fscache/object.c +@@ -807,6 +807,8 @@ void fscache_object_destroy(struct fscache_object *object) + } + EXPORT_SYMBOL(fscache_object_destroy); + ++static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); ++ + /* + * enqueue an object for metadata-type processing + */ +@@ -815,16 +817,12 @@ void fscache_enqueue_object(struct fscache_object *object) + _enter("{OBJ%x}", object->debug_id); + + if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { +- wait_queue_head_t *cong_wq = +- &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) +- wake_up(cong_wq); ++ wake_up(&fscache_object_cong_wait); + } else + fscache_put_object(object, fscache_obj_put_queue); +- +- put_cpu_var(fscache_object_cong_wait); + } + } + +@@ -842,16 +840,15 @@ void fscache_enqueue_object(struct fscache_object *object) + */ + bool fscache_object_sleep_till_congested(signed long *timeoutp) + { +- wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + +- add_wait_queue_exclusive(cong_wq, &wait); ++ add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); +- finish_wait(cong_wq, &wait); ++ finish_wait(&fscache_object_cong_wait, &wait); + + return fscache_object_congested(); + } +diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c +index bc2678323..3176913fa 100644 +--- a/fs/fuse/readdir.c ++++ b/fs/fuse/readdir.c +@@ -158,7 +158,7 @@ static int fuse_direntplus_link(struct file *file, + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* +diff --git a/fs/inode.c b/fs/inode.c +index 82090bfad..96ddef6c6 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -158,7 +158,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_link = NULL; +- inode->i_dir_seq = 0; ++ inode->__i_dir_seq = 0; + inode->i_rdev = 0; + inode->dirtied_when = 0; + +diff --git a/fs/namei.c b/fs/namei.c +index 0782401c6..a3003d832 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1526,7 +1526,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, + { + struct dentry *dentry, *old; + struct inode *inode = dir->d_inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + /* Don't go there if it's already dead */ + if (unlikely(IS_DEADDIR(inode))) +@@ -3021,7 +3021,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + struct dentry *dentry; + int error, create_error = 0; + umode_t mode = op->mode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (unlikely(IS_DEADDIR(dir_inode))) + return ERR_PTR(-ENOENT); +diff --git a/fs/namespace.c b/fs/namespace.c +index 6e76f2a72..dbd1119a5 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -322,8 +323,11 @@ int __mnt_want_write(struct vfsmount *m) + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); +- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) +- cpu_relax(); ++ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { ++ preempt_enable(); ++ cpu_chill(); ++ preempt_disable(); ++ } + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 2ad56ff47..26c63c5ac 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -484,7 +484,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) + { + struct qstr filename = QSTR_INIT(entry->name, entry->len); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct dentry *dentry; + struct dentry *alias; + struct inode *inode; +@@ -1670,7 +1670,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode) + { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct nfs_open_context *ctx; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; +diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c +index b27ebdcce..f86c98a7e 100644 +--- a/fs/nfs/unlink.c ++++ b/fs/nfs/unlink.c +@@ -13,7 +13,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -180,7 +180,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) + + data->cred = get_current_cred(); + data->res.dir_attr = &data->dir_attr; +- init_waitqueue_head(&data->wq); ++ init_swait_queue_head(&data->wq); + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 18a4588c3..decaa7768 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -384,9 +384,9 @@ static inline void task_context_switch_counts(struct seq_file *m, + static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) + { + seq_printf(m, "Cpus_allowed:\t%*pb\n", +- cpumask_pr_args(task->cpus_ptr)); ++ cpumask_pr_args(&task->cpus_mask)); + seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", +- cpumask_pr_args(task->cpus_ptr)); ++ cpumask_pr_args(&task->cpus_mask)); + } + + static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 2ba1313aa..fa6ae9bca 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -96,6 +96,7 @@ + #include + #include + #include ++#include + #include + #include + #include "internal.h" +@@ -2145,7 +2146,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, + + child = d_hash_and_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + goto end_instantiate; +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index ffed75f83..15f837dc0 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -683,7 +683,7 @@ static bool proc_sys_fill_cache(struct file *file, + + child = d_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + return false; +diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c +index b1ebf7b61..b7e3a6bac 100644 +--- a/fs/pstore/platform.c ++++ b/fs/pstore/platform.c +@@ -383,7 +383,8 @@ void pstore_record_init(struct pstore_record *record, + * end of the buffer. + */ + static void pstore_dump(struct kmsg_dumper *dumper, +- enum kmsg_dump_reason reason) ++ enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter) + { + unsigned long total = 0; + const char *why; +@@ -435,7 +436,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, + dst_size -= header_size; + + /* Write dump contents. */ +- if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, ++ if (!kmsg_dump_get_buffer(iter, true, dst + header_size, + dst_size, &dump_size)) + break; + +diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild +index d1300c6e0..267f6dfb8 100644 +--- a/include/asm-generic/Kbuild ++++ b/include/asm-generic/Kbuild +@@ -30,7 +30,7 @@ mandatory-y += irq.h + mandatory-y += irq_regs.h + mandatory-y += irq_work.h + mandatory-y += kdebug.h +-mandatory-y += kmap_types.h ++mandatory-y += kmap_size.h + mandatory-y += kprobes.h + mandatory-y += linkage.h + mandatory-y += local.h +diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h +index d14214dfc..7317e8258 100644 +--- a/include/asm-generic/hardirq.h ++++ b/include/asm-generic/hardirq.h +@@ -7,9 +7,13 @@ + + typedef struct { + unsigned int __softirq_pending; ++#ifdef ARCH_WANTS_NMI_IRQSTAT ++ unsigned int __nmi_count; ++#endif + } ____cacheline_aligned irq_cpustat_t; + +-#include /* Standard mappings for irq_cpustat_t above */ ++DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); ++ + #include + + #ifndef ack_bad_irq +diff --git a/include/asm-generic/kmap_size.h b/include/asm-generic/kmap_size.h +new file mode 100644 +index 000000000..9d6c7786a +--- /dev/null ++++ b/include/asm-generic/kmap_size.h +@@ -0,0 +1,12 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _ASM_GENERIC_KMAP_SIZE_H ++#define _ASM_GENERIC_KMAP_SIZE_H ++ ++/* For debug this provides guard pages between the maps */ ++#ifdef CONFIG_DEBUG_HIGHMEM ++# define KM_MAX_IDX 33 ++#else ++# define KM_MAX_IDX 16 ++#endif ++ ++#endif +diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h +deleted file mode 100644 +index 9f95b7b63..000000000 +--- a/include/asm-generic/kmap_types.h ++++ /dev/null +@@ -1,11 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _ASM_GENERIC_KMAP_TYPES_H +-#define _ASM_GENERIC_KMAP_TYPES_H +- +-#ifdef __WITH_KM_FENCE +-# define KM_TYPE_NR 41 +-#else +-# define KM_TYPE_NR 20 +-#endif +- +-#endif +diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h +index b4d43a4af..ac255e889 100644 +--- a/include/asm-generic/preempt.h ++++ b/include/asm-generic/preempt.h +@@ -79,6 +79,9 @@ static __always_inline bool should_resched(int preempt_offset) + } + + #ifdef CONFIG_PREEMPTION ++#ifdef CONFIG_PREEMPT_RT ++extern void preempt_schedule_lock(void); ++#endif + extern asmlinkage void preempt_schedule(void); + #define __preempt_schedule() preempt_schedule() + extern asmlinkage void preempt_schedule_notrace(void); +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 23dfe7608..88e65db55 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -162,7 +162,7 @@ struct request { + */ + union { + struct hlist_node hash; /* merge hash */ +- struct list_head ipi_list; ++ struct llist_node ipi_list; + }; + + /* +diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h +index a19519f42..eed86eb0a 100644 +--- a/include/linux/bottom_half.h ++++ b/include/linux/bottom_half.h +@@ -4,7 +4,7 @@ + + #include + +-#ifdef CONFIG_TRACE_IRQFLAGS ++#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) + extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); + #else + static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) +@@ -32,4 +32,10 @@ static inline void local_bh_enable(void) + __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern bool local_bh_blocked(void); ++#else ++static inline bool local_bh_blocked(void) { return false; } ++#endif ++ + #endif /* _LINUX_BH_H */ +diff --git a/include/linux/console.h b/include/linux/console.h +index bc2a749e6..027278792 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -16,6 +16,7 @@ + + #include + #include ++#include + + struct vc_data; + struct console_font_op; +@@ -137,10 +138,12 @@ static inline int con_debug_leave(void) + #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ + #define CON_BRL (32) /* Used for a braille device */ + #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ ++#define CON_HANDOVER (128) /* Device was previously a boot console. */ + + struct console { + char name[16]; + void (*write)(struct console *, const char *, unsigned); ++ void (*write_atomic)(struct console *co, const char *s, unsigned int count); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +@@ -150,6 +153,11 @@ struct console { + short flags; + short index; + int cflag; ++#ifdef CONFIG_PRINTK ++ char sync_buf[CONSOLE_LOG_MAX]; ++#endif ++ atomic64_t printk_seq; ++ struct task_struct *thread; + uint ispeed; + uint ospeed; + void *data; +@@ -232,4 +240,7 @@ extern void console_init(void); + void dummycon_register_output_notifier(struct notifier_block *nb); + void dummycon_unregister_output_notifier(struct notifier_block *nb); + ++extern void console_atomic_lock(unsigned int *flags); ++extern void console_atomic_unlock(unsigned int flags); ++ + #endif /* _LINUX_CONSOLE_H */ +diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h +index b98b9eb7d..c986a9543 100644 +--- a/include/linux/cpuhotplug.h ++++ b/include/linux/cpuhotplug.h +@@ -155,6 +155,7 @@ enum cpuhp_state { + CPUHP_AP_ONLINE, + CPUHP_TEARDOWN_CPU, + CPUHP_AP_ONLINE_IDLE, ++ CPUHP_AP_SCHED_WAIT_EMPTY, + CPUHP_AP_SMPBOOT_THREADS, + CPUHP_AP_X86_VDSO_VMA_ONLINE, + CPUHP_AP_IRQ_AFFINITY_ONLINE, +diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h +index 0159986ac..c53364c42 100644 +--- a/include/linux/cpumask.h ++++ b/include/linux/cpumask.h +@@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, + return cpumask_next_and(-1, src1p, src2p); + } + ++static inline int cpumask_any_distribute(const struct cpumask *srcp) ++{ ++ return cpumask_first(srcp); ++} ++ + #define for_each_cpu(cpu, mask) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) + #define for_each_cpu_not(cpu, mask) \ +@@ -252,6 +257,7 @@ int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu); + unsigned int cpumask_local_spread(unsigned int i, int node); + int cpumask_any_and_distribute(const struct cpumask *src1p, + const struct cpumask *src2p); ++int cpumask_any_distribute(const struct cpumask *srcp); + + /** + * for_each_cpu - iterate over every cpu in a mask +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 4bb8b1759..c5821c04a 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -108,7 +108,7 @@ struct dentry { + + union { + struct list_head d_lru; /* LRU list */ +- wait_queue_head_t *d_wait; /* in-lookup ones only */ ++ struct swait_queue_head *d_wait; /* in-lookup ones only */ + }; + struct list_head d_child; /* child of parent list */ + struct list_head d_subdirs; /* our children */ +@@ -251,7 +251,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op + extern struct dentry * d_alloc(struct dentry *, const struct qstr *); + extern struct dentry * d_alloc_anon(struct super_block *); + extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, +- wait_queue_head_t *); ++ struct swait_queue_head *); + extern struct dentry * d_splice_alias(struct inode *, struct dentry *); + extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); + extern struct dentry * d_exact_alias(struct dentry *, struct inode *); +diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h +index edb5c186b..3f49e6516 100644 +--- a/include/linux/debug_locks.h ++++ b/include/linux/debug_locks.h +@@ -3,8 +3,7 @@ + #define __LINUX_DEBUG_LOCKING_H + + #include +-#include +-#include ++#include + + struct task_struct; + +diff --git a/include/linux/delay.h b/include/linux/delay.h +index e8607992c..cd24f34b4 100644 +--- a/include/linux/delay.h ++++ b/include/linux/delay.h +@@ -88,4 +88,10 @@ static inline void fsleep(unsigned long usecs) + msleep(DIV_ROUND_UP(usecs, 1000)); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern void cpu_chill(void); ++#else ++# define cpu_chill() cpu_relax() ++#endif ++ + #endif /* defined(_LINUX_DELAY_H) */ +diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h +index d8e1c798d..d2aca09f7 100644 +--- a/include/linux/entry-common.h ++++ b/include/linux/entry-common.h +@@ -70,7 +70,7 @@ + + #define EXIT_TO_USER_MODE_WORK \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ +- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ ++ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | \ + ARCH_EXIT_TO_USER_MODE_WORK) + + /** +diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h +index dc4fd8a66..836b4c021 100644 +--- a/include/linux/eventfd.h ++++ b/include/linux/eventfd.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + /* + * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining +@@ -42,11 +43,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, + __u64 *cnt); + +-DECLARE_PER_CPU(int, eventfd_wake_count); +- +-static inline bool eventfd_signal_count(void) ++static inline bool eventfd_signal_allowed(void) + { +- return this_cpu_read(eventfd_wake_count); ++ return !current->in_eventfd_signal; + } + + #else /* CONFIG_EVENTFD */ +@@ -77,9 +76,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, + return -ENOSYS; + } + +-static inline bool eventfd_signal_count(void) ++static inline bool eventfd_signal_allowed(void) + { +- return false; ++ return true; + } + + #endif +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 18259e38d..b71d98518 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -719,7 +719,7 @@ struct inode { + struct block_device *i_bdev; + struct cdev *i_cdev; + char *i_link; +- unsigned i_dir_seq; ++ unsigned __i_dir_seq; + }; + + __u32 i_generation; +diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h +index 754f67ac4..76878b357 100644 +--- a/include/linux/hardirq.h ++++ b/include/linux/hardirq.h +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -32,9 +33,9 @@ static __always_inline void rcu_irq_enter_check_tick(void) + */ + #define __irq_enter() \ + do { \ +- account_irq_enter_time(current); \ + preempt_count_add(HARDIRQ_OFFSET); \ + lockdep_hardirq_enter(); \ ++ account_hardirq_enter(current); \ + } while (0) + + /* +@@ -62,8 +63,8 @@ void irq_enter_rcu(void); + */ + #define __irq_exit() \ + do { \ ++ account_hardirq_exit(current); \ + lockdep_hardirq_exit(); \ +- account_irq_exit_time(current); \ + preempt_count_sub(HARDIRQ_OFFSET); \ + } while (0) + +@@ -115,7 +116,6 @@ extern void rcu_nmi_exit(void); + do { \ + lockdep_off(); \ + arch_nmi_enter(); \ +- printk_nmi_enter(); \ + BUG_ON(in_nmi() == NMI_MASK); \ + __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ + } while (0) +@@ -134,7 +134,6 @@ extern void rcu_nmi_exit(void); + do { \ + BUG_ON(!in_nmi()); \ + __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ +- printk_nmi_exit(); \ + arch_nmi_exit(); \ + lockdep_on(); \ + } while (0) +diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h +new file mode 100644 +index 000000000..f9bc6acd3 +--- /dev/null ++++ b/include/linux/highmem-internal.h +@@ -0,0 +1,222 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_HIGHMEM_INTERNAL_H ++#define _LINUX_HIGHMEM_INTERNAL_H ++ ++/* ++ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. ++ */ ++#ifdef CONFIG_KMAP_LOCAL ++void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); ++void *__kmap_local_page_prot(struct page *page, pgprot_t prot); ++void kunmap_local_indexed(void *vaddr); ++void kmap_local_fork(struct task_struct *tsk); ++void __kmap_local_sched_out(void); ++void __kmap_local_sched_in(void); ++static inline void kmap_assert_nomap(void) ++{ ++ DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); ++} ++#else ++static inline void kmap_local_fork(struct task_struct *tsk) { } ++static inline void kmap_assert_nomap(void) { } ++#endif ++ ++#ifdef CONFIG_HIGHMEM ++#include ++ ++#ifndef ARCH_HAS_KMAP_FLUSH_TLB ++static inline void kmap_flush_tlb(unsigned long addr) { } ++#endif ++ ++#ifndef kmap_prot ++#define kmap_prot PAGE_KERNEL ++#endif ++ ++void *kmap_high(struct page *page); ++void kunmap_high(struct page *page); ++void __kmap_flush_unused(void); ++struct page *__kmap_to_page(void *addr); ++ ++static inline void *kmap(struct page *page) ++{ ++ void *addr; ++ ++ might_sleep(); ++ if (!PageHighMem(page)) ++ addr = page_address(page); ++ else ++ addr = kmap_high(page); ++ kmap_flush_tlb((unsigned long)addr); ++ return addr; ++} ++ ++static inline void kunmap(struct page *page) ++{ ++ might_sleep(); ++ if (!PageHighMem(page)) ++ return; ++ kunmap_high(page); ++} ++ ++static inline struct page *kmap_to_page(void *addr) ++{ ++ return __kmap_to_page(addr); ++} ++ ++static inline void kmap_flush_unused(void) ++{ ++ __kmap_flush_unused(); ++} ++ ++static inline void *kmap_local_page(struct page *page) ++{ ++ return __kmap_local_page_prot(page, kmap_prot); ++} ++ ++static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) ++{ ++ return __kmap_local_page_prot(page, prot); ++} ++ ++static inline void *kmap_local_pfn(unsigned long pfn) ++{ ++ return __kmap_local_pfn_prot(pfn, kmap_prot); ++} ++ ++static inline void __kunmap_local(void *vaddr) ++{ ++ kunmap_local_indexed(vaddr); ++} ++ ++static inline void *kmap_atomic(struct page *page) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); ++ pagefault_disable(); ++ return __kmap_local_page_prot(page, kmap_prot); ++} ++ ++static inline void __kunmap_atomic(void *addr) ++{ ++ kunmap_local_indexed(addr); ++ pagefault_enable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_enable(); ++ else ++ preempt_enable(); ++} ++ ++unsigned int __nr_free_highpages(void); ++extern atomic_long_t _totalhigh_pages; ++ ++static inline unsigned int nr_free_highpages(void) ++{ ++ return __nr_free_highpages(); ++} ++ ++static inline unsigned long totalhigh_pages(void) ++{ ++ return (unsigned long)atomic_long_read(&_totalhigh_pages); ++} ++ ++static inline void totalhigh_pages_inc(void) ++{ ++ atomic_long_inc(&_totalhigh_pages); ++} ++ ++static inline void totalhigh_pages_add(long count) ++{ ++ atomic_long_add(count, &_totalhigh_pages); ++} ++ ++#else /* CONFIG_HIGHMEM */ ++ ++static inline struct page *kmap_to_page(void *addr) ++{ ++ return virt_to_page(addr); ++} ++ ++static inline void *kmap(struct page *page) ++{ ++ might_sleep(); ++ return page_address(page); ++} ++ ++static inline void kunmap_high(struct page *page) { } ++static inline void kmap_flush_unused(void) { } ++ ++static inline void kunmap(struct page *page) ++{ ++#ifdef ARCH_HAS_FLUSH_ON_KUNMAP ++ kunmap_flush_on_unmap(page_address(page)); ++#endif ++} ++ ++static inline void *kmap_local_page(struct page *page) ++{ ++ return page_address(page); ++} ++ ++static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) ++{ ++ return kmap_local_page(page); ++} ++ ++static inline void *kmap_local_pfn(unsigned long pfn) ++{ ++ return kmap_local_page(pfn_to_page(pfn)); ++} ++ ++static inline void __kunmap_local(void *addr) ++{ ++#ifdef ARCH_HAS_FLUSH_ON_KUNMAP ++ kunmap_flush_on_unmap(addr); ++#endif ++} ++ ++static inline void *kmap_atomic(struct page *page) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_disable(); ++ else ++ preempt_disable(); ++ pagefault_disable(); ++ return page_address(page); ++} ++ ++static inline void __kunmap_atomic(void *addr) ++{ ++#ifdef ARCH_HAS_FLUSH_ON_KUNMAP ++ kunmap_flush_on_unmap(addr); ++#endif ++ pagefault_enable(); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ migrate_enable(); ++ else ++ preempt_enable(); ++} ++ ++static inline unsigned int nr_free_highpages(void) { return 0; } ++static inline unsigned long totalhigh_pages(void) { return 0UL; } ++ ++#endif /* CONFIG_HIGHMEM */ ++ ++/* ++ * Prevent people trying to call kunmap_atomic() as if it were kunmap() ++ * kunmap_atomic() should get the return value of kmap_atomic, not the page. ++ */ ++#define kunmap_atomic(__addr) \ ++do { \ ++ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ ++ __kunmap_atomic(__addr); \ ++} while (0) ++ ++#define kunmap_local(__addr) \ ++do { \ ++ BUILD_BUG_ON(__same_type((__addr), struct page *)); \ ++ __kunmap_local(__addr); \ ++} while (0) ++ ++#endif +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 6b27af8fe..ec1edaf12 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -11,217 +11,137 @@ + + #include + +-#ifndef ARCH_HAS_FLUSH_ANON_PAGE +-static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) +-{ +-} +-#endif +- +-#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +-static inline void flush_kernel_dcache_page(struct page *page) +-{ +-} +-static inline void flush_kernel_vmap_range(void *vaddr, int size) +-{ +-} +-static inline void invalidate_kernel_vmap_range(void *vaddr, int size) +-{ +-} +-#endif +- +-#include +- +-#ifdef CONFIG_HIGHMEM +-extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +-extern void kunmap_atomic_high(void *kvaddr); +-#include +- +-#ifndef ARCH_HAS_KMAP_FLUSH_TLB +-static inline void kmap_flush_tlb(unsigned long addr) { } +-#endif +- +-#ifndef kmap_prot +-#define kmap_prot PAGE_KERNEL +-#endif +- +-void *kmap_high(struct page *page); +-static inline void *kmap(struct page *page) +-{ +- void *addr; +- +- might_sleep(); +- if (!PageHighMem(page)) +- addr = page_address(page); +- else +- addr = kmap_high(page); +- kmap_flush_tlb((unsigned long)addr); +- return addr; +-} +- +-void kunmap_high(struct page *page); +- +-static inline void kunmap(struct page *page) +-{ +- might_sleep(); +- if (!PageHighMem(page)) +- return; +- kunmap_high(page); +-} ++#include "highmem-internal.h" + +-/* +- * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because +- * no global lock is needed and because the kmap code must perform a global TLB +- * invalidation when the kmap pool wraps. ++/** ++ * kmap - Map a page for long term usage ++ * @page: Pointer to the page to be mapped ++ * ++ * Returns: The virtual address of the mapping + * +- * However when holding an atomic kmap it is not legal to sleep, so atomic +- * kmaps are appropriate for short, tight code paths only. ++ * Can only be invoked from preemptible task context because on 32bit ++ * systems with CONFIG_HIGHMEM enabled this function might sleep. + * +- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap +- * gives a more generic (and caching) interface. But kmap_atomic can +- * be used in IRQ contexts, so in some (very limited) cases we need +- * it. ++ * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area ++ * this returns the virtual address of the direct kernel mapping. ++ * ++ * The returned virtual address is globally visible and valid up to the ++ * point where it is unmapped via kunmap(). The pointer can be handed to ++ * other contexts. ++ * ++ * For highmem pages on 32bit systems this can be slow as the mapping space ++ * is limited and protected by a global lock. In case that there is no ++ * mapping slot available the function blocks until a slot is released via ++ * kunmap(). + */ +-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +-{ +- preempt_disable(); +- pagefault_disable(); +- if (!PageHighMem(page)) +- return page_address(page); +- return kmap_atomic_high_prot(page, prot); +-} +-#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) ++static inline void *kmap(struct page *page); + +-/* declarations for linux/mm/highmem.c */ +-unsigned int nr_free_highpages(void); +-extern atomic_long_t _totalhigh_pages; +-static inline unsigned long totalhigh_pages(void) +-{ +- return (unsigned long)atomic_long_read(&_totalhigh_pages); +-} +- +-static inline void totalhigh_pages_inc(void) +-{ +- atomic_long_inc(&_totalhigh_pages); +-} +- +-static inline void totalhigh_pages_dec(void) +-{ +- atomic_long_dec(&_totalhigh_pages); +-} +- +-static inline void totalhigh_pages_add(long count) +-{ +- atomic_long_add(count, &_totalhigh_pages); +-} +- +-static inline void totalhigh_pages_set(long val) +-{ +- atomic_long_set(&_totalhigh_pages, val); +-} +- +-void kmap_flush_unused(void); +- +-struct page *kmap_to_page(void *addr); +- +-#else /* CONFIG_HIGHMEM */ ++/** ++ * kunmap - Unmap the virtual address mapped by kmap() ++ * @addr: Virtual address to be unmapped ++ * ++ * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of ++ * pages in the low memory area. ++ */ ++static inline void kunmap(struct page *page); + +-static inline unsigned int nr_free_highpages(void) { return 0; } ++/** ++ * kmap_to_page - Get the page for a kmap'ed address ++ * @addr: The address to look up ++ * ++ * Returns: The page which is mapped to @addr. ++ */ ++static inline struct page *kmap_to_page(void *addr); + +-static inline struct page *kmap_to_page(void *addr) +-{ +- return virt_to_page(addr); +-} ++/** ++ * kmap_flush_unused - Flush all unused kmap mappings in order to ++ * remove stray mappings ++ */ ++static inline void kmap_flush_unused(void); + +-static inline unsigned long totalhigh_pages(void) { return 0UL; } ++/** ++ * kmap_local_page - Map a page for temporary usage ++ * @page: Pointer to the page to be mapped ++ * ++ * Returns: The virtual address of the mapping ++ * ++ * Can be invoked from any context. ++ * ++ * Requires careful handling when nesting multiple mappings because the map ++ * management is stack based. The unmap has to be in the reverse order of ++ * the map operation: ++ * ++ * addr1 = kmap_local_page(page1); ++ * addr2 = kmap_local_page(page2); ++ * ... ++ * kunmap_local(addr2); ++ * kunmap_local(addr1); ++ * ++ * Unmapping addr1 before addr2 is invalid and causes malfunction. ++ * ++ * Contrary to kmap() mappings the mapping is only valid in the context of ++ * the caller and cannot be handed to other contexts. ++ * ++ * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the ++ * virtual address of the direct mapping. Only real highmem pages are ++ * temporarily mapped. ++ * ++ * While it is significantly faster than kmap() for the higmem case it ++ * comes with restrictions about the pointer validity. Only use when really ++ * necessary. ++ * ++ * On HIGHMEM enabled systems mapping a highmem page has the side effect of ++ * disabling migration in order to keep the virtual address stable across ++ * preemption. No caller of kmap_local_page() can rely on this side effect. ++ */ ++static inline void *kmap_local_page(struct page *page); + +-static inline void *kmap(struct page *page) +-{ +- might_sleep(); +- return page_address(page); +-} ++/** ++ * kmap_atomic - Atomically map a page for temporary usage - Deprecated! ++ * @page: Pointer to the page to be mapped ++ * ++ * Returns: The virtual address of the mapping ++ * ++ * Effectively a wrapper around kmap_local_page() which disables pagefaults ++ * and preemption. ++ * ++ * Do not use in new code. Use kmap_local_page() instead. ++ */ ++static inline void *kmap_atomic(struct page *page); + +-static inline void kunmap_high(struct page *page) +-{ +-} ++/** ++ * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() ++ * @addr: Virtual address to be unmapped ++ * ++ * Counterpart to kmap_atomic(). ++ * ++ * Effectively a wrapper around kunmap_local() which additionally undoes ++ * the side effects of kmap_atomic(), i.e. reenabling pagefaults and ++ * preemption. ++ */ + +-static inline void kunmap(struct page *page) +-{ +-#ifdef ARCH_HAS_FLUSH_ON_KUNMAP +- kunmap_flush_on_unmap(page_address(page)); +-#endif +-} ++/* Highmem related interfaces for management code */ ++static inline unsigned int nr_free_highpages(void); ++static inline unsigned long totalhigh_pages(void); + +-static inline void *kmap_atomic(struct page *page) ++#ifndef ARCH_HAS_FLUSH_ANON_PAGE ++static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) + { +- preempt_disable(); +- pagefault_disable(); +- return page_address(page); + } +-#define kmap_atomic_prot(page, prot) kmap_atomic(page) +- +-static inline void kunmap_atomic_high(void *addr) +-{ +- /* +- * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() +- * handles re-enabling faults + preemption +- */ +-#ifdef ARCH_HAS_FLUSH_ON_KUNMAP +- kunmap_flush_on_unmap(addr); + #endif +-} +- +-#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) +- +-#define kmap_flush_unused() do {} while(0) +- +-#endif /* CONFIG_HIGHMEM */ + +-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +- +-DECLARE_PER_CPU(int, __kmap_atomic_idx); +- +-static inline int kmap_atomic_idx_push(void) ++#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE ++static inline void flush_kernel_dcache_page(struct page *page) + { +- int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; +- +-#ifdef CONFIG_DEBUG_HIGHMEM +- WARN_ON_ONCE(in_irq() && !irqs_disabled()); +- BUG_ON(idx >= KM_TYPE_NR); +-#endif +- return idx; + } +- +-static inline int kmap_atomic_idx(void) ++static inline void flush_kernel_vmap_range(void *vaddr, int size) + { +- return __this_cpu_read(__kmap_atomic_idx) - 1; + } +- +-static inline void kmap_atomic_idx_pop(void) ++static inline void invalidate_kernel_vmap_range(void *vaddr, int size) + { +-#ifdef CONFIG_DEBUG_HIGHMEM +- int idx = __this_cpu_dec_return(__kmap_atomic_idx); +- +- BUG_ON(idx < 0); +-#else +- __this_cpu_dec(__kmap_atomic_idx); +-#endif + } +- + #endif + +-/* +- * Prevent people trying to call kunmap_atomic() as if it were kunmap() +- * kunmap_atomic() should get the return value of kmap_atomic, not the page. +- */ +-#define kunmap_atomic(addr) \ +-do { \ +- BUILD_BUG_ON(__same_type((addr), struct page *)); \ +- kunmap_atomic_high(addr); \ +- pagefault_enable(); \ +- preempt_enable(); \ +-} while (0) +- +- + /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ + #ifndef clear_user_highpage + static inline void clear_user_highpage(struct page *page, unsigned long vaddr) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 22240a8c3..fc162c252 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -542,7 +542,7 @@ struct softirq_action + asmlinkage void do_softirq(void); + asmlinkage void __do_softirq(void); + +-#ifdef __ARCH_HAS_DO_SOFTIRQ ++#if defined(__ARCH_HAS_DO_SOFTIRQ) && !defined(CONFIG_PREEMPT_RT) + void do_softirq_own_stack(void); + #else + static inline void do_softirq_own_stack(void) +@@ -637,26 +637,21 @@ enum + TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ + }; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) + static inline int tasklet_trylock(struct tasklet_struct *t) + { + return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); + } + +-static inline void tasklet_unlock(struct tasklet_struct *t) +-{ +- smp_mb__before_atomic(); +- clear_bit(TASKLET_STATE_RUN, &(t)->state); +-} ++void tasklet_unlock(struct tasklet_struct *t); ++void tasklet_unlock_wait(struct tasklet_struct *t); ++void tasklet_unlock_spin_wait(struct tasklet_struct *t); + +-static inline void tasklet_unlock_wait(struct tasklet_struct *t) +-{ +- while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } +-} + #else +-#define tasklet_trylock(t) 1 +-#define tasklet_unlock_wait(t) do { } while (0) +-#define tasklet_unlock(t) do { } while (0) ++static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } ++static inline void tasklet_unlock(struct tasklet_struct *t) { } ++static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } ++static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } + #endif + + extern void __tasklet_schedule(struct tasklet_struct *t); +@@ -681,6 +676,17 @@ static inline void tasklet_disable_nosync(struct tasklet_struct *t) + smp_mb__after_atomic(); + } + ++/* ++ * Do not use in new code. Disabling tasklets from atomic contexts is ++ * error prone and should be avoided. ++ */ ++static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) ++{ ++ tasklet_disable_nosync(t); ++ tasklet_unlock_spin_wait(t); ++ smp_mb(); ++} ++ + static inline void tasklet_disable(struct tasklet_struct *t) + { + tasklet_disable_nosync(t); +diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h +index c75e4d3d8..4bb8223f2 100644 +--- a/include/linux/io-mapping.h ++++ b/include/linux/io-mapping.h +@@ -60,22 +60,20 @@ io_mapping_fini(struct io_mapping *mapping) + iomap_free(mapping->base, mapping->size); + } + +-/* Atomic map/unmap */ ++/* Temporary mappings which are only valid in the current context */ + static inline void __iomem * +-io_mapping_map_atomic_wc(struct io_mapping *mapping, +- unsigned long offset) ++io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) + { + resource_size_t phys_addr; + + BUG_ON(offset >= mapping->size); + phys_addr = mapping->base + offset; +- return iomap_atomic_prot_pfn(PHYS_PFN(phys_addr), mapping->prot); ++ return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); + } + +-static inline void +-io_mapping_unmap_atomic(void __iomem *vaddr) ++static inline void io_mapping_unmap_local(void __iomem *vaddr) + { +- iounmap_atomic(vaddr); ++ kunmap_local_indexed((void __force *)vaddr); + } + + static inline void __iomem * +@@ -97,7 +95,7 @@ io_mapping_unmap(void __iomem *vaddr) + iounmap(vaddr); + } + +-#else ++#else /* HAVE_ATOMIC_IOMAP */ + + #include + +@@ -144,25 +142,19 @@ io_mapping_unmap(void __iomem *vaddr) + { + } + +-/* Atomic map/unmap */ ++/* Temporary mappings which are only valid in the current context */ + static inline void __iomem * +-io_mapping_map_atomic_wc(struct io_mapping *mapping, +- unsigned long offset) ++io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) + { +- preempt_disable(); +- pagefault_disable(); + return io_mapping_map_wc(mapping, offset, PAGE_SIZE); + } + +-static inline void +-io_mapping_unmap_atomic(void __iomem *vaddr) ++static inline void io_mapping_unmap_local(void __iomem *vaddr) + { + io_mapping_unmap(vaddr); +- pagefault_enable(); +- preempt_enable(); + } + +-#endif /* HAVE_ATOMIC_IOMAP */ ++#endif /* !HAVE_ATOMIC_IOMAP */ + + static inline struct io_mapping * + io_mapping_create_wc(resource_size_t base, +diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h +deleted file mode 100644 +index 6e8895cd4..000000000 +--- a/include/linux/irq_cpustat.h ++++ /dev/null +@@ -1,28 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef __irq_cpustat_h +-#define __irq_cpustat_h +- +-/* +- * Contains default mappings for irq_cpustat_t, used by almost every +- * architecture. Some arch (like s390) have per cpu hardware pages and +- * they define their own mappings for irq_stat. +- * +- * Keith Owens July 2000. +- */ +- +- +-/* +- * Simple wrappers reducing source bloat. Define all irq_stat fields +- * here, even ones that are arch dependent. That way we get common +- * definitions instead of differing sets for each arch. +- */ +- +-#ifndef __ARCH_IRQ_STAT +-DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); /* defined in asm/hardirq.h */ +-#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat.member, cpu)) +-#endif +- +-/* arch dependent irq_stat fields */ +-#define nmi_count(cpu) __IRQ_STAT((cpu), __nmi_count) /* i386 */ +- +-#endif /* __irq_cpustat_h */ +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index ec2a47a81..255d2dfec 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -3,6 +3,7 @@ + #define _LINUX_IRQ_WORK_H + + #include ++#include + + /* + * An entry can be in one of four states: +@@ -16,11 +17,13 @@ + struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); ++ struct rcuwait irqwait; + }; + + #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ + .node = { .u_flags = (_flags), }, \ + .func = (_func), \ ++ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ + } + + #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) +@@ -30,10 +33,15 @@ struct irq_work { + #define DEFINE_IRQ_WORK(name, _f) \ + struct irq_work name = IRQ_WORK_INIT(_f) + ++#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) ++#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY) ++#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ) ++ + static inline + void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) + { + *work = IRQ_WORK_INIT(func); ++ rcuwait_init(&work->irqwait); + } + + static inline bool irq_work_is_pending(struct irq_work *work) +@@ -46,6 +54,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) + return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; + } + ++static inline bool irq_work_is_hard(struct irq_work *work) ++{ ++ return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; ++} ++ + bool irq_work_queue(struct irq_work *work); + bool irq_work_queue_on(struct irq_work *work, int cpu); + +diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h +index dc1b213ae..9bbcd8cba 100644 +--- a/include/linux/irqdesc.h ++++ b/include/linux/irqdesc.h +@@ -68,6 +68,7 @@ struct irq_desc { + unsigned int irqs_unhandled; + atomic_t threads_handled; + int threads_handled_last; ++ u64 random_ip; + raw_spinlock_t lock; + struct cpumask *percpu_enabled; + const struct cpumask *percpu_affinity; +diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h +index fef2d43a7..741aa2008 100644 +--- a/include/linux/irqflags.h ++++ b/include/linux/irqflags.h +@@ -71,14 +71,6 @@ do { \ + do { \ + __this_cpu_dec(hardirq_context); \ + } while (0) +-# define lockdep_softirq_enter() \ +-do { \ +- current->softirq_context++; \ +-} while (0) +-# define lockdep_softirq_exit() \ +-do { \ +- current->softirq_context--; \ +-} while (0) + + # define lockdep_hrtimer_enter(__hrtimer) \ + ({ \ +@@ -140,6 +132,21 @@ do { \ + # define lockdep_irq_work_exit(__work) do { } while (0) + #endif + ++#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) ++# define lockdep_softirq_enter() \ ++do { \ ++ current->softirq_context++; \ ++} while (0) ++# define lockdep_softirq_exit() \ ++do { \ ++ current->softirq_context--; \ ++} while (0) ++ ++#else ++# define lockdep_softirq_enter() do { } while (0) ++# define lockdep_softirq_exit() do { } while (0) ++#endif ++ + #if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) + extern void stop_critical_timings(void); +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index 78a0907f0..e6270bfa6 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -220,6 +220,7 @@ static __always_inline void might_resched(void) + extern void ___might_sleep(const char *file, int line, int preempt_offset); + extern void __might_sleep(const char *file, int line, int preempt_offset); + extern void __cant_sleep(const char *file, int line, int preempt_offset); ++extern void __cant_migrate(const char *file, int line); + + /** + * might_sleep - annotation for functions that can sleep +@@ -235,6 +236,10 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + */ + # define might_sleep() \ + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) ++ ++# define might_sleep_no_state_check() \ ++ do { ___might_sleep(__FILE__, __LINE__, 0); } while (0) ++ + /** + * cant_sleep - annotation for functions that cannot sleep + * +@@ -243,6 +248,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + # define cant_sleep() \ + do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) + # define sched_annotate_sleep() (current->task_state_change = 0) ++ ++/** ++ * cant_migrate - annotation for functions that cannot migrate ++ * ++ * Will print a stack trace if executed in code which is migratable ++ */ ++# define cant_migrate() \ ++ do { \ ++ if (IS_ENABLED(CONFIG_SMP)) \ ++ __cant_migrate(__FILE__, __LINE__); \ ++ } while (0) ++ + /** + * non_block_start - annotate the start of section where sleeping is prohibited + * +@@ -266,7 +283,9 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + static inline void __might_sleep(const char *file, int line, + int preempt_offset) { } + # define might_sleep() do { might_resched(); } while (0) ++# define might_sleep_no_state_check() do { might_resched(); } while (0) + # define cant_sleep() do { } while (0) ++# define cant_migrate() do { } while (0) + # define sched_annotate_sleep() do { } while (0) + # define non_block_start() do { } while (0) + # define non_block_end() do { } while (0) +@@ -274,13 +293,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); + + #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) + +-#ifndef CONFIG_PREEMPT_RT +-# define cant_migrate() cant_sleep() +-#else +- /* Placeholder for now */ +-# define cant_migrate() do { } while (0) +-#endif +- + /** + * abs - return absolute value of an argument + * @x: the value. If it is unsigned type, it is converted to signed type first. +diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h +index 3378bcbe5..86673930c 100644 +--- a/include/linux/kmsg_dump.h ++++ b/include/linux/kmsg_dump.h +@@ -29,6 +29,18 @@ enum kmsg_dump_reason { + KMSG_DUMP_MAX + }; + ++/** ++ * struct kmsg_dumper_iter - iterator for kernel crash message dumper ++ * @active: Flag that specifies if this is currently dumping ++ * @cur_seq: Points to the oldest message to dump (private) ++ * @next_seq: Points after the newest message to dump (private) ++ */ ++struct kmsg_dumper_iter { ++ bool active; ++ u64 cur_seq; ++ u64 next_seq; ++}; ++ + /** + * struct kmsg_dumper - kernel crash message dumper structure + * @list: Entry in the dumper list (private) +@@ -39,33 +51,22 @@ enum kmsg_dump_reason { + */ + struct kmsg_dumper { + struct list_head list; +- void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); ++ void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, ++ struct kmsg_dumper_iter *iter); + enum kmsg_dump_reason max_reason; +- bool active; + bool registered; +- +- /* private state of the kmsg iterator */ +- u32 cur_idx; +- u32 next_idx; +- u64 cur_seq; +- u64 next_seq; + }; + + #ifdef CONFIG_PRINTK + void kmsg_dump(enum kmsg_dump_reason reason); + +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, +- char *line, size_t size, size_t *len); +- +-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len); + +-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +- char *buf, size_t size, size_t *len); +- +-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper); ++bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, ++ char *buf, size_t size, size_t *len_out); + +-void kmsg_dump_rewind(struct kmsg_dumper *dumper); ++void kmsg_dump_rewind(struct kmsg_dumper_iter *iter); + + int kmsg_dump_register(struct kmsg_dumper *dumper); + +@@ -77,30 +78,19 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) + { + } + +-static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, +- bool syslog, const char *line, +- size_t size, size_t *len) +-{ +- return false; +-} +- +-static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, ++static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + const char *line, size_t size, size_t *len) + { + return false; + } + +-static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, ++static inline bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len) + { + return false; + } + +-static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +-{ +-} +- +-static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper) ++static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) + { + } + +diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h +index 3f02b8186..1b8ae0349 100644 +--- a/include/linux/local_lock_internal.h ++++ b/include/linux/local_lock_internal.h +@@ -7,13 +7,39 @@ + #include + + typedef struct { +-#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#ifdef CONFIG_PREEMPT_RT ++ spinlock_t lock; ++ struct task_struct *owner; ++ int nestcnt; ++ ++#elif defined(CONFIG_DEBUG_LOCK_ALLOC) + struct lockdep_map dep_map; + struct task_struct *owner; + #endif + } local_lock_t; + +-#ifdef CONFIG_DEBUG_LOCK_ALLOC ++#ifdef CONFIG_PREEMPT_RT ++ ++#define INIT_LOCAL_LOCK(lockname) { \ ++ __SPIN_LOCK_UNLOCKED((lockname).lock), \ ++ .owner = NULL, \ ++ .nestcnt = 0, \ ++ } ++ ++static inline void ___local_lock_init(local_lock_t *l) ++{ ++ l->owner = NULL; ++ l->nestcnt = 0; ++} ++ ++#define __local_lock_init(l) \ ++do { \ ++ spin_lock_init(&(l)->lock); \ ++ ___local_lock_init(l); \ ++} while (0) ++ ++#elif defined(CONFIG_DEBUG_LOCK_ALLOC) ++ + # define LOCAL_LOCK_DEBUG_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ +@@ -21,7 +47,33 @@ typedef struct { + .lock_type = LD_LOCK_PERCPU, \ + }, \ + .owner = NULL, ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT + ++static inline void local_lock_acquire(local_lock_t *l) ++{ ++ if (l->owner != current) { ++ spin_lock(&l->lock); ++ DEBUG_LOCKS_WARN_ON(l->owner); ++ DEBUG_LOCKS_WARN_ON(l->nestcnt); ++ l->owner = current; ++ } ++ l->nestcnt++; ++} ++ ++static inline void local_lock_release(local_lock_t *l) ++{ ++ DEBUG_LOCKS_WARN_ON(l->nestcnt == 0); ++ DEBUG_LOCKS_WARN_ON(l->owner != current); ++ if (--l->nestcnt) ++ return; ++ ++ l->owner = NULL; ++ spin_unlock(&l->lock); ++} ++ ++#elif defined(CONFIG_DEBUG_LOCK_ALLOC) + static inline void local_lock_acquire(local_lock_t *l) + { + lock_map_acquire(&l->dep_map); +@@ -47,6 +99,47 @@ static inline void local_lock_release(local_lock_t *l) { } + static inline void local_lock_debug_init(local_lock_t *l) { } + #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ + ++#ifdef CONFIG_PREEMPT_RT ++ ++#define __local_lock(lock) \ ++ do { \ ++ migrate_disable(); \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ ++#define __local_unlock(lock) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ migrate_enable(); \ ++ } while (0) ++ ++#define __local_lock_irq(lock) \ ++ do { \ ++ migrate_disable(); \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ ++#define __local_lock_irqsave(lock, flags) \ ++ do { \ ++ migrate_disable(); \ ++ flags = 0; \ ++ local_lock_acquire(this_cpu_ptr(lock)); \ ++ } while (0) ++ ++#define __local_unlock_irq(lock) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ migrate_enable(); \ ++ } while (0) ++ ++#define __local_unlock_irqrestore(lock, flags) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ migrate_enable(); \ ++ } while (0) ++ ++#else ++ + #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } + + #define __local_lock_init(lock) \ +@@ -66,6 +159,12 @@ do { \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + ++#define __local_unlock(lock) \ ++ do { \ ++ local_lock_release(this_cpu_ptr(lock)); \ ++ preempt_enable(); \ ++ } while (0) ++ + #define __local_lock_irq(lock) \ + do { \ + local_irq_disable(); \ +@@ -78,12 +177,6 @@ do { \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +-#define __local_unlock(lock) \ +- do { \ +- local_lock_release(this_cpu_ptr(lock)); \ +- preempt_enable(); \ +- } while (0) +- + #define __local_unlock_irq(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ +@@ -95,3 +188,5 @@ do { \ + local_lock_release(this_cpu_ptr(lock)); \ + local_irq_restore(flags); \ + } while (0) ++ ++#endif +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 1c22e294f..41aed4e91 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -577,6 +578,9 @@ struct mm_struct { + bool tlb_flush_batched; + #endif + struct uprobes_state uprobes_state; ++#ifdef CONFIG_PREEMPT_RT ++ struct rcu_head delayed_drop; ++#endif + #ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; + #endif +diff --git a/include/linux/mutex.h b/include/linux/mutex.h +index 4d671fba3..90923d300 100644 +--- a/include/linux/mutex.h ++++ b/include/linux/mutex.h +@@ -22,6 +22,20 @@ + + struct ww_acquire_ctx; + ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ ++ , .dep_map = { \ ++ .name = #lockname, \ ++ .wait_type_inner = LD_WAIT_SLEEP, \ ++ } ++#else ++# define __DEP_MAP_MUTEX_INITIALIZER(lockname) ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++# include ++#else ++ + /* + * Simple, straightforward mutexes with strict semantics: + * +@@ -68,14 +82,6 @@ struct mutex { + struct ww_class; + struct ww_acquire_ctx; + +-struct ww_mutex { +- struct mutex base; +- struct ww_acquire_ctx *ctx; +-#ifdef CONFIG_DEBUG_MUTEXES +- struct ww_class *ww_class; +-#endif +-}; +- + /* + * This is the control structure for tasks blocked on mutex, + * which resides on the blocked task's kernel stack: +@@ -119,16 +125,6 @@ do { \ + __mutex_init((mutex), #mutex, &__key); \ + } while (0) + +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ +- , .dep_map = { \ +- .name = #lockname, \ +- .wait_type_inner = LD_WAIT_SLEEP, \ +- } +-#else +-# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +-#endif +- + #define __MUTEX_INITIALIZER(lockname) \ + { .owner = ATOMIC_LONG_INIT(0) \ + , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ +@@ -224,4 +220,6 @@ enum mutex_trylock_recursive_enum { + extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum + mutex_trylock_recursive(struct mutex *lock); + ++#endif /* !PREEMPT_RT */ ++ + #endif /* __LINUX_MUTEX_H */ +diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h +new file mode 100644 +index 000000000..f0b2e07cd +--- /dev/null ++++ b/include/linux/mutex_rt.h +@@ -0,0 +1,130 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_MUTEX_RT_H ++#define __LINUX_MUTEX_RT_H ++ ++#ifndef __LINUX_MUTEX_H ++#error "Please include mutex.h" ++#endif ++ ++#include ++ ++/* FIXME: Just for __lockfunc */ ++#include ++ ++struct mutex { ++ struct rt_mutex lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++#define __MUTEX_INITIALIZER(mutexname) \ ++ { \ ++ .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ ++ __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ ++ } ++ ++#define DEFINE_MUTEX(mutexname) \ ++ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) ++ ++extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key); ++extern void __lockfunc _mutex_lock(struct mutex *lock); ++extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); ++extern int __lockfunc _mutex_lock_killable(struct mutex *lock); ++extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); ++extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); ++extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass); ++extern int __lockfunc _mutex_trylock(struct mutex *lock); ++extern void __lockfunc _mutex_unlock(struct mutex *lock); ++ ++#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) ++#define mutex_lock(l) _mutex_lock(l) ++#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) ++#define mutex_lock_killable(l) _mutex_lock_killable(l) ++#define mutex_trylock(l) _mutex_trylock(l) ++#define mutex_unlock(l) _mutex_unlock(l) ++#define mutex_lock_io(l) _mutex_lock_io_nested(l, 0); ++ ++#define __mutex_owner(l) ((l)->lock.owner) ++ ++#ifdef CONFIG_DEBUG_MUTEXES ++#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) ++#else ++static inline void mutex_destroy(struct mutex *lock) {} ++#endif ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) ++# define mutex_lock_interruptible_nested(l, s) \ ++ _mutex_lock_interruptible_nested(l, s) ++# define mutex_lock_killable_nested(l, s) \ ++ _mutex_lock_killable_nested(l, s) ++# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) ++ ++# define mutex_lock_nest_lock(lock, nest_lock) \ ++do { \ ++ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ ++ _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ ++} while (0) ++ ++#else ++# define mutex_lock_nested(l, s) _mutex_lock(l) ++# define mutex_lock_interruptible_nested(l, s) \ ++ _mutex_lock_interruptible(l) ++# define mutex_lock_killable_nested(l, s) \ ++ _mutex_lock_killable(l) ++# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) ++# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) ++#endif ++ ++# define mutex_init(mutex) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ rt_mutex_init(&(mutex)->lock); \ ++ __mutex_do_init((mutex), #mutex, &__key); \ ++} while (0) ++ ++# define __mutex_init(mutex, name, key) \ ++do { \ ++ rt_mutex_init(&(mutex)->lock); \ ++ __mutex_do_init((mutex), name, key); \ ++} while (0) ++ ++/** ++ * These values are chosen such that FAIL and SUCCESS match the ++ * values of the regular mutex_trylock(). ++ */ ++enum mutex_trylock_recursive_enum { ++ MUTEX_TRYLOCK_FAILED = 0, ++ MUTEX_TRYLOCK_SUCCESS = 1, ++ MUTEX_TRYLOCK_RECURSIVE, ++}; ++/** ++ * mutex_trylock_recursive - trylock variant that allows recursive locking ++ * @lock: mutex to be locked ++ * ++ * This function should not be used, _ever_. It is purely for hysterical GEM ++ * raisins, and once those are gone this will be removed. ++ * ++ * Returns: ++ * MUTEX_TRYLOCK_FAILED - trylock failed, ++ * MUTEX_TRYLOCK_SUCCESS - lock acquired, ++ * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. ++ */ ++int __rt_mutex_owner_current(struct rt_mutex *lock); ++ ++static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum ++mutex_trylock_recursive(struct mutex *lock) ++{ ++ if (unlikely(__rt_mutex_owner_current(&lock->lock))) ++ return MUTEX_TRYLOCK_RECURSIVE; ++ ++ return mutex_trylock(lock); ++} ++ ++extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); ++ ++#endif +diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h +index 5491ad5f4..cd9e5b3f1 100644 +--- a/include/linux/nfs_xdr.h ++++ b/include/linux/nfs_xdr.h +@@ -1675,7 +1675,7 @@ struct nfs_unlinkdata { + struct nfs_removeargs args; + struct nfs_removeres res; + struct dentry *dentry; +- wait_queue_head_t wq; ++ struct swait_queue_head wq; + const struct cred *cred; + struct nfs_fattr dir_attr; + long timeout; +diff --git a/include/linux/notifier.h b/include/linux/notifier.h +index 2fb373a5c..723bc2df6 100644 +--- a/include/linux/notifier.h ++++ b/include/linux/notifier.h +@@ -58,7 +58,7 @@ struct notifier_block { + }; + + struct atomic_notifier_head { +- spinlock_t lock; ++ raw_spinlock_t lock; + struct notifier_block __rcu *head; + }; + +@@ -78,7 +78,7 @@ struct srcu_notifier_head { + }; + + #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ +- spin_lock_init(&(name)->lock); \ ++ raw_spin_lock_init(&(name)->lock); \ + (name)->head = NULL; \ + } while (0) + #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ +@@ -95,7 +95,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); + cleanup_srcu_struct(&(name)->srcu); + + #define ATOMIC_NOTIFIER_INIT(name) { \ +- .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ ++ .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ + .head = NULL } + #define BLOCKING_NOTIFIER_INIT(name) { \ + .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ +diff --git a/include/linux/pid.h b/include/linux/pid.h +index 34afff2dc..514dd026c 100644 +--- a/include/linux/pid.h ++++ b/include/linux/pid.h +@@ -3,6 +3,7 @@ + #define _LINUX_PID_H + + #include ++#include + #include + #include + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 7d9c1c0e1..7b5b2ed55 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -77,31 +77,37 @@ + /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ + #include + ++#define nmi_count() (preempt_count() & NMI_MASK) + #define hardirq_count() (preempt_count() & HARDIRQ_MASK) +-#define softirq_count() (preempt_count() & SOFTIRQ_MASK) +-#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ +- | NMI_MASK)) ++#ifdef CONFIG_PREEMPT_RT ++# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) ++#else ++# define softirq_count() (preempt_count() & SOFTIRQ_MASK) ++#endif ++#define irq_count() (nmi_count() | hardirq_count() | softirq_count()) + + /* +- * Are we doing bottom half or hardware interrupt processing? ++ * Macros to retrieve the current execution context: + * +- * in_irq() - We're in (hard) IRQ context ++ * in_nmi() - We're in NMI context ++ * in_hardirq() - We're in hard IRQ context ++ * in_serving_softirq() - We're in softirq context ++ * in_task() - We're in task context ++ */ ++#define in_nmi() (nmi_count()) ++#define in_hardirq() (hardirq_count()) ++#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) ++#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq())) ++ ++/* ++ * The following macros are deprecated and should not be used in new code: ++ * in_irq() - Obsolete version of in_hardirq() + * in_softirq() - We have BH disabled, or are processing softirqs + * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled +- * in_serving_softirq() - We're in softirq context +- * in_nmi() - We're in NMI context +- * in_task() - We're in task context +- * +- * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really +- * should not be used in new code. + */ + #define in_irq() (hardirq_count()) + #define in_softirq() (softirq_count()) + #define in_interrupt() (irq_count()) +-#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) +-#define in_nmi() (preempt_count() & NMI_MASK) +-#define in_task() (!(preempt_count() & \ +- (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + + /* + * The preempt_count offset after preempt_disable(); +@@ -115,7 +121,11 @@ + /* + * The preempt_count offset after spin_lock() + */ ++#if !defined(CONFIG_PREEMPT_RT) + #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET ++#else ++#define PREEMPT_LOCK_OFFSET 0 ++#endif + + /* + * The preempt_count offset needed for things like: +@@ -164,6 +174,20 @@ extern void preempt_count_sub(int val); + #define preempt_count_inc() preempt_count_add(1) + #define preempt_count_dec() preempt_count_sub(1) + ++#ifdef CONFIG_PREEMPT_LAZY ++#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) ++#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) ++#define inc_preempt_lazy_count() add_preempt_lazy_count(1) ++#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) ++#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) ++#else ++#define add_preempt_lazy_count(val) do { } while (0) ++#define sub_preempt_lazy_count(val) do { } while (0) ++#define inc_preempt_lazy_count() do { } while (0) ++#define dec_preempt_lazy_count() do { } while (0) ++#define preempt_lazy_count() (0) ++#endif ++ + #ifdef CONFIG_PREEMPT_COUNT + + #define preempt_disable() \ +@@ -172,13 +196,25 @@ do { \ + barrier(); \ + } while (0) + ++#define preempt_lazy_disable() \ ++do { \ ++ inc_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define sched_preempt_enable_no_resched() \ + do { \ + barrier(); \ + preempt_count_dec(); \ + } while (0) + +-#define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++#ifndef CONFIG_PREEMPT_RT ++# define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++# define preempt_check_resched_rt() barrier(); ++#else ++# define preempt_enable_no_resched() preempt_enable() ++# define preempt_check_resched_rt() preempt_check_resched() ++#endif + + #define preemptible() (preempt_count() == 0 && !irqs_disabled()) + +@@ -203,6 +239,18 @@ do { \ + __preempt_schedule(); \ + } while (0) + ++/* ++ * open code preempt_check_resched() because it is not exported to modules and ++ * used by local_unlock() or bpf_enable_instrumentation(). ++ */ ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++ if (should_resched(0)) \ ++ __preempt_schedule(); \ ++} while (0) ++ + #else /* !CONFIG_PREEMPTION */ + #define preempt_enable() \ + do { \ +@@ -210,6 +258,12 @@ do { \ + preempt_count_dec(); \ + } while (0) + ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define preempt_enable_notrace() \ + do { \ + barrier(); \ +@@ -248,8 +302,12 @@ do { \ + #define preempt_disable_notrace() barrier() + #define preempt_enable_no_resched_notrace() barrier() + #define preempt_enable_notrace() barrier() ++#define preempt_check_resched_rt() barrier() + #define preemptible() 0 + ++#define preempt_lazy_disable() barrier() ++#define preempt_lazy_enable() barrier() ++ + #endif /* CONFIG_PREEMPT_COUNT */ + + #ifdef MODULE +@@ -268,10 +326,22 @@ do { \ + } while (0) + #define preempt_fold_need_resched() \ + do { \ +- if (tif_need_resched()) \ ++ if (tif_need_resched_now()) \ + set_preempt_need_resched(); \ + } while (0) + ++#ifdef CONFIG_PREEMPT_RT ++# define preempt_disable_rt() preempt_disable() ++# define preempt_enable_rt() preempt_enable() ++# define preempt_disable_nort() barrier() ++# define preempt_enable_nort() barrier() ++#else ++# define preempt_disable_rt() barrier() ++# define preempt_enable_rt() barrier() ++# define preempt_disable_nort() preempt_disable() ++# define preempt_enable_nort() preempt_enable() ++#endif ++ + #ifdef CONFIG_PREEMPT_NOTIFIERS + + struct preempt_notifier; +@@ -322,34 +392,78 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, + + #endif + +-/** +- * migrate_disable - Prevent migration of the current task ++#ifdef CONFIG_SMP ++ ++/* ++ * Migrate-Disable and why it is undesired. + * +- * Maps to preempt_disable() which also disables preemption. Use +- * migrate_disable() to annotate that the intent is to prevent migration, +- * but not necessarily preemption. ++ * When a preempted task becomes elegible to run under the ideal model (IOW it ++ * becomes one of the M highest priority tasks), it might still have to wait ++ * for the preemptee's migrate_disable() section to complete. Thereby suffering ++ * a reduction in bandwidth in the exact duration of the migrate_disable() ++ * section. + * +- * Can be invoked nested like preempt_disable() and needs the corresponding +- * number of migrate_enable() invocations. +- */ +-static __always_inline void migrate_disable(void) +-{ +- preempt_disable(); +-} +- +-/** +- * migrate_enable - Allow migration of the current task ++ * Per this argument, the change from preempt_disable() to migrate_disable() ++ * gets us: ++ * ++ * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() ++ * it would have had to wait for the lower priority task. ++ * ++ * - a lower priority tasks; which under preempt_disable() could've instantly ++ * migrated away when another CPU becomes available, is now constrained ++ * by the ability to push the higher priority task away, which might itself be ++ * in a migrate_disable() section, reducing it's available bandwidth. ++ * ++ * IOW it trades latency / moves the interference term, but it stays in the ++ * system, and as long as it remains unbounded, the system is not fully ++ * deterministic. + * +- * Counterpart to migrate_disable(). + * +- * As migrate_disable() can be invoked nested, only the outermost invocation +- * reenables migration. ++ * The reason we have it anyway. ++ * ++ * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a ++ * number of primitives into becoming preemptible, they would also allow ++ * migration. This turns out to break a bunch of per-cpu usage. To this end, ++ * all these primitives employ migirate_disable() to restore this implicit ++ * assumption. ++ * ++ * This is a 'temporary' work-around at best. The correct solution is getting ++ * rid of the above assumptions and reworking the code to employ explicit ++ * per-cpu locking or short preempt-disable regions. ++ * ++ * The end goal must be to get rid of migrate_disable(), alternatively we need ++ * a schedulability theory that does not depend on abritrary migration. ++ * ++ * ++ * Notes on the implementation. ++ * ++ * The implementation is particularly tricky since existing code patterns ++ * dictate neither migrate_disable() nor migrate_enable() is allowed to block. ++ * This means that it cannot use cpus_read_lock() to serialize against hotplug, ++ * nor can it easily migrate itself into a pending affinity mask change on ++ * migrate_enable(). ++ * ++ * ++ * Note: even non-work-conserving schedulers like semi-partitioned depends on ++ * migration, so migrate_disable() is not only a problem for ++ * work-conserving schedulers. + * +- * Currently mapped to preempt_enable(). + */ +-static __always_inline void migrate_enable(void) ++extern void migrate_disable(void); ++extern void migrate_enable(void); ++ ++#else ++ ++static inline void migrate_disable(void) + { +- preempt_enable(); ++ preempt_lazy_disable(); + } + ++static inline void migrate_enable(void) ++{ ++ preempt_lazy_enable(); ++} ++ ++#endif /* CONFIG_SMP */ ++ + #endif /* __LINUX_PREEMPT_H */ +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 7d787f91d..9331b131b 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -46,6 +46,12 @@ static inline const char *printk_skip_headers(const char *buffer) + + #define CONSOLE_EXT_LOG_MAX 8192 + ++/* ++ * The maximum size of a record formatted for console printing ++ * (i.e. with the prefix prepended to every line). ++ */ ++#define CONSOLE_LOG_MAX 4096 ++ + /* printk's without a loglevel use this.. */ + #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT + +@@ -149,18 +155,6 @@ static inline __printf(1, 2) __cold + void early_printk(const char *s, ...) { } + #endif + +-#ifdef CONFIG_PRINTK_NMI +-extern void printk_nmi_enter(void); +-extern void printk_nmi_exit(void); +-extern void printk_nmi_direct_enter(void); +-extern void printk_nmi_direct_exit(void); +-#else +-static inline void printk_nmi_enter(void) { } +-static inline void printk_nmi_exit(void) { } +-static inline void printk_nmi_direct_enter(void) { } +-static inline void printk_nmi_direct_exit(void) { } +-#endif /* PRINTK_NMI */ +- + #ifdef CONFIG_PRINTK + extern void printk_safe_enter(void); + extern void printk_safe_exit(void); +@@ -247,8 +241,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); + void dump_stack_print_info(const char *log_lvl); + void show_regs_print_info(const char *log_lvl); + extern asmlinkage void dump_stack(void) __cold; +-extern void printk_safe_flush(void); +-extern void printk_safe_flush_on_panic(void); + #if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) + extern void zap_locks(void); + #else +@@ -318,14 +310,6 @@ static inline void dump_stack(void) + { + } + +-static inline void printk_safe_flush(void) +-{ +-} +- +-static inline void printk_safe_flush_on_panic(void) +-{ +-} +- + static inline void zap_locks(void) + { + } +@@ -546,6 +530,8 @@ extern int kptr_restrict; + no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) + #endif + ++bool pr_flush(int timeout_ms, bool reset_on_progress); ++ + /* + * ratelimited messages with local ratelimit_state, + * no local ratelimit_state used in the !PRINTK case +diff --git a/include/linux/random.h b/include/linux/random.h +index f45b8be3e..0e41d0527 100644 +--- a/include/linux/random.h ++++ b/include/linux/random.h +@@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {} + + extern void add_input_randomness(unsigned int type, unsigned int code, + unsigned int value) __latent_entropy; +-extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; ++extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy; + + extern void get_random_bytes(void *buf, int nbytes); + extern int wait_for_random_bytes(void); +diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h +index d7db17996..c33b0e16d 100644 +--- a/include/linux/rbtree.h ++++ b/include/linux/rbtree.h +@@ -19,19 +19,9 @@ + + #include + #include ++#include + #include + +-struct rb_node { +- unsigned long __rb_parent_color; +- struct rb_node *rb_right; +- struct rb_node *rb_left; +-} __attribute__((aligned(sizeof(long)))); +- /* The alignment might seem pointless, but allegedly CRIS needs it */ +- +-struct rb_root { +- struct rb_node *rb_node; +-}; +- + #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) + + #define RB_ROOT (struct rb_root) { NULL, } +@@ -112,21 +102,6 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent + typeof(*pos), field); 1; }); \ + pos = n) + +-/* +- * Leftmost-cached rbtrees. +- * +- * We do not cache the rightmost node based on footprint +- * size vs number of potential users that could benefit +- * from O(1) rb_last(). Just not worth it, users that want +- * this feature can always implement the logic explicitly. +- * Furthermore, users that want to cache both pointers may +- * find it a bit asymmetric, but that's ok. +- */ +-struct rb_root_cached { +- struct rb_root rb_root; +- struct rb_node *rb_leftmost; +-}; +- + #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } + + /* Same as rb_first(), but O(1) */ +diff --git a/include/linux/rbtree_type.h b/include/linux/rbtree_type.h +new file mode 100644 +index 000000000..77a89dd2c +--- /dev/null ++++ b/include/linux/rbtree_type.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++#ifndef _LINUX_RBTREE_TYPE_H ++#define _LINUX_RBTREE_TYPE_H ++ ++struct rb_node { ++ unsigned long __rb_parent_color; ++ struct rb_node *rb_right; ++ struct rb_node *rb_left; ++} __attribute__((aligned(sizeof(long)))); ++/* The alignment might seem pointless, but allegedly CRIS needs it */ ++ ++struct rb_root { ++ struct rb_node *rb_node; ++}; ++ ++/* ++ * Leftmost-cached rbtrees. ++ * ++ * We do not cache the rightmost node based on footprint ++ * size vs number of potential users that could benefit ++ * from O(1) rb_last(). Just not worth it, users that want ++ * this feature can always implement the logic explicitly. ++ * Furthermore, users that want to cache both pointers may ++ * find it a bit asymmetric, but that's ok. ++ */ ++struct rb_root_cached { ++ struct rb_root rb_root; ++ struct rb_node *rb_leftmost; ++}; ++ ++#endif +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index 095b3b39b..1effcae06 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -54,6 +54,11 @@ void __rcu_read_unlock(void); + * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. + */ + #define rcu_preempt_depth() (current->rcu_read_lock_nesting) ++#ifndef CONFIG_PREEMPT_RT ++#define sched_rcu_preempt_depth() rcu_preempt_depth() ++#else ++static inline int sched_rcu_preempt_depth(void) { return 0; } ++#endif + + #else /* #ifdef CONFIG_PREEMPT_RCU */ + +@@ -79,6 +84,8 @@ static inline int rcu_preempt_depth(void) + return 0; + } + ++#define sched_rcu_preempt_depth() rcu_preempt_depth() ++ + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + + /* Internal to kernel */ +@@ -329,7 +336,8 @@ static inline void rcu_preempt_sleep_check(void) { } + #define rcu_sleep_check() \ + do { \ + rcu_preempt_sleep_check(); \ +- RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ ++ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ + "Illegal context switch in RCU-bh read-side critical section"); \ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ + "Illegal context switch in RCU-sched read-side critical section"); \ +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 6fd615a0e..b02009f53 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -14,11 +14,15 @@ + #define __LINUX_RT_MUTEX_H + + #include +-#include +-#include ++#include ++#include + + extern int max_lock_depth; /* for sysctl */ + ++#ifdef CONFIG_DEBUG_MUTEXES ++#include ++#endif ++ + /** + * The rt_mutex structure + * +@@ -31,12 +35,7 @@ struct rt_mutex { + raw_spinlock_t wait_lock; + struct rb_root_cached waiters; + struct task_struct *owner; +-#ifdef CONFIG_DEBUG_RT_MUTEXES + int save_state; +- const char *name, *file; +- int line; +- void *magic; +-#endif + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +@@ -49,6 +48,7 @@ struct hrtimer_sleeper; + extern int rt_mutex_debug_check_no_locks_freed(const void *from, + unsigned long len); + extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task); ++ extern void rt_mutex_debug_task_free(struct task_struct *tsk); + #else + static inline int rt_mutex_debug_check_no_locks_freed(const void *from, + unsigned long len) +@@ -56,25 +56,15 @@ struct hrtimer_sleeper; + return 0; + } + # define rt_mutex_debug_check_no_locks_held(task) do { } while (0) ++# define rt_mutex_debug_task_free(t) do { } while (0) + #endif + +-#ifdef CONFIG_DEBUG_RT_MUTEXES +-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ +- , .name = #mutexname, .file = __FILE__, .line = __LINE__ +- +-# define rt_mutex_init(mutex) \ ++#define rt_mutex_init(mutex) \ + do { \ + static struct lock_class_key __key; \ + __rt_mutex_init(mutex, __func__, &__key); \ + } while (0) + +- extern void rt_mutex_debug_task_free(struct task_struct *tsk); +-#else +-# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) +-# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) +-# define rt_mutex_debug_task_free(t) do { } while (0) +-#endif +- + #ifdef CONFIG_DEBUG_LOCK_ALLOC + #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ + , .dep_map = { .name = #mutexname } +@@ -82,12 +72,19 @@ do { \ + #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) + #endif + +-#define __RT_MUTEX_INITIALIZER(mutexname) \ +- { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ ++#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ ++ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ + , .waiters = RB_ROOT_CACHED \ + , .owner = NULL \ +- __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ +- __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} ++ __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) ++ ++#define __RT_MUTEX_INITIALIZER(mutexname) \ ++ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ ++ , .save_state = 0 } ++ ++#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \ ++ { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ ++ , .save_state = 1 } + + #define DEFINE_RT_MUTEX(mutexname) \ + struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) +@@ -115,9 +112,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock); + #endif + + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +-extern int rt_mutex_timed_lock(struct rt_mutex *lock, +- struct hrtimer_sleeper *timeout); +- + extern int rt_mutex_trylock(struct rt_mutex *lock); + + extern void rt_mutex_unlock(struct rt_mutex *lock); +diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h +new file mode 100644 +index 000000000..aafdb0a68 +--- /dev/null ++++ b/include/linux/rwlock_rt.h +@@ -0,0 +1,109 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_RWLOCK_RT_H ++#define __LINUX_RWLOCK_RT_H ++ ++#ifndef __LINUX_SPINLOCK_H ++#error Do not include directly. Use spinlock.h ++#endif ++ ++extern void __lockfunc rt_write_lock(rwlock_t *rwlock); ++extern void __lockfunc rt_read_lock(rwlock_t *rwlock); ++extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); ++extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); ++extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); ++extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); ++extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock); ++extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock); ++extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); ++ ++#define read_can_lock(rwlock) rt_read_can_lock(rwlock) ++#define write_can_lock(rwlock) rt_write_can_lock(rwlock) ++ ++#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) ++#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) ++ ++static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags) ++{ ++ *flags = 0; ++ return rt_write_trylock(lock); ++} ++ ++#define write_trylock_irqsave(lock, flags) \ ++ __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags))) ++ ++#define read_lock_irqsave(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ rt_read_lock(lock); \ ++ flags = 0; \ ++ } while (0) ++ ++#define write_lock_irqsave(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ rt_write_lock(lock); \ ++ flags = 0; \ ++ } while (0) ++ ++#define read_lock(lock) rt_read_lock(lock) ++ ++#define read_lock_bh(lock) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_read_lock(lock); \ ++ } while (0) ++ ++#define read_lock_irq(lock) read_lock(lock) ++ ++#define write_lock(lock) rt_write_lock(lock) ++ ++#define write_lock_bh(lock) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_write_lock(lock); \ ++ } while (0) ++ ++#define write_lock_irq(lock) write_lock(lock) ++ ++#define read_unlock(lock) rt_read_unlock(lock) ++ ++#define read_unlock_bh(lock) \ ++ do { \ ++ rt_read_unlock(lock); \ ++ local_bh_enable(); \ ++ } while (0) ++ ++#define read_unlock_irq(lock) read_unlock(lock) ++ ++#define write_unlock(lock) rt_write_unlock(lock) ++ ++#define write_unlock_bh(lock) \ ++ do { \ ++ rt_write_unlock(lock); \ ++ local_bh_enable(); \ ++ } while (0) ++ ++#define write_unlock_irq(lock) write_unlock(lock) ++ ++#define read_unlock_irqrestore(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ (void) flags; \ ++ rt_read_unlock(lock); \ ++ } while (0) ++ ++#define write_unlock_irqrestore(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ (void) flags; \ ++ rt_write_unlock(lock); \ ++ } while (0) ++ ++#define rwlock_init(rwl) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rt_rwlock_init(rwl, #rwl, &__key); \ ++} while (0) ++ ++#endif +diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h +index 3bd03e180..0ad226b5d 100644 +--- a/include/linux/rwlock_types.h ++++ b/include/linux/rwlock_types.h +@@ -1,6 +1,10 @@ + #ifndef __LINUX_RWLOCK_TYPES_H + #define __LINUX_RWLOCK_TYPES_H + ++#if !defined(__LINUX_SPINLOCK_TYPES_H) ++# error "Do not include directly, include spinlock_types.h" ++#endif ++ + /* + * include/linux/rwlock_types.h - generic rwlock type definitions + * and initializers +diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h +new file mode 100644 +index 000000000..4762391d6 +--- /dev/null ++++ b/include/linux/rwlock_types_rt.h +@@ -0,0 +1,56 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_RWLOCK_TYPES_RT_H ++#define __LINUX_RWLOCK_TYPES_RT_H ++ ++#ifndef __LINUX_SPINLOCK_TYPES_H ++#error "Do not include directly. Include spinlock_types.h instead" ++#endif ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } ++#else ++# define RW_DEP_MAP_INIT(lockname) ++#endif ++ ++typedef struct rt_rw_lock rwlock_t; ++ ++#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name) ++ ++#define DEFINE_RWLOCK(name) \ ++ rwlock_t name = __RW_LOCK_UNLOCKED(name) ++ ++/* ++ * A reader biased implementation primarily for CPU pinning. ++ * ++ * Can be selected as general replacement for the single reader RT rwlock ++ * variant ++ */ ++struct rt_rw_lock { ++ struct rt_mutex rtmutex; ++ atomic_t readers; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++#define READER_BIAS (1U << 31) ++#define WRITER_BIAS (1U << 30) ++ ++#define __RWLOCK_RT_INITIALIZER(name) \ ++{ \ ++ .readers = ATOMIC_INIT(READER_BIAS), \ ++ .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \ ++ RW_DEP_MAP_INIT(name) \ ++} ++ ++void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, ++ struct lock_class_key *key); ++ ++#define rwlock_biased_rt_init(rwlock) \ ++ do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \ ++ } while (0) ++ ++#endif +diff --git a/include/linux/rwsem-rt.h b/include/linux/rwsem-rt.h +new file mode 100644 +index 000000000..0ba8aae9a +--- /dev/null ++++ b/include/linux/rwsem-rt.h +@@ -0,0 +1,70 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef _LINUX_RWSEM_RT_H ++#define _LINUX_RWSEM_RT_H ++ ++#ifndef _LINUX_RWSEM_H ++#error "Include rwsem.h" ++#endif ++ ++#include ++#include ++ ++#define READER_BIAS (1U << 31) ++#define WRITER_BIAS (1U << 30) ++ ++struct rw_semaphore { ++ atomic_t readers; ++ struct rt_mutex rtmutex; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++#define __RWSEM_INITIALIZER(name) \ ++{ \ ++ .readers = ATOMIC_INIT(READER_BIAS), \ ++ .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \ ++ RW_DEP_MAP_INIT(name) \ ++} ++ ++#define DECLARE_RWSEM(lockname) \ ++ struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) ++ ++extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, ++ struct lock_class_key *key); ++ ++#define __init_rwsem(sem, name, key) \ ++do { \ ++ rt_mutex_init(&(sem)->rtmutex); \ ++ __rwsem_init((sem), (name), (key)); \ ++} while (0) ++ ++#define init_rwsem(sem) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __init_rwsem((sem), #sem, &__key); \ ++} while (0) ++ ++static inline int rwsem_is_locked(struct rw_semaphore *sem) ++{ ++ return atomic_read(&sem->readers) != READER_BIAS; ++} ++ ++static inline int rwsem_is_contended(struct rw_semaphore *sem) ++{ ++ return atomic_read(&sem->readers) > 0; ++} ++ ++extern void __down_read(struct rw_semaphore *sem); ++extern int __down_read_interruptible(struct rw_semaphore *sem); ++extern int __down_read_killable(struct rw_semaphore *sem); ++extern int __down_read_trylock(struct rw_semaphore *sem); ++extern void __down_write(struct rw_semaphore *sem); ++extern int __must_check __down_write_killable(struct rw_semaphore *sem); ++extern int __down_write_trylock(struct rw_semaphore *sem); ++extern void __up_read(struct rw_semaphore *sem); ++extern void __up_write(struct rw_semaphore *sem); ++extern void __downgrade_write(struct rw_semaphore *sem); ++ ++#endif +diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h +index 4c715be48..9323af8a9 100644 +--- a/include/linux/rwsem.h ++++ b/include/linux/rwsem.h +@@ -16,6 +16,11 @@ + #include + #include + #include ++ ++#ifdef CONFIG_PREEMPT_RT ++#include ++#else /* PREEMPT_RT */ ++ + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER + #include + #endif +@@ -119,6 +124,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) + return !list_empty(&sem->wait_list); + } + ++#endif /* !PREEMPT_RT */ ++ ++/* ++ * The functions below are the same for all rwsem implementations including ++ * the RT specific variant. ++ */ ++ + /* + * lock for reading + */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 47f462040..4f0333bbe 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -114,12 +115,8 @@ struct io_uring_task; + __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ + TASK_PARKED) + +-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) +- + #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) + +-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) +- + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + + /* +@@ -143,6 +140,9 @@ struct io_uring_task; + smp_store_mb(current->state, (state_value)); \ + } while (0) + ++#define __set_current_state_no_track(state_value) \ ++ current->state = (state_value); ++ + #define set_special_state(state_value) \ + do { \ + unsigned long flags; /* may shadow */ \ +@@ -196,6 +196,9 @@ struct io_uring_task; + #define set_current_state(state_value) \ + smp_store_mb(current->state, (state_value)) + ++#define __set_current_state_no_track(state_value) \ ++ __set_current_state(state_value) ++ + /* + * set_special_state() should be used for those states when the blocking task + * can not use the regular condition based wait-loop. In that case we must +@@ -679,6 +682,13 @@ struct wake_q_node { + struct task_struct_resvd { + }; + ++struct kmap_ctrl { ++#ifdef CONFIG_KMAP_LOCAL ++ int idx; ++ pte_t pteval[KM_MAX_IDX]; ++#endif ++}; ++ + struct task_struct { + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* +@@ -689,6 +699,8 @@ struct task_struct { + #endif + /* -1 unrunnable, 0 runnable, >0 stopped: */ + volatile long state; ++ /* saved state for "spinlock sleepers" */ ++ volatile long saved_state; + + /* + * This begins the randomizable portion of task_struct. Only +@@ -760,6 +772,11 @@ struct task_struct { + int nr_cpus_allowed; + const cpumask_t *cpus_ptr; + cpumask_t cpus_mask; ++ void *migration_pending; ++#ifdef CONFIG_SMP ++ unsigned short migration_disabled; ++#endif ++ unsigned short migration_flags; + + #ifdef CONFIG_PREEMPT_RCU + int rcu_read_lock_nesting; +@@ -865,6 +882,10 @@ struct task_struct { + /* Stalled due to lack of memory */ + unsigned in_memstall:1; + #endif ++#ifdef CONFIG_EVENTFD ++ /* Recursion prevention for eventfd_signal() */ ++ unsigned in_eventfd_signal:1; ++#endif + + unsigned long atomic_flags; /* Flags requiring atomic access. */ + +@@ -1006,11 +1027,16 @@ struct task_struct { + /* Signal handlers: */ + struct signal_struct *signal; + struct sighand_struct __rcu *sighand; ++ struct sigqueue *sigqueue_cache; + sigset_t blocked; + sigset_t real_blocked; + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; ++#ifdef CONFIG_PREEMPT_RT ++ /* TODO: move me into ->restart_block ? */ ++ struct kernel_siginfo forced_info; ++#endif + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; +@@ -1037,6 +1063,7 @@ struct task_struct { + raw_spinlock_t pi_lock; + + struct wake_q_node wake_q; ++ struct wake_q_node wake_q_sleeper; + + #ifdef CONFIG_RT_MUTEXES + /* PI waiters blocked on a rt_mutex held by this task: */ +@@ -1064,6 +1091,9 @@ struct task_struct { + int softirq_context; + int irq_config; + #endif ++#ifdef CONFIG_PREEMPT_RT ++ int softirq_disable_cnt; ++#endif + + #ifdef CONFIG_LOCKDEP + # define MAX_LOCK_DEPTH 48UL +@@ -1349,6 +1379,7 @@ struct task_struct { + unsigned int sequential_io; + unsigned int sequential_io_avg; + #endif ++ struct kmap_ctrl kmap_ctrl; + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; + #endif +@@ -1813,6 +1844,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); + + extern int wake_up_state(struct task_struct *tsk, unsigned int state); + extern int wake_up_process(struct task_struct *tsk); ++extern int wake_up_lock_sleeper(struct task_struct *tsk); + extern void wake_up_new_task(struct task_struct *tsk); + + #ifdef CONFIG_SMP +@@ -1910,6 +1942,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); ++} ++ ++static inline int need_resched_lazy(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#else ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } ++static inline int need_resched_lazy(void) { return 0; } ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#endif ++ ++ ++static inline bool __task_is_stopped_or_traced(struct task_struct *task) ++{ ++ if (task->state & (__TASK_STOPPED | __TASK_TRACED)) ++ return true; ++#ifdef CONFIG_PREEMPT_RT ++ if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) ++ return true; ++#endif ++ return false; ++} ++ ++static inline bool task_is_stopped_or_traced(struct task_struct *task) ++{ ++ bool traced_stopped; ++ ++#ifdef CONFIG_PREEMPT_RT ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ traced_stopped = __task_is_stopped_or_traced(task); ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++#else ++ traced_stopped = __task_is_stopped_or_traced(task); ++#endif ++ return traced_stopped; ++} ++ ++static inline bool task_is_traced(struct task_struct *task) ++{ ++ bool traced = false; ++ ++ if (task->state & __TASK_TRACED) ++ return true; ++#ifdef CONFIG_PREEMPT_RT ++ /* in case the task is sleeping on tasklist_lock */ ++ raw_spin_lock_irq(&task->pi_lock); ++ if (task->state & __TASK_TRACED) ++ traced = true; ++ else if (task->saved_state & __TASK_TRACED) ++ traced = true; ++ raw_spin_unlock_irq(&task->pi_lock); ++#endif ++ return traced; ++} ++ + /* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return +diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h +index 9a62ffdd2..412cdaba3 100644 +--- a/include/linux/sched/hotplug.h ++++ b/include/linux/sched/hotplug.h +@@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); + extern int sched_cpu_deactivate(unsigned int cpu); + + #ifdef CONFIG_HOTPLUG_CPU ++extern int sched_cpu_wait_empty(unsigned int cpu); + extern int sched_cpu_dying(unsigned int cpu); + #else ++# define sched_cpu_wait_empty NULL + # define sched_cpu_dying NULL + #endif + +diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h +index dc1f4dcd9..9796cc213 100644 +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm) + __mmdrop(mm); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern void __mmdrop_delayed(struct rcu_head *rhp); ++static inline void mmdrop_delayed(struct mm_struct *mm) ++{ ++ if (atomic_dec_and_test(&mm->mm_count)) ++ call_rcu(&mm->delayed_drop, __mmdrop_delayed); ++} ++#else ++# define mmdrop_delayed(mm) mmdrop(mm) ++#endif ++ + /** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c0..994c25640 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) + } + extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); + extern void rt_mutex_adjust_pi(struct task_struct *p); +-static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +-{ +- return tsk->pi_blocked_on != NULL; +-} + #else + static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) + { + return NULL; + } + # define rt_mutex_adjust_pi(p) do { } while (0) +-static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +-{ +- return false; +-} + #endif + + extern void normalize_rt_tasks(void); +diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h +index 26a2013ac..6e2dff721 100644 +--- a/include/linux/sched/wake_q.h ++++ b/include/linux/sched/wake_q.h +@@ -58,6 +58,17 @@ static inline bool wake_q_empty(struct wake_q_head *head) + + extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); + extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); +-extern void wake_up_q(struct wake_q_head *head); ++extern void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task); ++extern void __wake_up_q(struct wake_q_head *head, bool sleeper); ++ ++static inline void wake_up_q(struct wake_q_head *head) ++{ ++ __wake_up_q(head, false); ++} ++ ++static inline void wake_up_q_sleeper(struct wake_q_head *head) ++{ ++ __wake_up_q(head, true); ++} + + #endif /* _LINUX_SCHED_WAKE_Q_H */ +diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h +index 2b70f736b..68d756373 100644 +--- a/include/linux/serial_8250.h ++++ b/include/linux/serial_8250.h +@@ -7,6 +7,7 @@ + #ifndef _LINUX_SERIAL_8250_H + #define _LINUX_SERIAL_8250_H + ++#include + #include + #include + #include +@@ -125,6 +126,8 @@ struct uart_8250_port { + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA + unsigned char msr_saved_flags; + ++ atomic_t console_printing; ++ + struct uart_8250_dma *dma; + const struct uart_8250_ops *ops; + +@@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); + void serial8250_set_defaults(struct uart_8250_port *up); + void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); ++void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, ++ unsigned int count); + int serial8250_console_setup(struct uart_port *port, char *options, bool probe); + int serial8250_console_exit(struct uart_port *port); + +diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h +index 93240799a..df2871ed8 100644 +--- a/include/linux/shmem_fs.h ++++ b/include/linux/shmem_fs.h +@@ -32,7 +32,7 @@ struct shmem_sb_info { + struct percpu_counter used_blocks; /* How many are allocated */ + unsigned long max_inodes; /* How many inodes are allowed */ + unsigned long free_inodes; /* How many are left for allocation */ +- spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ ++ raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + umode_t mode; /* Mount mode for root directory */ + unsigned char huge; /* Whether to try for hugepages */ + kuid_t uid; /* Mount uid for root directory */ +diff --git a/include/linux/signal.h b/include/linux/signal.h +index b256f9c65..ebf6c515a 100644 +--- a/include/linux/signal.h ++++ b/include/linux/signal.h +@@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig) + } + + extern void flush_sigqueue(struct sigpending *queue); ++extern void flush_task_sigqueue(struct task_struct *tsk); + + /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ + static inline int valid_signal(unsigned long sig) +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 68efccc15..3cfa2988b 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -298,6 +298,7 @@ struct sk_buff_head { + + __u32 qlen; + spinlock_t lock; ++ raw_spinlock_t raw_lock; + }; + + struct sk_buff; +@@ -1913,6 +1914,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) + __skb_queue_head_init(list); + } + ++static inline void skb_queue_head_init_raw(struct sk_buff_head *list) ++{ ++ raw_spin_lock_init(&list->raw_lock); ++ __skb_queue_head_init(list); ++} ++ + static inline void skb_queue_head_init_class(struct sk_buff_head *list, + struct lock_class_key *class) + { +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 84a0b4828..8348fa412 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -260,6 +260,9 @@ static inline int get_boot_cpu_id(void) + #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) + #define put_cpu() preempt_enable() + ++#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) ++#define put_cpu_light() migrate_enable() ++ + /* + * Callback to arch code if there's nosmp or maxcpus=0 on the + * boot command line: +diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h +index 79897841a..c3c70291b 100644 +--- a/include/linux/spinlock.h ++++ b/include/linux/spinlock.h +@@ -309,7 +309,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) + }) + + /* Include rwlock functions */ +-#include ++#ifdef CONFIG_PREEMPT_RT ++# include ++#else ++# include ++#endif + + /* + * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: +@@ -320,6 +324,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) + # include + #endif + ++#ifdef CONFIG_PREEMPT_RT ++# include ++#else /* PREEMPT_RT */ ++ + /* + * Map the spin_lock functions to the raw variants for PREEMPT_RT=n + */ +@@ -454,6 +462,8 @@ static __always_inline int spin_is_contended(spinlock_t *lock) + + #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) + ++#endif /* !PREEMPT_RT */ ++ + /* + * Pull the atomic_t declaration: + * (asm-mips/atomic.h needs above definitions) +diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h +index 19a9be9d9..da38149f2 100644 +--- a/include/linux/spinlock_api_smp.h ++++ b/include/linux/spinlock_api_smp.h +@@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) + return 0; + } + +-#include ++#ifndef CONFIG_PREEMPT_RT ++# include ++#endif + + #endif /* __LINUX_SPINLOCK_API_SMP_H */ +diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h +new file mode 100644 +index 000000000..3085132ea +--- /dev/null ++++ b/include/linux/spinlock_rt.h +@@ -0,0 +1,155 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_SPINLOCK_RT_H ++#define __LINUX_SPINLOCK_RT_H ++ ++#ifndef __LINUX_SPINLOCK_H ++#error Do not include directly. Use spinlock.h ++#endif ++ ++#include ++ ++extern void ++__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key); ++ ++#define spin_lock_init(slock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ rt_mutex_init(&(slock)->lock); \ ++ __rt_spin_lock_init(slock, #slock, &__key); \ ++} while (0) ++ ++extern void __lockfunc rt_spin_lock(spinlock_t *lock); ++extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); ++extern void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock); ++extern void __lockfunc rt_spin_unlock(spinlock_t *lock); ++extern void __lockfunc rt_spin_lock_unlock(spinlock_t *lock); ++extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); ++extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock); ++extern int __lockfunc rt_spin_trylock(spinlock_t *lock); ++extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); ++ ++/* ++ * lockdep-less calls, for derived types like rwlock: ++ * (for trylock they can use rt_mutex_trylock() directly. ++ * Migrate disable handling must be done at the call site. ++ */ ++extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); ++extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock); ++extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); ++ ++#define spin_lock(lock) rt_spin_lock(lock) ++ ++#define spin_lock_bh(lock) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_spin_lock(lock); \ ++ } while (0) ++ ++#define spin_lock_irq(lock) spin_lock(lock) ++ ++#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock)) ++ ++#define spin_trylock(lock) \ ++({ \ ++ int __locked; \ ++ __locked = spin_do_trylock(lock); \ ++ __locked; \ ++}) ++ ++#ifdef CONFIG_LOCKDEP ++# define spin_lock_nested(lock, subclass) \ ++ do { \ ++ rt_spin_lock_nested(lock, subclass); \ ++ } while (0) ++ ++#define spin_lock_bh_nested(lock, subclass) \ ++ do { \ ++ local_bh_disable(); \ ++ rt_spin_lock_nested(lock, subclass); \ ++ } while (0) ++ ++# define spin_lock_nest_lock(lock, subclass) \ ++ do { \ ++ typecheck(struct lockdep_map *, &(subclass)->dep_map); \ ++ rt_spin_lock_nest_lock(lock, &(subclass)->dep_map); \ ++ } while (0) ++ ++# define spin_lock_irqsave_nested(lock, flags, subclass) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ rt_spin_lock_nested(lock, subclass); \ ++ } while (0) ++#else ++# define spin_lock_nested(lock, subclass) spin_lock(((void)(subclass), (lock))) ++# define spin_lock_nest_lock(lock, subclass) spin_lock(((void)(subclass), (lock))) ++# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(((void)(subclass), (lock))) ++ ++# define spin_lock_irqsave_nested(lock, flags, subclass) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ spin_lock(((void)(subclass), (lock))); \ ++ } while (0) ++#endif ++ ++#define spin_lock_irqsave(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ spin_lock(lock); \ ++ } while (0) ++ ++#define spin_unlock(lock) rt_spin_unlock(lock) ++ ++#define spin_unlock_bh(lock) \ ++ do { \ ++ rt_spin_unlock(lock); \ ++ local_bh_enable(); \ ++ } while (0) ++ ++#define spin_unlock_irq(lock) spin_unlock(lock) ++ ++#define spin_unlock_irqrestore(lock, flags) \ ++ do { \ ++ typecheck(unsigned long, flags); \ ++ (void) flags; \ ++ spin_unlock(lock); \ ++ } while (0) ++ ++#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock)) ++#define spin_trylock_irq(lock) spin_trylock(lock) ++ ++#define spin_trylock_irqsave(lock, flags) \ ++({ \ ++ int __locked; \ ++ \ ++ typecheck(unsigned long, flags); \ ++ flags = 0; \ ++ __locked = spin_trylock(lock); \ ++ __locked; \ ++}) ++ ++#ifdef CONFIG_GENERIC_LOCKBREAK ++# define spin_is_contended(lock) ((lock)->break_lock) ++#else ++# define spin_is_contended(lock) (((void)(lock), 0)) ++#endif ++ ++static inline int spin_can_lock(spinlock_t *lock) ++{ ++ return !rt_mutex_is_locked(&lock->lock); ++} ++ ++static inline int spin_is_locked(spinlock_t *lock) ++{ ++ return rt_mutex_is_locked(&lock->lock); ++} ++ ++static inline void assert_spin_locked(spinlock_t *lock) ++{ ++ BUG_ON(!spin_is_locked(lock)); ++} ++ ++#endif +diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h +index b981caafe..8d896d3e1 100644 +--- a/include/linux/spinlock_types.h ++++ b/include/linux/spinlock_types.h +@@ -9,93 +9,15 @@ + * Released under the General Public License (GPL). + */ + +-#if defined(CONFIG_SMP) +-# include +-#else +-# include +-#endif +- +-#include +- +-typedef struct raw_spinlock { +- arch_spinlock_t raw_lock; +-#ifdef CONFIG_DEBUG_SPINLOCK +- unsigned int magic, owner_cpu; +- void *owner; +-#endif +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +- struct lockdep_map dep_map; +-#endif +-} raw_spinlock_t; +- +-#define SPINLOCK_MAGIC 0xdead4ead +- +-#define SPINLOCK_OWNER_INIT ((void *)-1L) +- +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define RAW_SPIN_DEP_MAP_INIT(lockname) \ +- .dep_map = { \ +- .name = #lockname, \ +- .wait_type_inner = LD_WAIT_SPIN, \ +- } +-# define SPIN_DEP_MAP_INIT(lockname) \ +- .dep_map = { \ +- .name = #lockname, \ +- .wait_type_inner = LD_WAIT_CONFIG, \ +- } +-#else +-# define RAW_SPIN_DEP_MAP_INIT(lockname) +-# define SPIN_DEP_MAP_INIT(lockname) +-#endif ++#include + +-#ifdef CONFIG_DEBUG_SPINLOCK +-# define SPIN_DEBUG_INIT(lockname) \ +- .magic = SPINLOCK_MAGIC, \ +- .owner_cpu = -1, \ +- .owner = SPINLOCK_OWNER_INIT, ++#ifndef CONFIG_PREEMPT_RT ++# include ++# include + #else +-# define SPIN_DEBUG_INIT(lockname) ++# include ++# include ++# include + #endif + +-#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ +- { \ +- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ +- SPIN_DEBUG_INIT(lockname) \ +- RAW_SPIN_DEP_MAP_INIT(lockname) } +- +-#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ +- (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) +- +-#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) +- +-typedef struct spinlock { +- union { +- struct raw_spinlock rlock; +- +-#ifdef CONFIG_DEBUG_LOCK_ALLOC +-# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) +- struct { +- u8 __padding[LOCK_PADSIZE]; +- struct lockdep_map dep_map; +- }; +-#endif +- }; +-} spinlock_t; +- +-#define ___SPIN_LOCK_INITIALIZER(lockname) \ +- { \ +- .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ +- SPIN_DEBUG_INIT(lockname) \ +- SPIN_DEP_MAP_INIT(lockname) } +- +-#define __SPIN_LOCK_INITIALIZER(lockname) \ +- { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } } +- +-#define __SPIN_LOCK_UNLOCKED(lockname) \ +- (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname) +- +-#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) +- +-#include +- + #endif /* __LINUX_SPINLOCK_TYPES_H */ +diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h +new file mode 100644 +index 000000000..e4549f0dd +--- /dev/null ++++ b/include/linux/spinlock_types_nort.h +@@ -0,0 +1,39 @@ ++#ifndef __LINUX_SPINLOCK_TYPES_NORT_H ++#define __LINUX_SPINLOCK_TYPES_NORT_H ++ ++#ifndef __LINUX_SPINLOCK_TYPES_H ++#error "Do not include directly. Include spinlock_types.h instead" ++#endif ++ ++/* ++ * The non RT version maps spinlocks to raw_spinlocks ++ */ ++typedef struct spinlock { ++ union { ++ struct raw_spinlock rlock; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) ++ struct { ++ u8 __padding[LOCK_PADSIZE]; ++ struct lockdep_map dep_map; ++ }; ++#endif ++ }; ++} spinlock_t; ++ ++#define ___SPIN_LOCK_INITIALIZER(lockname) \ ++{ \ ++ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ ++ SPIN_DEBUG_INIT(lockname) \ ++ SPIN_DEP_MAP_INIT(lockname) } ++ ++#define __SPIN_LOCK_INITIALIZER(lockname) \ ++ { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } } ++ ++#define __SPIN_LOCK_UNLOCKED(lockname) \ ++ (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname) ++ ++#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) ++ ++#endif +diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h +new file mode 100644 +index 000000000..1d4a180e9 +--- /dev/null ++++ b/include/linux/spinlock_types_raw.h +@@ -0,0 +1,65 @@ ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H ++#define __LINUX_SPINLOCK_TYPES_RAW_H ++ ++#include ++ ++#if defined(CONFIG_SMP) ++# include ++#else ++# include ++#endif ++ ++#include ++ ++typedef struct raw_spinlock { ++ arch_spinlock_t raw_lock; ++#ifdef CONFIG_DEBUG_SPINLOCK ++ unsigned int magic, owner_cpu; ++ void *owner; ++#endif ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++} raw_spinlock_t; ++ ++#define SPINLOCK_MAGIC 0xdead4ead ++ ++#define SPINLOCK_OWNER_INIT ((void *)-1L) ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++# define RAW_SPIN_DEP_MAP_INIT(lockname) \ ++ .dep_map = { \ ++ .name = #lockname, \ ++ .wait_type_inner = LD_WAIT_SPIN, \ ++ } ++# define SPIN_DEP_MAP_INIT(lockname) \ ++ .dep_map = { \ ++ .name = #lockname, \ ++ .wait_type_inner = LD_WAIT_CONFIG, \ ++ } ++#else ++# define RAW_SPIN_DEP_MAP_INIT(lockname) ++# define SPIN_DEP_MAP_INIT(lockname) ++#endif ++ ++#ifdef CONFIG_DEBUG_SPINLOCK ++# define SPIN_DEBUG_INIT(lockname) \ ++ .magic = SPINLOCK_MAGIC, \ ++ .owner_cpu = -1, \ ++ .owner = SPINLOCK_OWNER_INIT, ++#else ++# define SPIN_DEBUG_INIT(lockname) ++#endif ++ ++#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ ++{ \ ++ .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ ++ SPIN_DEBUG_INIT(lockname) \ ++ RAW_SPIN_DEP_MAP_INIT(lockname) } ++ ++#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ ++ (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) ++ ++#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) ++ ++#endif +diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h +new file mode 100644 +index 000000000..446da786e +--- /dev/null ++++ b/include/linux/spinlock_types_rt.h +@@ -0,0 +1,38 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#ifndef __LINUX_SPINLOCK_TYPES_RT_H ++#define __LINUX_SPINLOCK_TYPES_RT_H ++ ++#ifndef __LINUX_SPINLOCK_TYPES_H ++#error "Do not include directly. Include spinlock_types.h instead" ++#endif ++ ++#include ++ ++/* ++ * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field: ++ */ ++typedef struct spinlock { ++ struct rt_mutex lock; ++ unsigned int break_lock; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++} spinlock_t; ++ ++#define __RT_SPIN_INITIALIZER(name) \ ++ { \ ++ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ ++ .save_state = 1, \ ++ } ++/* ++.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock) ++*/ ++ ++#define __SPIN_LOCK_UNLOCKED(name) \ ++ { .lock = __RT_SPIN_INITIALIZER(name.lock), \ ++ SPIN_DEP_MAP_INIT(name) } ++ ++#define DEFINE_SPINLOCK(name) \ ++ spinlock_t name = __SPIN_LOCK_UNLOCKED(name) ++ ++#endif +diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h +index c09b6407a..d9b371fa1 100644 +--- a/include/linux/spinlock_types_up.h ++++ b/include/linux/spinlock_types_up.h +@@ -1,7 +1,7 @@ + #ifndef __LINUX_SPINLOCK_TYPES_UP_H + #define __LINUX_SPINLOCK_TYPES_UP_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) + # error "please don't include this file directly" + #endif + +diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h +index 08ec8e2fd..8eac0050b 100644 +--- a/include/linux/stop_machine.h ++++ b/include/linux/stop_machine.h +@@ -25,6 +25,7 @@ typedef int (*cpu_stop_fn_t)(void *arg); + struct cpu_stop_work { + struct list_head list; /* cpu_stopper->works */ + cpu_stop_fn_t fn; ++ unsigned long caller; + void *arg; + struct cpu_stop_done *done; + KABI_RESERVE(1) +@@ -38,6 +39,8 @@ void stop_machine_park(int cpu); + void stop_machine_unpark(int cpu); + void stop_machine_yield(const struct cpumask *cpumask); + ++extern void print_stop_info(const char *log_lvl, struct task_struct *task); ++ + #else /* CONFIG_SMP */ + + #include +@@ -82,6 +85,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, + return false; + } + ++static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } ++ + #endif /* CONFIG_SMP */ + + /* +diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h +index 19f76d87f..1c89c7d9f 100644 +--- a/include/linux/thread_info.h ++++ b/include/linux/thread_info.h +@@ -36,7 +36,17 @@ static inline long set_restart_fn(struct restart_block *restart, + + #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) + +-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#ifdef CONFIG_PREEMPT_LAZY ++#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ ++ test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) ++#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++ ++#else ++#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_lazy() 0 ++#endif + + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES + static inline int arch_within_stack_frames(const void * const stack, +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index 409385b25..4a0f567d6 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -67,6 +67,8 @@ struct trace_entry { + unsigned char flags; + unsigned char preempt_count; + int pid; ++ unsigned char migrate_disable; ++ unsigned char preempt_lazy_count; + }; + + #define TRACE_EVENT_TYPE_MAX \ +@@ -152,17 +154,66 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + unsigned short type, + unsigned int trace_ctx) + { +- struct task_struct *tsk = current; +- + entry->preempt_count = trace_ctx & 0xff; +- entry->pid = (tsk) ? tsk->pid : 0; ++ entry->migrate_disable = (trace_ctx >> 8) & 0xff; ++ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; ++ entry->pid = current->pid; + entry->type = type; +- entry->flags = trace_ctx >> 16; ++ entry->flags = trace_ctx >> 24; + } + +-unsigned int tracing_gen_ctx_flags(unsigned long irqflags); +-unsigned int tracing_gen_ctx(void); +-unsigned int tracing_gen_ctx_dec(void); ++unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); ++ ++enum trace_flag_type { ++ TRACE_FLAG_IRQS_OFF = 0x01, ++ TRACE_FLAG_IRQS_NOSUPPORT = 0x02, ++ TRACE_FLAG_NEED_RESCHED = 0x04, ++ TRACE_FLAG_HARDIRQ = 0x08, ++ TRACE_FLAG_SOFTIRQ = 0x10, ++ TRACE_FLAG_PREEMPT_RESCHED = 0x20, ++ TRACE_FLAG_NMI = 0x40, ++ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, ++}; ++ ++#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT ++static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++{ ++ unsigned int irq_status = irqs_disabled_flags(irqflags) ? ++ TRACE_FLAG_IRQS_OFF : 0; ++ return tracing_gen_ctx_irq_test(irq_status); ++} ++static inline unsigned int tracing_gen_ctx(void) ++{ ++ unsigned long irqflags; ++ ++ local_save_flags(irqflags); ++ return tracing_gen_ctx_flags(irqflags); ++} ++#else ++ ++static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++{ ++ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++} ++static inline unsigned int tracing_gen_ctx(void) ++{ ++ return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++} ++#endif ++ ++static inline unsigned int tracing_gen_ctx_dec(void) ++{ ++ unsigned int trace_ctx; ++ ++ trace_ctx = tracing_gen_ctx(); ++ /* ++ * Subtract one from the preeption counter if preemption is enabled, ++ * see trace_event_buffer_reserve()for details. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ trace_ctx--; ++ return trace_ctx; ++} + + struct trace_event_file; + +diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h +index e81856c0b..66eb968a0 100644 +--- a/include/linux/u64_stats_sync.h ++++ b/include/linux/u64_stats_sync.h +@@ -66,7 +66,7 @@ + #include + + struct u64_stats_sync { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + seqcount_t seq; + #endif + }; +@@ -115,7 +115,7 @@ static inline void u64_stats_inc(u64_stats_t *p) + } + #endif + +-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) + #else + static inline void u64_stats_init(struct u64_stats_sync *syncp) +@@ -125,15 +125,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) + + static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + write_seqcount_begin(&syncp->seq); + #endif + } + + static inline void u64_stats_update_end(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + #endif + } + +@@ -142,8 +146,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) + { + unsigned long flags = 0; + +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +- local_irq_save(flags); ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); ++ else ++ local_irq_save(flags); + write_seqcount_begin(&syncp->seq); + #endif + return flags; +@@ -153,15 +160,18 @@ static inline void + u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); +- local_irq_restore(flags); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); ++ else ++ local_irq_restore(flags); + #endif + } + + static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_begin(&syncp->seq); + #else + return 0; +@@ -170,7 +180,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * + + static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -179,7 +189,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy + static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_retry(&syncp->seq, start); + #else + return false; +@@ -189,7 +199,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +@@ -203,7 +213,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + */ + static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_disable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -212,7 +224,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync + static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_enable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index 322dcbfcc..9a3a10ea3 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -63,7 +63,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); + */ + static inline void __count_vm_event(enum vm_event_item item) + { ++ preempt_disable_rt(); + raw_cpu_inc(vm_event_states.event[item]); ++ preempt_enable_rt(); + } + + static inline void count_vm_event(enum vm_event_item item) +@@ -73,7 +75,9 @@ static inline void count_vm_event(enum vm_event_item item) + + static inline void __count_vm_events(enum vm_event_item item, long delta) + { ++ preempt_disable_rt(); + raw_cpu_add(vm_event_states.event[item], delta); ++ preempt_enable_rt(); + } + + static inline void count_vm_events(enum vm_event_item item, long delta) +diff --git a/include/linux/vtime.h b/include/linux/vtime.h +index 2cdeca062..041d6524d 100644 +--- a/include/linux/vtime.h ++++ b/include/linux/vtime.h +@@ -83,36 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } + #endif + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +-extern void vtime_account_irq_enter(struct task_struct *tsk); +-static inline void vtime_account_irq_exit(struct task_struct *tsk) +-{ +- /* On hard|softirq exit we always account to hard|softirq cputime */ +- vtime_account_kernel(tsk); +-} ++extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); ++extern void vtime_account_softirq(struct task_struct *tsk); ++extern void vtime_account_hardirq(struct task_struct *tsk); + extern void vtime_flush(struct task_struct *tsk); + #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ +-static inline void vtime_account_irq_enter(struct task_struct *tsk) { } +-static inline void vtime_account_irq_exit(struct task_struct *tsk) { } ++static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } ++static inline void vtime_account_softirq(struct task_struct *tsk) { } ++static inline void vtime_account_hardirq(struct task_struct *tsk) { } + static inline void vtime_flush(struct task_struct *tsk) { } + #endif + + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING +-extern void irqtime_account_irq(struct task_struct *tsk); ++extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset); + #else +-static inline void irqtime_account_irq(struct task_struct *tsk) { } ++static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { } + #endif + +-static inline void account_irq_enter_time(struct task_struct *tsk) ++static inline void account_softirq_enter(struct task_struct *tsk) ++{ ++ vtime_account_irq(tsk, SOFTIRQ_OFFSET); ++ irqtime_account_irq(tsk, SOFTIRQ_OFFSET); ++} ++ ++static inline void account_softirq_exit(struct task_struct *tsk) ++{ ++ vtime_account_softirq(tsk); ++ irqtime_account_irq(tsk, 0); ++} ++ ++static inline void account_hardirq_enter(struct task_struct *tsk) + { +- vtime_account_irq_enter(tsk); +- irqtime_account_irq(tsk); ++ vtime_account_irq(tsk, HARDIRQ_OFFSET); ++ irqtime_account_irq(tsk, HARDIRQ_OFFSET); + } + +-static inline void account_irq_exit_time(struct task_struct *tsk) ++static inline void account_hardirq_exit(struct task_struct *tsk) + { +- vtime_account_irq_exit(tsk); +- irqtime_account_irq(tsk); ++ vtime_account_hardirq(tsk); ++ irqtime_account_irq(tsk, 0); + } + + #endif /* _LINUX_KERNEL_VTIME_H */ +diff --git a/include/linux/wait.h b/include/linux/wait.h +index 9b8b08331..33001b534 100644 +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -10,6 +10,7 @@ + + #include + #include ++#include + + typedef struct wait_queue_entry wait_queue_entry_t; + +diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h +index 6ecf2a022..3145de598 100644 +--- a/include/linux/ww_mutex.h ++++ b/include/linux/ww_mutex.h +@@ -28,6 +28,14 @@ struct ww_class { + unsigned int is_wait_die; + }; + ++struct ww_mutex { ++ struct mutex base; ++ struct ww_acquire_ctx *ctx; ++#ifdef CONFIG_DEBUG_MUTEXES ++ struct ww_class *ww_class; ++#endif ++}; ++ + struct ww_acquire_ctx { + struct task_struct *task; + unsigned long stamp; +diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h +index 1424e02ce..163f8415e 100644 +--- a/include/net/gen_stats.h ++++ b/include/net/gen_stats.h +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + + /* Note: this used to be in include/uapi/linux/gen_stats.h */ + struct gnet_stats_basic_packed { +@@ -42,15 +43,15 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, + spinlock_t *lock, struct gnet_dump *d, + int padattr); + +-int gnet_stats_copy_basic(const seqcount_t *running, ++int gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); +-void __gnet_stats_copy_basic(const seqcount_t *running, ++void __gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); +-int gnet_stats_copy_basic_hw(const seqcount_t *running, ++int gnet_stats_copy_basic_hw(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); +@@ -70,13 +71,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ net_seqlock_t *running, struct nlattr *opt); + void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); + int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **ptr, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ net_seqlock_t *running, struct nlattr *opt); + bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); + bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, + struct gnet_stats_rate_est64 *sample); +diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h +new file mode 100644 +index 000000000..67710bace +--- /dev/null ++++ b/include/net/net_seq_lock.h +@@ -0,0 +1,15 @@ ++#ifndef __NET_NET_SEQ_LOCK_H__ ++#define __NET_NET_SEQ_LOCK_H__ ++ ++#ifdef CONFIG_PREEMPT_RT ++# define net_seqlock_t seqlock_t ++# define net_seq_begin(__r) read_seqbegin(__r) ++# define net_seq_retry(__r, __s) read_seqretry(__r, __s) ++ ++#else ++# define net_seqlock_t seqcount_t ++# define net_seq_begin(__r) read_seqcount_begin(__r) ++# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s) ++#endif ++ ++#endif +diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h +index 9144e0f09..464d14b2a 100644 +--- a/include/net/netns/xfrm.h ++++ b/include/net/netns/xfrm.h +@@ -74,7 +74,7 @@ struct netns_xfrm { + struct dst_ops xfrm6_dst_ops; + #endif + spinlock_t xfrm_state_lock; +- seqcount_t xfrm_state_hash_generation; ++ seqcount_spinlock_t xfrm_state_hash_generation; + seqcount_spinlock_t xfrm_policy_hash_generation; + + spinlock_t xfrm_policy_lock; +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index 330094583..346eeb7fc 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -102,7 +103,7 @@ struct Qdisc { + struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; + struct qdisc_skb_head q; + struct gnet_stats_basic_packed bstats; +- seqcount_t running; ++ net_seqlock_t running; + struct gnet_stats_queue qstats; + unsigned long state; + struct Qdisc *next_sched; +@@ -146,7 +147,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) + { + if (qdisc->flags & TCQ_F_NOLOCK) + return spin_is_locked(&qdisc->seqlock); ++#ifdef CONFIG_PREEMPT_RT ++ return spin_is_locked(&qdisc->running.lock) ? true : false; ++#else + return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; ++#endif + } + + static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) +@@ -207,17 +212,35 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) + } else if (qdisc_is_running(qdisc)) { + return false; + } ++#ifdef CONFIG_PREEMPT_RT ++ if (spin_trylock(&qdisc->running.lock)) { ++ seqcount_t *s = &qdisc->running.seqcount.seqcount; ++ /* ++ * Variant of write_seqcount_t_begin() telling lockdep that a ++ * trylock was attempted. ++ */ ++ raw_write_seqcount_t_begin(s); ++ seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_); ++ return true; ++ } ++ return false; ++#else + /* Variant of write_seqcount_begin() telling lockdep a trylock + * was attempted. + */ + raw_write_seqcount_begin(&qdisc->running); + seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); + return true; ++#endif + } + + static inline void qdisc_run_end(struct Qdisc *qdisc) + { ++#ifdef CONFIG_PREEMPT_RT ++ write_sequnlock(&qdisc->running); ++#else + write_seqcount_end(&qdisc->running); ++#endif + if (qdisc->flags & TCQ_F_NOLOCK) { + spin_unlock(&qdisc->seqlock); + +@@ -605,7 +628,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) + return qdisc_lock(root); + } + +-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) ++static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) + { + struct Qdisc *root = qdisc_root_sleeping(qdisc); + +diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h +index 028f49662..4231714b3 100644 +--- a/include/trace/events/sched.h ++++ b/include/trace/events/sched.h +@@ -705,6 +705,18 @@ DECLARE_TRACE(sched_update_nr_running_tp, + TP_PROTO(struct rq *rq, int change), + TP_ARGS(rq, change)); + ++DECLARE_TRACE(sched_migrate_disable_tp, ++ TP_PROTO(struct task_struct *p), ++ TP_ARGS(p)); ++ ++DECLARE_TRACE(sched_migrate_enable_tp, ++ TP_PROTO(struct task_struct *p), ++ TP_ARGS(p)); ++ ++DECLARE_TRACE(sched_migrate_pull_tp, ++ TP_PROTO(struct task_struct *p), ++ TP_ARGS(p)); ++ + DECLARE_EVENT_CLASS(psi_memstall_template, + + TP_PROTO(unsigned long function), +diff --git a/init/Kconfig b/init/Kconfig +index 27c5ed16f..848a2e5bd 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -861,7 +861,7 @@ config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY +- depends on SMP && NUMA && MIGRATION ++ depends on SMP && NUMA && MIGRATION && !PREEMPT_RT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -994,6 +994,7 @@ config CFS_BANDWIDTH + config RT_GROUP_SCHED + bool "Group scheduling for SCHED_RR/FIFO" + depends on CGROUP_SCHED ++ depends on !PREEMPT_RT + default n + help + This feature lets you explicitly allocate real CPU bandwidth +@@ -1956,6 +1957,7 @@ choice + + config SLAB + bool "SLAB" ++ depends on !PREEMPT_RT + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work +@@ -1976,6 +1978,7 @@ config SLUB + config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" ++ depends on !PREEMPT_RT + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but +@@ -2042,7 +2045,7 @@ config SHUFFLE_PAGE_ALLOCATOR + + config SLUB_CPU_PARTIAL + default y +- depends on SLUB && SMP ++ depends on SLUB && SMP && !PREEMPT_RT + bool "SLUB per cpu partial cache" + help + Per cpu partial caches accelerate objects allocation and freeing +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 3de8fd118..4198f0273 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS + + config QUEUED_RWLOCKS + def_bool y if ARCH_USE_QUEUED_RWLOCKS +- depends on SMP ++ depends on SMP && !PREEMPT_RT + + config ARCH_HAS_MMIOWB + bool +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index 416017301..90837a6cb 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,5 +1,11 @@ + # SPDX-License-Identifier: GPL-2.0-only + ++config HAVE_PREEMPT_LAZY ++ bool ++ ++config PREEMPT_LAZY ++ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT ++ + choice + prompt "Preemption Model" + default PREEMPT_NONE +@@ -60,6 +66,7 @@ config PREEMPT_RT + bool "Fully Preemptible Kernel (Real-Time)" + depends on EXPERT && ARCH_SUPPORTS_RT + select PREEMPTION ++ select RT_MUTEXES + help + This option turns the kernel into a real-time kernel by replacing + various locking primitives (spinlocks, rwlocks, etc.) with +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index b7a936e5d..f80a8f91b 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -351,7 +351,7 @@ void cpuset_read_unlock(void) + percpu_up_read(&cpuset_rwsem); + } + +-static DEFINE_SPINLOCK(callback_lock); ++static DEFINE_RAW_SPINLOCK(callback_lock); + + static struct workqueue_struct *cpuset_migrate_mm_wq; + +@@ -1290,7 +1290,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, + * Newly added CPUs will be removed from effective_cpus and + * newly deleted ones will be added back to effective_cpus. + */ +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(parent->subparts_cpus, + parent->subparts_cpus, tmp->addmask); +@@ -1312,7 +1312,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, + + if (cpuset->partition_root_state != new_prs) + cpuset->partition_root_state = new_prs; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + return cmd == partcmd_update; + } +@@ -1415,7 +1415,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) + continue; + rcu_read_unlock(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { +@@ -1449,7 +1449,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) + if (new_prs != cp->partition_root_state) + cp->partition_root_state = new_prs; + +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + WARN_ON(!is_in_v2_mode() && + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); +@@ -1577,7 +1577,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + return -EINVAL; + } + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + + /* +@@ -1587,7 +1587,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + update_cpumasks_hier(cs, &tmp); + +@@ -1781,9 +1781,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) + continue; + rcu_read_unlock(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cp->effective_mems = *new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + WARN_ON(!is_in_v2_mode() && + !nodes_equal(cp->mems_allowed, cp->effective_mems)); +@@ -1851,9 +1851,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + if (retval < 0) + goto done; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->mems_allowed = trialcs->mems_allowed; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + /* use trialcs->mems_allowed as a temp variable */ + update_nodemasks_hier(cs, &trialcs->mems_allowed); +@@ -1944,9 +1944,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) + || (is_spread_page(cs) != is_spread_page(trialcs))); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->flags = trialcs->flags; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) + rebuild_sched_domains_locked(); +@@ -2037,9 +2037,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) + rebuild_sched_domains_locked(); + out: + if (!err) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + } + + free_cpumasks(NULL, &tmpmask); +@@ -2457,7 +2457,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) + cpuset_filetype_t type = seq_cft(sf)->private; + int ret = 0; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + + switch (type) { + case FILE_CPULIST: +@@ -2479,7 +2479,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) + ret = -EINVAL; + } + +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + return ret; + } + +@@ -2792,14 +2792,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) + + cpuset_inc(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (is_in_v2_mode()) { + cpumask_copy(cs->effective_cpus, parent->effective_cpus); + cs->effective_mems = parent->effective_mems; + cs->use_parent_ecpus = true; + parent->child_ecpus_count++; + } +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) + goto out_unlock; +@@ -2826,12 +2826,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) + } + rcu_read_unlock(); + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + out_unlock: + percpu_up_write(&cpuset_rwsem); + put_online_cpus(); +@@ -2887,7 +2887,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) + static void cpuset_bind(struct cgroup_subsys_state *root_css) + { + percpu_down_write(&cpuset_rwsem); +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + + if (is_in_v2_mode()) { + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); +@@ -2898,7 +2898,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) + top_cpuset.mems_allowed = top_cpuset.effective_mems; + } + +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + percpu_up_write(&cpuset_rwsem); + } + +@@ -2995,12 +2995,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, + { + bool is_empty; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cpumask_copy(cs->cpus_allowed, new_cpus); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->mems_allowed = *new_mems; + cs->effective_mems = *new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + /* + * Don't call update_tasks_cpumask() if the cpuset becomes empty, +@@ -3037,10 +3037,10 @@ hotplug_update_tasks(struct cpuset *cs, + if (nodes_empty(*new_mems)) + *new_mems = parent_cs(cs)->effective_mems; + +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cpumask_copy(cs->effective_cpus, new_cpus); + cs->effective_mems = *new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + + if (cpus_updated) + update_tasks_cpumask(cs); +@@ -3107,10 +3107,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) + if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || + (parent->partition_root_state == PRS_ERROR))) { + if (cs->nr_subparts_cpus) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + compute_effective_cpumask(&new_cpus, cs, parent); + } + +@@ -3124,9 +3124,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) + cpumask_empty(&new_cpus)) { + update_parent_subparts_cpumask(cs, partcmd_disable, + NULL, tmp); +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + cs->partition_root_state = PRS_ERROR; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + } + cpuset_force_rebuild(); + } +@@ -3206,7 +3206,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) + + /* synchronize cpus_allowed to cpu_active_mask */ + if (cpus_updated) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (!on_dfl) + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); + /* +@@ -3226,17 +3226,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work) + } + } + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + /* we don't mess with cpumasks of tasks in top_cpuset */ + } + + /* synchronize mems_allowed to N_MEMORY */ + if (mems_updated) { +- spin_lock_irq(&callback_lock); ++ raw_spin_lock_irq(&callback_lock); + if (!on_dfl) + top_cpuset.mems_allowed = new_mems; + top_cpuset.effective_mems = new_mems; +- spin_unlock_irq(&callback_lock); ++ raw_spin_unlock_irq(&callback_lock); + update_tasks_nodemask(&top_cpuset); + } + +@@ -3337,11 +3337,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) + { + unsigned long flags; + +- spin_lock_irqsave(&callback_lock, flags); ++ raw_spin_lock_irqsave(&callback_lock, flags); + rcu_read_lock(); + guarantee_online_cpus(task_cs(tsk), pmask); + rcu_read_unlock(); +- spin_unlock_irqrestore(&callback_lock, flags); ++ raw_spin_unlock_irqrestore(&callback_lock, flags); + } + + /** +@@ -3402,11 +3402,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) + nodemask_t mask; + unsigned long flags; + +- spin_lock_irqsave(&callback_lock, flags); ++ raw_spin_lock_irqsave(&callback_lock, flags); + rcu_read_lock(); + guarantee_online_mems(task_cs(tsk), &mask); + rcu_read_unlock(); +- spin_unlock_irqrestore(&callback_lock, flags); ++ raw_spin_unlock_irqrestore(&callback_lock, flags); + + return mask; + } +@@ -3498,14 +3498,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) + return true; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ +- spin_lock_irqsave(&callback_lock, flags); ++ raw_spin_lock_irqsave(&callback_lock, flags); + + rcu_read_lock(); + cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); + rcu_read_unlock(); + +- spin_unlock_irqrestore(&callback_lock, flags); ++ raw_spin_unlock_irqrestore(&callback_lock, flags); + return allowed; + } + +diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c +index d2ae14d0b..7b3bea56d 100644 +--- a/kernel/cgroup/rstat.c ++++ b/kernel/cgroup/rstat.c +@@ -156,8 +156,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; ++ unsigned long flags; + +- raw_spin_lock(cpu_lock); ++ raw_spin_lock_irqsave(cpu_lock, flags); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + +@@ -169,7 +170,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } +- raw_spin_unlock(cpu_lock); ++ raw_spin_unlock_irqrestore(cpu_lock, flags); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || +diff --git a/kernel/cpu.c b/kernel/cpu.c +index c06ced18f..10b6287af 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -1662,7 +1662,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { + .name = "ap:online", + }, + /* +- * Handled on controll processor until the plugged processor manages ++ * Handled on control processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { +@@ -1671,6 +1671,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { + .teardown.single = takedown_cpu, + .cant_stop = true, + }, ++ ++ [CPUHP_AP_SCHED_WAIT_EMPTY] = { ++ .name = "sched:waitempty", ++ .startup.single = NULL, ++ .teardown.single = sched_cpu_wait_empty, ++ }, ++ + /* Handle smpboot threads park/unpark */ + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot/threads:online", +diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c +index 930ac1b25..dbf1d126a 100644 +--- a/kernel/debug/kdb/kdb_main.c ++++ b/kernel/debug/kdb/kdb_main.c +@@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const char **argv) + int adjust = 0; + int n = 0; + int skip = 0; +- struct kmsg_dumper dumper = { .active = 1 }; ++ struct kmsg_dumper_iter iter = { .active = 1 }; + size_t len; + char buf[201]; + +@@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const char **argv) + kdb_set(2, setargs); + } + +- kmsg_dump_rewind_nolock(&dumper); +- while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) ++ kmsg_dump_rewind(&iter); ++ while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL)) + n++; + + if (lines < 0) { +@@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const char **argv) + if (skip >= n || skip < 0) + return 0; + +- kmsg_dump_rewind_nolock(&dumper); +- while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { ++ kmsg_dump_rewind(&iter); ++ while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) { + if (skip) { + skip--; + continue; +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index cea3957eb..790b0992e 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -2,6 +2,7 @@ + + #include + #include ++#include + #include + #include + +@@ -148,9 +149,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + + local_irq_enable_exit_to_user(ti_work); + +- if (ti_work & _TIF_NEED_RESCHED) ++ if (ti_work & _TIF_NEED_RESCHED_MASK) + schedule(); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + +@@ -201,6 +210,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) + + /* Ensure that the address limit is intact and no locks are held */ + addr_limit_user_check(); ++ kmap_assert_nomap(); + lockdep_assert_irqs_disabled(); + lockdep_sys_exit(); + } +@@ -360,7 +370,7 @@ void irqentry_exit_cond_resched(void) + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); +- if (need_resched()) ++ if (should_resched(0)) + preempt_schedule_irq(); + } + } +diff --git a/kernel/exit.c b/kernel/exit.c +index d13d67fc5..f5933bd07 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ +- flush_sigqueue(&tsk->pending); ++ flush_task_sigqueue(tsk); + tsk->sighand = NULL; + spin_unlock(&sighand->siglock); + +diff --git a/kernel/fork.c b/kernel/fork.c +index 0fb86b65a..240e256f0 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -290,7 +291,7 @@ static inline void free_thread_stack(struct task_struct *tsk) + return; + } + +- vfree_atomic(tsk->stack); ++ vfree(tsk->stack); + return; + } + #endif +@@ -691,6 +692,19 @@ void __mmdrop(struct mm_struct *mm) + } + EXPORT_SYMBOL_GPL(__mmdrop); + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * RCU callback for delayed mm drop. Not strictly rcu, but we don't ++ * want another facility to make this work. ++ */ ++void __mmdrop_delayed(struct rcu_head *rhp) ++{ ++ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); ++ ++ __mmdrop(mm); ++} ++#endif ++ + static void mmdrop_async_fn(struct work_struct *work) + { + struct mm_struct *mm; +@@ -732,6 +746,15 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(refcount_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(tsk); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(tsk); ++ + io_uring_free(tsk); + cgroup_free(tsk); + task_numa_free(tsk, true); +@@ -929,10 +952,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) + tsk->splice_pipe = NULL; + tsk->task_frag.page = NULL; + tsk->wake_q.next = NULL; ++ tsk->wake_q_sleeper.next = NULL; + + account_kernel_stack(tsk, 1); + + kcov_task_init(tsk); ++ kmap_local_fork(tsk); + + #ifdef CONFIG_FAULT_INJECTION + tsk->fail_nth = 0; +@@ -2028,6 +2053,7 @@ static __latent_entropy struct task_struct *copy_process( + spin_lock_init(&p->alloc_lock); + + init_sigpending(&p->pending); ++ p->sigqueue_cache = NULL; + + p->utime = p->stime = p->gtime = 0; + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME +diff --git a/kernel/futex.c b/kernel/futex.c +index 98a6e1b80..b2b275bc1 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1498,6 +1498,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ + struct task_struct *new_owner; + bool postunlock = false; + DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); + int ret = 0; + + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); +@@ -1547,14 +1548,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); +- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); ++ postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, ++ &wake_sleeper_q); + } + + out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + + if (postunlock) +- rt_mutex_postunlock(&wake_q); ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); + + return ret; + } +@@ -2155,6 +2157,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + */ + requeue_pi_wake_futex(this, &key2, hb2); + continue; ++ } else if (ret == -EAGAIN) { ++ /* ++ * Waiter was woken by timeout or ++ * signal and has set pi_blocked_on to ++ * PI_WAKEUP_INPROGRESS before we ++ * tried to enqueue it on the rtmutex. ++ */ ++ this->pi_state = NULL; ++ put_pi_state(pi_state); ++ continue; + } else if (ret) { + /* + * rt_mutex_start_proxy_lock() detected a +@@ -2847,7 +2859,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, + goto no_block; + } + +- rt_mutex_init_waiter(&rt_waiter); ++ rt_mutex_init_waiter(&rt_waiter, false); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not +@@ -3172,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + { + struct hrtimer_sleeper timeout, *to; + struct rt_mutex_waiter rt_waiter; +- struct futex_hash_bucket *hb; ++ struct futex_hash_bucket *hb, *hb2; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + int res, ret; +@@ -3193,7 +3205,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ +- rt_mutex_init_waiter(&rt_waiter); ++ rt_mutex_init_waiter(&rt_waiter, false); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); + if (unlikely(ret != 0)) +@@ -3224,20 +3236,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + +- spin_lock(&hb->lock); +- ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); +- spin_unlock(&hb->lock); +- if (ret) +- goto out; ++ /* ++ * On RT we must avoid races with requeue and trying to block ++ * on two mutexes (hb->lock and uaddr2's rtmutex) by ++ * serializing access to pi_blocked_on with pi_lock. ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ if (current->pi_blocked_on) { ++ /* ++ * We have been requeued or are in the process of ++ * being requeued. ++ */ ++ raw_spin_unlock_irq(¤t->pi_lock); ++ } else { ++ /* ++ * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS ++ * prevents a concurrent requeue from moving us to the ++ * uaddr2 rtmutex. After that we can safely acquire ++ * (and possibly block on) hb->lock. ++ */ ++ current->pi_blocked_on = PI_WAKEUP_INPROGRESS; ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ spin_lock(&hb->lock); ++ ++ /* ++ * Clean up pi_blocked_on. We might leak it otherwise ++ * when we succeeded with the hb->lock in the fast ++ * path. ++ */ ++ raw_spin_lock_irq(¤t->pi_lock); ++ current->pi_blocked_on = NULL; ++ raw_spin_unlock_irq(¤t->pi_lock); ++ ++ ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); ++ spin_unlock(&hb->lock); ++ if (ret) ++ goto out; ++ } + + /* +- * In order for us to be here, we know our q.key == key2, and since +- * we took the hb->lock above, we also know that futex_requeue() has +- * completed and we no longer have to concern ourselves with a wakeup +- * race with the atomic proxy lock acquisition by the requeue code. The +- * futex_requeue dropped our key1 reference and incremented our key2 +- * reference count. ++ * In order to be here, we have either been requeued, are in ++ * the process of being requeued, or requeue successfully ++ * acquired uaddr2 on our behalf. If pi_blocked_on was ++ * non-null above, we may be racing with a requeue. Do not ++ * rely on q->lock_ptr to be hb2->lock until after blocking on ++ * hb->lock or hb2->lock. The futex_requeue dropped our key1 ++ * reference and incremented our key2 reference count. + */ ++ hb2 = hash_futex(&key2); + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { +@@ -3246,14 +3293,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { +- spin_lock(q.lock_ptr); ++ spin_lock(&hb2->lock); ++ BUG_ON(&hb2->lock != q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); +- spin_unlock(q.lock_ptr); ++ spin_unlock(&hb2->lock); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. +@@ -3272,7 +3320,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + +- spin_lock(q.lock_ptr); ++ spin_lock(&hb2->lock); ++ BUG_ON(&hb2->lock != q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + +diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c +index 762a928e1..7929fcdb7 100644 +--- a/kernel/irq/handle.c ++++ b/kernel/irq/handle.c +@@ -192,10 +192,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) + { + irqreturn_t retval; + unsigned int flags = 0; ++ struct pt_regs *regs = get_irq_regs(); ++ u64 ip = regs ? instruction_pointer(regs) : 0; + + retval = __handle_irq_event_percpu(desc, &flags); + +- add_interrupt_randomness(desc->irq_data.irq, flags); ++#ifdef CONFIG_PREEMPT_RT ++ desc->random_ip = ip; ++#else ++ add_interrupt_randomness(desc->irq_data.irq, flags, ip); ++#endif + + if (!noirqdebug) + note_interrupt(desc, retval); +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index d3033e1f9..4f7885934 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -1260,6 +1260,8 @@ static int irq_thread(void *data) + irqreturn_t (*handler_fn)(struct irq_desc *desc, + struct irqaction *action); + ++ sched_set_fifo(current); ++ + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; +@@ -1280,6 +1282,12 @@ static int irq_thread(void *data) + if (action_ret == IRQ_WAKE_THREAD) + irq_wake_secondary(desc, action); + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ migrate_disable(); ++ add_interrupt_randomness(action->irq, 0, ++ desc->random_ip ^ (unsigned long) action); ++ migrate_enable(); ++ } + wake_threads_waitq(desc); + } + +@@ -1425,8 +1433,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) + if (IS_ERR(t)) + return PTR_ERR(t); + +- sched_set_fifo(t); +- + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code +@@ -2823,7 +2829,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); + * This call sets the internal irqchip state of an interrupt, + * depending on the value of @which. + * +- * This function should be called with preemption disabled if the ++ * This function should be called with migration disabled if the + * interrupt controller has per-cpu registers. + */ + int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, +diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c +index f865e5f4d..dc7311dd7 100644 +--- a/kernel/irq/spurious.c ++++ b/kernel/irq/spurious.c +@@ -443,6 +443,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); + + static int __init irqfixup_setup(char *str) + { ++#ifdef CONFIG_PREEMPT_RT ++ pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT\n"); ++ return 1; ++#endif + irqfixup = 1; + printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); + printk(KERN_WARNING "This may impact system performance.\n"); +@@ -455,6 +459,10 @@ module_param(irqfixup, int, 0644); + + static int __init irqpoll_setup(char *str) + { ++#ifdef CONFIG_PREEMPT_RT ++ pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT\n"); ++ return 1; ++#endif + irqfixup = 2; + printk(KERN_WARNING "Misrouted IRQ fixup and polling support " + "enabled\n"); +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index fbff25adb..711bd5e87 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -18,11 +18,37 @@ + #include + #include + #include ++#include ++#include + #include + + + static DEFINE_PER_CPU(struct llist_head, raised_list); + static DEFINE_PER_CPU(struct llist_head, lazy_list); ++static DEFINE_PER_CPU(struct task_struct *, irq_workd); ++ ++static void wake_irq_workd(void) ++{ ++ struct task_struct *tsk = __this_cpu_read(irq_workd); ++ ++ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) ++ wake_up_process(tsk); ++} ++ ++#ifdef CONFIG_SMP ++static void irq_work_wake(struct irq_work *entry) ++{ ++ wake_irq_workd(); ++} ++ ++static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = ++ IRQ_WORK_INIT_HARD(irq_work_wake); ++#endif ++ ++static int irq_workd_should_run(unsigned int cpu) ++{ ++ return !llist_empty(this_cpu_ptr(&lazy_list)); ++} + + /* + * Claim the entry so that no one else will poke at it. +@@ -52,15 +78,30 @@ void __weak arch_irq_work_raise(void) + /* Enqueue on current CPU, work must already be claimed and preempt disabled */ + static void __irq_work_queue_local(struct irq_work *work) + { ++ struct llist_head *list; ++ bool rt_lazy_work = false; ++ bool lazy_work = false; ++ int work_flags; ++ ++ work_flags = atomic_read(&work->node.a_flags); ++ if (work_flags & IRQ_WORK_LAZY) ++ lazy_work = true; ++ else if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(work_flags & IRQ_WORK_HARD_IRQ)) ++ rt_lazy_work = true; ++ ++ if (lazy_work || rt_lazy_work) ++ list = this_cpu_ptr(&lazy_list); ++ else ++ list = this_cpu_ptr(&raised_list); ++ ++ if (!llist_add(&work->node.llist, list)) ++ return; ++ ++ + /* If the work is "lazy", handle it from next tick if any */ +- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { +- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && +- tick_nohz_tick_stopped()) +- arch_irq_work_raise(); +- } else { +- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) +- arch_irq_work_raise(); +- } ++ if (!lazy_work || tick_nohz_tick_stopped()) ++ arch_irq_work_raise(); + } + + /* Enqueue the irq work @work on the current CPU */ +@@ -102,10 +143,28 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); ++ ++ /* ++ * On PREEMPT_RT the items which are not marked as ++ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work ++ * item is used on the remote CPU to wake the thread. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { ++ ++ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) ++ goto out; ++ ++ work = &per_cpu(irq_work_wakeup, cpu); ++ if (!irq_work_claim(work)) ++ goto out; ++ } ++ + __smp_call_single_queue(cpu, &work->node.llist); + } else { + __irq_work_queue_local(work); + } ++out: + preempt_enable(); + + return true; +@@ -120,9 +179,8 @@ bool irq_work_needs_cpu(void) + raised = this_cpu_ptr(&raised_list); + lazy = this_cpu_ptr(&lazy_list); + +- if (llist_empty(raised) || arch_irq_work_has_interrupt()) +- if (llist_empty(lazy)) +- return false; ++ if (llist_empty(raised) && llist_empty(lazy)) ++ return false; + + /* All work should have been flushed before going offline */ + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); +@@ -153,6 +211,10 @@ void irq_work_single(void *arg) + */ + flags &= ~IRQ_WORK_PENDING; + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); ++ ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) ++ rcuwait_wake_up(&work->irqwait); + } + + static void irq_work_run_list(struct llist_head *list) +@@ -160,7 +222,12 @@ static void irq_work_run_list(struct llist_head *list) + struct irq_work *work, *tmp; + struct llist_node *llnode; + +- BUG_ON(!irqs_disabled()); ++ /* ++ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed ++ * in a per-CPU thread in preemptible context. Only the items which are ++ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. ++ */ ++ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); + + if (llist_empty(list)) + return; +@@ -177,7 +244,10 @@ static void irq_work_run_list(struct llist_head *list) + void irq_work_run(void) + { + irq_work_run_list(this_cpu_ptr(&raised_list)); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + EXPORT_SYMBOL_GPL(irq_work_run); + +@@ -187,7 +257,11 @@ void irq_work_tick(void) + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + + /* +@@ -197,8 +271,42 @@ void irq_work_tick(void) + void irq_work_sync(struct irq_work *work) + { + lockdep_assert_irqs_enabled(); ++ might_sleep(); ++ ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) { ++ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), ++ TASK_UNINTERRUPTIBLE); ++ return; ++ } + + while (irq_work_is_busy(work)) + cpu_relax(); + } + EXPORT_SYMBOL_GPL(irq_work_sync); ++ ++static void run_irq_workd(unsigned int cpu) ++{ ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++} ++ ++static void irq_workd_setup(unsigned int cpu) ++{ ++ sched_set_fifo_low(current); ++} ++ ++static struct smp_hotplug_thread irqwork_threads = { ++ .store = &irq_workd, ++ .setup = irq_workd_setup, ++ .thread_should_run = irq_workd_should_run, ++ .thread_fn = run_irq_workd, ++ .thread_comm = "irq_work/%u", ++}; ++ ++static __init int irq_work_init_threads(void) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); ++ return 0; ++} ++early_initcall(irq_work_init_threads); +\ No newline at end of file +diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c +index b9a6f4658..c26219f34 100644 +--- a/kernel/kexec_core.c ++++ b/kernel/kexec_core.c +@@ -984,7 +984,6 @@ void crash_kexec(struct pt_regs *regs) + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ +- printk_safe_flush_on_panic(); + __crash_kexec(regs); + + /* +diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c +index 35859da8b..dfff31ed6 100644 +--- a/kernel/ksysfs.c ++++ b/kernel/ksysfs.c +@@ -138,6 +138,15 @@ KERNEL_ATTR_RO(vmcoreinfo); + + #endif /* CONFIG_CRASH_CORE */ + ++#if defined(CONFIG_PREEMPT_RT) ++static ssize_t realtime_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%d\n", 1); ++} ++KERNEL_ATTR_RO(realtime); ++#endif ++ + /* whether file capabilities are enabled */ + static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +@@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[] = { + #ifndef CONFIG_TINY_RCU + &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, ++#endif ++#ifdef CONFIG_PREEMPT_RT ++ &realtime_attr.attr, + #endif + NULL + }; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 508fe5278..3ce6a31db 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -264,6 +264,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); + + static int kthread(void *_create) + { ++ static const struct sched_param param = { .sched_priority = 0 }; + /* Copy data: it's on kthread's stack */ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data) = create->threadfn; +@@ -294,6 +295,13 @@ static int kthread(void *_create) + init_completion(&self->parked); + current->vfork_done = &self->exited; + ++ /* ++ * The new thread inherited kthreadd's priority and CPU mask. Reset ++ * back to default in case they have been changed. ++ */ ++ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); ++ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); ++ + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; +@@ -391,7 +399,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + } + task = create->result; + if (!IS_ERR(task)) { +- static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; + + /* +@@ -400,13 +407,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); +- /* +- * root may have changed our (kthreadd's) priority or CPU mask. +- * The kernel thread should not inherit these properties. +- */ +- sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); +- set_cpus_allowed_ptr(task, +- housekeeping_cpumask(HK_FLAG_KTHREAD)); + } + kfree(create); + return task; +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index 6d11cfb9b..c7fbf737e 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -3,7 +3,7 @@ + # and is generally not a function of system call inputs. + KCOV_INSTRUMENT := n + +-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ++obj-y += semaphore.o rwsem.o percpu-rwsem.o + + # Avoid recursion lockdep -> KCSAN -> ... -> lockdep. + KCSAN_SANITIZE_lockdep.o := n +@@ -15,19 +15,23 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) + CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) + endif + +-obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o + obj-$(CONFIG_LOCKDEP) += lockdep.o + ifeq ($(CONFIG_PROC_FS),y) + obj-$(CONFIG_LOCKDEP) += lockdep_proc.o + endif + obj-$(CONFIG_SMP) += spinlock.o +-obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o + obj-$(CONFIG_PROVE_LOCKING) += spinlock.o + obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o + obj-$(CONFIG_RT_MUTEXES) += rtmutex.o + obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o + obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o + obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o ++ifneq ($(CONFIG_PREEMPT_RT),y) ++obj-y += mutex.o ++obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o ++obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o ++endif ++obj-$(CONFIG_PREEMPT_RT) += mutex-rt.o rwsem-rt.o rwlock-rt.o + obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index af4b35450..787099636 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -5410,6 +5410,7 @@ static noinstr void check_flags(unsigned long flags) + } + } + ++#ifndef CONFIG_PREEMPT_RT + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only +@@ -5424,6 +5425,7 @@ static noinstr void check_flags(unsigned long flags) + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + } ++#endif + + if (!debug_locks) + print_irqtrace_events(current); +diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c +new file mode 100644 +index 000000000..2b849e6b9 +--- /dev/null ++++ b/kernel/locking/mutex-rt.c +@@ -0,0 +1,224 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * Real-Time Preemption Support ++ * ++ * started by Ingo Molnar: ++ * ++ * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar ++ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner ++ * ++ * historic credit for proving that Linux spinlocks can be implemented via ++ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow ++ * and others) who prototyped it on 2.4 and did lots of comparative ++ * research and analysis; TimeSys, for proving that you can implement a ++ * fully preemptible kernel via the use of IRQ threading and mutexes; ++ * Bill Huey for persuasively arguing on lkml that the mutex model is the ++ * right one; and to MontaVista, who ported pmutexes to 2.6. ++ * ++ * This code is a from-scratch implementation and is not based on pmutexes, ++ * but the idea of converting spinlocks to mutexes is used here too. ++ * ++ * lock debugging, locking tree, deadlock detection: ++ * ++ * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey ++ * Released under the General Public License (GPL). ++ * ++ * Includes portions of the generic R/W semaphore implementation from: ++ * ++ * Copyright (c) 2001 David Howells (dhowells@redhat.com). ++ * - Derived partially from idea by Andrea Arcangeli ++ * - Derived also from comments by Linus ++ * ++ * Pending ownership of locks and ownership stealing: ++ * ++ * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt ++ * ++ * (also by Steven Rostedt) ++ * - Converted single pi_lock to individual task locks. ++ * ++ * By Esben Nielsen: ++ * Doing priority inheritance with help of the scheduler. ++ * ++ * Copyright (C) 2006, Timesys Corp., Thomas Gleixner ++ * - major rework based on Esben Nielsens initial patch ++ * - replaced thread_info references by task_struct refs ++ * - removed task->pending_owner dependency ++ * - BKL drop/reacquire for semaphore style locks to avoid deadlocks ++ * in the scheduler return path as discussed with Steven Rostedt ++ * ++ * Copyright (C) 2006, Kihon Technologies Inc. ++ * Steven Rostedt ++ * - debugged and patched Thomas Gleixner's rework. ++ * - added back the cmpxchg to the rework. ++ * - turned atomic require back on for SMP. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "rtmutex_common.h" ++ ++/* ++ * struct mutex functions ++ */ ++void __mutex_do_init(struct mutex *mutex, const char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); ++ lockdep_init_map(&mutex->dep_map, name, key, 0); ++#endif ++ mutex->lock.save_state = 0; ++} ++EXPORT_SYMBOL(__mutex_do_init); ++ ++static int _mutex_lock_blk_flush(struct mutex *lock, int state) ++{ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ return __rt_mutex_lock_state(&lock->lock, state); ++} ++ ++void __lockfunc _mutex_lock(struct mutex *lock) ++{ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(_mutex_lock); ++ ++void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); ++ ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL_GPL(_mutex_lock_io_nested); ++ ++int __lockfunc _mutex_lock_interruptible(struct mutex *lock) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_interruptible); ++ ++int __lockfunc _mutex_lock_killable(struct mutex *lock) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_killable); ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) ++{ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(_mutex_lock_nested); ++ ++void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) ++{ ++ mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_); ++ _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); ++} ++EXPORT_SYMBOL(_mutex_lock_nest_lock); ++ ++int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) ++{ ++ int ret; ++ ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_interruptible_nested); ++ ++int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) ++{ ++ int ret; ++ ++ mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_lock_killable_nested); ++#endif ++ ++int __lockfunc _mutex_trylock(struct mutex *lock) ++{ ++ int ret = __rt_mutex_trylock(&lock->lock); ++ ++ if (ret) ++ mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ ++ return ret; ++} ++EXPORT_SYMBOL(_mutex_trylock); ++ ++void __lockfunc _mutex_unlock(struct mutex *lock) ++{ ++ mutex_release(&lock->dep_map, _RET_IP_); ++ __rt_mutex_unlock(&lock->lock); ++} ++EXPORT_SYMBOL(_mutex_unlock); ++ ++/** ++ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 ++ * @cnt: the atomic which we are to dec ++ * @lock: the mutex to return holding if we dec to 0 ++ * ++ * return true and hold lock if we dec to 0, return false otherwise ++ */ ++int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) ++{ ++ /* dec if we can't possibly hit 0 */ ++ if (atomic_add_unless(cnt, -1, 1)) ++ return 0; ++ /* we might hit 0, so take the lock */ ++ mutex_lock(lock); ++ if (!atomic_dec_and_test(cnt)) { ++ /* when we actually did the dec, we didn't hit 0 */ ++ mutex_unlock(lock); ++ return 0; ++ } ++ /* we hit 0, and we hold the lock */ ++ return 1; ++} ++EXPORT_SYMBOL(atomic_dec_and_mutex_lock); +diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c +index 36e69100e..fb1501003 100644 +--- a/kernel/locking/rtmutex-debug.c ++++ b/kernel/locking/rtmutex-debug.c +@@ -32,110 +32,12 @@ + + #include "rtmutex_common.h" + +-static void printk_task(struct task_struct *p) +-{ +- if (p) +- printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); +- else +- printk(""); +-} +- +-static void printk_lock(struct rt_mutex *lock, int print_owner) +-{ +- if (lock->name) +- printk(" [%p] {%s}\n", +- lock, lock->name); +- else +- printk(" [%p] {%s:%d}\n", +- lock, lock->file, lock->line); +- +- if (print_owner && rt_mutex_owner(lock)) { +- printk(".. ->owner: %p\n", lock->owner); +- printk(".. held by: "); +- printk_task(rt_mutex_owner(lock)); +- printk("\n"); +- } +-} +- + void rt_mutex_debug_task_free(struct task_struct *task) + { + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); + } + +-/* +- * We fill out the fields in the waiter to store the information about +- * the deadlock. We print when we return. act_waiter can be NULL in +- * case of a remove waiter operation. +- */ +-void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, +- struct rt_mutex_waiter *act_waiter, +- struct rt_mutex *lock) +-{ +- struct task_struct *task; +- +- if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) +- return; +- +- task = rt_mutex_owner(act_waiter->lock); +- if (task && task != current) { +- act_waiter->deadlock_task_pid = get_pid(task_pid(task)); +- act_waiter->deadlock_lock = lock; +- } +-} +- +-void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +-{ +- struct task_struct *task; +- +- if (!waiter->deadlock_lock || !debug_locks) +- return; +- +- rcu_read_lock(); +- task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); +- if (!task) { +- rcu_read_unlock(); +- return; +- } +- +- if (!debug_locks_off()) { +- rcu_read_unlock(); +- return; +- } +- +- pr_warn("\n"); +- pr_warn("============================================\n"); +- pr_warn("WARNING: circular locking deadlock detected!\n"); +- pr_warn("%s\n", print_tainted()); +- pr_warn("--------------------------------------------\n"); +- printk("%s/%d is deadlocking current task %s/%d\n\n", +- task->comm, task_pid_nr(task), +- current->comm, task_pid_nr(current)); +- +- printk("\n1) %s/%d is trying to acquire this lock:\n", +- current->comm, task_pid_nr(current)); +- printk_lock(waiter->lock, 1); +- +- printk("\n2) %s/%d is blocked on this lock:\n", +- task->comm, task_pid_nr(task)); +- printk_lock(waiter->deadlock_lock, 1); +- +- debug_show_held_locks(current); +- debug_show_held_locks(task); +- +- printk("\n%s/%d's [blocked] stackdump:\n\n", +- task->comm, task_pid_nr(task)); +- show_stack(task, NULL, KERN_DEFAULT); +- printk("\n%s/%d's [current] stackdump:\n\n", +- current->comm, task_pid_nr(current)); +- dump_stack(); +- debug_show_all_locks(); +- rcu_read_unlock(); +- +- printk("[ turning off deadlock detection." +- "Please report this trace. ]\n\n"); +-} +- + void debug_rt_mutex_lock(struct rt_mutex *lock) + { + } +@@ -158,12 +60,10 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) + void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) + { + memset(waiter, 0x11, sizeof(*waiter)); +- waiter->deadlock_task_pid = NULL; + } + + void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) + { +- put_pid(waiter->deadlock_task_pid); + memset(waiter, 0x22, sizeof(*waiter)); + } + +@@ -173,10 +73,8 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_cl + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); +- lock->name = name; + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_init_map(&lock->dep_map, name, key, 0); + #endif + } +- +diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h +index fc549713b..659e93e25 100644 +--- a/kernel/locking/rtmutex-debug.h ++++ b/kernel/locking/rtmutex-debug.h +@@ -18,20 +18,9 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); + extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner); + extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +-extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, +- struct rt_mutex_waiter *waiter, +- struct rt_mutex *lock); +-extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +-# define debug_rt_mutex_reset_waiter(w) \ +- do { (w)->deadlock_lock = NULL; } while (0) + + static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + enum rtmutex_chainwalk walk) + { + return (waiter != NULL); + } +- +-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +-{ +- debug_rt_mutex_print_deadlock(w); +-} +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index f00dd928f..40539bc43 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -8,6 +8,11 @@ + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen ++ * Adaptive Spinlocks: ++ * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, ++ * and Peter Morreale, ++ * Adaptive Spinlocks simplification: ++ * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt + * + * See Documentation/locking/rt-mutex-design.rst for details. + */ +@@ -19,6 +24,7 @@ + #include + #include + #include ++#include + + #include "rtmutex_common.h" + +@@ -136,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); + } + ++static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) ++{ ++ return waiter && waiter != PI_WAKEUP_INPROGRESS && ++ waiter != PI_REQUEUE_INPROGRESS; ++} ++ + /* + * We can speed up the acquire/release, if there's no debugging state to be + * set up. +@@ -227,7 +239,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -267,6 +279,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + return 1; + } + ++#define STEAL_NORMAL 0 ++#define STEAL_LATERAL 1 ++ ++static inline int ++rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode) ++{ ++ struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); ++ ++ if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter)) ++ return 1; ++ ++ /* ++ * Note that RT tasks are excluded from lateral-steals ++ * to prevent the introduction of an unbounded latency. ++ */ ++ if (mode == STEAL_NORMAL || rt_task(waiter->task)) ++ return 0; ++ ++ return rt_mutex_waiter_equal(waiter, top_waiter); ++} ++ + static void + rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) + { +@@ -371,6 +404,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, + return debug_rt_mutex_detect_deadlock(waiter, chwalk); + } + ++static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter) ++{ ++ if (waiter->savestate) ++ wake_up_lock_sleeper(waiter->task); ++ else ++ wake_up_process(waiter->task); ++} ++ + /* + * Max number of times we'll walk the boosting chain: + */ +@@ -378,7 +419,8 @@ int max_lock_depth = 1024; + + static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) + { +- return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; ++ return rt_mutex_real_waiter(p->pi_blocked_on) ? ++ p->pi_blocked_on->lock : NULL; + } + + /* +@@ -514,7 +556,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * reached or the state of the chain has changed while we + * dropped the locks. + */ +- if (!waiter) ++ if (!rt_mutex_real_waiter(waiter)) + goto out_unlock_pi; + + /* +@@ -597,7 +639,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * walk, we detected a deadlock. + */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { +- debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); + raw_spin_unlock(&lock->wait_lock); + ret = -EDEADLK; + goto out_unlock_pi; +@@ -694,13 +735,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * follow here. This is the end of the chain we are walking. + */ + if (!rt_mutex_owner(lock)) { ++ struct rt_mutex_waiter *lock_top_waiter; ++ + /* + * If the requeue [7] above changed the top waiter, + * then we need to wake the new top waiter up to try + * to get the lock. + */ +- if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) +- wake_up_process(rt_mutex_top_waiter(lock)->task); ++ lock_top_waiter = rt_mutex_top_waiter(lock); ++ if (prerequeue_top_waiter != lock_top_waiter) ++ rt_mutex_wake_waiter(lock_top_waiter); + raw_spin_unlock_irq(&lock->wait_lock); + return 0; + } +@@ -801,9 +845,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * @task: The task which wants to acquire the lock + * @waiter: The waiter that is queued to the lock's wait tree if the + * callsite called task_blocked_on_lock(), otherwise NULL ++ * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL) + */ +-static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, +- struct rt_mutex_waiter *waiter) ++static int __try_to_take_rt_mutex(struct rt_mutex *lock, ++ struct task_struct *task, ++ struct rt_mutex_waiter *waiter, int mode) + { + lockdep_assert_held(&lock->wait_lock); + +@@ -839,12 +885,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + */ + if (waiter) { + /* +- * If waiter is not the highest priority waiter of +- * @lock, give up. ++ * If waiter is not the highest priority waiter of @lock, ++ * or its peer when lateral steal is allowed, give up. + */ +- if (waiter != rt_mutex_top_waiter(lock)) ++ if (!rt_mutex_steal(lock, waiter, mode)) + return 0; +- + /* + * We can acquire the lock. Remove the waiter from the + * lock waiters tree. +@@ -862,14 +907,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + */ + if (rt_mutex_has_waiters(lock)) { + /* +- * If @task->prio is greater than or equal to +- * the top waiter priority (kernel view), +- * @task lost. ++ * If @task->prio is greater than the top waiter ++ * priority (kernel view), or equal to it when a ++ * lateral steal is forbidden, @task lost. + */ +- if (!rt_mutex_waiter_less(task_to_waiter(task), +- rt_mutex_top_waiter(lock))) ++ if (!rt_mutex_steal(lock, task_to_waiter(task), mode)) + return 0; +- + /* + * The current top waiter stays enqueued. We + * don't have to change anything in the lock +@@ -916,6 +959,329 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + return 1; + } + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * preemptible spin_lock functions: ++ */ ++static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, ++ void (*slowfn)(struct rt_mutex *lock)) ++{ ++ might_sleep_no_state_check(); ++ ++ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) ++ return; ++ else ++ slowfn(lock); ++} ++ ++static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, ++ void (*slowfn)(struct rt_mutex *lock)) ++{ ++ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) ++ return; ++ else ++ slowfn(lock); ++} ++#ifdef CONFIG_SMP ++/* ++ * Note that owner is a speculative pointer and dereferencing relies ++ * on rcu_read_lock() and the check against the lock owner. ++ */ ++static int adaptive_wait(struct rt_mutex *lock, ++ struct task_struct *owner) ++{ ++ int res = 0; ++ ++ rcu_read_lock(); ++ for (;;) { ++ if (owner != rt_mutex_owner(lock)) ++ break; ++ /* ++ * Ensure that owner->on_cpu is dereferenced _after_ ++ * checking the above to be valid. ++ */ ++ barrier(); ++ if (!owner->on_cpu) { ++ res = 1; ++ break; ++ } ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ return res; ++} ++#else ++static int adaptive_wait(struct rt_mutex *lock, ++ struct task_struct *orig_owner) ++{ ++ return 1; ++} ++#endif ++ ++static int task_blocks_on_rt_mutex(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task, ++ enum rtmutex_chainwalk chwalk); ++/* ++ * Slow path lock function spin_lock style: this variant is very ++ * careful not to miss any non-lock wakeups. ++ * ++ * We store the current state under p->pi_lock in p->saved_state and ++ * the try_to_wake_up() code handles this accordingly. ++ */ ++void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ unsigned long flags) ++{ ++ struct task_struct *lock_owner, *self = current; ++ struct rt_mutex_waiter *top_waiter; ++ int ret; ++ ++ if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) ++ return; ++ ++ BUG_ON(rt_mutex_owner(lock) == self); ++ ++ /* ++ * We save whatever state the task is in and we'll restore it ++ * after acquiring the lock taking real wakeups into account ++ * as well. We are serialized via pi_lock against wakeups. See ++ * try_to_wake_up(). ++ */ ++ raw_spin_lock(&self->pi_lock); ++ self->saved_state = self->state; ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ ++ ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK); ++ BUG_ON(ret); ++ ++ for (;;) { ++ /* Try to acquire the lock again. */ ++ if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL)) ++ break; ++ ++ top_waiter = rt_mutex_top_waiter(lock); ++ lock_owner = rt_mutex_owner(lock); ++ ++ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ if (top_waiter != waiter || adaptive_wait(lock, lock_owner)) ++ preempt_schedule_lock(); ++ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ } ++ ++ /* ++ * Restore the task state to current->saved_state. We set it ++ * to the original state above and the try_to_wake_up() code ++ * has possibly updated it when a real (non-rtmutex) wakeup ++ * happened while we were blocked. Clear saved_state so ++ * try_to_wakeup() does not get confused. ++ */ ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(self->saved_state); ++ self->saved_state = TASK_RUNNING; ++ raw_spin_unlock(&self->pi_lock); ++ ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit ++ * unconditionally. We might have to fix that up: ++ */ ++ fixup_rt_mutex_waiters(lock); ++ ++ BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock)); ++ BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry)); ++} ++ ++static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) ++{ ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ ++ rt_mutex_init_waiter(&waiter, true); ++ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ rt_spin_lock_slowlock_locked(lock, &waiter, flags); ++ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ debug_rt_mutex_free_waiter(&waiter); ++} ++ ++static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper); ++/* ++ * Slow path to release a rt_mutex spin_lock style ++ */ ++void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) ++{ ++ unsigned long flags; ++ DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); ++ bool postunlock; ++ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q); ++ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ ++ if (postunlock) ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); ++} ++ ++void __lockfunc rt_spin_lock(spinlock_t *lock) ++{ ++ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_spin_lock); ++ ++void __lockfunc __rt_spin_lock(struct rt_mutex *lock) ++{ ++ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); ++} ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) ++{ ++ spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_spin_lock_nested); ++ ++void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, ++ struct lockdep_map *nest_lock) ++{ ++ spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); ++ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_spin_lock_nest_lock); ++#endif ++ ++void __lockfunc rt_spin_unlock(spinlock_t *lock) ++{ ++ /* NOTE: we always pass in '1' for nested, for simplicity */ ++ spin_release(&lock->dep_map, _RET_IP_); ++ migrate_enable(); ++ rcu_read_unlock(); ++ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); ++} ++EXPORT_SYMBOL(rt_spin_unlock); ++ ++void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) ++{ ++ rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); ++} ++EXPORT_SYMBOL(__rt_spin_unlock); ++ ++/* ++ * Wait for the lock to get unlocked: instead of polling for an unlock ++ * (like raw spinlocks do), we lock and unlock, to force the kernel to ++ * schedule if there's contention: ++ */ ++void __lockfunc rt_spin_lock_unlock(spinlock_t *lock) ++{ ++ spin_lock(lock); ++ spin_unlock(lock); ++} ++EXPORT_SYMBOL(rt_spin_lock_unlock); ++ ++int __lockfunc rt_spin_trylock(spinlock_t *lock) ++{ ++ int ret; ++ ++ ret = __rt_mutex_trylock(&lock->lock); ++ if (ret) { ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); ++ migrate_disable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_spin_trylock); ++ ++int __lockfunc rt_spin_trylock_bh(spinlock_t *lock) ++{ ++ int ret; ++ ++ local_bh_disable(); ++ ret = __rt_mutex_trylock(&lock->lock); ++ if (ret) { ++ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); ++ migrate_disable(); ++ } else { ++ local_bh_enable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_spin_trylock_bh); ++ ++void ++__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held lock: ++ */ ++ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++EXPORT_SYMBOL(__rt_spin_lock_init); ++ ++#endif /* PREEMPT_RT */ ++ ++#ifdef CONFIG_PREEMPT_RT ++ static inline int __sched ++__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); ++ struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); ++ ++ if (!hold_ctx) ++ return 0; ++ ++ if (unlikely(ctx == hold_ctx)) ++ return -EALREADY; ++ ++ if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && ++ (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { ++#ifdef CONFIG_DEBUG_MUTEXES ++ DEBUG_LOCKS_WARN_ON(ctx->contending_lock); ++ ctx->contending_lock = ww; ++#endif ++ return -EDEADLK; ++ } ++ ++ return 0; ++} ++#else ++ static inline int __sched ++__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ BUG(); ++ return 0; ++} ++ ++#endif ++ ++static inline int ++try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, ++ struct rt_mutex_waiter *waiter) ++{ ++ return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL); ++} ++ + /* + * Task blocks on lock. + * +@@ -948,6 +1314,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + return -EDEADLK; + + raw_spin_lock(&task->pi_lock); ++ /* ++ * In the case of futex requeue PI, this will be a proxy ++ * lock. The task will wake unaware that it is enqueueed on ++ * this lock. Avoid blocking on two locks and corrupting ++ * pi_blocked_on via the PI_WAKEUP_INPROGRESS ++ * flag. futex_wait_requeue_pi() sets this when it wakes up ++ * before requeue (due to a signal or timeout). Do not enqueue ++ * the task if PI_WAKEUP_INPROGRESS is set. ++ */ ++ if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { ++ raw_spin_unlock(&task->pi_lock); ++ return -EAGAIN; ++ } ++ ++ BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)); ++ + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +@@ -971,7 +1353,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + rt_mutex_enqueue_pi(owner, waiter); + + rt_mutex_adjust_prio(owner); +- if (owner->pi_blocked_on) ++ if (rt_mutex_real_waiter(owner->pi_blocked_on)) + chain_walk = 1; + } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { + chain_walk = 1; +@@ -1013,6 +1395,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + * Called with lock->wait_lock held and interrupts disabled. + */ + static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q, + struct rt_mutex *lock) + { + struct rt_mutex_waiter *waiter; +@@ -1052,7 +1435,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); +- wake_q_add(wake_q, waiter->task); ++ if (waiter->savestate) ++ wake_q_add_sleeper(wake_sleeper_q, waiter->task); ++ else ++ wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); + } + +@@ -1067,7 +1453,7 @@ static void remove_waiter(struct rt_mutex *lock, + { + bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); +- struct rt_mutex *next_lock; ++ struct rt_mutex *next_lock = NULL; + + lockdep_assert_held(&lock->wait_lock); + +@@ -1093,7 +1479,8 @@ static void remove_waiter(struct rt_mutex *lock, + rt_mutex_adjust_prio(owner); + + /* Store the lock on which owner is blocked or NULL */ +- next_lock = task_blocked_on_lock(owner); ++ if (rt_mutex_real_waiter(owner->pi_blocked_on)) ++ next_lock = task_blocked_on_lock(owner); + + raw_spin_unlock(&owner->pi_lock); + +@@ -1129,26 +1516,28 @@ void rt_mutex_adjust_pi(struct task_struct *task) + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; +- if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { ++ if (!rt_mutex_real_waiter(waiter) || ++ rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + next_lock = waiter->lock; +- raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, + next_lock, NULL, task); + } + +-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) ++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) + { + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; ++ waiter->savestate = savestate; + } + + /** +@@ -1164,7 +1553,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) + static int __sched + __rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, +- struct rt_mutex_waiter *waiter) ++ struct rt_mutex_waiter *waiter, ++ struct ww_acquire_ctx *ww_ctx) + { + int ret = 0; + +@@ -1173,24 +1563,23 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, + if (try_to_take_rt_mutex(lock, current, waiter)) + break; + +- /* +- * TASK_INTERRUPTIBLE checks for signals and +- * timeout. Ignored otherwise. +- */ +- if (likely(state == TASK_INTERRUPTIBLE)) { +- /* Signal pending? */ +- if (signal_pending(current)) +- ret = -EINTR; +- if (timeout && !timeout->task) +- ret = -ETIMEDOUT; ++ if (timeout && !timeout->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } ++ if (signal_pending_state(state, current)) { ++ ret = -EINTR; ++ break; ++ } ++ ++ if (ww_ctx && ww_ctx->acquired > 0) { ++ ret = __mutex_lock_check_stamp(lock, ww_ctx); + if (ret) + break; + } + + raw_spin_unlock_irq(&lock->wait_lock); + +- debug_rt_mutex_print_deadlock(waiter); +- + schedule(); + + raw_spin_lock_irq(&lock->wait_lock); +@@ -1211,43 +1600,110 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, + if (res != -EDEADLOCK || detect_deadlock) + return; + +- /* +- * Yell lowdly and stop the task right here. +- */ +- rt_mutex_print_deadlock(w); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + } + +-/* +- * Slow path lock function: +- */ +-static int __sched +-rt_mutex_slowlock(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk) ++static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, ++ struct ww_acquire_ctx *ww_ctx) + { +- struct rt_mutex_waiter waiter; +- unsigned long flags; +- int ret = 0; ++#ifdef CONFIG_DEBUG_MUTEXES ++ /* ++ * If this WARN_ON triggers, you used ww_mutex_lock to acquire, ++ * but released with a normal mutex_unlock in this call. ++ * ++ * This should never happen, always use ww_mutex_unlock. ++ */ ++ DEBUG_LOCKS_WARN_ON(ww->ctx); ++ ++ /* ++ * Not quite done after calling ww_acquire_done() ? ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); + +- rt_mutex_init_waiter(&waiter); ++ if (ww_ctx->contending_lock) { ++ /* ++ * After -EDEADLK you tried to ++ * acquire a different ww_mutex? Bad! ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); ++ ++ /* ++ * You called ww_mutex_lock after receiving -EDEADLK, ++ * but 'forgot' to unlock everything else first? ++ */ ++ DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); ++ ww_ctx->contending_lock = NULL; ++ } + + /* +- * Technically we could use raw_spin_[un]lock_irq() here, but this can +- * be called in early boot if the cmpxchg() fast path is disabled +- * (debug, no architecture support). In this case we will acquire the +- * rtmutex with lock->wait_lock held. But we cannot unconditionally +- * enable interrupts in that early boot case. So we need to use the +- * irqsave/restore variants. ++ * Naughty, using a different class will lead to undefined behavior! + */ +- raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); ++#endif ++ ww_ctx->acquired++; ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++static void ww_mutex_account_lock(struct rt_mutex *lock, ++ struct ww_acquire_ctx *ww_ctx) ++{ ++ struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); ++ struct rt_mutex_waiter *waiter, *n; ++ ++ /* ++ * This branch gets optimized out for the common case, ++ * and is only important for ww_mutex_lock. ++ */ ++ ww_mutex_lock_acquired(ww, ww_ctx); ++ ww->ctx = ww_ctx; ++ ++ /* ++ * Give any possible sleeping processes the chance to wake up, ++ * so they can recheck if they have to back off. ++ */ ++ rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root, ++ tree_entry) { ++ /* XXX debug rt mutex waiter wakeup */ ++ ++ BUG_ON(waiter->lock != lock); ++ rt_mutex_wake_waiter(waiter); ++ } ++} ++ ++#else ++ ++static void ww_mutex_account_lock(struct rt_mutex *lock, ++ struct ww_acquire_ctx *ww_ctx) ++{ ++ BUG(); ++} ++#endif ++ ++int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx, ++ struct rt_mutex_waiter *waiter) ++{ ++ int ret; ++ ++#ifdef CONFIG_PREEMPT_RT ++ if (ww_ctx) { ++ struct ww_mutex *ww; ++ ++ ww = container_of(lock, struct ww_mutex, base.lock); ++ if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) ++ return -EALREADY; ++ } ++#endif + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock, current, NULL)) { +- raw_spin_unlock_irqrestore(&lock->wait_lock, flags); ++ if (ww_ctx) ++ ww_mutex_account_lock(lock, ww_ctx); + return 0; + } + +@@ -1257,16 +1713,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + if (unlikely(timeout)) + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + +- ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); ++ ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); + +- if (likely(!ret)) ++ if (likely(!ret)) { + /* sleep on the mutex */ +- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); ++ ret = __rt_mutex_slowlock(lock, state, timeout, waiter, ++ ww_ctx); ++ } else if (ww_ctx) { ++ /* ww_mutex received EDEADLK, let it become EALREADY */ ++ ret = __mutex_lock_check_stamp(lock, ww_ctx); ++ BUG_ON(!ret); ++ } + + if (unlikely(ret)) { + __set_current_state(TASK_RUNNING); +- remove_waiter(lock, &waiter); +- rt_mutex_handle_deadlock(ret, chwalk, &waiter); ++ remove_waiter(lock, waiter); ++ /* ww_mutex wants to report EDEADLK/EALREADY, let it */ ++ if (!ww_ctx) ++ rt_mutex_handle_deadlock(ret, chwalk, waiter); ++ } else if (ww_ctx) { ++ ww_mutex_account_lock(lock, ww_ctx); + } + + /* +@@ -1274,6 +1740,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); ++ return ret; ++} ++ ++/* ++ * Slow path lock function: ++ */ ++static int __sched ++rt_mutex_slowlock(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx) ++{ ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ int ret = 0; ++ ++ rt_mutex_init_waiter(&waiter, false); ++ ++ /* ++ * Technically we could use raw_spin_[un]lock_irq() here, but this can ++ * be called in early boot if the cmpxchg() fast path is disabled ++ * (debug, no architecture support). In this case we will acquire the ++ * rtmutex with lock->wait_lock held. But we cannot unconditionally ++ * enable interrupts in that early boot case. So we need to use the ++ * irqsave/restore variants. ++ */ ++ raw_spin_lock_irqsave(&lock->wait_lock, flags); ++ ++ ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx, ++ &waiter); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + +@@ -1334,7 +1830,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) + * Return whether the current task needs to call rt_mutex_postunlock(). + */ + static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, +- struct wake_q_head *wake_q) ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q) + { + unsigned long flags; + +@@ -1388,7 +1885,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, + * + * Queue the next waiter for wakeup once we release the wait_lock. + */ +- mark_wakeup_next_waiter(wake_q, lock); ++ mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + return true; /* call rt_mutex_postunlock() */ +@@ -1402,29 +1899,16 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, + */ + static inline int + rt_mutex_fastlock(struct rt_mutex *lock, int state, ++ struct ww_acquire_ctx *ww_ctx, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk)) ++ enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx)) + { + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 0; + +- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); +-} +- +-static inline int +-rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk, +- int (*slowfn)(struct rt_mutex *lock, int state, +- struct hrtimer_sleeper *timeout, +- enum rtmutex_chainwalk chwalk)) +-{ +- if (chwalk == RT_MUTEX_MIN_CHAINWALK && +- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) +- return 0; +- +- return slowfn(lock, state, timeout, chwalk); ++ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx); + } + + static inline int +@@ -1440,10 +1924,12 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, + /* + * Performs the wakeup of the top-waiter and re-enables preemption. + */ +-void rt_mutex_postunlock(struct wake_q_head *wake_q) ++void rt_mutex_postunlock(struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q) + { + wake_up_q(wake_q); +- ++ wake_up_q_sleeper(wake_sleeper_q); ++ + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); + } +@@ -1451,23 +1937,46 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q) + static inline void + rt_mutex_fastunlock(struct rt_mutex *lock, + bool (*slowfn)(struct rt_mutex *lock, +- struct wake_q_head *wqh)) ++ struct wake_q_head *wqh, ++ struct wake_q_head *wq_sleeper)) + { + DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); + + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; + +- if (slowfn(lock, &wake_q)) +- rt_mutex_postunlock(&wake_q); ++ if (slowfn(lock, &wake_q, &wake_sleeper_q)) ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); + } + +-static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) ++int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state) + { + might_sleep(); ++ return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock); ++} ++ ++/** ++ * rt_mutex_lock_state - lock a rt_mutex with a given state ++ * ++ * @lock: The rt_mutex to be locked ++ * @state: The state to set when blocking on the rt_mutex ++ */ ++static inline int __sched rt_mutex_lock_state(struct rt_mutex *lock, ++ unsigned int subclass, int state) ++{ ++ int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); +- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); ++ ret = __rt_mutex_lock_state(lock, state); ++ if (ret) ++ mutex_release(&lock->dep_map, _RET_IP_); ++ return ret; ++} ++ ++static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) ++{ ++ rt_mutex_lock_state(lock, subclass, TASK_UNINTERRUPTIBLE); + } + + #ifdef CONFIG_DEBUG_LOCK_ALLOC +@@ -1508,16 +2017,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); + */ + int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) + { +- int ret; +- +- might_sleep(); +- +- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); +- ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); +- if (ret) +- mutex_release(&lock->dep_map, _RET_IP_); +- +- return ret; ++ return rt_mutex_lock_state(lock, 0, TASK_INTERRUPTIBLE); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +@@ -1534,36 +2034,17 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) + return __rt_mutex_slowtrylock(lock); + } + +-/** +- * rt_mutex_timed_lock - lock a rt_mutex interruptible +- * the timeout structure is provided +- * by the caller +- * +- * @lock: the rt_mutex to be locked +- * @timeout: timeout structure or NULL (no timeout) +- * +- * Returns: +- * 0 on success +- * -EINTR when interrupted by a signal +- * -ETIMEDOUT when the timeout expired +- */ +-int +-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) ++int __sched __rt_mutex_trylock(struct rt_mutex *lock) + { +- int ret; +- +- might_sleep(); +- +- mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); +- ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, +- RT_MUTEX_MIN_CHAINWALK, +- rt_mutex_slowlock); +- if (ret) +- mutex_release(&lock->dep_map, _RET_IP_); ++#ifdef CONFIG_PREEMPT_RT ++ if (WARN_ON_ONCE(in_irq() || in_nmi())) ++#else ++ if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) ++#endif ++ return 0; + +- return ret; ++ return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + } +-EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + + /** + * rt_mutex_trylock - try to lock a rt_mutex +@@ -1580,10 +2061,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) + { + int ret; + +- if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) +- return 0; +- +- ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); ++ ret = __rt_mutex_trylock(lock); + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + +@@ -1591,6 +2069,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) + } + EXPORT_SYMBOL_GPL(rt_mutex_trylock); + ++void __sched __rt_mutex_unlock(struct rt_mutex *lock) ++{ ++ rt_mutex_fastunlock(lock, rt_mutex_slowunlock); ++} ++ + /** + * rt_mutex_unlock - unlock a rt_mutex + * +@@ -1599,16 +2082,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); + void __sched rt_mutex_unlock(struct rt_mutex *lock) + { + mutex_release(&lock->dep_map, _RET_IP_); +- rt_mutex_fastunlock(lock, rt_mutex_slowunlock); ++ __rt_mutex_unlock(lock); + } + EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +-/** +- * Futex variant, that since futex variants do not use the fast-path, can be +- * simple and will not need to retry. +- */ +-bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wake_q) ++static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper) + { + lockdep_assert_held(&lock->wait_lock); + +@@ -1625,23 +2105,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ +- mark_wakeup_next_waiter(wake_q, lock); ++ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock); + + return true; /* call postunlock() */ + } + ++/** ++ * Futex variant, that since futex variants do not use the fast-path, can be ++ * simple and will not need to retry. ++ */ ++bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper) ++{ ++ return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper); ++} ++ + void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) + { + DEFINE_WAKE_Q(wake_q); ++ DEFINE_WAKE_Q(wake_sleeper_q); + unsigned long flags; + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); +- postunlock = __rt_mutex_futex_unlock(lock, &wake_q); ++ postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) +- rt_mutex_postunlock(&wake_q); ++ rt_mutex_postunlock(&wake_q, &wake_sleeper_q); + } + + /** +@@ -1655,9 +2147,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) + void rt_mutex_destroy(struct rt_mutex *lock) + { + WARN_ON(rt_mutex_is_locked(lock)); +-#ifdef CONFIG_DEBUG_RT_MUTEXES +- lock->magic = NULL; +-#endif + } + EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +@@ -1680,7 +2169,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, + if (name && key) + debug_rt_mutex_init(lock, name, key); + } +-EXPORT_SYMBOL_GPL(__rt_mutex_init); ++EXPORT_SYMBOL(__rt_mutex_init); + + /** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a +@@ -1700,6 +2189,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) + { + __rt_mutex_init(lock, NULL, NULL); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* ++ * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is ++ * holding the ->wait_lock of the proxy_lock while unlocking a sleeping ++ * lock. ++ */ ++ raw_spin_lock_init(&lock->wait_lock); ++#endif + debug_rt_mutex_proxy_lock(lock, proxy_owner); + rt_mutex_set_owner(lock, proxy_owner); + } +@@ -1722,6 +2219,26 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock) + rt_mutex_set_owner(lock, NULL); + } + ++static void fixup_rt_mutex_blocked(struct rt_mutex *lock) ++{ ++ struct task_struct *tsk = current; ++ /* ++ * RT has a problem here when the wait got interrupted by a timeout ++ * or a signal. task->pi_blocked_on is still set. The task must ++ * acquire the hash bucket lock when returning from this function. ++ * ++ * If the hash bucket lock is contended then the ++ * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in ++ * task_blocks_on_rt_mutex() will trigger. This can be avoided by ++ * clearing task->pi_blocked_on which removes the task from the ++ * boosting chain of the rtmutex. That's correct because the task ++ * is not longer blocked on it. ++ */ ++ raw_spin_lock(&tsk->pi_lock); ++ tsk->pi_blocked_on = NULL; ++ raw_spin_unlock(&tsk->pi_lock); ++} ++ + /** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take +@@ -1752,6 +2269,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; + ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * In PREEMPT_RT there's an added race. ++ * If the task, that we are about to requeue, times out, ++ * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue ++ * to skip this task. But right after the task sets ++ * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then ++ * block on the spin_lock(&hb->lock), which in RT is an rtmutex. ++ * This will replace the PI_WAKEUP_INPROGRESS with the actual ++ * lock that it blocks on. We *must not* place this task ++ * on this proxy lock in that case. ++ * ++ * To prevent this race, we first take the task's pi_lock ++ * and check if it has updated its pi_blocked_on. If it has, ++ * we assume that it woke up and we return -EAGAIN. ++ * Otherwise, we set the task's pi_blocked_on to ++ * PI_REQUEUE_INPROGRESS, so that if the task is waking up ++ * it will know that we are in the process of requeuing it. ++ */ ++ raw_spin_lock(&task->pi_lock); ++ if (task->pi_blocked_on) { ++ raw_spin_unlock(&task->pi_lock); ++ return -EAGAIN; ++ } ++ task->pi_blocked_on = PI_REQUEUE_INPROGRESS; ++ raw_spin_unlock(&task->pi_lock); ++#endif ++ + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, + RT_MUTEX_FULL_CHAINWALK); +@@ -1766,7 +2311,8 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + ret = 0; + } + +- debug_rt_mutex_print_deadlock(waiter); ++ if (ret) ++ fixup_rt_mutex_blocked(lock); + + return ret; + } +@@ -1851,12 +2397,15 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + raw_spin_lock_irq(&lock->wait_lock); + /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); +- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); ++ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); ++ if (ret) ++ fixup_rt_mutex_blocked(lock); ++ + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +@@ -1918,3 +2467,97 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + + return cleanup; + } ++ ++static inline int ++ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH ++ unsigned int tmp; ++ ++ if (ctx->deadlock_inject_countdown-- == 0) { ++ tmp = ctx->deadlock_inject_interval; ++ if (tmp > UINT_MAX/4) ++ tmp = UINT_MAX; ++ else ++ tmp = tmp*2 + tmp + tmp/2; ++ ++ ctx->deadlock_inject_interval = tmp; ++ ctx->deadlock_inject_countdown = tmp; ++ ctx->contending_lock = lock; ++ ++ ww_mutex_unlock(lock); ++ ++ return -EDEADLK; ++ } ++#endif ++ ++ return 0; ++} ++ ++#ifdef CONFIG_PREEMPT_RT ++int __sched ++ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ int ret; ++ ++ might_sleep(); ++ ++ mutex_acquire_nest(&lock->base.dep_map, 0, 0, ++ ctx ? &ctx->dep_map : NULL, _RET_IP_); ++ ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, ++ ctx); ++ if (ret) ++ mutex_release(&lock->base.dep_map, _RET_IP_); ++ else if (!ret && ctx && ctx->acquired > 1) ++ return ww_mutex_deadlock_injection(lock, ctx); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); ++ ++int __sched ++ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) ++{ ++ int ret; ++ ++ might_sleep(); ++ ++ mutex_acquire_nest(&lock->base.dep_map, 0, 0, ++ ctx ? &ctx->dep_map : NULL, _RET_IP_); ++ ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, ++ ctx); ++ if (ret) ++ mutex_release(&lock->base.dep_map, _RET_IP_); ++ else if (!ret && ctx && ctx->acquired > 1) ++ return ww_mutex_deadlock_injection(lock, ctx); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(ww_mutex_lock); ++ ++void __sched ww_mutex_unlock(struct ww_mutex *lock) ++{ ++ /* ++ * The unlocking fastpath is the 0->1 transition from 'locked' ++ * into 'unlocked' state: ++ */ ++ if (lock->ctx) { ++#ifdef CONFIG_DEBUG_MUTEXES ++ DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); ++#endif ++ if (lock->ctx->acquired > 0) ++ lock->ctx->acquired--; ++ lock->ctx = NULL; ++ } ++ ++ mutex_release(&lock->base.dep_map, _RET_IP_); ++ __rt_mutex_unlock(&lock->base.lock); ++} ++EXPORT_SYMBOL(ww_mutex_unlock); ++ ++int __rt_mutex_owner_current(struct rt_mutex *lock) ++{ ++ return rt_mutex_owner(lock) == current; ++} ++EXPORT_SYMBOL(__rt_mutex_owner_current); ++#endif +diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h +index 732f96abf..338ccd291 100644 +--- a/kernel/locking/rtmutex.h ++++ b/kernel/locking/rtmutex.h +@@ -19,15 +19,8 @@ + #define debug_rt_mutex_proxy_unlock(l) do { } while (0) + #define debug_rt_mutex_unlock(l) do { } while (0) + #define debug_rt_mutex_init(m, n, k) do { } while (0) +-#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +-#define debug_rt_mutex_print_deadlock(w) do { } while (0) + #define debug_rt_mutex_reset_waiter(w) do { } while (0) + +-static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) +-{ +- WARN(1, "rtmutex deadlock detected\n"); +-} +- + static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, + enum rtmutex_chainwalk walk) + { +diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h +index ca6fb4890..248a7d915 100644 +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -15,6 +15,7 @@ + + #include + #include ++#include + + /* + * This is the control structure for tasks blocked on a rt_mutex, +@@ -29,12 +30,8 @@ struct rt_mutex_waiter { + struct rb_node pi_tree_entry; + struct task_struct *task; + struct rt_mutex *lock; +-#ifdef CONFIG_DEBUG_RT_MUTEXES +- unsigned long ip; +- struct pid *deadlock_task_pid; +- struct rt_mutex *deadlock_lock; +-#endif + int prio; ++ bool savestate; + u64 deadline; + }; + +@@ -130,11 +127,14 @@ enum rtmutex_chainwalk { + /* + * PI-futex support (proxy locking functions, etc.): + */ ++#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) ++#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) ++ + extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); + extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); +-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); ++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +@@ -152,9 +152,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l); + + extern void rt_mutex_futex_unlock(struct rt_mutex *lock); + extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wqh); +- +-extern void rt_mutex_postunlock(struct wake_q_head *wake_q); ++ struct wake_q_head *wqh, ++ struct wake_q_head *wq_sleeper); ++ ++extern void rt_mutex_postunlock(struct wake_q_head *wake_q, ++ struct wake_q_head *wake_sleeper_q); ++ ++/* RW semaphore special interface */ ++struct ww_acquire_ctx; ++ ++extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state); ++extern int __rt_mutex_trylock(struct rt_mutex *lock); ++extern void __rt_mutex_unlock(struct rt_mutex *lock); ++int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, ++ struct hrtimer_sleeper *timeout, ++ enum rtmutex_chainwalk chwalk, ++ struct ww_acquire_ctx *ww_ctx, ++ struct rt_mutex_waiter *waiter); ++void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ unsigned long flags); ++void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock); + + #ifdef CONFIG_DEBUG_RT_MUTEXES + # include "rtmutex-debug.h" +diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c +new file mode 100644 +index 000000000..3d2d1f14b +--- /dev/null ++++ b/kernel/locking/rwlock-rt.c +@@ -0,0 +1,334 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++ ++#include "rtmutex_common.h" ++#include ++ ++/* ++ * RT-specific reader/writer locks ++ * ++ * write_lock() ++ * 1) Lock lock->rtmutex ++ * 2) Remove the reader BIAS to force readers into the slow path ++ * 3) Wait until all readers have left the critical region ++ * 4) Mark it write locked ++ * ++ * write_unlock() ++ * 1) Remove the write locked marker ++ * 2) Set the reader BIAS so readers can use the fast path again ++ * 3) Unlock lock->rtmutex to release blocked readers ++ * ++ * read_lock() ++ * 1) Try fast path acquisition (reader BIAS is set) ++ * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag ++ * 3) If !writelocked, acquire it for read ++ * 4) If writelocked, block on lock->rtmutex ++ * 5) unlock lock->rtmutex, goto 1) ++ * ++ * read_unlock() ++ * 1) Try fast path release (reader count != 1) ++ * 2) Wake the writer waiting in write_lock()#3 ++ * ++ * read_lock()#3 has the consequence, that rw locks on RT are not writer ++ * fair, but writers, which should be avoided in RT tasks (think tasklist ++ * lock), are subject to the rtmutex priority/DL inheritance mechanism. ++ * ++ * It's possible to make the rw locks writer fair by keeping a list of ++ * active readers. A blocked writer would force all newly incoming readers ++ * to block on the rtmutex, but the rtmutex would have to be proxy locked ++ * for one reader after the other. We can't use multi-reader inheritance ++ * because there is no way to support that with ++ * SCHED_DEADLINE. Implementing the one by one reader boosting/handover ++ * mechanism is a major surgery for a very dubious value. ++ * ++ * The risk of writer starvation is there, but the pathological use cases ++ * which trigger it are not necessarily the typical RT workloads. ++ */ ++ ++void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held semaphore: ++ */ ++ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++ atomic_set(&lock->readers, READER_BIAS); ++ rt_mutex_init(&lock->rtmutex); ++ lock->rtmutex.save_state = 1; ++} ++ ++static int __read_rt_trylock(struct rt_rw_lock *lock) ++{ ++ int r, old; ++ ++ /* ++ * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is ++ * set. ++ */ ++ for (r = atomic_read(&lock->readers); r < 0;) { ++ old = atomic_cmpxchg(&lock->readers, r, r + 1); ++ if (likely(old == r)) ++ return 1; ++ r = old; ++ } ++ return 0; ++} ++ ++static void __read_rt_lock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ struct rt_mutex_waiter waiter; ++ unsigned long flags; ++ ++ if (__read_rt_trylock(lock)) ++ return; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ /* ++ * Allow readers as long as the writer has not completely ++ * acquired the semaphore for write. ++ */ ++ if (atomic_read(&lock->readers) != WRITER_BIAS) { ++ atomic_inc(&lock->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return; ++ } ++ ++ /* ++ * Call into the slow lock path with the rtmutex->wait_lock ++ * held, so this can't result in the following race: ++ * ++ * Reader1 Reader2 Writer ++ * read_lock() ++ * write_lock() ++ * rtmutex_lock(m) ++ * swait() ++ * read_lock() ++ * unlock(m->wait_lock) ++ * read_unlock() ++ * swake() ++ * lock(m->wait_lock) ++ * lock->writelocked=true ++ * unlock(m->wait_lock) ++ * ++ * write_unlock() ++ * lock->writelocked=false ++ * rtmutex_unlock(m) ++ * read_lock() ++ * write_lock() ++ * rtmutex_lock(m) ++ * swait() ++ * rtmutex_lock(m) ++ * ++ * That would put Reader1 behind the writer waiting on ++ * Reader2 to call read_unlock() which might be unbound. ++ */ ++ rt_mutex_init_waiter(&waiter, true); ++ rt_spin_lock_slowlock_locked(m, &waiter, flags); ++ /* ++ * The slowlock() above is guaranteed to return with the rtmutex is ++ * now held, so there can't be a writer active. Increment the reader ++ * count and immediately drop the rtmutex again. ++ */ ++ atomic_inc(&lock->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ rt_spin_lock_slowunlock(m); ++ ++ debug_rt_mutex_free_waiter(&waiter); ++} ++ ++static void __read_rt_unlock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ struct task_struct *tsk; ++ ++ /* ++ * sem->readers can only hit 0 when a writer is waiting for the ++ * active readers to leave the critical region. ++ */ ++ if (!atomic_dec_and_test(&lock->readers)) ++ return; ++ ++ raw_spin_lock_irq(&m->wait_lock); ++ /* ++ * Wake the writer, i.e. the rtmutex owner. It might release the ++ * rtmutex concurrently in the fast path, but to clean up the rw ++ * lock it needs to acquire m->wait_lock. The worst case which can ++ * happen is a spurious wakeup. ++ */ ++ tsk = rt_mutex_owner(m); ++ if (tsk) ++ wake_up_process(tsk); ++ ++ raw_spin_unlock_irq(&m->wait_lock); ++} ++ ++static void __write_unlock_common(struct rt_rw_lock *lock, int bias, ++ unsigned long flags) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ ++ atomic_add(READER_BIAS - bias, &lock->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ rt_spin_lock_slowunlock(m); ++} ++ ++static void __write_rt_lock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ struct task_struct *self = current; ++ unsigned long flags; ++ ++ /* Take the rtmutex as a first step */ ++ __rt_spin_lock(m); ++ ++ /* Force readers into slow path */ ++ atomic_sub(READER_BIAS, &lock->readers); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ ++ raw_spin_lock(&self->pi_lock); ++ self->saved_state = self->state; ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ ++ for (;;) { ++ /* Have all readers left the critical region? */ ++ if (!atomic_read(&lock->readers)) { ++ atomic_set(&lock->readers, WRITER_BIAS); ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(self->saved_state); ++ self->saved_state = TASK_RUNNING; ++ raw_spin_unlock(&self->pi_lock); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return; ++ } ++ ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ ++ if (atomic_read(&lock->readers) != 0) ++ preempt_schedule_lock(); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ ++ raw_spin_lock(&self->pi_lock); ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock(&self->pi_lock); ++ } ++} ++ ++static int __write_rt_trylock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ unsigned long flags; ++ ++ if (!__rt_mutex_trylock(m)) ++ return 0; ++ ++ atomic_sub(READER_BIAS, &lock->readers); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ if (!atomic_read(&lock->readers)) { ++ atomic_set(&lock->readers, WRITER_BIAS); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return 1; ++ } ++ __write_unlock_common(lock, 0, flags); ++ return 0; ++} ++ ++static void __write_rt_unlock(struct rt_rw_lock *lock) ++{ ++ struct rt_mutex *m = &lock->rtmutex; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ __write_unlock_common(lock, WRITER_BIAS, flags); ++} ++ ++int __lockfunc rt_read_can_lock(rwlock_t *rwlock) ++{ ++ return atomic_read(&rwlock->readers) < 0; ++} ++ ++int __lockfunc rt_write_can_lock(rwlock_t *rwlock) ++{ ++ return atomic_read(&rwlock->readers) == READER_BIAS; ++} ++ ++/* ++ * The common functions which get wrapped into the rwlock API. ++ */ ++int __lockfunc rt_read_trylock(rwlock_t *rwlock) ++{ ++ int ret; ++ ++ ret = __read_rt_trylock(rwlock); ++ if (ret) { ++ rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); ++ migrate_disable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_read_trylock); ++ ++int __lockfunc rt_write_trylock(rwlock_t *rwlock) ++{ ++ int ret; ++ ++ ret = __write_rt_trylock(rwlock); ++ if (ret) { ++ rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); ++ rcu_read_lock(); ++ migrate_disable(); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(rt_write_trylock); ++ ++void __lockfunc rt_read_lock(rwlock_t *rwlock) ++{ ++ rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); ++ __read_rt_lock(rwlock); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_read_lock); ++ ++void __lockfunc rt_write_lock(rwlock_t *rwlock) ++{ ++ rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); ++ __write_rt_lock(rwlock); ++ rcu_read_lock(); ++ migrate_disable(); ++} ++EXPORT_SYMBOL(rt_write_lock); ++ ++void __lockfunc rt_read_unlock(rwlock_t *rwlock) ++{ ++ rwlock_release(&rwlock->dep_map, _RET_IP_); ++ migrate_enable(); ++ rcu_read_unlock(); ++ __read_rt_unlock(rwlock); ++} ++EXPORT_SYMBOL(rt_read_unlock); ++ ++void __lockfunc rt_write_unlock(rwlock_t *rwlock) ++{ ++ rwlock_release(&rwlock->dep_map, _RET_IP_); ++ migrate_enable(); ++ rcu_read_unlock(); ++ __write_rt_unlock(rwlock); ++} ++EXPORT_SYMBOL(rt_write_unlock); ++ ++void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) ++{ ++ __rwlock_biased_rt_init(rwlock, name, key); ++} ++EXPORT_SYMBOL(__rt_rwlock_init); +diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c +new file mode 100644 +index 000000000..b61edc4dc +--- /dev/null ++++ b/kernel/locking/rwsem-rt.c +@@ -0,0 +1,317 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++ ++#include "rtmutex_common.h" ++ ++/* ++ * RT-specific reader/writer semaphores ++ * ++ * down_write() ++ * 1) Lock sem->rtmutex ++ * 2) Remove the reader BIAS to force readers into the slow path ++ * 3) Wait until all readers have left the critical region ++ * 4) Mark it write locked ++ * ++ * up_write() ++ * 1) Remove the write locked marker ++ * 2) Set the reader BIAS so readers can use the fast path again ++ * 3) Unlock sem->rtmutex to release blocked readers ++ * ++ * down_read() ++ * 1) Try fast path acquisition (reader BIAS is set) ++ * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag ++ * 3) If !writelocked, acquire it for read ++ * 4) If writelocked, block on sem->rtmutex ++ * 5) unlock sem->rtmutex, goto 1) ++ * ++ * up_read() ++ * 1) Try fast path release (reader count != 1) ++ * 2) Wake the writer waiting in down_write()#3 ++ * ++ * down_read()#3 has the consequence, that rw semaphores on RT are not writer ++ * fair, but writers, which should be avoided in RT tasks (think mmap_sem), ++ * are subject to the rtmutex priority/DL inheritance mechanism. ++ * ++ * It's possible to make the rw semaphores writer fair by keeping a list of ++ * active readers. A blocked writer would force all newly incoming readers to ++ * block on the rtmutex, but the rtmutex would have to be proxy locked for one ++ * reader after the other. We can't use multi-reader inheritance because there ++ * is no way to support that with SCHED_DEADLINE. Implementing the one by one ++ * reader boosting/handover mechanism is a major surgery for a very dubious ++ * value. ++ * ++ * The risk of writer starvation is there, but the pathological use cases ++ * which trigger it are not necessarily the typical RT workloads. ++ */ ++ ++void __rwsem_init(struct rw_semaphore *sem, const char *name, ++ struct lock_class_key *key) ++{ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ /* ++ * Make sure we are not reinitializing a held semaphore: ++ */ ++ debug_check_no_locks_freed((void *)sem, sizeof(*sem)); ++ lockdep_init_map(&sem->dep_map, name, key, 0); ++#endif ++ atomic_set(&sem->readers, READER_BIAS); ++} ++EXPORT_SYMBOL(__rwsem_init); ++ ++int __down_read_trylock(struct rw_semaphore *sem) ++{ ++ int r, old; ++ ++ /* ++ * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is ++ * set. ++ */ ++ for (r = atomic_read(&sem->readers); r < 0;) { ++ old = atomic_cmpxchg(&sem->readers, r, r + 1); ++ if (likely(old == r)) ++ return 1; ++ r = old; ++ } ++ return 0; ++} ++ ++static int __sched __down_read_common(struct rw_semaphore *sem, int state) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ struct rt_mutex_waiter waiter; ++ int ret; ++ ++ if (__down_read_trylock(sem)) ++ return 0; ++ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ ++ might_sleep(); ++ raw_spin_lock_irq(&m->wait_lock); ++ /* ++ * Allow readers as long as the writer has not completely ++ * acquired the semaphore for write. ++ */ ++ if (atomic_read(&sem->readers) != WRITER_BIAS) { ++ atomic_inc(&sem->readers); ++ raw_spin_unlock_irq(&m->wait_lock); ++ return 0; ++ } ++ ++ /* ++ * Call into the slow lock path with the rtmutex->wait_lock ++ * held, so this can't result in the following race: ++ * ++ * Reader1 Reader2 Writer ++ * down_read() ++ * down_write() ++ * rtmutex_lock(m) ++ * swait() ++ * down_read() ++ * unlock(m->wait_lock) ++ * up_read() ++ * swake() ++ * lock(m->wait_lock) ++ * sem->writelocked=true ++ * unlock(m->wait_lock) ++ * ++ * up_write() ++ * sem->writelocked=false ++ * rtmutex_unlock(m) ++ * down_read() ++ * down_write() ++ * rtmutex_lock(m) ++ * swait() ++ * rtmutex_lock(m) ++ * ++ * That would put Reader1 behind the writer waiting on ++ * Reader2 to call up_read() which might be unbound. ++ */ ++ rt_mutex_init_waiter(&waiter, false); ++ ret = rt_mutex_slowlock_locked(m, state, NULL, RT_MUTEX_MIN_CHAINWALK, ++ NULL, &waiter); ++ /* ++ * The slowlock() above is guaranteed to return with the rtmutex (for ++ * ret = 0) is now held, so there can't be a writer active. Increment ++ * the reader count and immediately drop the rtmutex again. ++ * For ret != 0 we don't hold the rtmutex and need unlock the wait_lock. ++ * We don't own the lock then. ++ */ ++ if (!ret) ++ atomic_inc(&sem->readers); ++ raw_spin_unlock_irq(&m->wait_lock); ++ if (!ret) ++ __rt_mutex_unlock(m); ++ ++ debug_rt_mutex_free_waiter(&waiter); ++ return ret; ++} ++ ++void __down_read(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_UNINTERRUPTIBLE); ++ WARN_ON_ONCE(ret); ++} ++ ++int __down_read_interruptible(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_INTERRUPTIBLE); ++ if (likely(!ret)) ++ return ret; ++ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); ++ return -EINTR; ++} ++ ++int __down_read_killable(struct rw_semaphore *sem) ++{ ++ int ret; ++ ++ ret = __down_read_common(sem, TASK_KILLABLE); ++ if (likely(!ret)) ++ return ret; ++ WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); ++ return -EINTR; ++} ++ ++void __up_read(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ struct task_struct *tsk; ++ ++ /* ++ * sem->readers can only hit 0 when a writer is waiting for the ++ * active readers to leave the critical region. ++ */ ++ if (!atomic_dec_and_test(&sem->readers)) ++ return; ++ ++ raw_spin_lock_irq(&m->wait_lock); ++ /* ++ * Wake the writer, i.e. the rtmutex owner. It might release the ++ * rtmutex concurrently in the fast path (due to a signal), but to ++ * clean up the rwsem it needs to acquire m->wait_lock. The worst ++ * case which can happen is a spurious wakeup. ++ */ ++ tsk = rt_mutex_owner(m); ++ if (tsk) ++ wake_up_process(tsk); ++ ++ raw_spin_unlock_irq(&m->wait_lock); ++} ++ ++static void __up_write_unlock(struct rw_semaphore *sem, int bias, ++ unsigned long flags) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ ++ atomic_add(READER_BIAS - bias, &sem->readers); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ __rt_mutex_unlock(m); ++} ++ ++static int __sched __down_write_common(struct rw_semaphore *sem, int state) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ /* ++ * Flush blk before ->pi_blocked_on is set. At schedule() time it is too ++ * late if one of the callbacks needs to acquire a sleeping lock. ++ */ ++ if (blk_needs_flush_plug(current)) ++ blk_schedule_flush_plug(current); ++ ++ /* Take the rtmutex as a first step */ ++ if (__rt_mutex_lock_state(m, state)) ++ return -EINTR; ++ ++ /* Force readers into slow path */ ++ atomic_sub(READER_BIAS, &sem->readers); ++ might_sleep(); ++ ++ set_current_state(state); ++ for (;;) { ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ /* Have all readers left the critical region? */ ++ if (!atomic_read(&sem->readers)) { ++ atomic_set(&sem->readers, WRITER_BIAS); ++ __set_current_state(TASK_RUNNING); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return 0; ++ } ++ ++ if (signal_pending_state(state, current)) { ++ __set_current_state(TASK_RUNNING); ++ __up_write_unlock(sem, 0, flags); ++ return -EINTR; ++ } ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ ++ if (atomic_read(&sem->readers) != 0) { ++ schedule(); ++ set_current_state(state); ++ } ++ } ++} ++ ++void __sched __down_write(struct rw_semaphore *sem) ++{ ++ __down_write_common(sem, TASK_UNINTERRUPTIBLE); ++} ++ ++int __sched __down_write_killable(struct rw_semaphore *sem) ++{ ++ return __down_write_common(sem, TASK_KILLABLE); ++} ++ ++int __down_write_trylock(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ if (!__rt_mutex_trylock(m)) ++ return 0; ++ ++ atomic_sub(READER_BIAS, &sem->readers); ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ if (!atomic_read(&sem->readers)) { ++ atomic_set(&sem->readers, WRITER_BIAS); ++ raw_spin_unlock_irqrestore(&m->wait_lock, flags); ++ return 1; ++ } ++ __up_write_unlock(sem, 0, flags); ++ return 0; ++} ++ ++void __up_write(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ __up_write_unlock(sem, WRITER_BIAS, flags); ++} ++ ++void __downgrade_write(struct rw_semaphore *sem) ++{ ++ struct rt_mutex *m = &sem->rtmutex; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&m->wait_lock, flags); ++ /* Release it and account current as reader */ ++ __up_write_unlock(sem, WRITER_BIAS - 1, flags); ++} +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index cc5cc889b..f7c909ef1 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -28,6 +28,7 @@ + #include + #include + ++#ifndef CONFIG_PREEMPT_RT + #include "lock_events.h" + + /* +@@ -1494,6 +1495,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) + if (tmp & RWSEM_FLAG_WAITERS) + rwsem_downgrade_wake(sem); + } ++#endif + + /* + * lock for reading +@@ -1657,7 +1659,9 @@ void down_read_non_owner(struct rw_semaphore *sem) + { + might_sleep(); + __down_read(sem); ++#ifndef CONFIG_PREEMPT_RT + __rwsem_set_reader_owned(sem, NULL); ++#endif + } + EXPORT_SYMBOL(down_read_non_owner); + +@@ -1686,7 +1690,9 @@ EXPORT_SYMBOL(down_write_killable_nested); + + void up_read_non_owner(struct rw_semaphore *sem) + { ++#ifndef CONFIG_PREEMPT_RT + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); ++#endif + __up_read(sem); + } + EXPORT_SYMBOL(up_read_non_owner); +diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c +index 0ff08380f..45445a2f1 100644 +--- a/kernel/locking/spinlock.c ++++ b/kernel/locking/spinlock.c +@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ + * __[spin|read|write]_lock_bh() + */ + BUILD_LOCK_OPS(spin, raw_spinlock); ++ ++#ifndef CONFIG_PREEMPT_RT + BUILD_LOCK_OPS(read, rwlock); + BUILD_LOCK_OPS(write, rwlock); ++#endif + + #endif + +@@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) + EXPORT_SYMBOL(_raw_spin_unlock_bh); + #endif + ++#ifndef CONFIG_PREEMPT_RT ++ + #ifndef CONFIG_INLINE_READ_TRYLOCK + int __lockfunc _raw_read_trylock(rwlock_t *lock) + { +@@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) + EXPORT_SYMBOL(_raw_write_unlock_bh); + #endif + ++#endif /* !PREEMPT_RT */ ++ + #ifdef CONFIG_DEBUG_LOCK_ALLOC + + void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c +index b9d93087e..72e306e0e 100644 +--- a/kernel/locking/spinlock_debug.c ++++ b/kernel/locking/spinlock_debug.c +@@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + + EXPORT_SYMBOL(__raw_spin_lock_init); + ++#ifndef CONFIG_PREEMPT_RT + void __rwlock_init(rwlock_t *lock, const char *name, + struct lock_class_key *key) + { +@@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, + } + + EXPORT_SYMBOL(__rwlock_init); ++#endif + + static void spin_dump(raw_spinlock_t *lock, const char *msg) + { +@@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) + arch_spin_unlock(&lock->raw_lock); + } + ++#ifndef CONFIG_PREEMPT_RT + static void rwlock_bug(rwlock_t *lock, const char *msg) + { + if (!debug_locks_off()) +@@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock) + debug_write_unlock(lock); + arch_write_unlock(&lock->raw_lock); + } ++ ++#endif +diff --git a/kernel/notifier.c b/kernel/notifier.c +index 1b019cbca..c20782f07 100644 +--- a/kernel/notifier.c ++++ b/kernel/notifier.c +@@ -142,9 +142,9 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + unsigned long flags; + int ret; + +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + return ret; + } + EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); +@@ -164,9 +164,9 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + unsigned long flags; + int ret; + +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; + } +@@ -182,9 +182,9 @@ int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, + * Musn't use RCU; because then the notifier list can + * change between the up and down traversal. + */ +- spin_lock_irqsave(&nh->lock, flags); ++ raw_spin_lock_irqsave(&nh->lock, flags); + ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); +- spin_unlock_irqrestore(&nh->lock, flags); ++ raw_spin_unlock_irqrestore(&nh->lock, flags); + + return ret; + } +diff --git a/kernel/panic.c b/kernel/panic.c +index d991c3b1b..fa3025e0c 100644 +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -177,12 +177,28 @@ static void panic_print_sys_info(void) + void panic(const char *fmt, ...) + { + static char buf[1024]; ++ va_list args2; + va_list args; + long i, i_next = 0, len; + int state = 0; + int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + ++ console_verbose(); ++ pr_emerg("Kernel panic - not syncing:\n"); ++ va_start(args2, fmt); ++ va_copy(args, args2); ++ vprintk(fmt, args2); ++ va_end(args2); ++#ifdef CONFIG_DEBUG_BUGVERBOSE ++ /* ++ * Avoid nested stack-dumping if a panic occurs during oops processing ++ */ ++ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) ++ dump_stack(); ++#endif ++ pr_flush(1000, true); ++ + /* + * Disable local interrupts. This will prevent panic_smp_self_stop + * from deadlocking the first cpu that invokes the panic, since +@@ -213,24 +229,13 @@ void panic(const char *fmt, ...) + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + panic_smp_self_stop(); + +- console_verbose(); + bust_spinlocks(1); +- va_start(args, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + +- pr_emerg("Kernel panic - not syncing: %s\n", buf); +-#ifdef CONFIG_DEBUG_BUGVERBOSE +- /* +- * Avoid nested stack-dumping if a panic occurs during oops processing +- */ +- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) +- dump_stack(); +-#endif +- + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left +@@ -247,7 +252,6 @@ void panic(const char *fmt, ...) + * Bypass the panic_cpu check and call __crash_kexec directly. + */ + if (!_crash_kexec_post_notifiers) { +- printk_safe_flush_on_panic(); + __crash_kexec(NULL); + + /* +@@ -298,8 +302,6 @@ void panic(const char *fmt, ...) + */ + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + +- /* Call flush even twice. It tries harder with a single online CPU */ +- printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + + /* +@@ -569,9 +571,11 @@ static u64 oops_id; + + static int init_oops_id(void) + { ++#ifndef CONFIG_PREEMPT_RT + if (!oops_id) + get_random_bytes(&oops_id, sizeof(oops_id)); + else ++#endif + oops_id++; + + return 0; +@@ -582,6 +586,7 @@ static void print_oops_end_marker(void) + { + init_oops_id(); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); ++ pr_flush(1000, true); + } + + /* +diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile +index eee3dc9b6..59cb24e25 100644 +--- a/kernel/printk/Makefile ++++ b/kernel/printk/Makefile +@@ -1,5 +1,4 @@ + # SPDX-License-Identifier: GPL-2.0-only + obj-y = printk.o +-obj-$(CONFIG_PRINTK) += printk_safe.o + obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o + obj-$(CONFIG_PRINTK) += printk_ringbuffer.o +diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h +index b1c155328..e69de29bb 100644 +--- a/kernel/printk/internal.h ++++ b/kernel/printk/internal.h +@@ -1,37 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0-or-later */ +-/* +- * internal.h - printk internal definitions +- */ +-#include +- +-#ifdef CONFIG_PRINTK +- +-#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff +-#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 +-#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 +- +-#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 +- +-extern raw_spinlock_t logbuf_lock; +- +-__printf(4, 0) +-int vprintk_store(int facility, int level, +- const struct dev_printk_info *dev_info, +- const char *fmt, va_list args); +- +-__printf(1, 0) int vprintk_default(const char *fmt, va_list args); +-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args); +- +-void printk_safe_init(void); +-bool printk_percpu_data_ready(void); +- +-void defer_console_output(void); +- +-#else +- +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } +- +-static inline void printk_safe_init(void) { } +-static inline bool printk_percpu_data_ready(void) { return false; } +-#endif /* CONFIG_PRINTK */ +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 43f8f2573..54a4b01a4 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -44,6 +44,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include +@@ -58,7 +61,6 @@ + #include "printk_ringbuffer.h" + #include "console_cmdline.h" + #include "braille.h" +-#include "internal.h" + + int console_printk[4] = { + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ +@@ -93,12 +95,6 @@ EXPORT_SYMBOL_GPL(console_drivers); + */ + int __read_mostly suppress_printk; + +-#ifdef CONFIG_LOCKDEP +-static struct lockdep_map console_lock_dep_map = { +- .name = "console_lock" +-}; +-#endif +- + enum devkmsg_log_bits { + __DEVKMSG_LOG_BIT_ON = 0, + __DEVKMSG_LOG_BIT_OFF, +@@ -225,19 +221,7 @@ static int nr_ext_console_drivers; + + static int __down_trylock_console_sem(unsigned long ip) + { +- int lock_failed; +- unsigned long flags; +- +- /* +- * Here and in __up_console_sem() we need to be in safe mode, +- * because spindump/WARN/etc from under console ->lock will +- * deadlock in printk()->down_trylock_console_sem() otherwise. +- */ +- printk_safe_enter_irqsave(flags); +- lock_failed = down_trylock(&console_sem); +- printk_safe_exit_irqrestore(flags); +- +- if (lock_failed) ++ if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +@@ -246,13 +230,9 @@ static int __down_trylock_console_sem(unsigned long ip) + + static void __up_console_sem(unsigned long ip) + { +- unsigned long flags; +- + mutex_release(&console_lock_dep_map, ip); + +- printk_safe_enter_irqsave(flags); + up(&console_sem); +- printk_safe_exit_irqrestore(flags); + } + #define up_console_sem() __up_console_sem(_RET_IP_) + +@@ -266,11 +246,6 @@ static void __up_console_sem(unsigned long ip) + */ + static int console_locked, console_suspended; + +-/* +- * If exclusive_console is non-NULL then only this console is to be printed to. +- */ +-static struct console *exclusive_console; +- + /* + * Array of consoles built from command line options (console=) + */ +@@ -355,61 +330,43 @@ enum log_flags { + LOG_CONT = 8, /* text is a fragment of a continuation line */ + }; + +-/* +- * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken +- * within the scheduler's rq lock. It must be released before calling +- * console_unlock() or anything else that might wake up a process. +- */ +-DEFINE_RAW_SPINLOCK(logbuf_lock); ++#ifdef CONFIG_PRINTK ++/* syslog_lock protects syslog_* variables and write access to clear_seq. */ ++static DEFINE_SPINLOCK(syslog_lock); + +-/* +- * Helper macros to lock/unlock logbuf_lock and switch between +- * printk-safe/unsafe modes. +- */ +-#define logbuf_lock_irq() \ +- do { \ +- printk_safe_enter_irq(); \ +- raw_spin_lock(&logbuf_lock); \ +- } while (0) +- +-#define logbuf_unlock_irq() \ +- do { \ +- raw_spin_unlock(&logbuf_lock); \ +- printk_safe_exit_irq(); \ +- } while (0) +- +-#define logbuf_lock_irqsave(flags) \ +- do { \ +- printk_safe_enter_irqsave(flags); \ +- raw_spin_lock(&logbuf_lock); \ +- } while (0) +- +-#define logbuf_unlock_irqrestore(flags) \ +- do { \ +- raw_spin_unlock(&logbuf_lock); \ +- printk_safe_exit_irqrestore(flags); \ +- } while (0) ++/* Set to enable sync mode. Once set, it is never cleared. */ ++static bool sync_mode; + +-#ifdef CONFIG_PRINTK + DECLARE_WAIT_QUEUE_HEAD(log_wait); ++/* All 3 protected by @syslog_lock. */ + /* the next printk record to read by syslog(READ) or /proc/kmsg */ + static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-/* the next printk record to write to the console */ +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; ++struct latched_seq { ++ seqcount_latch_t latch; ++ u64 val[2]; ++}; + +-/* the next printk record to read after the last 'clear' command */ +-static u64 clear_seq; ++/* ++ * The next printk record to read after the last 'clear' command. There are ++ * two copies (updated with seqcount_latch) so that reads can locklessly ++ * access a valid value. Writers are synchronized by @syslog_lock. ++ */ ++static struct latched_seq clear_seq = { ++ .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), ++ .val[0] = 0, ++ .val[1] = 0, ++}; + + #ifdef CONFIG_PRINTK_CALLER + #define PREFIX_MAX 48 + #else + #define PREFIX_MAX 32 + #endif ++ ++/* the maximum size allowed to be reserved for a record */ + #define LOG_LINE_MAX (1024 - PREFIX_MAX) + + #define LOG_LEVEL(v) ((v) & 0x07) +@@ -447,11 +404,36 @@ static struct printk_ringbuffer *prb = &printk_rb_static; + */ + static bool __printk_percpu_data_ready __read_mostly; + +-bool printk_percpu_data_ready(void) ++static bool printk_percpu_data_ready(void) + { + return __printk_percpu_data_ready; + } + ++/* Must be called under syslog_lock. */ ++static void latched_seq_write(struct latched_seq *ls, u64 val) ++{ ++ raw_write_seqcount_latch(&ls->latch); ++ ls->val[0] = val; ++ raw_write_seqcount_latch(&ls->latch); ++ ls->val[1] = val; ++} ++ ++/* Can be called from any context. */ ++static u64 latched_seq_read_nolock(struct latched_seq *ls) ++{ ++ unsigned int seq; ++ unsigned int idx; ++ u64 val; ++ ++ do { ++ seq = raw_read_seqcount_latch(&ls->latch); ++ idx = seq & 0x1; ++ val = ls->val[idx]; ++ } while (read_seqcount_latch_retry(&ls->latch, seq)); ++ ++ return val; ++} ++ + /* Return log buffer address */ + char *log_buf_addr_get(void) + { +@@ -493,52 +475,6 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) + *trunc_msg_len = 0; + } + +-/* insert record into the buffer, discard old ones, update heads */ +-static int log_store(u32 caller_id, int facility, int level, +- enum log_flags flags, u64 ts_nsec, +- const struct dev_printk_info *dev_info, +- const char *text, u16 text_len) +-{ +- struct prb_reserved_entry e; +- struct printk_record r; +- u16 trunc_msg_len = 0; +- +- prb_rec_init_wr(&r, text_len); +- +- if (!prb_reserve(&e, prb, &r)) { +- /* truncate the message if it is too long for empty buffer */ +- truncate_msg(&text_len, &trunc_msg_len); +- prb_rec_init_wr(&r, text_len + trunc_msg_len); +- /* survive when the log buffer is too small for trunc_msg */ +- if (!prb_reserve(&e, prb, &r)) +- return 0; +- } +- +- /* fill message */ +- memcpy(&r.text_buf[0], text, text_len); +- if (trunc_msg_len) +- memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); +- r.info->text_len = text_len + trunc_msg_len; +- r.info->facility = facility; +- r.info->level = level & 7; +- r.info->flags = flags & 0x1f; +- if (ts_nsec > 0) +- r.info->ts_nsec = ts_nsec; +- else +- r.info->ts_nsec = local_clock(); +- r.info->caller_id = caller_id; +- if (dev_info) +- memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); +- +- /* A message without a trailing newline can be continued. */ +- if (!(flags & LOG_NEWLINE)) +- prb_commit(&e); +- else +- prb_final_commit(&e); +- +- return (text_len + trunc_msg_len); +-} +- + int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); + + static int syslog_action_restricted(int type) +@@ -667,7 +603,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, + + /* /dev/kmsg - userspace message inject/listen interface */ + struct devkmsg_user { +- u64 seq; ++ atomic64_t seq; + struct ratelimit_state rs; + struct mutex lock; + char buf[CONSOLE_EXT_LOG_MAX]; +@@ -768,27 +704,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + if (ret) + return ret; + +- logbuf_lock_irq(); +- if (!prb_read_valid(prb, user->seq, r)) { ++ if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; +- logbuf_unlock_irq(); + goto out; + } + +- logbuf_unlock_irq(); + ret = wait_event_interruptible(log_wait, +- prb_read_valid(prb, user->seq, r)); ++ prb_read_valid(prb, atomic64_read(&user->seq), r)); + if (ret) + goto out; +- logbuf_lock_irq(); + } + +- if (r->info->seq != user->seq) { ++ if (r->info->seq != atomic64_read(&user->seq)) { + /* our last seen message is gone, return error and reset */ +- user->seq = r->info->seq; ++ atomic64_set(&user->seq, r->info->seq); + ret = -EPIPE; +- logbuf_unlock_irq(); + goto out; + } + +@@ -797,8 +728,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, + &r->text_buf[0], r->info->text_len, + &r->info->dev_info); + +- user->seq = r->info->seq + 1; +- logbuf_unlock_irq(); ++ atomic64_set(&user->seq, r->info->seq + 1); + + if (len > count) { + ret = -EINVAL; +@@ -833,11 +763,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + if (offset) + return -ESPIPE; + +- logbuf_lock_irq(); + switch (whence) { + case SEEK_SET: + /* the first record */ +- user->seq = prb_first_valid_seq(prb); ++ atomic64_set(&user->seq, prb_first_valid_seq(prb)); + break; + case SEEK_DATA: + /* +@@ -845,16 +774,15 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ +- user->seq = clear_seq; ++ atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); + break; + case SEEK_END: + /* after the last record */ +- user->seq = prb_next_seq(prb); ++ atomic64_set(&user->seq, prb_next_seq(prb)); + break; + default: + ret = -EINVAL; + } +- logbuf_unlock_irq(); + return ret; + } + +@@ -869,15 +797,13 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) + + poll_wait(file, &log_wait, wait); + +- logbuf_lock_irq(); +- if (prb_read_valid_info(prb, user->seq, &info, NULL)) { ++ if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { + /* return error when data has vanished underneath us */ +- if (info.seq != user->seq) ++ if (info.seq != atomic64_read(&user->seq)) + ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; + else + ret = EPOLLIN|EPOLLRDNORM; + } +- logbuf_unlock_irq(); + + return ret; + } +@@ -910,9 +836,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) + prb_rec_init_rd(&user->record, &user->info, + &user->text_buf[0], sizeof(user->text_buf)); + +- logbuf_lock_irq(); +- user->seq = prb_first_valid_seq(prb); +- logbuf_unlock_irq(); ++ atomic64_set(&user->seq, prb_first_valid_seq(prb)); + + file->private_data = user; + return 0; +@@ -1004,6 +928,9 @@ void log_buf_vmcoreinfo_setup(void) + + VMCOREINFO_SIZE(atomic_long_t); + VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); ++ ++ VMCOREINFO_STRUCT_SIZE(latched_seq); ++ VMCOREINFO_OFFSET(latched_seq, val); + } + #endif + +@@ -1075,9 +1002,6 @@ static inline void log_buf_add_cpu(void) {} + + static void __init set_percpu_data_ready(void) + { +- printk_safe_init(); +- /* Make sure we set this flag only after printk_safe() init is done */ +- barrier(); + __printk_percpu_data_ready = true; + } + +@@ -1117,7 +1041,6 @@ void __init setup_log_buf(int early) + struct printk_record r; + size_t new_descs_size; + size_t new_infos_size; +- unsigned long flags; + char *new_log_buf; + unsigned int free; + u64 seq; +@@ -1175,8 +1098,6 @@ void __init setup_log_buf(int early) + new_descs, ilog2(new_descs_count), + new_infos); + +- logbuf_lock_irqsave(flags); +- + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; +@@ -1192,8 +1113,6 @@ void __init setup_log_buf(int early) + */ + prb = &printk_rb_dynamic; + +- logbuf_unlock_irqrestore(flags); +- + if (seq != prb_next_seq(&printk_rb_static)) { + pr_err("dropped %llu messages\n", + prb_next_seq(&printk_rb_static) - seq); +@@ -1470,6 +1389,50 @@ static size_t get_record_print_text_size(struct printk_info *info, + return ((prefix_len * line_count) + info->text_len + 1); + } + ++/* ++ * Beginning with @start_seq, find the first record where it and all following ++ * records up to (but not including) @max_seq fit into @size. ++ * ++ * @max_seq is simply an upper bound and does not need to exist. If the caller ++ * does not require an upper bound, -1 can be used for @max_seq. ++ */ ++static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, ++ bool syslog, bool time) ++{ ++ struct printk_info info; ++ unsigned int line_count; ++ size_t len = 0; ++ u64 seq; ++ ++ /* Determine the size of the records up to @max_seq. */ ++ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { ++ if (info.seq >= max_seq) ++ break; ++ len += get_record_print_text_size(&info, line_count, syslog, time); ++ } ++ ++ /* ++ * Adjust the upper bound for the next loop to avoid subtracting ++ * lengths that were never added. ++ */ ++ if (seq < max_seq) ++ max_seq = seq; ++ ++ /* ++ * Move first record forward until length fits into the buffer. Ignore ++ * newest messages that were not counted in the above cycle. Messages ++ * might appear and get lost in the meantime. This is a best effort ++ * that prevents an infinite loop that could occur with a retry. ++ */ ++ prb_for_each_info(start_seq, prb, seq, &info, &line_count) { ++ if (len <= size || info.seq >= max_seq) ++ break; ++ len -= get_record_print_text_size(&info, line_count, syslog, time); ++ } ++ ++ return seq; ++} ++ + static int syslog_print(char __user *buf, int size) + { + struct printk_info info; +@@ -1477,19 +1440,19 @@ static int syslog_print(char __user *buf, int size) + char *text; + int len = 0; + +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + +- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + while (size > 0) { + size_t n; + size_t skip; + +- logbuf_lock_irq(); ++ spin_lock_irq(&syslog_lock); + if (!prb_read_valid(prb, syslog_seq, &r)) { +- logbuf_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); + break; + } + if (r.info->seq != syslog_seq) { +@@ -1518,7 +1481,7 @@ static int syslog_print(char __user *buf, int size) + syslog_partial += n; + } else + n = 0; +- logbuf_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); + + if (!n) + break; +@@ -1541,34 +1504,25 @@ static int syslog_print(char __user *buf, int size) + static int syslog_print_all(char __user *buf, int size, bool clear) + { + struct printk_info info; +- unsigned int line_count; + struct printk_record r; + char *text; + int len = 0; + u64 seq; + bool time; + +- text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); + if (!text) + return -ENOMEM; + + time = printk_time; +- logbuf_lock_irq(); + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. + */ +- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) +- len += get_record_print_text_size(&info, line_count, true, time); +- +- /* move first record forward until length fits into the buffer */ +- prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { +- if (len <= size) +- break; +- len -= get_record_print_text_size(&info, line_count, true, time); +- } ++ seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, ++ size, true, time); + +- prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); + + len = 0; + prb_for_each_record(seq, prb, seq, &r) { +@@ -1581,20 +1535,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + break; + } + +- logbuf_unlock_irq(); + if (copy_to_user(buf + len, text, textlen)) + len = -EFAULT; + else + len += textlen; +- logbuf_lock_irq(); + + if (len < 0) + break; + } + +- if (clear) +- clear_seq = seq; +- logbuf_unlock_irq(); ++ if (clear) { ++ spin_lock_irq(&syslog_lock); ++ latched_seq_write(&clear_seq, seq); ++ spin_unlock_irq(&syslog_lock); ++ } + + kfree(text); + return len; +@@ -1602,9 +1556,21 @@ static int syslog_print_all(char __user *buf, int size, bool clear) + + static void syslog_clear(void) + { +- logbuf_lock_irq(); +- clear_seq = prb_next_seq(prb); +- logbuf_unlock_irq(); ++ spin_lock_irq(&syslog_lock); ++ latched_seq_write(&clear_seq, prb_next_seq(prb)); ++ spin_unlock_irq(&syslog_lock); ++} ++ ++/* Return a consistent copy of @syslog_seq. */ ++static u64 read_syslog_seq_irq(void) ++{ ++ u64 seq; ++ ++ spin_lock_irq(&syslog_lock); ++ seq = syslog_seq; ++ spin_unlock_irq(&syslog_lock); ++ ++ return seq; + } + + int do_syslog(int type, char __user *buf, int len, int source) +@@ -1630,8 +1596,9 @@ int do_syslog(int type, char __user *buf, int len, int source) + return 0; + if (!access_ok(buf, len)) + return -EFAULT; ++ + error = wait_event_interruptible(log_wait, +- prb_read_valid(prb, syslog_seq, NULL)); ++ prb_read_valid(prb, read_syslog_seq_irq(), NULL)); + if (error) + return error; + error = syslog_print(buf, len); +@@ -1679,10 +1646,10 @@ int do_syslog(int type, char __user *buf, int len, int source) + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: +- logbuf_lock_irq(); ++ spin_lock_irq(&syslog_lock); + if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { + /* No unread messages. */ +- logbuf_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); + return 0; + } + if (info.seq != syslog_seq) { +@@ -1710,7 +1677,7 @@ int do_syslog(int type, char __user *buf, int len, int source) + } + error -= syslog_partial; + } +- logbuf_unlock_irq(); ++ spin_unlock_irq(&syslog_lock); + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: +@@ -1740,9 +1707,7 @@ static struct lockdep_map console_owner_dep_map = { + }; + #endif + +-static DEFINE_RAW_SPINLOCK(console_owner_lock); +-static struct task_struct *console_owner; +-static bool console_waiter; ++int printk_delay_msec __read_mostly; + + #if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) + void zap_locks(void) +@@ -1763,187 +1728,171 @@ void zap_locks(void) + } + #endif + +-/** +- * console_lock_spinning_enable - mark beginning of code where another +- * thread might safely busy wait +- * +- * This basically converts console_lock into a spinlock. This marks +- * the section where the console_lock owner can not sleep, because +- * there may be a waiter spinning (like a spinlock). Also it must be +- * ready to hand over the lock at the end of the section. +- */ +-static void console_lock_spinning_enable(void) ++static inline void printk_delay(int level) + { +- raw_spin_lock(&console_owner_lock); +- console_owner = current; +- raw_spin_unlock(&console_owner_lock); ++ boot_delay_msec(level); ++ ++ if (unlikely(printk_delay_msec)) { ++ int m = printk_delay_msec; + +- /* The waiter may spin on us after setting console_owner */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); ++ while (m--) { ++ mdelay(1); ++ touch_nmi_watchdog(); ++ } ++ } + } + +-/** +- * console_lock_spinning_disable_and_check - mark end of code where another +- * thread was able to busy wait and check if there is a waiter +- * +- * This is called at the end of the section where spinning is allowed. +- * It has two functions. First, it is a signal that it is no longer +- * safe to start busy waiting for the lock. Second, it checks if +- * there is a busy waiter and passes the lock rights to her. +- * +- * Important: Callers lose the lock if there was a busy waiter. +- * They must not touch items synchronized by console_lock +- * in this case. +- * +- * Return: 1 if the lock rights were passed, 0 otherwise. +- */ +-static int console_lock_spinning_disable_and_check(void) ++static bool kernel_sync_mode(void) + { +- int waiter; ++ return (oops_in_progress || sync_mode); ++} + +- raw_spin_lock(&console_owner_lock); +- waiter = READ_ONCE(console_waiter); +- console_owner = NULL; +- raw_spin_unlock(&console_owner_lock); ++static bool console_can_sync(struct console *con) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ return true; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ return true; ++ if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ return true; ++ return false; ++} + +- if (!waiter) { +- spin_release(&console_owner_dep_map, _THIS_IP_); +- return 0; +- } ++static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ con->write_atomic(con, text, text_len); ++ else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ con->write_atomic(con, text, text_len); ++ else if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ con->write(con, text, text_len); ++ else ++ return false; + +- /* The waiter is now free to continue */ +- WRITE_ONCE(console_waiter, false); ++ return true; ++} + +- spin_release(&console_owner_dep_map, _THIS_IP_); ++static bool have_atomic_console(void) ++{ ++ struct console *con; + +- /* +- * Hand off console_lock to waiter. The waiter will perform +- * the up(). After this, the waiter is the console_lock owner. +- */ +- mutex_release(&console_lock_dep_map, _THIS_IP_); +- return 1; ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic) ++ return true; ++ } ++ return false; + } + +-/** +- * console_trylock_spinning - try to get console_lock by busy waiting +- * +- * This allows to busy wait for the console_lock when the current +- * owner is running in specially marked sections. It means that +- * the current owner is running and cannot reschedule until it +- * is ready to lose the lock. +- * +- * Return: 1 if we got the lock, 0 othrewise +- */ +-static int console_trylock_spinning(void) ++static bool print_sync(struct console *con, u64 *seq) + { +- struct task_struct *owner = NULL; +- bool waiter; +- bool spin = false; +- unsigned long flags; ++ struct printk_info info; ++ struct printk_record r; ++ size_t text_len; + +- if (console_trylock()) +- return 1; ++ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); + +- printk_safe_enter_irqsave(flags); ++ if (!prb_read_valid(prb, *seq, &r)) ++ return false; + +- raw_spin_lock(&console_owner_lock); +- owner = READ_ONCE(console_owner); +- waiter = READ_ONCE(console_waiter); +- if (!waiter && owner && owner != current) { +- WRITE_ONCE(console_waiter, true); +- spin = true; +- } +- raw_spin_unlock(&console_owner_lock); ++ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + +- /* +- * If there is an active printk() writing to the +- * consoles, instead of having it write our data too, +- * see if we can offload that load from the active +- * printer, and do some printing ourselves. +- * Go into a spin only if there isn't already a waiter +- * spinning, and there is an active printer, and +- * that active printer isn't us (recursive printk?). +- */ +- if (!spin) { +- printk_safe_exit_irqrestore(flags); +- return 0; +- } ++ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) ++ return false; + +- /* We spin waiting for the owner to release us */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +- /* Owner will clear console_waiter on hand off */ +- while (READ_ONCE(console_waiter)) +- cpu_relax(); +- spin_release(&console_owner_dep_map, _THIS_IP_); ++ *seq = r.info->seq; + +- printk_safe_exit_irqrestore(flags); +- /* +- * The owner passed the console lock to us. +- * Since we did not spin on console lock, annotate +- * this as a trylock. Otherwise lockdep will +- * complain. +- */ +- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); + +- return 1; ++ if (text_len) ++ printk_delay(r.info->level); ++ ++ return true; + } + +-/* +- * Call the console drivers, asking them to write out +- * log_buf[start] to log_buf[end - 1]. +- * The console_lock must be held. +- */ +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) ++static void print_sync_until(struct console *con, u64 seq) + { +- static char dropped_text[64]; +- size_t dropped_len = 0; +- struct console *con; ++ unsigned int flags; ++ u64 printk_seq; ++ ++ console_atomic_lock(&flags); ++ for (;;) { ++ printk_seq = atomic64_read(&con->printk_seq); ++ if (printk_seq >= seq) ++ break; ++ if (!print_sync(con, &printk_seq)) ++ break; ++ atomic64_set(&con->printk_seq, printk_seq + 1); ++ } ++ console_atomic_unlock(flags); ++} + +- trace_console_rcuidle(text, len); ++#ifdef CONFIG_PRINTK_NMI ++#define NUM_RECURSION_CTX 2 ++#else ++#define NUM_RECURSION_CTX 1 ++#endif + +- if (!console_drivers) +- return; ++struct printk_recursion { ++ char count[NUM_RECURSION_CTX]; ++}; + +- if (console_dropped) { +- dropped_len = snprintf(dropped_text, sizeof(dropped_text), +- "** %lu printk messages dropped **\n", +- console_dropped); +- console_dropped = 0; +- } ++static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion); ++static char printk_recursion_count[NUM_RECURSION_CTX]; + +- for_each_console(con) { +- if (exclusive_console && con != exclusive_console) +- continue; +- if (!(con->flags & CON_ENABLED)) +- continue; +- if (!con->write) +- continue; +- if (!cpu_online(smp_processor_id()) && +- !(con->flags & CON_ANYTIME)) +- continue; +- if (con->flags & CON_EXTENDED) +- con->write(con, ext_text, ext_len); +- else { +- if (dropped_len) +- con->write(con, dropped_text, dropped_len); +- con->write(con, text, len); +- } ++static char *printk_recursion_counter(void) ++{ ++ struct printk_recursion *rec; ++ char *count; ++ ++ if (!printk_percpu_data_ready()) { ++ count = &printk_recursion_count[0]; ++ } else { ++ rec = this_cpu_ptr(&percpu_printk_recursion); ++ ++ count = &rec->count[0]; + } ++ ++#ifdef CONFIG_PRINTK_NMI ++ if (in_nmi()) ++ count++; ++#endif ++ ++ return count; + } + +-int printk_delay_msec __read_mostly; + +-static inline void printk_delay(void) ++static bool printk_enter_irqsave(unsigned long *flags) + { +- if (unlikely(printk_delay_msec)) { +- int m = printk_delay_msec; ++ char *count; + +- while (m--) { +- mdelay(1); +- touch_nmi_watchdog(); +- } ++ local_irq_save(*flags); ++ count = printk_recursion_counter(); ++ /* Only 1 level of recursion allowed. */ ++ if (*count > 1) { ++ local_irq_restore(*flags); ++ return false; + } ++ (*count)++; ++ ++ return true; ++} ++ ++static void printk_exit_irqrestore(unsigned long flags) ++{ ++ char *count; ++ ++ count = printk_recursion_counter(); ++ (*count)--; ++ local_irq_restore(flags); + } + + static inline u32 printk_caller_id(void) +@@ -1952,144 +1901,248 @@ static inline u32 printk_caller_id(void) + 0x80000000 + raw_smp_processor_id(); + } + +-static size_t log_output(int facility, int level, enum log_flags lflags, +- const struct dev_printk_info *dev_info, +- char *text, size_t text_len) ++/** ++ * parse_prefix - Parse level and control flags. ++ * ++ * @text: The terminated text message. ++ * @level: A pointer to the current level value, will be updated. ++ * @lflags: A pointer to the current log flags, will be updated. ++ * ++ * @level may be NULL if the caller is not interested in the parsed value. ++ * Otherwise the variable pointed to by @level must be set to ++ * LOGLEVEL_DEFAULT in order to be updated with the parsed value. ++ * ++ * @lflags may be NULL if the caller is not interested in the parsed value. ++ * Otherwise the variable pointed to by @lflags will be OR'd with the parsed ++ * value. ++ * ++ * Return: The length of the parsed level and control flags. ++ */ ++static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) + { +- const u32 caller_id = printk_caller_id(); ++ u16 prefix_len = 0; ++ int kern_level; + +- if (lflags & LOG_CONT) { +- struct prb_reserved_entry e; +- struct printk_record r; ++ while (*text) { ++ kern_level = printk_get_level(text); ++ if (!kern_level) ++ break; + +- prb_rec_init_wr(&r, text_len); +- if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { +- memcpy(&r.text_buf[r.info->text_len], text, text_len); +- r.info->text_len += text_len; +- if (lflags & LOG_NEWLINE) { +- r.info->flags |= LOG_NEWLINE; +- prb_final_commit(&e); +- } else { +- prb_commit(&e); +- } +- return text_len; ++ switch (kern_level) { ++ case '0' ... '7': ++ if (level && *level == LOGLEVEL_DEFAULT) ++ *level = kern_level - '0'; ++ break; ++ case 'c': /* KERN_CONT */ ++ if (lflags) ++ *lflags |= LOG_CONT; + } ++ ++ prefix_len += 2; ++ text += 2; + } + +- /* Store it in the record log */ +- return log_store(caller_id, facility, level, lflags, 0, +- dev_info, text, text_len); ++ return prefix_len; + } + +-/* Must be called under logbuf_lock. */ +-int vprintk_store(int facility, int level, +- const struct dev_printk_info *dev_info, +- const char *fmt, va_list args) ++static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags, ++ const char *fmt, va_list args) + { +- static char textbuf[LOG_LINE_MAX]; +- char *text = textbuf; +- size_t text_len; +- enum log_flags lflags = 0; ++ u16 text_len; + +- /* +- * The printf needs to come first; we need the syslog +- * prefix which might be passed-in as a parameter. +- */ +- text_len = vscnprintf(text, sizeof(textbuf), fmt, args); ++ text_len = vscnprintf(text, size, fmt, args); + +- /* mark and strip a trailing newline */ +- if (text_len && text[text_len-1] == '\n') { ++ /* Mark and strip a trailing newline. */ ++ if (text_len && text[text_len - 1] == '\n') { + text_len--; +- lflags |= LOG_NEWLINE; ++ *lflags |= LOG_NEWLINE; + } + +- /* strip kernel syslog prefix and extract log level or control flags */ ++ /* Strip log level and control flags. */ + if (facility == 0) { +- int kern_level; +- +- while ((kern_level = printk_get_level(text)) != 0) { +- switch (kern_level) { +- case '0' ... '7': +- if (level == LOGLEVEL_DEFAULT) +- level = kern_level - '0'; +- break; +- case 'c': /* KERN_CONT */ +- lflags |= LOG_CONT; +- } ++ u16 prefix_len; + +- text_len -= 2; +- text += 2; ++ prefix_len = parse_prefix(text, NULL, NULL); ++ if (prefix_len) { ++ text_len -= prefix_len; ++ memmove(text, text + prefix_len, text_len); + } + } + +- if (level == LOGLEVEL_DEFAULT) +- level = default_message_loglevel; +- +- if (dev_info) +- lflags |= LOG_NEWLINE; +- +- return log_output(facility, level, lflags, dev_info, text, text_len); ++ return text_len; + } + +-asmlinkage int vprintk_emit(int facility, int level, +- const struct dev_printk_info *dev_info, +- const char *fmt, va_list args) ++__printf(4, 0) ++static int vprintk_store(int facility, int level, ++ const struct dev_printk_info *dev_info, ++ const char *fmt, va_list args) + { +- int printed_len; +- bool in_sched = false; +- unsigned long flags; ++ const u32 caller_id = printk_caller_id(); ++ struct prb_reserved_entry e; ++ enum log_flags lflags = 0; ++ bool final_commit = false; ++ struct printk_record r; ++ unsigned long irqflags; ++ u16 trunc_msg_len = 0; ++ char prefix_buf[8]; ++ u16 reserve_size; ++ va_list args2; ++ u16 text_len; ++ int ret = 0; ++ u64 ts_nsec; ++ u64 seq; + +- /* Suppress unimportant messages after panic happens */ +- if (unlikely(suppress_printk)) ++ /* ++ * Since the duration of printk() can vary depending on the message ++ * and state of the ringbuffer, grab the timestamp now so that it is ++ * close to the call of printk(). This provides a more deterministic ++ * timestamp with respect to the caller. ++ */ ++ ts_nsec = local_clock(); ++ ++ if (!printk_enter_irqsave(&irqflags)) + return 0; + +- if (level == LOGLEVEL_SCHED) { +- level = LOGLEVEL_DEFAULT; +- in_sched = true; ++ /* ++ * The sprintf needs to come first since the syslog prefix might be ++ * passed in as a parameter. An extra byte must be reserved so that ++ * later the vscnprintf() into the reserved buffer has room for the ++ * terminating '\0', which is not counted by vsnprintf(). ++ */ ++ va_copy(args2, args); ++ reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; ++ va_end(args2); ++ ++ if (reserve_size > LOG_LINE_MAX) ++ reserve_size = LOG_LINE_MAX; ++ ++ /* Extract log level or control flags. */ ++ if (facility == 0) ++ parse_prefix(&prefix_buf[0], &level, &lflags); ++ ++ if (level == LOGLEVEL_DEFAULT) ++ level = default_message_loglevel; ++ ++ if (dev_info) ++ lflags |= LOG_NEWLINE; ++ ++ if (lflags & LOG_CONT) { ++ prb_rec_init_wr(&r, reserve_size); ++ if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { ++ seq = r.info->seq; ++ text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, ++ facility, &lflags, fmt, args); ++ r.info->text_len += text_len; ++ ++ if (lflags & LOG_NEWLINE) { ++ r.info->flags |= LOG_NEWLINE; ++ prb_final_commit(&e); ++ final_commit = true; ++ } else { ++ prb_commit(&e); ++ } ++ ++ ret = text_len; ++ goto out; ++ } + } + +- boot_delay_msec(level); +- printk_delay(); ++ /* ++ * Explicitly initialize the record before every prb_reserve() call. ++ * prb_reserve_in_last() and prb_reserve() purposely invalidate the ++ * structure when they fail. ++ */ ++ prb_rec_init_wr(&r, reserve_size); ++ if (!prb_reserve(&e, prb, &r)) { ++ /* truncate the message if it is too long for empty buffer */ ++ truncate_msg(&reserve_size, &trunc_msg_len); + +- /* This stops the holder of console_sem just where we want him */ +- logbuf_lock_irqsave(flags); +- printed_len = vprintk_store(facility, level, dev_info, fmt, args); +- logbuf_unlock_irqrestore(flags); ++ prb_rec_init_wr(&r, reserve_size + trunc_msg_len); ++ if (!prb_reserve(&e, prb, &r)) ++ goto out; ++ } + +- /* If called from the scheduler, we can not call up(). */ +- if (!in_sched) { +- /* +- * Disable preemption to avoid being preempted while holding +- * console_sem which would prevent anyone from printing to +- * console +- */ +- preempt_disable(); +- /* +- * Try to acquire and then immediately release the console +- * semaphore. The release will print out buffers and wake up +- * /dev/kmsg and syslog() users. +- */ +- if (console_trylock_spinning()) +- console_unlock(); +- preempt_enable(); ++ seq = r.info->seq; ++ ++ /* fill message */ ++ text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); ++ if (trunc_msg_len) ++ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); ++ r.info->text_len = text_len + trunc_msg_len; ++ r.info->facility = facility; ++ r.info->level = level & 7; ++ r.info->flags = lflags & 0x1f; ++ r.info->ts_nsec = ts_nsec; ++ r.info->caller_id = caller_id; ++ if (dev_info) ++ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); ++ ++ /* A message without a trailing newline can be continued. */ ++ if (!(lflags & LOG_NEWLINE)) { ++ prb_commit(&e); ++ } else { ++ prb_final_commit(&e); ++ final_commit = true; + } + ++ ret = text_len + trunc_msg_len; ++out: ++ /* only the kernel may perform synchronous printing */ ++ if (facility == 0 && final_commit) { ++ struct console *con; ++ ++ for_each_console(con) { ++ if (console_can_sync(con)) ++ print_sync_until(con, seq + 1); ++ } ++ } ++ ++ printk_exit_irqrestore(irqflags); ++ return ret; ++} ++ ++asmlinkage int vprintk_emit(int facility, int level, ++ const struct dev_printk_info *dev_info, ++ const char *fmt, va_list args) ++{ ++ int printed_len; ++ ++ /* Suppress unimportant messages after panic happens */ ++ if (unlikely(suppress_printk)) ++ return 0; ++ ++ if (level == LOGLEVEL_SCHED) ++ level = LOGLEVEL_DEFAULT; ++ ++ printed_len = vprintk_store(facility, level, dev_info, fmt, args); ++ + wake_up_klogd(); + return printed_len; + } + EXPORT_SYMBOL(vprintk_emit); + +-asmlinkage int vprintk(const char *fmt, va_list args) ++__printf(1, 0) ++static int vprintk_default(const char *fmt, va_list args) + { +- return vprintk_func(fmt, args); ++ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); + } +-EXPORT_SYMBOL(vprintk); + +-int vprintk_default(const char *fmt, va_list args) ++__printf(1, 0) ++static int vprintk_func(const char *fmt, va_list args) + { +- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); ++#ifdef CONFIG_KGDB_KDB ++ /* Allow to pass printk() to kdb but avoid a recursion. */ ++ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) ++ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); ++#endif ++ return vprintk_default(fmt, args); ++} ++ ++asmlinkage int vprintk(const char *fmt, va_list args) ++{ ++ return vprintk_func(fmt, args); + } +-EXPORT_SYMBOL_GPL(vprintk_default); ++EXPORT_SYMBOL(vprintk); + + /** + * printk - print a kernel message +@@ -2125,38 +2178,158 @@ asmlinkage __visible int printk(const char *fmt, ...) + } + EXPORT_SYMBOL(printk); + +-#else /* CONFIG_PRINTK */ ++static int printk_kthread_func(void *data) ++{ ++ struct console *con = data; ++ unsigned long dropped = 0; ++ char *dropped_text = NULL; ++ struct printk_info info; ++ struct printk_record r; ++ char *ext_text = NULL; ++ size_t dropped_len; ++ int ret = -ENOMEM; ++ char *text = NULL; ++ char *write_text; ++ u64 printk_seq; ++ size_t len; ++ int error; ++ u64 seq; ++ ++ if (con->flags & CON_EXTENDED) { ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!ext_text) ++ goto out; ++ } ++ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ dropped_text = kmalloc(64, GFP_KERNEL); ++ if (!text || !dropped_text) ++ goto out; + +-#define LOG_LINE_MAX 0 +-#define PREFIX_MAX 0 +-#define printk_time false ++ if (con->flags & CON_EXTENDED) ++ write_text = ext_text; ++ else ++ write_text = text; + +-#define prb_read_valid(rb, seq, r) false +-#define prb_first_valid_seq(rb) 0 ++ seq = atomic64_read(&con->printk_seq); + +-static u64 syslog_seq; +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; ++ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); ++ ++ for (;;) { ++ error = wait_event_interruptible(log_wait, ++ prb_read_valid(prb, seq, &r) || kthread_should_stop()); ++ ++ if (kthread_should_stop()) ++ break; ++ ++ if (error) ++ continue; ++ ++ if (seq != r.info->seq) { ++ dropped += r.info->seq - seq; ++ seq = r.info->seq; ++ } ++ ++ seq++; ++ ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ ++ if (suppress_message_printing(r.info->level)) ++ continue; ++ ++ if (con->flags & CON_EXTENDED) { ++ len = info_print_ext_header(ext_text, ++ CONSOLE_EXT_LOG_MAX, ++ r.info); ++ len += msg_print_ext_body(ext_text + len, ++ CONSOLE_EXT_LOG_MAX - len, ++ &r.text_buf[0], r.info->text_len, ++ &r.info->dev_info); ++ } else { ++ len = record_print_text(&r, ++ console_msg_format & MSG_FORMAT_SYSLOG, ++ printk_time); ++ } ++ ++ printk_seq = atomic64_read(&con->printk_seq); + +-static size_t record_print_text(const struct printk_record *r, +- bool syslog, bool time) ++ console_lock(); ++ console_may_schedule = 0; ++ ++ if (kernel_sync_mode() && con->write_atomic) { ++ console_unlock(); ++ break; ++ } ++ ++ if (!(con->flags & CON_EXTENDED) && dropped) { ++ dropped_len = snprintf(dropped_text, 64, ++ "** %lu printk messages dropped **\n", ++ dropped); ++ dropped = 0; ++ ++ con->write(con, dropped_text, dropped_len); ++ printk_delay(r.info->level); ++ } ++ ++ con->write(con, write_text, len); ++ if (len) ++ printk_delay(r.info->level); ++ ++ atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq); ++ ++ console_unlock(); ++ } ++out: ++ kfree(dropped_text); ++ kfree(text); ++ kfree(ext_text); ++ pr_info("%sconsole [%s%d]: printing thread stopped\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return ret; ++} ++ ++/* Must be called within console_lock(). */ ++static void start_printk_kthread(struct console *con) + { +- return 0; ++ con->thread = kthread_run(printk_kthread_func, con, ++ "pr/%s%d", con->name, con->index); ++ if (IS_ERR(con->thread)) { ++ pr_err("%sconsole [%s%d]: unable to start printing thread\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return; ++ } ++ pr_info("%sconsole [%s%d]: printing thread started\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); + } +-static ssize_t info_print_ext_header(char *buf, size_t size, +- struct printk_info *info) ++ ++/* protected by console_lock */ ++static bool kthreads_started; ++ ++/* Must be called within console_lock(). */ ++static void console_try_thread(struct console *con) + { +- return 0; ++ if (kthreads_started) { ++ start_printk_kthread(con); ++ return; ++ } ++ ++ /* ++ * The printing threads have not been started yet. If this console ++ * can print synchronously, print all unprinted messages. ++ */ ++ if (console_can_sync(con)) ++ print_sync_until(con, prb_next_seq(prb)); + } +-static ssize_t msg_print_ext_body(char *buf, size_t size, +- char *text, size_t text_len, +- struct dev_printk_info *dev_info) { return 0; } +-static void console_lock_spinning_enable(void) { } +-static int console_lock_spinning_disable_and_check(void) { return 0; } +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) {} +-static bool suppress_message_printing(int level) { return false; } ++ ++#else /* CONFIG_PRINTK */ ++ ++#define prb_first_valid_seq(rb) 0 ++#define prb_next_seq(rb) 0 ++ ++#define console_try_thread(con) + + #endif /* CONFIG_PRINTK */ + +@@ -2401,34 +2574,6 @@ int is_console_locked(void) + } + EXPORT_SYMBOL(is_console_locked); + +-/* +- * Check if we have any console that is capable of printing while cpu is +- * booting or shutting down. Requires console_sem. +- */ +-static int have_callable_console(void) +-{ +- struct console *con; +- +- for_each_console(con) +- if ((con->flags & CON_ENABLED) && +- (con->flags & CON_ANYTIME)) +- return 1; +- +- return 0; +-} +- +-/* +- * Can we actually use the console at this time on this cpu? +- * +- * Console drivers may assume that per-cpu resources have been allocated. So +- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't +- * call them until this CPU is officially up. +- */ +-static inline int can_use_console(void) +-{ +- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +-} +- + /** + * console_unlock - unlock the console system + * +@@ -2445,142 +2590,14 @@ static inline int can_use_console(void) + */ + void console_unlock(void) + { +- static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[LOG_LINE_MAX + PREFIX_MAX]; +- unsigned long flags; +- bool do_cond_resched, retry; +- struct printk_info info; +- struct printk_record r; +- + if (console_suspended) { + up_console_sem(); + return; + } + +- prb_rec_init_rd(&r, &info, text, sizeof(text)); +- +- /* +- * Console drivers are called with interrupts disabled, so +- * @console_may_schedule should be cleared before; however, we may +- * end up dumping a lot of lines, for example, if called from +- * console registration path, and should invoke cond_resched() +- * between lines if allowable. Not doing so can cause a very long +- * scheduling stall on a slow console leading to RCU stall and +- * softlockup warnings which exacerbate the issue with more +- * messages practically incapacitating the system. +- * +- * console_trylock() is not able to detect the preemptive +- * context reliably. Therefore the value must be stored before +- * and cleared after the "again" goto label. +- */ +- do_cond_resched = console_may_schedule; +-again: +- console_may_schedule = 0; +- +- /* +- * We released the console_sem lock, so we need to recheck if +- * cpu is online and (if not) is there at least one CON_ANYTIME +- * console. +- */ +- if (!can_use_console()) { +- console_locked = 0; +- up_console_sem(); +- return; +- } +- +- for (;;) { +- size_t ext_len = 0; +- size_t len; +- +- printk_safe_enter_irqsave(flags); +- raw_spin_lock(&logbuf_lock); +-skip: +- if (!prb_read_valid(prb, console_seq, &r)) +- break; +- +- if (console_seq != r.info->seq) { +- console_dropped += r.info->seq - console_seq; +- console_seq = r.info->seq; +- } +- +- if (suppress_message_printing(r.info->level)) { +- /* +- * Skip record we have buffered and already printed +- * directly to the console when we received it, and +- * record that has level above the console loglevel. +- */ +- console_seq++; +- goto skip; +- } +- +- /* Output to all consoles once old messages replayed. */ +- if (unlikely(exclusive_console && +- console_seq >= exclusive_console_stop_seq)) { +- exclusive_console = NULL; +- } +- +- /* +- * Handle extended console text first because later +- * record_print_text() will modify the record buffer in-place. +- */ +- if (nr_ext_console_drivers) { +- ext_len = info_print_ext_header(ext_text, +- sizeof(ext_text), +- r.info); +- ext_len += msg_print_ext_body(ext_text + ext_len, +- sizeof(ext_text) - ext_len, +- &r.text_buf[0], +- r.info->text_len, +- &r.info->dev_info); +- } +- len = record_print_text(&r, +- console_msg_format & MSG_FORMAT_SYSLOG, +- printk_time); +- console_seq++; +- raw_spin_unlock(&logbuf_lock); +- +- /* +- * While actively printing out messages, if another printk() +- * were to occur on another CPU, it may wait for this one to +- * finish. This task can not be preempted if there is a +- * waiter waiting to take over. +- */ +- console_lock_spinning_enable(); +- +- stop_critical_timings(); /* don't trace print latency */ +- call_console_drivers(ext_text, ext_len, text, len); +- start_critical_timings(); +- +- if (console_lock_spinning_disable_and_check()) { +- printk_safe_exit_irqrestore(flags); +- return; +- } +- +- printk_safe_exit_irqrestore(flags); +- +- if (do_cond_resched) +- cond_resched(); +- } +- + console_locked = 0; + +- raw_spin_unlock(&logbuf_lock); +- + up_console_sem(); +- +- /* +- * Someone could have filled up the buffer again, so re-check if there's +- * something to flush. In case we cannot trylock the console_sem again, +- * there's a new owner and the console_unlock() from them will do the +- * flush, no worries. +- */ +- raw_spin_lock(&logbuf_lock); +- retry = prb_read_valid(prb, console_seq, NULL); +- raw_spin_unlock(&logbuf_lock); +- printk_safe_exit_irqrestore(flags); +- +- if (retry && console_trylock()) +- goto again; + } + EXPORT_SYMBOL(console_unlock); + +@@ -2630,23 +2647,20 @@ void console_unblank(void) + */ + void console_flush_on_panic(enum con_flush_mode mode) + { +- /* +- * If someone else is holding the console lock, trylock will fail +- * and may_schedule may be set. Ignore and proceed to unlock so +- * that messages are flushed out. As this can be called from any +- * context and we don't want to get preempted while flushing, +- * ensure may_schedule is cleared. +- */ +- console_trylock(); ++ struct console *c; ++ u64 seq; ++ ++ if (!console_trylock()) ++ return; ++ + console_may_schedule = 0; + + if (mode == CONSOLE_REPLAY_ALL) { +- unsigned long flags; +- +- logbuf_lock_irqsave(flags); +- console_seq = prb_first_valid_seq(prb); +- logbuf_unlock_irqrestore(flags); ++ seq = prb_first_valid_seq(prb); ++ for_each_console(c) ++ atomic64_set(&c->printk_seq, seq); + } ++ + console_unlock(); + } + EXPORT_SYMBOL(console_flush_on_panic); +@@ -2782,7 +2796,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) + */ + void register_console(struct console *newcon) + { +- unsigned long flags; + struct console *bcon = NULL; + int err; + +@@ -2806,6 +2819,8 @@ void register_console(struct console *newcon) + } + } + ++ newcon->thread = NULL; ++ + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + +@@ -2847,8 +2862,10 @@ void register_console(struct console *newcon) + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ +- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) ++ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + newcon->flags &= ~CON_PRINTBUFFER; ++ newcon->flags |= CON_HANDOVER; ++ } + + /* + * Put this console in the list - keep the +@@ -2870,26 +2887,12 @@ void register_console(struct console *newcon) + if (newcon->flags & CON_EXTENDED) + nr_ext_console_drivers++; + +- if (newcon->flags & CON_PRINTBUFFER) { +- /* +- * console_unlock(); will print out the buffered messages +- * for us. +- */ +- logbuf_lock_irqsave(flags); +- /* +- * We're about to replay the log buffer. Only do this to the +- * just-registered console to avoid excessive message spam to +- * the already-registered consoles. +- * +- * Set exclusive_console with disabled interrupts to reduce +- * race window with eventual console_flush_on_panic() that +- * ignores console_lock. +- */ +- exclusive_console = newcon; +- exclusive_console_stop_seq = console_seq; +- console_seq = syslog_seq; +- logbuf_unlock_irqrestore(flags); +- } ++ if (newcon->flags & CON_PRINTBUFFER) ++ atomic64_set(&newcon->printk_seq, 0); ++ else ++ atomic64_set(&newcon->printk_seq, prb_next_seq(prb)); ++ ++ console_try_thread(newcon); + console_unlock(); + console_sysfs_notify(); + +@@ -2963,6 +2966,9 @@ int unregister_console(struct console *console) + console_unlock(); + console_sysfs_notify(); + ++ if (console->thread && !IS_ERR(console->thread)) ++ kthread_stop(console->thread); ++ + if (console->exit) + res = console->exit(console); + +@@ -3045,6 +3051,15 @@ static int __init printk_late_init(void) + unregister_console(con); + } + } ++ ++#ifdef CONFIG_PRINTK ++ console_lock(); ++ for_each_console(con) ++ start_printk_kthread(con); ++ kthreads_started = true; ++ console_unlock(); ++#endif ++ + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); +@@ -3060,7 +3075,6 @@ late_initcall(printk_late_init); + * Delayed printk version, for scheduler-internal messages: + */ + #define PRINTK_PENDING_WAKEUP 0x01 +-#define PRINTK_PENDING_OUTPUT 0x02 + + static DEFINE_PER_CPU(int, printk_pending); + +@@ -3068,14 +3082,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) + { + int pending = __this_cpu_xchg(printk_pending, 0); + +- if (pending & PRINTK_PENDING_OUTPUT) { +- /* If trylock fails, someone else is doing the printing */ +- if (console_trylock()) +- console_unlock(); +- } +- + if (pending & PRINTK_PENDING_WAKEUP) +- wake_up_interruptible(&log_wait); ++ wake_up_interruptible_all(&log_wait); + } + + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = +@@ -3094,25 +3102,10 @@ void wake_up_klogd(void) + preempt_enable(); + } + +-void defer_console_output(void) ++__printf(1, 0) ++static int vprintk_deferred(const char *fmt, va_list args) + { +- if (!printk_percpu_data_ready()) +- return; +- +- preempt_disable(); +- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); +- irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); +- preempt_enable(); +-} +- +-int vprintk_deferred(const char *fmt, va_list args) +-{ +- int r; +- +- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); +- defer_console_output(); +- +- return r; ++ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); + } + + int printk_deferred(const char *fmt, ...) +@@ -3251,8 +3244,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); + */ + void kmsg_dump(enum kmsg_dump_reason reason) + { ++ struct kmsg_dumper_iter iter; + struct kmsg_dumper *dumper; +- unsigned long flags; ++ ++ if (!oops_in_progress) { ++ /* ++ * If atomic consoles are available, activate kernel sync mode ++ * to make sure any final messages are visible. The trailing ++ * printk message is important to flush any pending messages. ++ */ ++ if (have_atomic_console()) { ++ sync_mode = true; ++ pr_info("enabled sync mode\n"); ++ } ++ ++ /* ++ * Give the printing threads time to flush, allowing up to ++ * 1s of no printing forward progress before giving up. ++ */ ++ pr_flush(1000, true); ++ } + + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { +@@ -3270,25 +3281,18 @@ void kmsg_dump(enum kmsg_dump_reason reason) + continue; + + /* initialize iterator with data about the stored records */ +- dumper->active = true; +- +- logbuf_lock_irqsave(flags); +- dumper->cur_seq = clear_seq; +- dumper->next_seq = prb_next_seq(prb); +- logbuf_unlock_irqrestore(flags); ++ iter.active = true; ++ kmsg_dump_rewind(&iter); + + /* invoke dumper which will iterate over records */ +- dumper->dump(dumper, reason); +- +- /* reset iterator */ +- dumper->active = false; ++ dumper->dump(dumper, reason, &iter); + } + rcu_read_unlock(); + } + + /** +- * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) +- * @dumper: registered kmsg dumper ++ * kmsg_dump_get_line - retrieve one kmsg log line ++ * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer +@@ -3302,11 +3306,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) + * + * A return value of FALSE indicates that there are no more records to + * read. +- * +- * The function is similar to kmsg_dump_get_line(), but grabs no locks. + */ +-bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, +- char *line, size_t size, size_t *len) ++bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, ++ char *line, size_t size, size_t *len) + { + struct printk_info info; + unsigned int line_count; +@@ -3316,16 +3318,16 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + + prb_rec_init_rd(&r, &info, line, size); + +- if (!dumper->active) ++ if (!iter->active) + goto out; + + /* Read text or count text lines? */ + if (line) { +- if (!prb_read_valid(prb, dumper->cur_seq, &r)) ++ if (!prb_read_valid(prb, iter->cur_seq, &r)) + goto out; + l = record_print_text(&r, syslog, printk_time); + } else { +- if (!prb_read_valid_info(prb, dumper->cur_seq, ++ if (!prb_read_valid_info(prb, iter->cur_seq, + &info, &line_count)) { + goto out; + } +@@ -3334,48 +3336,18 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, + + } + +- dumper->cur_seq = r.info->seq + 1; ++ iter->cur_seq = r.info->seq + 1; + ret = true; + out: + if (len) + *len = l; + return ret; + } +- +-/** +- * kmsg_dump_get_line - retrieve one kmsg log line +- * @dumper: registered kmsg dumper +- * @syslog: include the "<4>" prefixes +- * @line: buffer to copy the line to +- * @size: maximum size of the buffer +- * @len: length of line placed into buffer +- * +- * Start at the beginning of the kmsg buffer, with the oldest kmsg +- * record, and copy one record into the provided buffer. +- * +- * Consecutive calls will return the next available record moving +- * towards the end of the buffer with the youngest messages. +- * +- * A return value of FALSE indicates that there are no more records to +- * read. +- */ +-bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, +- char *line, size_t size, size_t *len) +-{ +- unsigned long flags; +- bool ret; +- +- logbuf_lock_irqsave(flags); +- ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); +- logbuf_unlock_irqrestore(flags); +- +- return ret; +-} + EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + + /** + * kmsg_dump_get_buffer - copy kmsg log lines +- * @dumper: registered kmsg dumper ++ * @iter: kmsg dumper iterator + * @syslog: include the "<4>" prefixes + * @buf: buffer to copy the line to + * @size: maximum size of the buffer +@@ -3392,116 +3364,256 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + * A return value of FALSE indicates that there are no more records to + * read. + */ +-bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +- char *buf, size_t size, size_t *len) ++bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, ++ char *buf, size_t size, size_t *len_out) + { + struct printk_info info; +- unsigned int line_count; + struct printk_record r; +- unsigned long flags; + u64 seq; + u64 next_seq; +- size_t l = 0; ++ size_t len = 0; + bool ret = false; + bool time = printk_time; + +- prb_rec_init_rd(&r, &info, buf, size); +- +- if (!dumper->active || !buf || !size) ++ if (!iter->active || !buf || !size) + goto out; + +- logbuf_lock_irqsave(flags); +- if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) { +- if (info.seq != dumper->cur_seq) { ++ if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { ++ if (info.seq != iter->cur_seq) { + /* messages are gone, move to first available one */ +- dumper->cur_seq = info.seq; ++ iter->cur_seq = info.seq; + } + } + + /* last entry */ +- if (dumper->cur_seq >= dumper->next_seq) { +- logbuf_unlock_irqrestore(flags); ++ if (iter->cur_seq >= iter->next_seq) + goto out; +- } + +- /* calculate length of entire buffer */ +- seq = dumper->cur_seq; +- while (prb_read_valid_info(prb, seq, &info, &line_count)) { +- if (r.info->seq >= dumper->next_seq) +- break; +- l += get_record_print_text_size(&info, line_count, syslog, time); +- seq = r.info->seq + 1; +- } +- +- /* move first record forward until length fits into the buffer */ +- seq = dumper->cur_seq; +- while (l >= size && prb_read_valid_info(prb, seq, +- &info, &line_count)) { +- if (r.info->seq >= dumper->next_seq) +- break; +- l -= get_record_print_text_size(&info, line_count, syslog, time); +- seq = r.info->seq + 1; +- } ++ /* ++ * Find first record that fits, including all following records, ++ * into the user-provided buffer for this dump. Pass in size-1 ++ * because this function (by way of record_print_text()) will ++ * not write more than size-1 bytes of text into @buf. ++ */ ++ seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, ++ size - 1, syslog, time); + +- /* last message in next interation */ ++ /* ++ * Next kmsg_dump_get_buffer() invocation will dump block of ++ * older records stored right before this one. ++ */ + next_seq = seq; + +- /* actually read text into the buffer now */ +- l = 0; +- while (prb_read_valid(prb, seq, &r)) { +- if (r.info->seq >= dumper->next_seq) +- break; ++ prb_rec_init_rd(&r, &info, buf, size); + +- l += record_print_text(&r, syslog, time); ++ len = 0; ++ prb_for_each_record(seq, prb, seq, &r) { ++ if (r.info->seq >= iter->next_seq) ++ break; + +- /* adjust record to store to remaining buffer space */ +- prb_rec_init_rd(&r, &info, buf + l, size - l); ++ len += record_print_text(&r, syslog, time); + +- seq = r.info->seq + 1; ++ /* Adjust record to store to remaining buffer space. */ ++ prb_rec_init_rd(&r, &info, buf + len, size - len); + } + +- dumper->next_seq = next_seq; ++ iter->next_seq = next_seq; + ret = true; +- logbuf_unlock_irqrestore(flags); + out: +- if (len) +- *len = l; ++ if (len_out) ++ *len_out = len; + return ret; + } + EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); + + /** +- * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) +- * @dumper: registered kmsg dumper ++ * kmsg_dump_rewind - reset the iterator ++ * @iter: kmsg dumper iterator + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. ++ */ ++void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) ++{ ++ iter->cur_seq = latched_seq_read_nolock(&clear_seq); ++ iter->next_seq = prb_next_seq(prb); ++} ++EXPORT_SYMBOL_GPL(kmsg_dump_rewind); ++ ++#endif ++ ++struct prb_cpulock { ++ atomic_t owner; ++ unsigned long __percpu *irqflags; ++}; ++ ++#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \ ++static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \ ++static struct prb_cpulock name = { \ ++ .owner = ATOMIC_INIT(-1), \ ++ .irqflags = &_##name##_percpu_irqflags, \ ++} ++ ++static bool __prb_trylock(struct prb_cpulock *cpu_lock, ++ unsigned int *cpu_store) ++{ ++ unsigned long *flags; ++ unsigned int cpu; ++ ++ cpu = get_cpu(); ++ ++ *cpu_store = atomic_read(&cpu_lock->owner); ++ /* memory barrier to ensure the current lock owner is visible */ ++ smp_rmb(); ++ if (*cpu_store == -1) { ++ flags = per_cpu_ptr(cpu_lock->irqflags, cpu); ++ local_irq_save(*flags); ++ if (atomic_try_cmpxchg_acquire(&cpu_lock->owner, ++ cpu_store, cpu)) { ++ return true; ++ } ++ local_irq_restore(*flags); ++ } else if (*cpu_store == cpu) { ++ return true; ++ } ++ ++ put_cpu(); ++ return false; ++} ++ ++/* ++ * prb_lock: Perform a processor-reentrant spin lock. ++ * @cpu_lock: A pointer to the lock object. ++ * @cpu_store: A "flags" pointer to store lock status information. ++ * ++ * If no processor has the lock, the calling processor takes the lock and ++ * becomes the owner. If the calling processor is already the owner of the ++ * lock, this function succeeds immediately. If lock is locked by another ++ * processor, this function spins until the calling processor becomes the ++ * owner. + * +- * The function is similar to kmsg_dump_rewind(), but grabs no locks. ++ * It is safe to call this function from any context and state. + */ +-void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) ++static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store) + { +- dumper->cur_seq = clear_seq; +- dumper->next_seq = prb_next_seq(prb); ++ for (;;) { ++ if (__prb_trylock(cpu_lock, cpu_store)) ++ break; ++ cpu_relax(); ++ } + } + +-/** +- * kmsg_dump_rewind - reset the iterator +- * @dumper: registered kmsg dumper ++/* ++ * prb_unlock: Perform a processor-reentrant spin unlock. ++ * @cpu_lock: A pointer to the lock object. ++ * @cpu_store: A "flags" object storing lock status information. + * +- * Reset the dumper's iterator so that kmsg_dump_get_line() and +- * kmsg_dump_get_buffer() can be called again and used multiple +- * times within the same dumper.dump() callback. ++ * Release the lock. The calling processor must be the owner of the lock. ++ * ++ * It is safe to call this function from any context and state. + */ +-void kmsg_dump_rewind(struct kmsg_dumper *dumper) ++static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store) + { +- unsigned long flags; ++ unsigned long *flags; ++ unsigned int cpu; ++ ++ cpu = atomic_read(&cpu_lock->owner); ++ atomic_set_release(&cpu_lock->owner, cpu_store); ++ ++ if (cpu_store == -1) { ++ flags = per_cpu_ptr(cpu_lock->irqflags, cpu); ++ local_irq_restore(*flags); ++ } + +- logbuf_lock_irqsave(flags); +- kmsg_dump_rewind_nolock(dumper); +- logbuf_unlock_irqrestore(flags); ++ put_cpu(); + } +-EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +-#endif ++DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock); ++ ++void console_atomic_lock(unsigned int *flags) ++{ ++ prb_lock(&printk_cpulock, flags); ++} ++EXPORT_SYMBOL(console_atomic_lock); ++ ++void console_atomic_unlock(unsigned int flags) ++{ ++ prb_unlock(&printk_cpulock, flags); ++} ++EXPORT_SYMBOL(console_atomic_unlock); ++ ++static void pr_msleep(bool may_sleep, int ms) ++{ ++ if (may_sleep) { ++ msleep(ms); ++ } else { ++ while (ms--) ++ udelay(1000); ++ } ++} ++ ++/** ++ * pr_flush() - Wait for printing threads to catch up. ++ * ++ * @timeout_ms: The maximum time (in ms) to wait. ++ * @reset_on_progress: Reset the timeout if forward progress is seen. ++ * ++ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 ++ * represents infinite waiting. ++ * ++ * If @reset_on_progress is true, the timeout will be reset whenever any ++ * printer has been seen to make some forward progress. ++ * ++ * Context: Any context. ++ * Return: true if all enabled printers are caught up. ++ */ ++bool pr_flush(int timeout_ms, bool reset_on_progress) ++{ ++ int remaining = timeout_ms; ++ struct console *con; ++ u64 last_diff = 0; ++ bool may_sleep; ++ u64 printk_seq; ++ u64 diff; ++ u64 seq; ++ ++ may_sleep = (preemptible() && ++ !in_softirq() && ++ system_state >= SYSTEM_RUNNING); ++ ++ seq = prb_next_seq(prb); ++ ++ for (;;) { ++ diff = 0; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ printk_seq = atomic64_read(&con->printk_seq); ++ if (printk_seq < seq) ++ diff += seq - printk_seq; ++ } ++ ++ if (diff != last_diff && reset_on_progress) ++ remaining = timeout_ms; ++ ++ if (!diff || remaining == 0) ++ break; ++ ++ if (remaining < 0) { ++ pr_msleep(may_sleep, 100); ++ } else if (remaining < 100) { ++ pr_msleep(may_sleep, remaining); ++ remaining = 0; ++ } else { ++ pr_msleep(may_sleep, 100); ++ remaining -= 100; ++ } ++ ++ last_diff = diff; ++ } ++ ++ return (diff == 0); ++} ++EXPORT_SYMBOL(pr_flush); +diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c +index b774685cc..e69de29bb 100644 +--- a/kernel/printk/printk_safe.c ++++ b/kernel/printk/printk_safe.c +@@ -1,425 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0-or-later +-/* +- * printk_safe.c - Safe printk for printk-deadlock-prone contexts +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include "internal.h" +- +-/* +- * printk() could not take logbuf_lock in NMI context. Instead, +- * it uses an alternative implementation that temporary stores +- * the strings into a per-CPU buffer. The content of the buffer +- * is later flushed into the main ring buffer via IRQ work. +- * +- * The alternative implementation is chosen transparently +- * by examining current printk() context mask stored in @printk_context +- * per-CPU variable. +- * +- * The implementation allows to flush the strings also from another CPU. +- * There are situations when we want to make sure that all buffers +- * were handled or when IRQs are blocked. +- */ +- +-#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ +- sizeof(atomic_t) - \ +- sizeof(atomic_t) - \ +- sizeof(struct irq_work)) +- +-struct printk_safe_seq_buf { +- atomic_t len; /* length of written data */ +- atomic_t message_lost; +- struct irq_work work; /* IRQ work that flushes the buffer */ +- unsigned char buffer[SAFE_LOG_BUF_LEN]; +-}; +- +-static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); +-static DEFINE_PER_CPU(int, printk_context); +- +-static DEFINE_RAW_SPINLOCK(safe_read_lock); +- +-#ifdef CONFIG_PRINTK_NMI +-static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); +-#endif +- +-/* Get flushed in a more safe context. */ +-static void queue_flush_work(struct printk_safe_seq_buf *s) +-{ +- if (printk_percpu_data_ready()) +- irq_work_queue(&s->work); +-} +- +-/* +- * Add a message to per-CPU context-dependent buffer. NMI and printk-safe +- * have dedicated buffers, because otherwise printk-safe preempted by +- * NMI-printk would have overwritten the NMI messages. +- * +- * The messages are flushed from irq work (or from panic()), possibly, +- * from other CPU, concurrently with printk_safe_log_store(). Should this +- * happen, printk_safe_log_store() will notice the buffer->len mismatch +- * and repeat the write. +- */ +-static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, +- const char *fmt, va_list args) +-{ +- int add; +- size_t len; +- va_list ap; +- +-again: +- len = atomic_read(&s->len); +- +- /* The trailing '\0' is not counted into len. */ +- if (len >= sizeof(s->buffer) - 1) { +- atomic_inc(&s->message_lost); +- queue_flush_work(s); +- return 0; +- } +- +- /* +- * Make sure that all old data have been read before the buffer +- * was reset. This is not needed when we just append data. +- */ +- if (!len) +- smp_rmb(); +- +- va_copy(ap, args); +- add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); +- va_end(ap); +- if (!add) +- return 0; +- +- /* +- * Do it once again if the buffer has been flushed in the meantime. +- * Note that atomic_cmpxchg() is an implicit memory barrier that +- * makes sure that the data were written before updating s->len. +- */ +- if (atomic_cmpxchg(&s->len, len, len + add) != len) +- goto again; +- +- queue_flush_work(s); +- return add; +-} +- +-static inline void printk_safe_flush_line(const char *text, int len) +-{ +- /* +- * Avoid any console drivers calls from here, because we may be +- * in NMI or printk_safe context (when in panic). The messages +- * must go only into the ring buffer at this stage. Consoles will +- * get explicitly called later when a crashdump is not generated. +- */ +- printk_deferred("%.*s", len, text); +-} +- +-/* printk part of the temporary buffer line by line */ +-static int printk_safe_flush_buffer(const char *start, size_t len) +-{ +- const char *c, *end; +- bool header; +- +- c = start; +- end = start + len; +- header = true; +- +- /* Print line by line. */ +- while (c < end) { +- if (*c == '\n') { +- printk_safe_flush_line(start, c - start + 1); +- start = ++c; +- header = true; +- continue; +- } +- +- /* Handle continuous lines or missing new line. */ +- if ((c + 1 < end) && printk_get_level(c)) { +- if (header) { +- c = printk_skip_level(c); +- continue; +- } +- +- printk_safe_flush_line(start, c - start); +- start = c++; +- header = true; +- continue; +- } +- +- header = false; +- c++; +- } +- +- /* Check if there was a partial line. Ignore pure header. */ +- if (start < end && !header) { +- static const char newline[] = KERN_CONT "\n"; +- +- printk_safe_flush_line(start, end - start); +- printk_safe_flush_line(newline, strlen(newline)); +- } +- +- return len; +-} +- +-static void report_message_lost(struct printk_safe_seq_buf *s) +-{ +- int lost = atomic_xchg(&s->message_lost, 0); +- +- if (lost) +- printk_deferred("Lost %d message(s)!\n", lost); +-} +- +-/* +- * Flush data from the associated per-CPU buffer. The function +- * can be called either via IRQ work or independently. +- */ +-static void __printk_safe_flush(struct irq_work *work) +-{ +- struct printk_safe_seq_buf *s = +- container_of(work, struct printk_safe_seq_buf, work); +- unsigned long flags; +- size_t len; +- int i; +- +- /* +- * The lock has two functions. First, one reader has to flush all +- * available message to make the lockless synchronization with +- * writers easier. Second, we do not want to mix messages from +- * different CPUs. This is especially important when printing +- * a backtrace. +- */ +- raw_spin_lock_irqsave(&safe_read_lock, flags); +- +- i = 0; +-more: +- len = atomic_read(&s->len); +- +- /* +- * This is just a paranoid check that nobody has manipulated +- * the buffer an unexpected way. If we printed something then +- * @len must only increase. Also it should never overflow the +- * buffer size. +- */ +- if ((i && i >= len) || len > sizeof(s->buffer)) { +- const char *msg = "printk_safe_flush: internal error\n"; +- +- printk_safe_flush_line(msg, strlen(msg)); +- len = 0; +- } +- +- if (!len) +- goto out; /* Someone else has already flushed the buffer. */ +- +- /* Make sure that data has been written up to the @len */ +- smp_rmb(); +- i += printk_safe_flush_buffer(s->buffer + i, len - i); +- +- /* +- * Check that nothing has got added in the meantime and truncate +- * the buffer. Note that atomic_cmpxchg() is an implicit memory +- * barrier that makes sure that the data were copied before +- * updating s->len. +- */ +- if (atomic_cmpxchg(&s->len, len, 0) != len) +- goto more; +- +-out: +- report_message_lost(s); +- raw_spin_unlock_irqrestore(&safe_read_lock, flags); +-} +- +-/** +- * printk_safe_flush - flush all per-cpu nmi buffers. +- * +- * The buffers are flushed automatically via IRQ work. This function +- * is useful only when someone wants to be sure that all buffers have +- * been flushed at some point. +- */ +-void printk_safe_flush(void) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) { +-#ifdef CONFIG_PRINTK_NMI +- __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); +-#endif +- __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); +- } +-} +- +-/** +- * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system +- * goes down. +- * +- * Similar to printk_safe_flush() but it can be called even in NMI context when +- * the system goes down. It does the best effort to get NMI messages into +- * the main ring buffer. +- * +- * Note that it could try harder when there is only one CPU online. +- */ +-void printk_safe_flush_on_panic(void) +-{ +- /* +- * Make sure that we could access the main ring buffer. +- * Do not risk a double release when more CPUs are up. +- */ +- if (raw_spin_is_locked(&logbuf_lock)) { +- if (num_online_cpus() > 1) +- return; +- +- debug_locks_off(); +- raw_spin_lock_init(&logbuf_lock); +- } +- +- if (raw_spin_is_locked(&safe_read_lock)) { +- if (num_online_cpus() > 1) +- return; +- +- debug_locks_off(); +- raw_spin_lock_init(&safe_read_lock); +- } +- +- printk_safe_flush(); +-} +-EXPORT_SYMBOL_GPL(printk_safe_flush_on_panic); +- +-#ifdef CONFIG_PRINTK_NMI +-/* +- * Safe printk() for NMI context. It uses a per-CPU buffer to +- * store the message. NMIs are not nested, so there is always only +- * one writer running. But the buffer might get flushed from another +- * CPU, so we need to be careful. +- */ +-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) +-{ +- struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); +- +- return printk_safe_log_store(s, fmt, args); +-} +- +-void noinstr printk_nmi_enter(void) +-{ +- this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); +-} +- +-void noinstr printk_nmi_exit(void) +-{ +- this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); +-} +- +-/* +- * Marks a code that might produce many messages in NMI context +- * and the risk of losing them is more critical than eventual +- * reordering. +- * +- * It has effect only when called in NMI context. Then printk() +- * will try to store the messages into the main logbuf directly +- * and use the per-CPU buffers only as a fallback when the lock +- * is not available. +- */ +-void printk_nmi_direct_enter(void) +-{ +- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) +- this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); +-} +- +-void printk_nmi_direct_exit(void) +-{ +- this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); +-} +- +-#else +- +-static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) +-{ +- return 0; +-} +- +-#endif /* CONFIG_PRINTK_NMI */ +- +-/* +- * Lock-less printk(), to avoid deadlocks should the printk() recurse +- * into itself. It uses a per-CPU buffer to store the message, just like +- * NMI. +- */ +-static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) +-{ +- struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); +- +- return printk_safe_log_store(s, fmt, args); +-} +- +-/* Can be preempted by NMI. */ +-void printk_safe_enter(void) +-{ +- this_cpu_inc(printk_context); +-} +-EXPORT_SYMBOL_GPL(printk_safe_enter); +- +-/* Can be preempted by NMI. */ +-void printk_safe_exit(void) +-{ +- this_cpu_dec(printk_context); +-} +-EXPORT_SYMBOL_GPL(printk_safe_exit); +- +-__printf(1, 0) int vprintk_func(const char *fmt, va_list args) +-{ +-#ifdef CONFIG_KGDB_KDB +- /* Allow to pass printk() to kdb but avoid a recursion. */ +- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) +- return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +-#endif +- +- /* +- * Try to use the main logbuf even in NMI. But avoid calling console +- * drivers that might have their own locks. +- */ +- if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && +- raw_spin_trylock(&logbuf_lock)) { +- int len; +- +- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); +- raw_spin_unlock(&logbuf_lock); +- defer_console_output(); +- return len; +- } +- +- /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ +- if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) +- return vprintk_nmi(fmt, args); +- +- /* Use extra buffer to prevent a recursion deadlock in safe mode. */ +- if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) +- return vprintk_safe(fmt, args); +- +- /* No obstacles. */ +- return vprintk_default(fmt, args); +-} +- +-void __init printk_safe_init(void) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) { +- struct printk_safe_seq_buf *s; +- +- s = &per_cpu(safe_print_seq, cpu); +- init_irq_work(&s->work, __printk_safe_flush); +- +-#ifdef CONFIG_PRINTK_NMI +- s = &per_cpu(nmi_print_seq, cpu); +- init_irq_work(&s->work, __printk_safe_flush); +-#endif +- } +- +- /* Flush pending messages that did not have scheduled IRQ works. */ +- printk_safe_flush(); +-} +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index e3210358b..3b531adf1 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -197,7 +197,14 @@ static bool ptrace_freeze_traced(struct task_struct *task) + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { +- task->state = __TASK_TRACED; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (task->state & __TASK_TRACED) ++ task->state = __TASK_TRACED; ++ else ++ task->saved_state = __TASK_TRACED; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); +@@ -207,8 +214,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) + + static void ptrace_unfreeze_traced(struct task_struct *task) + { +- if (task->state != __TASK_TRACED) +- return; ++ unsigned long flags; ++ bool frozen = true; + + WARN_ON(!task->ptrace || task->parent != current); + +@@ -217,12 +224,19 @@ static void ptrace_unfreeze_traced(struct task_struct *task) + * Recheck state under the lock to close this race. + */ + spin_lock_irq(&task->sighand->siglock); +- if (task->state == __TASK_TRACED) { +- if (__fatal_signal_pending(task)) +- wake_up_state(task, __TASK_TRACED); +- else +- task->state = TASK_TRACED; +- } ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (task->state == __TASK_TRACED) ++ task->state = TASK_TRACED; ++ else if (task->saved_state == __TASK_TRACED) ++ task->saved_state = TASK_TRACED; ++ else ++ frozen = false; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ ++ if (frozen && __fatal_signal_pending(task)) ++ wake_up_state(task, __TASK_TRACED); ++ + spin_unlock_irq(&task->sighand->siglock); + } + +diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +index 84dfa8dae..e222aa0a5 100644 +--- a/kernel/rcu/Kconfig ++++ b/kernel/rcu/Kconfig +@@ -189,8 +189,8 @@ config RCU_FAST_NO_HZ + + config RCU_BOOST + bool "Enable RCU priority boosting" +- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT +- default n ++ depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT ++ default y if PREEMPT_RT + help + This option boosts the priority of preempted RCU readers that + block the current preemptible RCU grace period for too long. +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 310bcc79b..fb2288701 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -100,8 +100,10 @@ static struct rcu_state rcu_state = { + static bool dump_tree; + module_param(dump_tree, bool, 0444); + /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ +-static bool use_softirq = true; ++static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); ++#ifndef CONFIG_PREEMPT_RT + module_param(use_softirq, bool, 0444); ++#endif + /* Control rcu_node-tree auto-balancing at boot time. */ + static bool rcu_fanout_exact; + module_param(rcu_fanout_exact, bool, 0444); +diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c +index 849f0aa99..dd94a602a 100644 +--- a/kernel/rcu/update.c ++++ b/kernel/rcu/update.c +@@ -56,8 +56,10 @@ + #ifndef CONFIG_TINY_RCU + module_param(rcu_expedited, int, 0); + module_param(rcu_normal, int, 0); +-static int rcu_normal_after_boot; ++static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); ++#ifndef CONFIG_PREEMPT_RT + module_param(rcu_normal_after_boot, int, 0); ++#endif + #endif /* #ifndef CONFIG_TINY_RCU */ + + #ifdef CONFIG_DEBUG_LOCK_ALLOC +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 56be8d1c7..f437b4026 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -64,7 +64,11 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ ++#ifdef CONFIG_PREEMPT_RT ++const_debug unsigned int sysctl_sched_nr_migrate = 8; ++#else + const_debug unsigned int sysctl_sched_nr_migrate = 32; ++#endif + + /* + * period over which we measure -rt task CPU usage in us. +@@ -502,9 +506,15 @@ static bool set_nr_if_polling(struct task_struct *p) + #endif + #endif + +-static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task, ++ bool sleeper) + { +- struct wake_q_node *node = &task->wake_q; ++ struct wake_q_node *node; ++ ++ if (sleeper) ++ node = &task->wake_q_sleeper; ++ else ++ node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means +@@ -540,7 +550,13 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) + */ + void wake_q_add(struct wake_q_head *head, struct task_struct *task) + { +- if (__wake_q_add(head, task)) ++ if (__wake_q_add(head, task, false)) ++ get_task_struct(task); ++} ++ ++void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task, true)) + get_task_struct(task); + } + +@@ -563,28 +579,39 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) + */ + void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) + { +- if (!__wake_q_add(head, task)) ++ if (!__wake_q_add(head, task, false)) + put_task_struct(task); + } + +-void wake_up_q(struct wake_q_head *head) ++void __wake_up_q(struct wake_q_head *head, bool sleeper) + { + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + +- task = container_of(node, struct task_struct, wake_q); ++ if (sleeper) ++ task = container_of(node, struct task_struct, wake_q_sleeper); ++ else ++ task = container_of(node, struct task_struct, wake_q); ++ + BUG_ON(!task); + /* Task can safely be re-inserted now: */ + node = node->next; +- task->wake_q.next = NULL; + ++ if (sleeper) ++ task->wake_q_sleeper.next = NULL; ++ else ++ task->wake_q.next = NULL; + /* + * wake_up_process() executes a full barrier, which pairs with + * the queueing in wake_q_add() so as not to miss wakeups. + */ +- wake_up_process(task); ++ if (sleeper) ++ wake_up_lock_sleeper(task); ++ else ++ wake_up_process(task); ++ + put_task_struct(task); + } + } +@@ -620,6 +647,48 @@ void resched_curr(struct rq *rq) + trace_sched_wake_idle_without_ipi(cpu); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++ ++static int tsk_is_polling(struct task_struct *p) ++{ ++#ifdef TIF_POLLING_NRFLAG ++ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); ++#else ++ return 0; ++#endif ++} ++ ++void resched_curr_lazy(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ if (!sched_feat(PREEMPT_LAZY)) { ++ resched_curr(rq); ++ return; ++ } ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ if (test_tsk_need_resched_lazy(curr)) ++ return; ++ ++ set_tsk_need_resched_lazy(curr); ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) ++ return; ++ ++ /* NEED_RESCHED_LAZY must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(curr)) ++ smp_send_reschedule(cpu); ++} ++#endif ++ + void resched_cpu(int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -1702,6 +1771,82 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) + + #ifdef CONFIG_SMP + ++static void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, ++ u32 flags); ++ ++static void migrate_disable_switch(struct rq *rq, struct task_struct *p) ++{ ++ if (likely(!p->migration_disabled)) ++ return; ++ ++ if (p->cpus_ptr != &p->cpus_mask) ++ return; ++ ++ /* ++ * Violates locking rules! see comment in __do_set_cpus_allowed(). ++ */ ++ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); ++} ++ ++void migrate_disable(void) ++{ ++ struct task_struct *p = current; ++ ++ if (p->migration_disabled) { ++ p->migration_disabled++; ++ return; ++ } ++ ++ trace_sched_migrate_disable_tp(p); ++ ++ preempt_disable(); ++ this_rq()->nr_pinned++; ++ p->migration_disabled = 1; ++ preempt_lazy_disable(); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(migrate_disable); ++ ++void migrate_enable(void) ++{ ++ struct task_struct *p = current; ++ ++ if (p->migration_disabled > 1) { ++ p->migration_disabled--; ++ return; ++ } ++ ++ /* ++ * Ensure stop_task runs either before or after this, and that ++ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). ++ */ ++ preempt_disable(); ++ if (p->cpus_ptr != &p->cpus_mask) ++ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); ++ /* ++ * Mustn't clear migration_disabled() until cpus_ptr points back at the ++ * regular cpus_mask, otherwise things that race (eg. ++ * select_fallback_rq) get confused. ++ */ ++ barrier(); ++ p->migration_disabled = 0; ++ this_rq()->nr_pinned--; ++ preempt_lazy_enable(); ++ preempt_enable(); ++ ++ trace_sched_migrate_enable_tp(p); ++} ++EXPORT_SYMBOL_GPL(migrate_enable); ++ ++static inline bool rq_has_pinned_tasks(struct rq *rq) ++{ ++ return rq->nr_pinned; ++} ++ + /* + * Per-CPU kthreads are allowed to run on !active && online CPUs, see + * __set_cpus_allowed_ptr() and select_fallback_rq(). +@@ -1711,7 +1856,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) + return false; + +- if (is_per_cpu_kthread(p)) ++ if (is_per_cpu_kthread(p) || is_migration_disabled(p)) + return cpu_online(cpu); + + return cpu_active(cpu); +@@ -1756,8 +1901,21 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + } + + struct migration_arg { +- struct task_struct *task; +- int dest_cpu; ++ struct task_struct *task; ++ int dest_cpu; ++ struct set_affinity_pending *pending; ++}; ++ ++/* ++ * @refs: number of wait_for_completion() ++ * @stop_pending: is @stop_work in use ++ */ ++struct set_affinity_pending { ++ refcount_t refs; ++ unsigned int stop_pending; ++ struct completion done; ++ struct cpu_stop_work stop_work; ++ struct migration_arg arg; + }; + + /* +@@ -1790,15 +1948,17 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + static int migration_cpu_stop(void *data) + { + struct migration_arg *arg = data; ++ struct set_affinity_pending *pending = arg->pending; + struct task_struct *p = arg->task; + struct rq *rq = this_rq(); ++ bool complete = false; + struct rq_flags rf; + + /* + * The original target CPU might have gone down and we might + * be on another CPU but it doesn't matter. + */ +- local_irq_disable(); ++ local_irq_save(rf.flags); + /* + * We need to explicitly wake pending tasks before running + * __migrate_task() such that we will not miss enforcing cpus_ptr +@@ -1808,21 +1968,121 @@ static int migration_cpu_stop(void *data) + + raw_spin_lock(&p->pi_lock); + rq_lock(rq, &rf); ++ + /* + * If task_rq(p) != rq, it cannot be migrated here, because we're + * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because + * we're holding p->pi_lock. + */ + if (task_rq(p) == rq) { ++ if (is_migration_disabled(p)) ++ goto out; ++ ++ if (pending) { ++ if (p->migration_pending == pending) ++ p->migration_pending = NULL; ++ complete = true; ++ ++ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) ++ goto out; ++ } ++ + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; ++ ++ /* ++ * XXX __migrate_task() can fail, at which point we might end ++ * up running on a dodgy CPU, AFAICT this can only happen ++ * during CPU hotplug, at which point we'll get pushed out ++ * anyway, so it's probably not a big deal. ++ */ ++ ++ } else if (pending) { ++ /* ++ * This happens when we get migrated between migrate_enable()'s ++ * preempt_enable() and scheduling the stopper task. At that ++ * point we're a regular task again and not current anymore. ++ * ++ * A !PREEMPT kernel has a giant hole here, which makes it far ++ * more likely. ++ */ ++ ++ /* ++ * The task moved before the stopper got to run. We're holding ++ * ->pi_lock, so the allowed mask is stable - if it got ++ * somewhere allowed, we're done. ++ */ ++ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { ++ if (p->migration_pending == pending) ++ p->migration_pending = NULL; ++ complete = true; ++ goto out; ++ } ++ ++ /* ++ * When migrate_enable() hits a rq mis-match we can't reliably ++ * determine is_migration_disabled() and so have to chase after ++ * it. ++ */ ++ WARN_ON_ONCE(!pending->stop_pending); ++ task_rq_unlock(rq, p, &rf); ++ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); ++ return 0; + } +- rq_unlock(rq, &rf); +- raw_spin_unlock(&p->pi_lock); ++out: ++ if (pending) ++ pending->stop_pending = false; ++ task_rq_unlock(rq, p, &rf); ++ ++ if (complete) ++ complete_all(&pending->done); ++ ++ return 0; ++} ++ ++int push_cpu_stop(void *arg) ++{ ++ struct rq *lowest_rq = NULL, *rq = this_rq(); ++ struct task_struct *p = arg; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ if (task_rq(p) != rq) ++ goto out_unlock; ++ ++ if (is_migration_disabled(p)) { ++ p->migration_flags |= MDF_PUSH; ++ goto out_unlock; ++ } ++ ++ p->migration_flags &= ~MDF_PUSH; ++ ++ if (p->sched_class->find_lock_rq) ++ lowest_rq = p->sched_class->find_lock_rq(p, rq); ++ ++ if (!lowest_rq) ++ goto out_unlock; ++ ++ // XXX validate p is still the highest prio task ++ if (task_rq(p) == rq) { ++ deactivate_task(rq, p, 0); ++ set_task_cpu(p, lowest_rq->cpu); ++ activate_task(lowest_rq, p, 0); ++ resched_curr(lowest_rq); ++ } ++ ++ double_unlock_balance(rq, lowest_rq); ++ ++out_unlock: ++ rq->push_busy = false; ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irq(&p->pi_lock); + +- local_irq_enable(); ++ put_task_struct(p); + return 0; + } + +@@ -1830,18 +2090,39 @@ static int migration_cpu_stop(void *data) + * sched_class::set_cpus_allowed must do the below, but is not required to + * actually call this function. + */ +-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) + { ++ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { ++ p->cpus_ptr = new_mask; ++ return; ++ } ++ + cpumask_copy(&p->cpus_mask, new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); + } + +-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++static void ++__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) + { + struct rq *rq = task_rq(p); + bool queued, running; + +- lockdep_assert_held(&p->pi_lock); ++ /* ++ * This here violates the locking rules for affinity, since we're only ++ * supposed to change these variables while holding both rq->lock and ++ * p->pi_lock. ++ * ++ * HOWEVER, it magically works, because ttwu() is the only code that ++ * accesses these variables under p->pi_lock and only does so after ++ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() ++ * before finish_task(). ++ * ++ * XXX do further audits, this smells like something putrid. ++ */ ++ if (flags & SCA_MIGRATE_DISABLE) ++ SCHED_WARN_ON(!p->on_cpu); ++ else ++ lockdep_assert_held(&p->pi_lock); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); +@@ -1857,7 +2138,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + if (running) + put_prev_task(rq, p); + +- p->sched_class->set_cpus_allowed(p, new_mask); ++ p->sched_class->set_cpus_allowed(p, new_mask, flags); + + if (queued) + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); +@@ -1865,6 +2146,222 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + set_next_task(rq, p); + } + ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask, 0); ++} ++ ++/* ++ * This function is wildly self concurrent; here be dragons. ++ * ++ * ++ * When given a valid mask, __set_cpus_allowed_ptr() must block until the ++ * designated task is enqueued on an allowed CPU. If that task is currently ++ * running, we have to kick it out using the CPU stopper. ++ * ++ * Migrate-Disable comes along and tramples all over our nice sandcastle. ++ * Consider: ++ * ++ * Initial conditions: P0->cpus_mask = [0, 1] ++ * ++ * P0@CPU0 P1 ++ * ++ * migrate_disable(); ++ * ++ * set_cpus_allowed_ptr(P0, [1]); ++ * ++ * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes ++ * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). ++ * This means we need the following scheme: ++ * ++ * P0@CPU0 P1 ++ * ++ * migrate_disable(); ++ * ++ * set_cpus_allowed_ptr(P0, [1]); ++ * ++ * ++ * migrate_enable(); ++ * __set_cpus_allowed_ptr(); ++ * ++ * `--> ++ * ++ * Now the fun stuff: there may be several P1-like tasks, i.e. multiple ++ * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any ++ * task p are serialized by p->pi_lock, which we can leverage: the one that ++ * should come into effect at the end of the Migrate-Disable region is the last ++ * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), ++ * but we still need to properly signal those waiting tasks at the appropriate ++ * moment. ++ * ++ * This is implemented using struct set_affinity_pending. The first ++ * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will ++ * setup an instance of that struct and install it on the targeted task_struct. ++ * Any and all further callers will reuse that instance. Those then wait for ++ * a completion signaled at the tail of the CPU stopper callback (1), triggered ++ * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). ++ * ++ * ++ * (1) In the cases covered above. There is one more where the completion is ++ * signaled within affine_move_task() itself: when a subsequent affinity request ++ * cancels the need for an active migration. Consider: ++ * ++ * Initial conditions: P0->cpus_mask = [0, 1] ++ * ++ * P0@CPU0 P1 P2 ++ * ++ * migrate_disable(); ++ * ++ * set_cpus_allowed_ptr(P0, [1]); ++ * ++ * set_cpus_allowed_ptr(P0, [0, 1]); ++ * ++ * ++ * ++ * Note that the above is safe vs a concurrent migrate_enable(), as any ++ * pending affinity completion is preceded an uninstallion of ++ * p->migration_pending done with p->pi_lock held. ++ */ ++static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, ++ int dest_cpu, unsigned int flags) ++{ ++ struct set_affinity_pending my_pending = { }, *pending = NULL; ++ bool stop_pending, complete = false; ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { ++ struct task_struct *push_task = NULL; ++ ++ if ((flags & SCA_MIGRATE_ENABLE) && ++ (p->migration_flags & MDF_PUSH) && !rq->push_busy) { ++ rq->push_busy = true; ++ push_task = get_task_struct(p); ++ } ++ ++ /* ++ * If there are pending waiters, but no pending stop_work, ++ * then complete now. ++ */ ++ pending = p->migration_pending; ++ if (pending && !pending->stop_pending) { ++ p->migration_pending = NULL; ++ complete = true; ++ } ++ ++ task_rq_unlock(rq, p, rf); ++ ++ if (push_task) { ++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, ++ p, &rq->push_work); ++ } ++ ++ if (complete) ++ complete_all(&pending->done); ++ ++ return 0; ++ } ++ ++ if (!(flags & SCA_MIGRATE_ENABLE)) { ++ /* serialized by p->pi_lock */ ++ if (!p->migration_pending) { ++ /* Install the request */ ++ refcount_set(&my_pending.refs, 1); ++ init_completion(&my_pending.done); ++ my_pending.arg = (struct migration_arg) { ++ .task = p, ++ .dest_cpu = dest_cpu, ++ .pending = &my_pending, ++ }; ++ ++ p->migration_pending = &my_pending; ++ } else { ++ pending = p->migration_pending; ++ refcount_inc(&pending->refs); ++ /* ++ * Affinity has changed, but we've already installed a ++ * pending. migration_cpu_stop() *must* see this, else ++ * we risk a completion of the pending despite having a ++ * task on a disallowed CPU. ++ * ++ * Serialized by p->pi_lock, so this is safe. ++ */ ++ pending->arg.dest_cpu = dest_cpu; ++ } ++ } ++ pending = p->migration_pending; ++ /* ++ * - !MIGRATE_ENABLE: ++ * we'll have installed a pending if there wasn't one already. ++ * ++ * - MIGRATE_ENABLE: ++ * we're here because the current CPU isn't matching anymore, ++ * the only way that can happen is because of a concurrent ++ * set_cpus_allowed_ptr() call, which should then still be ++ * pending completion. ++ * ++ * Either way, we really should have a @pending here. ++ */ ++ if (WARN_ON_ONCE(!pending)) { ++ task_rq_unlock(rq, p, rf); ++ return -EINVAL; ++ } ++ ++ if (task_running(rq, p) || p->state == TASK_WAKING) { ++ /* ++ * MIGRATE_ENABLE gets here because 'p == current', but for ++ * anything else we cannot do is_migration_disabled(), punt ++ * and have the stopper function handle it all race-free. ++ */ ++ stop_pending = pending->stop_pending; ++ if (!stop_pending) ++ pending->stop_pending = true; ++ ++ if (flags & SCA_MIGRATE_ENABLE) ++ p->migration_flags &= ~MDF_PUSH; ++ ++ task_rq_unlock(rq, p, rf); ++ ++ if (!stop_pending) { ++ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, ++ &pending->arg, &pending->stop_work); ++ } ++ ++ if (flags & SCA_MIGRATE_ENABLE) ++ return 0; ++ } else { ++ ++ if (!is_migration_disabled(p)) { ++ if (task_on_rq_queued(p)) ++ rq = move_queued_task(rq, rf, p, dest_cpu); ++ ++ if (!pending->stop_pending) { ++ p->migration_pending = NULL; ++ complete = true; ++ } ++ } ++ task_rq_unlock(rq, p, rf); ++ ++ if (complete) ++ complete_all(&pending->done); ++ } ++ ++ wait_for_completion(&pending->done); ++ ++ if (refcount_dec_and_test(&pending->refs)) ++ wake_up_var(&pending->refs); /* No UaF, just an address */ ++ ++ /* ++ * Block the original owner of &pending until all subsequent callers ++ * have seen the completion and decremented the refcount ++ */ ++ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); ++ ++ /* ARGH */ ++ WARN_ON_ONCE(my_pending.stop_pending); ++ ++ return 0; ++} ++ + /* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on +@@ -1875,7 +2372,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) + * call is not atomic; no spinlocks may be held. + */ + static int __set_cpus_allowed_ptr(struct task_struct *p, +- const struct cpumask *new_mask, bool check) ++ const struct cpumask *new_mask, ++ u32 flags) + { + const struct cpumask *cpu_valid_mask = cpu_active_mask; + unsigned int dest_cpu; +@@ -1886,9 +2384,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + +- if (p->flags & PF_KTHREAD) { ++ if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { + /* +- * Kernel threads are allowed on online && !active CPUs ++ * Kernel threads are allowed on online && !active CPUs. ++ * ++ * Specifically, migration_disabled() tasks must not fail the ++ * cpumask_any_and_distribute() pick below, esp. so on ++ * SCA_MIGRATE_ENABLE, otherwise we'll not call ++ * set_cpus_allowed_common() and actually reset p->cpus_ptr. + */ + cpu_valid_mask = cpu_online_mask; + } +@@ -1897,13 +2400,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ +- if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + +- if (cpumask_equal(&p->cpus_mask, new_mask)) +- goto out; ++ if (!(flags & SCA_MIGRATE_ENABLE)) { ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ if (WARN_ON_ONCE(p == current && ++ is_migration_disabled(p) && ++ !cpumask_test_cpu(task_cpu(p), new_mask))) { ++ ret = -EBUSY; ++ goto out; ++ } ++ } + + /* + * Picking a ~random cpu helps in cases where we are changing affinity +@@ -1916,7 +2428,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + goto out; + } + +- do_set_cpus_allowed(p, new_mask); ++ __do_set_cpus_allowed(p, new_mask, flags); + + if (p->flags & PF_KTHREAD) { + /* +@@ -1928,23 +2440,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + p->nr_cpus_allowed != 1); + } + +- /* Can the task run on the task's current CPU? If so, we're done */ +- if (cpumask_test_cpu(task_cpu(p), new_mask)) +- goto out; ++ return affine_move_task(rq, p, &rf, dest_cpu, flags); + +- if (task_running(rq, p) || p->state == TASK_WAKING) { +- struct migration_arg arg = { p, dest_cpu }; +- /* Need help from migration thread: drop lock and wait. */ +- task_rq_unlock(rq, p, &rf); +- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); +- return 0; +- } else if (task_on_rq_queued(p)) { +- /* +- * OK, since we're going to drop the lock immediately +- * afterwards anyway. +- */ +- rq = move_queued_task(rq, &rf, p, dest_cpu); +- } + out: + task_rq_unlock(rq, p, &rf); + +@@ -1953,7 +2450,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, + + int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) + { +- return __set_cpus_allowed_ptr(p, new_mask, false); ++ return __set_cpus_allowed_ptr(p, new_mask, 0); + } + EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +@@ -1994,6 +2491,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) + * Clearly, migrating tasks to offline CPUs is a fairly daft thing. + */ + WARN_ON_ONCE(!cpu_online(new_cpu)); ++ ++ WARN_ON_ONCE(is_migration_disabled(p)); + #endif + + trace_sched_migrate_task(p, new_cpu); +@@ -2126,6 +2625,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, + } + #endif /* CONFIG_NUMA_BALANCING */ + ++static bool check_task_state(struct task_struct *p, long match_state) ++{ ++ bool match = false; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->state == match_state || p->saved_state == match_state) ++ match = true; ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ return match; ++} ++ + /* + * wait_task_inactive - wait for a thread to unschedule. + * +@@ -2170,7 +2681,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { +- if (match_state && unlikely(p->state != match_state)) ++ if (match_state && !check_task_state(p, match_state)) + return 0; + cpu_relax(); + } +@@ -2185,7 +2696,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) + running = task_running(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; +- if (!match_state || p->state == match_state) ++ if (!match_state || p->state == match_state || ++ p->saved_state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &rf); + +@@ -2219,7 +2731,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + +@@ -2324,6 +2836,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) + } + fallthrough; + case possible: ++ /* ++ * XXX When called from select_task_rq() we only ++ * hold p->pi_lock and again violate locking order. ++ * ++ * More yuck to audit. ++ */ + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; +@@ -2358,7 +2876,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) + { + lockdep_assert_held(&p->pi_lock); + +- if (p->nr_cpus_allowed > 1) ++ if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + else + cpu = cpumask_any(p->cpus_ptr); +@@ -2381,6 +2899,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) + + void sched_set_stop_task(int cpu, struct task_struct *stop) + { ++ static struct lock_class_key stop_pi_lock; + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + +@@ -2396,6 +2915,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; ++ ++ /* ++ * The PI code calls rt_mutex_setprio() with ->pi_lock held to ++ * adjust the effective priority of a task. As a result, ++ * rt_mutex_setprio() can trigger (RT) balancing operations, ++ * which can then trigger wakeups of the stop thread to push ++ * around the current task. ++ * ++ * The stop task itself will never be part of the PI-chain, it ++ * never blocks, therefore that ->pi_lock recursion is safe. ++ * Tell lockdep about this by placing the stop->pi_lock in its ++ * own class. ++ */ ++ lockdep_set_class(&stop->pi_lock, &stop_pi_lock); + } + + cpu_rq(cpu)->stop = stop; +@@ -2409,15 +2942,23 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) + } + } + +-#else ++#else /* CONFIG_SMP */ + + static inline int __set_cpus_allowed_ptr(struct task_struct *p, +- const struct cpumask *new_mask, bool check) ++ const struct cpumask *new_mask, ++ u32 flags) + { + return set_cpus_allowed_ptr(p, new_mask); + } + +-#endif /* CONFIG_SMP */ ++static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } ++ ++static inline bool rq_has_pinned_tasks(struct rq *rq) ++{ ++ return false; ++} ++ ++#endif /* !CONFIG_SMP */ + + static void + ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +@@ -2838,7 +3379,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + int cpu, success = 0; + + preempt_disable(); +- if (p == current) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) { + /* + * We're waking current, this means 'p->on_rq' and 'task_cpu(p) + * == smp_processor_id()'. Together this means we can special +@@ -2868,8 +3409,26 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + smp_mb__after_spinlock(); +- if (!(p->state & state)) ++ if (!(p->state & state)) { ++ /* ++ * The task might be running due to a spinlock sleeper ++ * wakeup. Check the saved state and set it to running ++ * if the wakeup condition is true. ++ */ ++ if (!(wake_flags & WF_LOCK_SLEEPER)) { ++ if (p->saved_state & state) { ++ p->saved_state = TASK_RUNNING; ++ success = 1; ++ } ++ } + goto unlock; ++ } ++ /* ++ * If this is a regular wakeup, then we can unconditionally ++ * clear the saved state of a "lock sleeper". ++ */ ++ if (!(wake_flags & WF_LOCK_SLEEPER)) ++ p->saved_state = TASK_RUNNING; + + trace_sched_waking(p); + +@@ -3058,6 +3617,18 @@ int wake_up_process(struct task_struct *p) + } + EXPORT_SYMBOL(wake_up_process); + ++/** ++ * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" ++ * @p: The process to be woken up. ++ * ++ * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate ++ * the nature of the wakeup. ++ */ ++int wake_up_lock_sleeper(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER); ++} ++ + int wake_up_state(struct task_struct *p, unsigned int state) + { + return try_to_wake_up(p, state, 0); +@@ -3111,6 +3682,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + init_numa_balancing(clone_flags, p); + #ifdef CONFIG_SMP + p->wake_entry.u_flags = CSD_TYPE_TTWU; ++ p->migration_pending = NULL; + #endif + } + +@@ -3316,6 +3888,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->on_cpu = 0; + #endif + init_task_preempt_count(p); ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(p)->preempt_lazy_count = 0; ++#endif + #ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +@@ -3494,51 +4069,135 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, + __fire_sched_out_preempt_notifiers(curr, next); + } + +-#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ * ++ * See the ttwu() WF_ON_CPU case and its ordering comment. ++ */ ++ WRITE_ONCE(next->on_cpu, 1); ++#endif ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * This must be the very last reference to @prev from this CPU. After ++ * p->on_cpu is cleared, the task can be moved to a different CPU. We ++ * must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++static void do_balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++ void (*func)(struct rq *rq); ++ struct callback_head *next; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ while (head) { ++ func = (void (*)(struct rq *))head->func; ++ next = head->next; ++ head->next = NULL; ++ head = next; ++ ++ func(rq); ++ } ++} ++ ++static inline struct callback_head *splice_balance_callbacks(struct rq *rq) ++{ ++ struct callback_head *head = rq->balance_callback; ++ ++ lockdep_assert_held(&rq->lock); ++ if (head) { ++ rq->balance_callback = NULL; ++ rq->balance_flags &= ~BALANCE_WORK; ++ } ++ ++ return head; ++} ++ ++static void __balance_callbacks(struct rq *rq) ++{ ++ do_balance_callbacks(rq, splice_balance_callbacks(rq)); ++} ++ ++static inline void balance_callbacks(struct rq *rq, struct callback_head *head) ++{ ++ unsigned long flags; ++ ++ if (unlikely(head)) { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ do_balance_callbacks(rq, head); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++} ++ ++static void balance_push(struct rq *rq); ++ ++static inline void balance_switch(struct rq *rq) ++{ ++ if (likely(!rq->balance_flags)) ++ return; ++ ++ if (rq->balance_flags & BALANCE_PUSH) { ++ balance_push(rq); ++ return; ++ } ++ ++ __balance_callbacks(rq); ++} ++ ++#else + +-static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++static inline void __balance_callbacks(struct rq *rq) + { + } + +-static inline void +-fire_sched_out_preempt_notifiers(struct task_struct *curr, +- struct task_struct *next) ++static inline struct callback_head *splice_balance_callbacks(struct rq *rq) + { ++ return NULL; + } + +-#endif /* CONFIG_PREEMPT_NOTIFIERS */ +- +-static inline void prepare_task(struct task_struct *next) ++static inline void balance_callbacks(struct rq *rq, struct callback_head *head) + { +-#ifdef CONFIG_SMP +- /* +- * Claim the task as running, we do this before switching to it +- * such that any running task will have this set. +- * +- * See the ttwu() WF_ON_CPU case and its ordering comment. +- */ +- WRITE_ONCE(next->on_cpu, 1); +-#endif + } + +-static inline void finish_task(struct task_struct *prev) ++static inline void balance_switch(struct rq *rq) + { +-#ifdef CONFIG_SMP +- /* +- * This must be the very last reference to @prev from this CPU. After +- * p->on_cpu is cleared, the task can be moved to a different CPU. We +- * must ensure this doesn't happen until the switch is completely +- * finished. +- * +- * In particular, the load of prev->state in finish_task_switch() must +- * happen before this. +- * +- * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). +- */ +- smp_store_release(&prev->on_cpu, 0); +-#endif + } + ++#endif ++ + static inline void + prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) + { +@@ -3564,6 +4223,7 @@ static inline void finish_lock_switch(struct rq *rq) + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ balance_switch(rq); + raw_spin_unlock_irq(&rq->lock); + } + +@@ -3579,6 +4239,22 @@ static inline void finish_lock_switch(struct rq *rq) + # define finish_arch_post_lock_switch() do { } while (0) + #endif + ++static inline void kmap_local_sched_out(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_out(); ++#endif ++} ++ ++static inline void kmap_local_sched_in(void) ++{ ++#ifdef CONFIG_KMAP_LOCAL ++ if (unlikely(current->kmap_ctrl.idx)) ++ __kmap_local_sched_in(); ++#endif ++} ++ + /** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch +@@ -3601,6 +4277,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, + perf_event_task_sched_out(prev, next); + rseq_preempt(prev); + fire_sched_out_preempt_notifiers(prev, next); ++ kmap_local_sched_out(); + prepare_task(next); + prepare_arch_switch(next); + } +@@ -3668,6 +4345,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) + finish_lock_switch(rq); + finish_arch_post_lock_switch(); + kcov_finish_switch(current); ++ kmap_local_sched_in(); + + fire_sched_in_preempt_notifiers(current); + /* +@@ -3682,66 +4360,24 @@ static struct rq *finish_task_switch(struct task_struct *prev) + * provided by mmdrop(), + * - a sync_core for SYNC_CORE. + */ ++ /* ++ * We use mmdrop_delayed() here so we don't have to do the ++ * full __mmdrop() when we are the last user. ++ */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); +- mmdrop(mm); ++ mmdrop_delayed(mm); + } + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + +- /* +- * Remove function-return probe instances associated with this +- * task and put them back on the free list. +- */ +- kprobe_flush_task(prev); +- +- /* Task is done with its stack. */ +- put_task_stack(prev); +- + put_task_struct_rcu_user(prev); + } + + return rq; + } + +-#ifdef CONFIG_SMP +- +-/* rq->lock is NOT held, but preemption is disabled */ +-static void __balance_callback(struct rq *rq) +-{ +- struct callback_head *head, *next; +- void (*func)(struct rq *rq); +- unsigned long flags; +- +- raw_spin_lock_irqsave(&rq->lock, flags); +- head = rq->balance_callback; +- rq->balance_callback = NULL; +- while (head) { +- func = (void (*)(struct rq *))head->func; +- next = head->next; +- head->next = NULL; +- head = next; +- +- func(rq); +- } +- raw_spin_unlock_irqrestore(&rq->lock, flags); +-} +- +-static inline void balance_callback(struct rq *rq) +-{ +- if (unlikely(rq->balance_callback)) +- __balance_callback(rq); +-} +- +-#else +- +-static inline void balance_callback(struct rq *rq) +-{ +-} +- +-#endif +- + /** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. +@@ -3761,7 +4397,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) + */ + + rq = finish_task_switch(prev); +- balance_callback(rq); + preempt_enable(); + + if (current->set_child_tid) +@@ -4456,7 +5091,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * + * WARNING: must be called with preemption disabled! + */ +-static void __sched notrace __schedule(bool preempt) ++static void __sched notrace __schedule(bool preempt, bool spinning_lock) + { + struct task_struct *prev, *next; + unsigned long *switch_count; +@@ -4509,7 +5144,7 @@ static void __sched notrace __schedule(bool preempt) + * - ptrace_{,un}freeze_traced() can change ->state underneath us. + */ + prev_state = prev->state; +- if (!preempt && prev_state) { ++ if ((!preempt || spinning_lock) && prev_state) { + if (signal_pending_state(prev_state, prev)) { + prev->state = TASK_RUNNING; + } else { +@@ -4544,6 +5179,7 @@ static void __sched notrace __schedule(bool preempt) + + next = pick_next_task(rq, prev, &rf); + clear_tsk_need_resched(prev); ++ clear_tsk_need_resched_lazy(prev); + clear_preempt_need_resched(); + + if (likely(prev != next)) { +@@ -4569,6 +5205,7 @@ static void __sched notrace __schedule(bool preempt) + */ + ++*switch_count; + ++ migrate_disable_switch(rq, prev); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + + trace_sched_switch(preempt, prev, next); +@@ -4577,10 +5214,11 @@ static void __sched notrace __schedule(bool preempt) + rq = context_switch(rq, prev, next, &rf); + } else { + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); +- rq_unlock_irq(rq, &rf); +- } + +- balance_callback(rq); ++ rq_unpin_lock(rq, &rf); ++ __balance_callbacks(rq); ++ raw_spin_unlock_irq(&rq->lock); ++ } + } + + void __noreturn do_task_dead(void) +@@ -4591,7 +5229,7 @@ void __noreturn do_task_dead(void) + /* Tell freezer to ignore us: */ + current->flags |= PF_NOFREEZE; + +- __schedule(false); ++ __schedule(false, false); + BUG(); + + /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ +@@ -4624,9 +5262,6 @@ static inline void sched_submit_work(struct task_struct *tsk) + preempt_enable_no_resched(); + } + +- if (tsk_is_pi_blocked(tsk)) +- return; +- + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. +@@ -4652,7 +5287,7 @@ asmlinkage __visible void __sched schedule(void) + sched_submit_work(tsk); + do { + preempt_disable(); +- __schedule(false); ++ __schedule(false, false); + sched_preempt_enable_no_resched(); + } while (need_resched()); + sched_update_worker(tsk); +@@ -4680,7 +5315,7 @@ void __sched schedule_idle(void) + */ + WARN_ON_ONCE(current->state); + do { +- __schedule(false); ++ __schedule(false, false); + } while (need_resched()); + } + +@@ -4733,7 +5368,7 @@ static void __sched notrace preempt_schedule_common(void) + */ + preempt_disable_notrace(); + preempt_latency_start(1); +- __schedule(true); ++ __schedule(true, false); + preempt_latency_stop(1); + preempt_enable_no_resched_notrace(); + +@@ -4744,6 +5379,30 @@ static void __sched notrace preempt_schedule_common(void) + } while (need_resched()); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++/* ++ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is ++ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as ++ * preempt_lazy_count counter >0. ++ */ ++static __always_inline int preemptible_lazy(void) ++{ ++ if (test_thread_flag(TIF_NEED_RESCHED)) ++ return 1; ++ if (current_thread_info()->preempt_lazy_count) ++ return 0; ++ return 1; ++} ++ ++#else ++ ++static inline int preemptible_lazy(void) ++{ ++ return 1; ++} ++ ++#endif ++ + #ifdef CONFIG_PREEMPTION + /* + * This is the entry point to schedule() from in-kernel preemption +@@ -4758,11 +5417,26 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) + if (likely(!preemptible())) + return; + ++ if (!preemptible_lazy()) ++ return; + preempt_schedule_common(); + } + NOKPROBE_SYMBOL(preempt_schedule); + EXPORT_SYMBOL(preempt_schedule); + ++#ifdef CONFIG_PREEMPT_RT ++void __sched notrace preempt_schedule_lock(void) ++{ ++ do { ++ preempt_disable(); ++ __schedule(true, true); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++} ++NOKPROBE_SYMBOL(preempt_schedule_lock); ++EXPORT_SYMBOL(preempt_schedule_lock); ++#endif ++ + #ifdef CONFIG_PREEMPT_DYNAMIC + DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); + EXPORT_STATIC_CALL(preempt_schedule); +@@ -4790,6 +5464,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + if (likely(!preemptible())) + return; + ++ if (!preemptible_lazy()) ++ return; ++ + do { + /* + * Because the function tracer can trace preempt_count_sub() +@@ -4812,7 +5489,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + * an infinite recursion. + */ + prev_ctx = exception_enter(); +- __schedule(true); ++ __schedule(true, false); + exception_exit(prev_ctx); + + preempt_latency_stop(1); +@@ -5030,7 +5707,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) + do { + preempt_disable(); + local_irq_enable(); +- __schedule(true); ++ __schedule(true, false); + local_irq_disable(); + sched_preempt_enable_no_resched(); + } while (need_resched()); +@@ -5196,9 +5873,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) + out_unlock: + /* Avoid rq from going away on us: */ + preempt_disable(); +- __task_rq_unlock(rq, &rf); + +- balance_callback(rq); ++ rq_unpin_lock(rq, &rf); ++ __balance_callbacks(rq); ++ raw_spin_unlock(&rq->lock); ++ + preempt_enable(); + } + #else +@@ -5441,6 +6120,7 @@ static int __sched_setscheduler(struct task_struct *p, + int oldpolicy = -1, policy = attr->sched_policy; + int retval, oldprio, newprio, queued, running; + const struct sched_class *prev_class; ++ struct callback_head *head; + struct rq_flags rf; + int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; +@@ -5695,6 +6375,7 @@ static int __sched_setscheduler(struct task_struct *p, + + /* Avoid rq from going away on us: */ + preempt_disable(); ++ head = splice_balance_callbacks(rq); + task_rq_unlock(rq, p, &rf); + + if (pi) { +@@ -5703,7 +6384,7 @@ static int __sched_setscheduler(struct task_struct *p, + } + + /* Run balance callbacks after we've adjusted the PI chain: */ +- balance_callback(rq); ++ balance_callbacks(rq, head); + preempt_enable(); + + return 0; +@@ -6198,7 +6879,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) + } + #endif + again: +- retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); +@@ -6784,7 +7465,7 @@ void __init init_idle(struct task_struct *idle, int cpu) + * + * And since this is boot we can forgo the serialization. + */ +- set_cpus_allowed_common(idle, cpumask_of(cpu)); ++ set_cpus_allowed_common(idle, cpumask_of(cpu), 0); + #endif + /* + * We're having a chicken and egg problem, even though we are +@@ -6811,7 +7492,9 @@ void __init init_idle(struct task_struct *idle, int cpu) + + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); +- ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(idle)->preempt_lazy_count = 0; ++#endif + /* + * The idle tasks have their own, simple scheduling class: + */ +@@ -6916,6 +7599,7 @@ void sched_setnuma(struct task_struct *p, int nid) + #endif /* CONFIG_NUMA_BALANCING */ + + #ifdef CONFIG_HOTPLUG_CPU ++ + /* + * Ensure that the idle task is using init_mm right before its CPU goes + * offline. +@@ -6935,119 +7619,126 @@ void idle_task_exit(void) + /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ + } + +-/* +- * Since this CPU is going 'away' for a while, fold any nr_active delta +- * we might have. Assumes we're called after migrate_tasks() so that the +- * nr_active count is stable. We need to take the teardown thread which +- * is calling this into account, so we hand in adjust = 1 to the load +- * calculation. +- * +- * Also see the comment "Global load-average calculations". +- */ +-static void calc_load_migrate(struct rq *rq) ++static int __balance_push_cpu_stop(void *arg) + { +- long delta = calc_load_fold_active(rq, 1); +- if (delta) +- atomic_long_add(delta, &calc_load_tasks); +-} ++ struct task_struct *p = arg; ++ struct rq *rq = this_rq(); ++ struct rq_flags rf; ++ int cpu; + +-static struct task_struct *__pick_migrate_task(struct rq *rq) +-{ +- const struct sched_class *class; +- struct task_struct *next; ++ raw_spin_lock_irq(&p->pi_lock); ++ rq_lock(rq, &rf); + +- for_each_class(class) { +- next = class->pick_next_task(rq); +- if (next) { +- next->sched_class->put_prev_task(rq, next); +- return next; +- } ++ update_rq_clock(rq); ++ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) { ++ cpu = select_fallback_rq(rq->cpu, p); ++ rq = __migrate_task(rq, &rf, p, cpu); + } + +- /* The idle class should always have a runnable task */ +- BUG(); ++ rq_unlock(rq, &rf); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ put_task_struct(p); ++ ++ return 0; + } + ++static DEFINE_PER_CPU(struct cpu_stop_work, push_work); ++ + /* +- * Migrate all tasks from the rq, sleeping tasks will be migrated by +- * try_to_wake_up()->select_task_rq(). +- * +- * Called with rq->lock held even though we'er in stop_machine() and +- * there's no concurrency possible, we hold the required locks anyway +- * because of lock validation efforts. ++ * Ensure we only run per-cpu kthreads once the CPU goes !active. + */ +-static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) ++static void balance_push(struct rq *rq) + { +- struct rq *rq = dead_rq; +- struct task_struct *next, *stop = rq->stop; +- struct rq_flags orf = *rf; +- int dest_cpu; ++ struct task_struct *push_task = rq->curr; ++ ++ lockdep_assert_held(&rq->lock); ++ SCHED_WARN_ON(rq->cpu != smp_processor_id()); + + /* +- * Fudge the rq selection such that the below task selection loop +- * doesn't get stuck on the currently eligible stop task. +- * +- * We're currently inside stop_machine() and the rq is either stuck +- * in the stop_machine_cpu_stop() loop, or we're executing this code, +- * either way we should never end up calling schedule() until we're +- * done here. ++ * Both the cpu-hotplug and stop task are in this case and are ++ * required to complete the hotplug process. + */ +- rq->stop = NULL; ++ if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { ++ /* ++ * If this is the idle task on the outgoing CPU try to wake ++ * up the hotplug control thread which might wait for the ++ * last task to vanish. The rcuwait_active() check is ++ * accurate here because the waiter is pinned on this CPU ++ * and can't obviously be running in parallel. ++ * ++ * On RT kernels this also has to check whether there are ++ * pinned and scheduled out tasks on the runqueue. They ++ * need to leave the migrate disabled section first. ++ */ ++ if (!rq->nr_running && !rq_has_pinned_tasks(rq) && ++ rcuwait_active(&rq->hotplug_wait)) { ++ raw_spin_unlock(&rq->lock); ++ rcuwait_wake_up(&rq->hotplug_wait); ++ raw_spin_lock(&rq->lock); ++ } ++ return; ++ } + ++ get_task_struct(push_task); + /* +- * put_prev_task() and pick_next_task() sched +- * class method both need to have an up-to-date +- * value of rq->clock[_task] ++ * Temporarily drop rq->lock such that we can wake-up the stop task. ++ * Both preemption and IRQs are still disabled. + */ +- update_rq_clock(rq); ++ raw_spin_unlock(&rq->lock); ++ stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, ++ this_cpu_ptr(&push_work)); ++ /* ++ * At this point need_resched() is true and we'll take the loop in ++ * schedule(). The next pick is obviously going to be the stop task ++ * which is_per_cpu_kthread() and will push this task away. ++ */ ++ raw_spin_lock(&rq->lock); ++} + +- for (;;) { +- /* +- * There's this thread running, bail when that's the only +- * remaining thread: +- */ +- if (rq->nr_running == 1) +- break; ++static void balance_push_set(int cpu, bool on) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; + +- next = __pick_migrate_task(rq); ++ rq_lock_irqsave(rq, &rf); ++ if (on) ++ rq->balance_flags |= BALANCE_PUSH; ++ else ++ rq->balance_flags &= ~BALANCE_PUSH; ++ rq_unlock_irqrestore(rq, &rf); ++} + +- /* +- * Rules for changing task_struct::cpus_mask are holding +- * both pi_lock and rq->lock, such that holding either +- * stabilizes the mask. +- * +- * Drop rq->lock is not quite as disastrous as it usually is +- * because !cpu_active at this point, which means load-balance +- * will not interfere. Also, stop-machine. +- */ +- rq_unlock(rq, rf); +- raw_spin_lock(&next->pi_lock); +- rq_relock(rq, rf); ++/* ++ * Invoked from a CPUs hotplug control thread after the CPU has been marked ++ * inactive. All tasks which are not per CPU kernel threads are either ++ * pushed off this CPU now via balance_push() or placed on a different CPU ++ * during wakeup. Wait until the CPU is quiescent. ++ */ ++static void balance_hotplug_wait(void) ++{ ++ struct rq *rq = this_rq(); + +- /* +- * Since we're inside stop-machine, _nothing_ should have +- * changed the task, WARN if weird stuff happened, because in +- * that case the above rq->lock drop is a fail too. +- */ +- if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { +- raw_spin_unlock(&next->pi_lock); +- continue; +- } ++ rcuwait_wait_event(&rq->hotplug_wait, ++ rq->nr_running == 1 && !rq_has_pinned_tasks(rq), ++ TASK_UNINTERRUPTIBLE); ++} + +- /* Find suitable destination for @next, with force if needed. */ +- dest_cpu = select_fallback_rq(dead_rq->cpu, next); +- rq = __migrate_task(rq, rf, next, dest_cpu); +- if (rq != dead_rq) { +- rq_unlock(rq, rf); +- rq = dead_rq; +- *rf = orf; +- rq_relock(rq, rf); +- } +- raw_spin_unlock(&next->pi_lock); +- } ++#else ++ ++static inline void balance_push(struct rq *rq) ++{ ++} + +- rq->stop = stop; ++static inline void balance_push_set(int cpu, bool on) ++{ ++} ++ ++static inline void balance_hotplug_wait(void) ++{ + } ++ + #endif /* CONFIG_HOTPLUG_CPU */ + + void set_rq_online(struct rq *rq) +@@ -7133,6 +7824,8 @@ int sched_cpu_activate(unsigned int cpu) + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + ++ balance_push_set(cpu, false); ++ + #ifdef CONFIG_SCHED_SMT + /* + * When going up, increment the number of cores with SMT present. +@@ -7168,6 +7861,8 @@ int sched_cpu_activate(unsigned int cpu) + + int sched_cpu_deactivate(unsigned int cpu) + { ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; + int ret; + + set_cpu_active(cpu, false); +@@ -7180,6 +7875,16 @@ int sched_cpu_deactivate(unsigned int cpu) + */ + synchronize_rcu(); + ++ balance_push_set(cpu, true); ++ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ update_rq_clock(rq); ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ rq_unlock_irqrestore(rq, &rf); ++ + #ifdef CONFIG_SCHED_SMT + /* + * When going down, decrement the number of cores with SMT present. +@@ -7193,6 +7898,7 @@ int sched_cpu_deactivate(unsigned int cpu) + + ret = cpuset_cpu_inactive(cpu); + if (ret) { ++ balance_push_set(cpu, false); + set_cpu_active(cpu, true); + return ret; + } +@@ -7216,6 +7922,41 @@ int sched_cpu_starting(unsigned int cpu) + } + + #ifdef CONFIG_HOTPLUG_CPU ++ ++/* ++ * Invoked immediately before the stopper thread is invoked to bring the ++ * CPU down completely. At this point all per CPU kthreads except the ++ * hotplug thread (current) and the stopper thread (inactive) have been ++ * either parked or have been unbound from the outgoing CPU. Ensure that ++ * any of those which might be on the way out are gone. ++ * ++ * If after this point a bound task is being woken on this CPU then the ++ * responsible hotplug callback has failed to do it's job. ++ * sched_cpu_dying() will catch it with the appropriate fireworks. ++ */ ++int sched_cpu_wait_empty(unsigned int cpu) ++{ ++ balance_hotplug_wait(); ++ return 0; ++} ++ ++/* ++ * Since this CPU is going 'away' for a while, fold any nr_active delta we ++ * might have. Called from the CPU stopper task after ensuring that the ++ * stopper is the last running task on the CPU, so nr_active count is ++ * stable. We need to take the teardown thread which is calling this into ++ * account, so we hand in adjust = 1 to the load calculation. ++ * ++ * Also see the comment "Global load-average calculations". ++ */ ++static void calc_load_migrate(struct rq *rq) ++{ ++ long delta = calc_load_fold_active(rq, 1); ++ ++ if (delta) ++ atomic_long_add(delta, &calc_load_tasks); ++} ++ + int sched_cpu_dying(unsigned int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -7225,12 +7966,7 @@ int sched_cpu_dying(unsigned int cpu) + sched_tick_stop(cpu); + + rq_lock_irqsave(rq, &rf); +- if (rq->rd) { +- BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); +- set_rq_offline(rq); +- } +- migrate_tasks(rq, &rf); +- BUG_ON(rq->nr_running != 1); ++ BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); + rq_unlock_irqrestore(rq, &rf); + + calc_load_migrate(rq); +@@ -7440,6 +8176,9 @@ void __init sched_init(void) + + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); + #endif ++#ifdef CONFIG_HOTPLUG_CPU ++ rcuwait_init(&rq->hotplug_wait); ++#endif + #endif /* CONFIG_SMP */ + hrtick_rq_init(rq); + atomic_set(&rq->nr_iowait, 0); +@@ -7480,7 +8219,7 @@ void __init sched_init(void) + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + static inline int preempt_count_equals(int preempt_offset) + { +- int nested = preempt_count() + rcu_preempt_depth(); ++ int nested = preempt_count() + sched_rcu_preempt_depth(); + + return (nested == preempt_offset); + } +@@ -7577,6 +8316,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } + EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++#ifdef CONFIG_SMP ++void __cant_migrate(const char *file, int line) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (is_migration_disabled(current)) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > 0) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); ++ pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), is_migration_disabled(current), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_migrate); ++#endif + #endif + + #ifdef CONFIG_MAGIC_SYSRQ +diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c +index 8cb06c8c7..ceb03d76c 100644 +--- a/kernel/sched/cpudeadline.c ++++ b/kernel/sched/cpudeadline.c +@@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, + const struct sched_dl_entity *dl_se = &p->dl; + + if (later_mask && +- cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { ++ cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { + unsigned long cap, max_cap = 0; + int cpu, max_cpu = -1; + +@@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, + + WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); + +- if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && ++ if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && + dl_time_before(dl_se->deadline, cp->elements[0].dl)) { + if (later_mask) + cpumask_set_cpu(best_cpu, later_mask); +diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c +index 0033731a0..11c4df201 100644 +--- a/kernel/sched/cpupri.c ++++ b/kernel/sched/cpupri.c +@@ -73,11 +73,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, + if (skip) + return 0; + +- if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) ++ if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) + return 0; + + if (lowest_mask) { +- cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); ++ cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); + + /* + * We have to ensure that we have at least one bit +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ca0eef7d3..02a5aa60f 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + } + + /* +- * Called before incrementing preempt_count on {soft,}irq_enter ++ * Called after incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +-void irqtime_account_irq(struct task_struct *curr) ++void irqtime_account_irq(struct task_struct *curr, unsigned int offset) + { + struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); ++ unsigned int pc; + s64 delta; + int cpu; + +@@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr) + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; + irqtime->irq_start_time += delta; ++ pc = irq_count() - offset; + + /* + * We do not account for softirq time from ksoftirqd here. +@@ -66,12 +68,11 @@ void irqtime_account_irq(struct task_struct *curr) + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ +- if (hardirq_count()) ++ if (pc & HARDIRQ_MASK) + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); +- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) ++ else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); + } +-EXPORT_SYMBOL_GPL(irqtime_account_irq); + + static u64 irqtime_tick_accounted(u64 maxtime) + { +@@ -418,24 +419,21 @@ void vtime_task_switch(struct task_struct *prev) + } + # endif + +-/* +- * Archs that account the whole time spent in the idle task +- * (outside irq) as idle time can rely on this and just implement +- * vtime_account_kernel() and vtime_account_idle(). Archs that +- * have other meaning of the idle time (s390 only includes the +- * time spent by the CPU when it's in low power mode) must override +- * vtime_account(). +- */ +-#ifndef __ARCH_HAS_VTIME_ACCOUNT +-void vtime_account_irq_enter(struct task_struct *tsk) ++void vtime_account_irq(struct task_struct *tsk, unsigned int offset) + { +- if (!in_interrupt() && is_idle_task(tsk)) ++ unsigned int pc = irq_count() - offset; ++ ++ if (pc & HARDIRQ_OFFSET) { ++ vtime_account_hardirq(tsk); ++ } else if (pc & SOFTIRQ_OFFSET) { ++ vtime_account_softirq(tsk); ++ } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && ++ is_idle_task(tsk)) { + vtime_account_idle(tsk); +- else ++ } else { + vtime_account_kernel(tsk); ++ } + } +-EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +-#endif /* __ARCH_HAS_VTIME_ACCOUNT */ + + void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index 8255267ce..5ab09ef74 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -565,7 +565,7 @@ static int push_dl_task(struct rq *rq); + + static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) + { +- return dl_task(prev); ++ return rq->online && dl_task(prev); + } + + static DEFINE_PER_CPU(struct callback_head, dl_push_head); +@@ -1919,7 +1919,7 @@ static void task_fork_dl(struct task_struct *p) + static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) + { + if (!task_running(rq, p) && +- cpumask_test_cpu(cpu, p->cpus_ptr)) ++ cpumask_test_cpu(cpu, &p->cpus_mask)) + return 1; + return 0; + } +@@ -2009,8 +2009,8 @@ static int find_later_rq(struct task_struct *task) + return this_cpu; + } + +- best_cpu = cpumask_first_and(later_mask, +- sched_domain_span(sd)); ++ best_cpu = cpumask_any_and_distribute(later_mask, ++ sched_domain_span(sd)); + /* + * Last chance: if a CPU being in both later_mask + * and current sd span is valid, that becomes our +@@ -2032,7 +2032,7 @@ static int find_later_rq(struct task_struct *task) + if (this_cpu != -1) + return this_cpu; + +- cpu = cpumask_any(later_mask); ++ cpu = cpumask_any_distribute(later_mask); + if (cpu < nr_cpu_ids) + return cpu; + +@@ -2097,7 +2097,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) + */ + next_task = pick_next_pushable_dl_task(rq); + if (unlikely(next_task != task || +- !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr))) { ++ !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask))) { + double_unlock_balance(rq, later_rq); + later_rq = NULL; + break; +@@ -2141,6 +2141,9 @@ static int push_dl_task(struct rq *rq) + return 0; + + retry: ++ if (is_migration_disabled(next_task)) ++ return 0; ++ + if (WARN_ON(next_task == rq->curr)) + return 0; + +@@ -2218,7 +2221,7 @@ static void push_dl_tasks(struct rq *rq) + static void pull_dl_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, cpu; +- struct task_struct *p; ++ struct task_struct *p, *push_task; + bool resched = false; + struct rq *src_rq; + u64 dmin = LONG_MAX; +@@ -2248,6 +2251,7 @@ static void pull_dl_task(struct rq *this_rq) + continue; + + /* Might drop this_rq->lock */ ++ push_task = NULL; + double_lock_balance(this_rq, src_rq); + + /* +@@ -2279,17 +2283,28 @@ static void pull_dl_task(struct rq *this_rq) + src_rq->curr->dl.deadline)) + goto skip; + +- resched = true; +- +- deactivate_task(src_rq, p, 0); +- set_task_cpu(p, this_cpu); +- activate_task(this_rq, p, 0); +- dmin = p->dl.deadline; ++ if (is_migration_disabled(p)) { ++ trace_sched_migrate_pull_tp(p); ++ push_task = get_push_task(src_rq); ++ } else { ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, this_cpu); ++ activate_task(this_rq, p, 0); ++ dmin = p->dl.deadline; ++ resched = true; ++ } + + /* Is there any other task even earlier? */ + } + skip: + double_unlock_balance(this_rq, src_rq); ++ ++ if (push_task) { ++ raw_spin_unlock(&this_rq->lock); ++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, ++ push_task, &src_rq->push_work); ++ raw_spin_lock(&this_rq->lock); ++ } + } + + if (resched) +@@ -2313,7 +2328,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) + } + + static void set_cpus_allowed_dl(struct task_struct *p, +- const struct cpumask *new_mask) ++ const struct cpumask *new_mask, ++ u32 flags) + { + struct root_domain *src_rd; + struct rq *rq; +@@ -2342,7 +2358,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, + raw_spin_unlock(&src_dl_b->lock); + } + +- set_cpus_allowed_common(p, new_mask); ++ set_cpus_allowed_common(p, new_mask, flags); + } + + /* Assumes rq->lock is held */ +@@ -2537,6 +2553,7 @@ const struct sched_class dl_sched_class + .rq_online = rq_online_dl, + .rq_offline = rq_offline_dl, + .task_woken = task_woken_dl, ++ .find_lock_rq = find_lock_later_rq, + #endif + + .task_tick = task_tick_dl, +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9d5c78016..5a1024f23 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4499,7 +4499,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. +@@ -4523,7 +4523,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + return; + + if (delta > ideal_runtime) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static void +@@ -4666,7 +4666,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) + * validating it and just reschedule. + */ + if (queued) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + return; + } + /* +@@ -4803,7 +4803,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static __always_inline +@@ -5552,7 +5552,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) + + if (delta < 0) { + if (rq->curr == p) +- resched_curr(rq); ++ resched_curr_lazy(rq); + return; + } + hrtick_start(rq, delta); +@@ -7161,7 +7161,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + return; + + preempt: +- resched_curr(rq); ++ resched_curr_lazy(rq); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved +@@ -11579,7 +11579,7 @@ static void task_fork_fair(struct task_struct *p) + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); +- resched_curr(rq); ++ resched_curr_lazy(rq); + } + + se->vruntime -= cfs_rq->min_vruntime; +@@ -11606,7 +11606,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + */ + if (rq->curr == p) { + if (p->prio > oldprio) +- resched_curr(rq); ++ resched_curr_lazy(rq); + } else + check_preempt_curr(rq, p, 0); + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 97ed11bd2..0dade2e74 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -45,11 +45,19 @@ SCHED_FEAT(DOUBLE_TICK, false) + */ + SCHED_FEAT(NONTASK_CAPACITY, true) + ++#ifdef CONFIG_PREEMPT_RT ++SCHED_FEAT(TTWU_QUEUE, false) ++# ifdef CONFIG_PREEMPT_LAZY ++SCHED_FEAT(PREEMPT_LAZY, true) ++# endif ++#else ++ + /* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ + SCHED_FEAT(TTWU_QUEUE, true) ++#endif + + /* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 59c3e2094..46b93fe56 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -270,7 +270,7 @@ static void pull_rt_task(struct rq *this_rq); + static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) + { + /* Try to pull RT tasks here if we lower this rq's prio */ +- return rq->rt.highest_prio.curr > prev->prio; ++ return rq->online && rq->rt.highest_prio.curr > prev->prio; + } + + static inline int rt_overloaded(struct rq *rq) +@@ -1665,7 +1665,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) + static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) + { + if (!task_running(rq, p) && +- cpumask_test_cpu(cpu, p->cpus_ptr)) ++ cpumask_test_cpu(cpu, &p->cpus_mask)) + return 1; + + return 0; +@@ -1759,8 +1759,8 @@ static int find_lowest_rq(struct task_struct *task) + return this_cpu; + } + +- best_cpu = cpumask_first_and(lowest_mask, +- sched_domain_span(sd)); ++ best_cpu = cpumask_any_and_distribute(lowest_mask, ++ sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); + return best_cpu; +@@ -1777,7 +1777,7 @@ static int find_lowest_rq(struct task_struct *task) + if (this_cpu != -1) + return this_cpu; + +- cpu = cpumask_any(lowest_mask); ++ cpu = cpumask_any_distribute(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + +@@ -1838,7 +1838,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) + */ + struct task_struct *next_task = pick_next_pushable_task(rq); + if (unlikely(next_task != task || +- !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr))) { ++ !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask))) { + double_unlock_balance(rq, lowest_rq); + lowest_rq = NULL; + break; +@@ -1862,7 +1862,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +-static int push_rt_task(struct rq *rq) ++static int push_rt_task(struct rq *rq, bool pull) + { + struct task_struct *next_task; + struct rq *lowest_rq; +@@ -1876,6 +1876,39 @@ static int push_rt_task(struct rq *rq) + return 0; + + retry: ++ if (is_migration_disabled(next_task)) { ++ struct task_struct *push_task = NULL; ++ int cpu; ++ ++ if (!pull) ++ return 0; ++ ++ trace_sched_migrate_pull_tp(next_task); ++ ++ if (rq->push_busy) ++ return 0; ++ ++ cpu = find_lowest_rq(rq->curr); ++ if (cpu == -1 || cpu == rq->cpu) ++ return 0; ++ ++ /* ++ * Given we found a CPU with lower priority than @next_task, ++ * therefore it should be running. However we cannot migrate it ++ * to this other CPU, instead attempt to push the current ++ * running task on this CPU away. ++ */ ++ push_task = get_push_task(rq); ++ if (push_task) { ++ raw_spin_unlock(&rq->lock); ++ stop_one_cpu_nowait(rq->cpu, push_cpu_stop, ++ push_task, &rq->push_work); ++ raw_spin_lock(&rq->lock); ++ } ++ ++ return 0; ++ } ++ + if (WARN_ON(next_task == rq->curr)) + return 0; + +@@ -1930,12 +1963,10 @@ static int push_rt_task(struct rq *rq) + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); +- ret = 1; +- + resched_curr(lowest_rq); ++ ret = 1; + + double_unlock_balance(rq, lowest_rq); +- + out: + put_task_struct(next_task); + +@@ -1945,7 +1976,7 @@ static int push_rt_task(struct rq *rq) + static void push_rt_tasks(struct rq *rq) + { + /* push_rt_task will return true if it moved an RT */ +- while (push_rt_task(rq)) ++ while (push_rt_task(rq, false)) + ; + } + +@@ -2098,7 +2129,8 @@ void rto_push_irq_work_func(struct irq_work *work) + */ + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); +- push_rt_tasks(rq); ++ while (push_rt_task(rq, true)) ++ ; + raw_spin_unlock(&rq->lock); + } + +@@ -2123,7 +2155,7 @@ static void pull_rt_task(struct rq *this_rq) + { + int this_cpu = this_rq->cpu, cpu; + bool resched = false; +- struct task_struct *p; ++ struct task_struct *p, *push_task; + struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); + +@@ -2170,6 +2202,7 @@ static void pull_rt_task(struct rq *this_rq) + * double_lock_balance, and another CPU could + * alter this_rq + */ ++ push_task = NULL; + double_lock_balance(this_rq, src_rq); + + /* +@@ -2197,11 +2230,15 @@ static void pull_rt_task(struct rq *this_rq) + if (p->prio < src_rq->curr->prio) + goto skip; + +- resched = true; +- +- deactivate_task(src_rq, p, 0); +- set_task_cpu(p, this_cpu); +- activate_task(this_rq, p, 0); ++ if (is_migration_disabled(p)) { ++ trace_sched_migrate_pull_tp(p); ++ push_task = get_push_task(src_rq); ++ } else { ++ deactivate_task(src_rq, p, 0); ++ set_task_cpu(p, this_cpu); ++ activate_task(this_rq, p, 0); ++ resched = true; ++ } + /* + * We continue with the search, just in + * case there's an even higher prio task +@@ -2211,6 +2248,13 @@ static void pull_rt_task(struct rq *this_rq) + } + skip: + double_unlock_balance(this_rq, src_rq); ++ ++ if (push_task) { ++ raw_spin_unlock(&this_rq->lock); ++ stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, ++ push_task, &src_rq->push_work); ++ raw_spin_lock(&this_rq->lock); ++ } + } + + if (resched) +@@ -2459,6 +2503,7 @@ const struct sched_class rt_sched_class + .rq_offline = rq_offline_rt, + .task_woken = task_woken_rt, + .switched_from = switched_from_rt, ++ .find_lock_rq = find_lock_lowest_rq, + #endif + + .task_tick = task_tick_rt, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 0d40bb700..adace5cf0 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1012,6 +1012,7 @@ struct rq { + unsigned long cpu_capacity_orig; + + struct callback_head *balance_callback; ++ unsigned char balance_flags; + + unsigned char nohz_idle_balance; + unsigned char idle_balance; +@@ -1042,6 +1043,10 @@ struct rq { + + /* This is used to determine avg_idle's max value */ + u64 max_idle_balance_cost; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ struct rcuwait hotplug_wait; ++#endif + #endif /* CONFIG_SMP */ + + #ifdef CONFIG_IRQ_TIME_ACCOUNTING +@@ -1100,6 +1105,11 @@ struct rq { + struct cpuidle_state *idle_state; + #endif + ++#ifdef CONFIG_SMP ++ unsigned int nr_pinned; ++#endif ++ unsigned int push_busy; ++ struct cpu_stop_work push_work; + KABI_RESERVE(1) + KABI_RESERVE(2) + KABI_RESERVE(3) +@@ -1135,6 +1145,17 @@ static inline int cpu_of(struct rq *rq) + #endif + } + ++#define MDF_PUSH 0x01 ++ ++static inline bool is_migration_disabled(struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->migration_disabled; ++#else ++ return false; ++#endif ++} ++ + #ifdef CONFIG_QOS_SCHED + enum task_qos_level { + QOS_LEVEL_OFFLINE = -1, +@@ -1269,6 +1290,12 @@ struct rq_flags { + */ + unsigned int clock_update_flags; + #endif ++ ++#ifdef CONFIG_SMP ++ unsigned int nr_pinned; ++#endif ++ unsigned int push_busy; ++ struct cpu_stop_work push_work; + }; + + /* +@@ -1289,6 +1316,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; + #endif ++#ifdef CONFIG_SMP ++ SCHED_WARN_ON(rq->balance_callback); ++#endif + } + + static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +@@ -1454,6 +1484,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + + #ifdef CONFIG_SMP + ++#define BALANCE_WORK 0x01 ++#define BALANCE_PUSH 0x02 ++ + static inline void + queue_balance_callback(struct rq *rq, + struct callback_head *head, +@@ -1461,12 +1494,12 @@ queue_balance_callback(struct rq *rq, + { + lockdep_assert_held(&rq->lock); + +- if (unlikely(head->next)) ++ if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) + return; + + head->func = (void (*)(struct callback_head *))func; + head->next = rq->balance_callback; +- rq->balance_callback = head; ++ rq->balance_flags |= BALANCE_WORK; + } + + #define rcu_dereference_check_sched_domain(p) \ +@@ -1791,6 +1824,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) + #define WF_FORK 0x02 /* Child wakeup after fork */ + #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ + #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ ++#define WF_LOCK_SLEEPER 0x10 /* Wakeup spinlock "sleeper" */ + + /* + * To aid in avoiding the subversion of "niceness" due to uneven distribution +@@ -1872,10 +1906,13 @@ struct sched_class { + void (*task_woken)(struct rq *this_rq, struct task_struct *task); + + void (*set_cpus_allowed)(struct task_struct *p, +- const struct cpumask *newmask); ++ const struct cpumask *newmask, ++ u32 flags); + + void (*rq_online)(struct rq *rq); + void (*rq_offline)(struct rq *rq); ++ ++ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + #endif + + void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); +@@ -1962,13 +1999,38 @@ static inline bool sched_fair_runnable(struct rq *rq) + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + extern struct task_struct *pick_next_task_idle(struct rq *rq); + ++#define SCA_CHECK 0x01 ++#define SCA_MIGRATE_DISABLE 0x02 ++#define SCA_MIGRATE_ENABLE 0x04 ++ + #ifdef CONFIG_SMP + + extern void update_group_capacity(struct sched_domain *sd, int cpu); + + extern void trigger_load_balance(struct rq *rq); + +-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); ++ ++static inline struct task_struct *get_push_task(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (rq->push_busy) ++ return NULL; ++ ++ if (p->nr_cpus_allowed == 1) ++ return NULL; ++ ++ if (p->migration_disabled) ++ return NULL; ++ ++ rq->push_busy = true; ++ return get_task_struct(p); ++} ++ ++extern int push_cpu_stop(void *arg); + + #endif + +@@ -2012,6 +2074,15 @@ extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); + ++#ifdef CONFIG_PREEMPT_LAZY ++extern void resched_curr_lazy(struct rq *rq); ++#else ++static inline void resched_curr_lazy(struct rq *rq) ++{ ++ resched_curr(rq); ++} ++#endif ++ + extern struct rt_bandwidth def_rt_bandwidth; + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +@@ -2374,7 +2445,6 @@ extern void nohz_balance_exit_idle(struct rq *rq); + static inline void nohz_balance_exit_idle(struct rq *rq) { } + #endif + +- + #ifdef CONFIG_SMP + static inline + void __dl_update(struct dl_bw *dl_b, s64 bw) +diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c +index e1c655f92..f230b1ac7 100644 +--- a/kernel/sched/swait.c ++++ b/kernel/sched/swait.c +@@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q) + struct swait_queue *curr; + LIST_HEAD(tmp); + ++ WARN_ON(irqs_disabled()); + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 9b4e3b25d..9a62e1b59 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -529,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd) + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); + init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); ++ atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.node.a_flags); + #endif + + init_dl_bw(&rd->dl_bw); +diff --git a/kernel/signal.c b/kernel/signal.c +index 54f86e0b9..28d34857e 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -406,13 +407,30 @@ void task_join_group_stop(struct task_struct *task) + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); + } + ++static inline struct sigqueue *get_task_cache(struct task_struct *t) ++{ ++ struct sigqueue *q = t->sigqueue_cache; ++ ++ if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) ++ return NULL; ++ return q; ++} ++ ++static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) ++{ ++ if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) ++ return 0; ++ return 1; ++} ++ + /* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appropriate lock must be held to stop the target task from exiting + */ + static struct sigqueue * +-__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) ++__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, ++ int override_rlimit, int fromslab) + { + struct sigqueue *q = NULL; + struct user_struct *user; +@@ -434,7 +452,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi + rcu_read_unlock(); + + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { +- q = kmem_cache_alloc(sigqueue_cachep, flags); ++ if (!fromslab) ++ q = get_task_cache(t); ++ if (!q) ++ q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } +@@ -451,6 +472,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi + return q; + } + ++static struct sigqueue * ++__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, ++ int override_rlimit) ++{ ++ return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); ++} ++ + static void __sigqueue_free(struct sigqueue *q) + { + if (q->flags & SIGQUEUE_PREALLOC) +@@ -460,6 +488,21 @@ static void __sigqueue_free(struct sigqueue *q) + kmem_cache_free(sigqueue_cachep, q); + } + ++static void sigqueue_free_current(struct sigqueue *q) ++{ ++ struct user_struct *up; ++ ++ if (q->flags & SIGQUEUE_PREALLOC) ++ return; ++ ++ up = q->user; ++ if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { ++ if (atomic_dec_and_test(&up->sigpending)) ++ free_uid(up); ++ } else ++ __sigqueue_free(q); ++} ++ + void flush_sigqueue(struct sigpending *queue) + { + struct sigqueue *q; +@@ -472,6 +515,21 @@ void flush_sigqueue(struct sigpending *queue) + } + } + ++/* ++ * Called from __exit_signal. Flush tsk->pending and ++ * tsk->sigqueue_cache ++ */ ++void flush_task_sigqueue(struct task_struct *tsk) ++{ ++ struct sigqueue *q; ++ ++ flush_sigqueue(&tsk->pending); ++ ++ q = get_task_cache(tsk); ++ if (q) ++ kmem_cache_free(sigqueue_cachep, q); ++} ++ + /* + * Flush all pending signals for this kthread. + */ +@@ -596,7 +654,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i + (info->si_code == SI_TIMER) && + (info->si_sys_private); + +- __sigqueue_free(first); ++ sigqueue_free_current(first); + } else { + /* + * Ok, it wasn't in the queue. This must be +@@ -633,6 +691,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in + bool resched_timer = false; + int signr; + ++ WARN_ON_ONCE(tsk != current); ++ + /* We only dequeue private signals from ourselves, we don't let + * signalfd steal them + */ +@@ -1319,6 +1379,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) + struct k_sigaction *action; + int sig = info->si_signo; + ++ /* ++ * On some archs, PREEMPT_RT has to delay sending a signal from a trap ++ * since it can not enable preemption, and the signal code's spin_locks ++ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will ++ * send the signal on exit of the trap. ++ */ ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (in_atomic()) { ++ struct task_struct *t = current; ++ ++ if (WARN_ON_ONCE(t->forced_info.si_signo)) ++ return 0; ++ ++ if (is_si_special(info)) { ++ WARN_ON_ONCE(info != SEND_SIG_PRIV); ++ t->forced_info.si_signo = info->si_signo; ++ t->forced_info.si_errno = 0; ++ t->forced_info.si_code = SI_KERNEL; ++ t->forced_info.si_pid = 0; ++ t->forced_info.si_uid = 0; ++ } else { ++ t->forced_info = *info; ++ } ++ ++ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); ++ return 0; ++ } ++#endif + spin_lock_irqsave(&t->sighand->siglock, flags); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; +@@ -1812,7 +1900,8 @@ EXPORT_SYMBOL(kill_pid); + */ + struct sigqueue *sigqueue_alloc(void) + { +- struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); ++ /* Preallocated sigqueue objects always from the slabcache ! */ ++ struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); + + if (q) + q->flags |= SIGQUEUE_PREALLOC; +@@ -2198,16 +2287,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t + if (gstop_done && ptrace_reparented(current)) + do_notify_parent_cldstop(current, false, why); + +- /* +- * Don't want to allow preemption here, because +- * sys_ptrace() needs this task to be inactive. +- * +- * XXX: implement read_unlock_no_resched(). +- */ +- preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); +- preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); + } else { +diff --git a/kernel/smp.c b/kernel/smp.c +index b04ab01eb..31269d781 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -480,8 +480,18 @@ void flush_smp_call_function_from_idle(void) + + local_irq_save(flags); + flush_smp_call_function_queue(true); +- if (local_softirq_pending()) +- do_softirq(); ++ ++ if (local_softirq_pending()) { ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ do_softirq(); ++ } else { ++ struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); ++ ++ if (ksoftirqd && ksoftirqd->state != TASK_RUNNING) ++ wake_up_process(ksoftirqd); ++ } ++ } + + local_irq_restore(flags); + } +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 09229ad82..c9adc5c46 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -25,6 +26,7 @@ + #include + #include + #include ++#include + + #define CREATE_TRACE_POINTS + #include +@@ -92,27 +94,212 @@ static bool ksoftirqd_running(unsigned long pending) + !__kthread_should_park(tsk); + } + ++#ifdef CONFIG_TRACE_IRQFLAGS ++DEFINE_PER_CPU(int, hardirqs_enabled); ++DEFINE_PER_CPU(int, hardirq_context); ++EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); ++EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); ++#endif ++ + /* +- * preempt_count and SOFTIRQ_OFFSET usage: +- * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving +- * softirq processing. +- * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) ++ * SOFTIRQ_OFFSET usage: ++ * ++ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies ++ * to a per CPU counter and to task::softirqs_disabled_cnt. ++ * ++ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq ++ * processing. ++ * ++ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. ++ * + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ ++#ifdef CONFIG_PREEMPT_RT + + /* +- * This one is for softirq.c-internal use, +- * where hardirqs are disabled legitimately: ++ * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and ++ * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a ++ * softirq disabled section to be preempted. ++ * ++ * The per task counter is used for softirq_count(), in_softirq() and ++ * in_serving_softirqs() because these counts are only valid when the task ++ * holding softirq_ctrl::lock is running. ++ * ++ * The per CPU counter prevents pointless wakeups of ksoftirqd in case that ++ * the task which is in a softirq disabled section is preempted or blocks. + */ +-#ifdef CONFIG_TRACE_IRQFLAGS ++struct softirq_ctrl { ++ local_lock_t lock; ++ int cnt; ++}; + +-DEFINE_PER_CPU(int, hardirqs_enabled); +-DEFINE_PER_CPU(int, hardirq_context); +-EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +-EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); ++static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { ++ .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), ++}; ++ ++/** ++ * local_bh_blocked() - Check for idle whether BH processing is blocked ++ * ++ * Returns false if the per CPU softirq::cnt is 0 otherwise true. ++ * ++ * This is invoked from the idle task to guard against false positive ++ * softirq pending warnings, which would happen when the task which holds ++ * softirq_ctrl::lock was the only running task on the CPU and blocks on ++ * some other lock. ++ */ ++bool local_bh_blocked(void) ++{ ++ return __this_cpu_read(softirq_ctrl.cnt) != 0; ++} ++ ++void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) ++{ ++ unsigned long flags; ++ int newcnt; ++ ++ WARN_ON_ONCE(in_hardirq()); ++ ++ /* First entry of a task into a BH disabled section? */ ++ if (!current->softirq_disable_cnt) { ++ if (preemptible()) { ++ local_lock(&softirq_ctrl.lock); ++ /* Required to meet the RCU bottomhalf requirements. */ ++ rcu_read_lock(); ++ } else { ++ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt)); ++ } ++ } ++ ++ /* ++ * Track the per CPU softirq disabled state. On RT this is per CPU ++ * state to allow preemption of bottom half disabled sections. ++ */ ++ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); ++ /* ++ * Reflect the result in the task state to prevent recursion on the ++ * local lock and to make softirq_count() & al work. ++ */ ++ current->softirq_disable_cnt = newcnt; ++ ++ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { ++ raw_local_irq_save(flags); ++ lockdep_softirqs_off(ip); ++ raw_local_irq_restore(flags); ++ } ++} ++EXPORT_SYMBOL(__local_bh_disable_ip); ++ ++static void __local_bh_enable(unsigned int cnt, bool unlock) ++{ ++ unsigned long flags; ++ int newcnt; ++ ++ DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != ++ this_cpu_read(softirq_ctrl.cnt)); ++ ++ if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { ++ raw_local_irq_save(flags); ++ lockdep_softirqs_on(_RET_IP_); ++ raw_local_irq_restore(flags); ++ } ++ ++ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); ++ current->softirq_disable_cnt = newcnt; ++ ++ if (!newcnt && unlock) { ++ rcu_read_unlock(); ++ local_unlock(&softirq_ctrl.lock); ++ } ++} ++ ++void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) ++{ ++ bool preempt_on = preemptible(); ++ unsigned long flags; ++ u32 pending; ++ int curcnt; ++ ++ WARN_ON_ONCE(in_irq()); ++ lockdep_assert_irqs_enabled(); ++ ++ local_irq_save(flags); ++ curcnt = __this_cpu_read(softirq_ctrl.cnt); ++ ++ /* ++ * If this is not reenabling soft interrupts, no point in trying to ++ * run pending ones. ++ */ ++ if (curcnt != cnt) ++ goto out; ++ ++ pending = local_softirq_pending(); ++ if (!pending || ksoftirqd_running(pending)) ++ goto out; ++ ++ /* ++ * If this was called from non preemptible context, wake up the ++ * softirq daemon. ++ */ ++ if (!preempt_on) { ++ wakeup_softirqd(); ++ goto out; ++ } ++ ++ /* ++ * Adjust softirq count to SOFTIRQ_OFFSET which makes ++ * in_serving_softirq() become true. ++ */ ++ cnt = SOFTIRQ_OFFSET; ++ __local_bh_enable(cnt, false); ++ __do_softirq(); ++ ++out: ++ __local_bh_enable(cnt, preempt_on); ++ local_irq_restore(flags); ++} ++EXPORT_SYMBOL(__local_bh_enable_ip); ++ ++/* ++ * Invoked from ksoftirqd_run() outside of the interrupt disabled section ++ * to acquire the per CPU local lock for reentrancy protection. ++ */ ++static inline void ksoftirqd_run_begin(void) ++{ ++ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); ++ local_irq_disable(); ++} ++ ++/* Counterpart to ksoftirqd_run_begin() */ ++static inline void ksoftirqd_run_end(void) ++{ ++ __local_bh_enable(SOFTIRQ_OFFSET, true); ++ WARN_ON_ONCE(in_interrupt()); ++ local_irq_enable(); ++} ++ ++static inline void softirq_handle_begin(void) { } ++static inline void softirq_handle_end(void) { } + ++static inline bool should_wake_ksoftirqd(void) ++{ ++ return !this_cpu_read(softirq_ctrl.cnt); ++} ++ ++static inline void invoke_softirq(void) ++{ ++ if (should_wake_ksoftirqd()) ++ wakeup_softirqd(); ++} ++ ++#else /* CONFIG_PREEMPT_RT */ ++ ++/* ++ * This one is for softirq.c-internal use, where hardirqs are disabled ++ * legitimately: ++ */ ++#ifdef CONFIG_TRACE_IRQFLAGS + void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) + { + unsigned long flags; +@@ -203,6 +390,78 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) + } + EXPORT_SYMBOL(__local_bh_enable_ip); + ++static inline void softirq_handle_begin(void) ++{ ++ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); ++} ++ ++static inline void softirq_handle_end(void) ++{ ++ __local_bh_enable(SOFTIRQ_OFFSET); ++ WARN_ON_ONCE(in_interrupt()); ++} ++ ++static inline void ksoftirqd_run_begin(void) ++{ ++ local_irq_disable(); ++} ++ ++static inline void ksoftirqd_run_end(void) ++{ ++ local_irq_enable(); ++} ++ ++static inline bool should_wake_ksoftirqd(void) ++{ ++ return true; ++} ++ ++static inline void invoke_softirq(void) ++{ ++ if (ksoftirqd_running(local_softirq_pending())) ++ return; ++ ++ if (!force_irqthreads) { ++#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK ++ /* ++ * We can safely execute softirq on the current stack if ++ * it is the irq stack, because it should be near empty ++ * at this stage. ++ */ ++ __do_softirq(); ++#else ++ /* ++ * Otherwise, irq_exit() is called on the task stack that can ++ * be potentially deep already. So call softirq in its own stack ++ * to prevent from any overrun. ++ */ ++ do_softirq_own_stack(); ++#endif ++ } else { ++ wakeup_softirqd(); ++ } ++} ++ ++asmlinkage __visible void do_softirq(void) ++{ ++ __u32 pending; ++ unsigned long flags; ++ ++ if (in_interrupt()) ++ return; ++ ++ local_irq_save(flags); ++ ++ pending = local_softirq_pending(); ++ ++ if (pending && !ksoftirqd_running(pending)) ++ do_softirq_own_stack(); ++ ++ local_irq_restore(flags); ++} ++ ++#endif /* !CONFIG_PREEMPT_RT */ ++ + /* + * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, + * but break the loop if need_resched() is set or after 2 ms. +@@ -270,10 +529,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + current->flags &= ~PF_MEMALLOC; + + pending = local_softirq_pending(); +- account_irq_enter_time(current); + +- __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); ++ softirq_handle_begin(); + in_hardirq = lockdep_softirq_start(); ++ account_softirq_enter(current); + + restart: + /* Reset the pending bitmask before enabling irqs */ +@@ -307,8 +566,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + pending >>= softirq_bit; + } + +- if (__this_cpu_read(ksoftirqd) == current) ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ++ __this_cpu_read(ksoftirqd) == current) + rcu_softirq_qs(); ++ + local_irq_disable(); + + pending = local_softirq_pending(); +@@ -320,46 +581,23 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + wakeup_softirqd(); + } + ++ account_softirq_exit(current); + lockdep_softirq_end(in_hardirq); +- account_irq_exit_time(current); +- __local_bh_enable(SOFTIRQ_OFFSET); +- WARN_ON_ONCE(in_interrupt()); ++ softirq_handle_end(); + current_restore_flags(old_flags, PF_MEMALLOC); + } + +-asmlinkage __visible void do_softirq(void) +-{ +- __u32 pending; +- unsigned long flags; +- +- if (in_interrupt()) +- return; +- +- local_irq_save(flags); +- +- pending = local_softirq_pending(); +- +- if (pending && !ksoftirqd_running(pending)) +- do_softirq_own_stack(); +- +- local_irq_restore(flags); +-} +- + /** + * irq_enter_rcu - Enter an interrupt context with RCU watching + */ + void irq_enter_rcu(void) + { +- if (is_idle_task(current) && !in_interrupt()) { +- /* +- * Prevent raise_softirq from needlessly waking up ksoftirqd +- * here, as softirq will be serviced on return from interrupt. +- */ +- local_bh_disable(); ++ __irq_enter_raw(); ++ ++ if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) + tick_irq_enter(); +- _local_bh_enable(); +- } +- __irq_enter(); ++ ++ account_hardirq_enter(current); + } + + /** +@@ -371,32 +609,6 @@ void irq_enter(void) + irq_enter_rcu(); + } + +-static inline void invoke_softirq(void) +-{ +- if (ksoftirqd_running(local_softirq_pending())) +- return; +- +- if (!force_irqthreads) { +-#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK +- /* +- * We can safely execute softirq on the current stack if +- * it is the irq stack, because it should be near empty +- * at this stage. +- */ +- __do_softirq(); +-#else +- /* +- * Otherwise, irq_exit() is called on the task stack that can +- * be potentially deep already. So call softirq in its own stack +- * to prevent from any overrun. +- */ +- do_softirq_own_stack(); +-#endif +- } else { +- wakeup_softirqd(); +- } +-} +- + static inline void tick_irq_exit(void) + { + #ifdef CONFIG_NO_HZ_COMMON +@@ -417,7 +629,7 @@ static inline void __irq_exit_rcu(void) + #else + lockdep_assert_irqs_disabled(); + #endif +- account_irq_exit_time(current); ++ account_hardirq_exit(current); + preempt_count_sub(HARDIRQ_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); +@@ -466,7 +678,7 @@ inline void raise_softirq_irqoff(unsigned int nr) + * Otherwise we wake up ksoftirqd to make sure we + * schedule the softirq soon. + */ +- if (!in_interrupt()) ++ if (!in_interrupt() && should_wake_ksoftirqd()) + wakeup_softirqd(); + } + +@@ -532,6 +744,16 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + ++static inline bool tasklet_clear_sched(struct tasklet_struct *t) ++{ ++ if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { ++ wake_up_var(&t->state); ++ return true; ++ } ++ ++ return false; ++} ++ + static void tasklet_action_common(struct softirq_action *a, + struct tasklet_head *tl_head, + unsigned int softirq_nr) +@@ -551,8 +773,7 @@ static void tasklet_action_common(struct softirq_action *a, + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { +- if (!test_and_clear_bit(TASKLET_STATE_SCHED, +- &t->state)) ++ if (!tasklet_clear_sched(t)) + BUG(); + if (t->use_callback) + t->callback(t); +@@ -607,21 +828,62 @@ void tasklet_init(struct tasklet_struct *t, + } + EXPORT_SYMBOL(tasklet_init); + ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) ++/* ++ * Do not use in new code. Waiting for tasklets from atomic contexts is ++ * error prone and should be avoided. ++ */ ++void tasklet_unlock_spin_wait(struct tasklet_struct *t) ++{ ++ while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ /* ++ * Prevent a live lock when current preempted soft ++ * interrupt processing or prevents ksoftirqd from ++ * running. If the tasklet runs on a different CPU ++ * then this has no effect other than doing the BH ++ * disable/enable dance for nothing. ++ */ ++ local_bh_disable(); ++ local_bh_enable(); ++ } else { ++ cpu_relax(); ++ } ++ } ++} ++EXPORT_SYMBOL(tasklet_unlock_spin_wait); ++#endif ++ + void tasklet_kill(struct tasklet_struct *t) + { + if (in_interrupt()) + pr_notice("Attempt to kill tasklet from interrupt\n"); + +- while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { +- do { +- yield(); +- } while (test_bit(TASKLET_STATE_SCHED, &t->state)); +- } ++ while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) ++ wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); ++ + tasklet_unlock_wait(t); +- clear_bit(TASKLET_STATE_SCHED, &t->state); ++ tasklet_clear_sched(t); + } + EXPORT_SYMBOL(tasklet_kill); + ++#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) ++void tasklet_unlock(struct tasklet_struct *t) ++{ ++ smp_mb__before_atomic(); ++ clear_bit(TASKLET_STATE_RUN, &t->state); ++ smp_mb__after_atomic(); ++ wake_up_var(&t->state); ++} ++EXPORT_SYMBOL_GPL(tasklet_unlock); ++ ++void tasklet_unlock_wait(struct tasklet_struct *t) ++{ ++ wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); ++} ++EXPORT_SYMBOL_GPL(tasklet_unlock_wait); ++#endif ++ + void __init softirq_init(void) + { + int cpu; +@@ -644,18 +906,18 @@ static int ksoftirqd_should_run(unsigned int cpu) + + static void run_ksoftirqd(unsigned int cpu) + { +- local_irq_disable(); ++ ksoftirqd_run_begin(); + if (local_softirq_pending()) { + /* + * We can safely run softirq on inline stack, as we are not deep + * in the task stack here. + */ + __do_softirq(); +- local_irq_enable(); ++ ksoftirqd_run_end(); + cond_resched(); + return; + } +- local_irq_enable(); ++ ksoftirqd_run_end(); + } + + #ifdef CONFIG_HOTPLUG_CPU +diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c +index dd5aeddbe..8bf1fb832 100644 +--- a/kernel/stop_machine.c ++++ b/kernel/stop_machine.c +@@ -47,11 +47,27 @@ struct cpu_stopper { + struct list_head works; /* list of pending works */ + + struct cpu_stop_work stop_work; /* for stop_cpus */ ++ unsigned long caller; ++ cpu_stop_fn_t fn; + }; + + static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); + static bool stop_machine_initialized = false; + ++void print_stop_info(const char *log_lvl, struct task_struct *task) ++{ ++ /* ++ * If @task is a stopper task, it cannot migrate and task_cpu() is ++ * stable. ++ */ ++ struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); ++ ++ if (task != stopper->thread) ++ return; ++ ++ printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); ++} ++ + /* static data for stop_cpus */ + static DEFINE_MUTEX(stop_cpus_mutex); + static bool stop_cpus_in_progress; +@@ -128,7 +144,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) + int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) + { + struct cpu_stop_done done; +- struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; ++ struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; + + cpu_stop_init_done(&done, 1); + if (!cpu_stop_queue_work(cpu, &work)) +@@ -344,7 +360,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * + work1 = work2 = (struct cpu_stop_work){ + .fn = multi_cpu_stop, + .arg = &msdata, +- .done = &done ++ .done = &done, ++ .caller = _RET_IP_, + }; + + cpu_stop_init_done(&done, 2); +@@ -380,7 +397,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * + bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf) + { +- *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; ++ *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; + return cpu_stop_queue_work(cpu, work_buf); + } + +@@ -500,6 +517,8 @@ static void cpu_stopper_thread(unsigned int cpu) + int ret; + + /* cpu stop callbacks must not sleep, make in_atomic() == T */ ++ stopper->caller = work->caller; ++ stopper->fn = fn; + preempt_count_inc(); + ret = fn(arg); + if (done) { +@@ -508,6 +527,8 @@ static void cpu_stopper_thread(unsigned int cpu) + cpu_stop_signal_done(done); + } + preempt_count_dec(); ++ stopper->fn = NULL; ++ stopper->caller = 0; + WARN_ONCE(preempt_count(), + "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); + goto repeat; +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 4ef90718c..6eb443234 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2052,6 +2052,36 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, + } + #endif + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Sleep for 1 ms in hope whoever holds what we want will let it go. ++ */ ++void cpu_chill(void) ++{ ++ unsigned int freeze_flag = current->flags & PF_NOFREEZE; ++ struct task_struct *self = current; ++ ktime_t chill_time; ++ ++ raw_spin_lock_irq(&self->pi_lock); ++ self->saved_state = self->state; ++ __set_current_state_no_track(TASK_UNINTERRUPTIBLE); ++ raw_spin_unlock_irq(&self->pi_lock); ++ ++ chill_time = ktime_set(0, NSEC_PER_MSEC); ++ ++ current->flags |= PF_NOFREEZE; ++ schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); ++ if (!freeze_flag) ++ current->flags &= ~PF_NOFREEZE; ++ ++ raw_spin_lock_irq(&self->pi_lock); ++ __set_current_state_no_track(self->saved_state); ++ self->saved_state = TASK_RUNNING; ++ raw_spin_unlock_irq(&self->pi_lock); ++} ++EXPORT_SYMBOL(cpu_chill); ++#endif ++ + /* + * Functions related to boot-time initialization: + */ +diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c +index 2d7899700..e4e09ad9e 100644 +--- a/kernel/time/tick-sched.c ++++ b/kernel/time/tick-sched.c +@@ -990,7 +990,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) + if (unlikely(local_softirq_pending())) { + static int ratelimit; + +- if (ratelimit < 10 && ++ if (ratelimit < 10 && !local_bh_blocked() && + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { + pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", + (unsigned int) local_softirq_pending()); +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 351420c23..2a9e0b89d 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1287,7 +1287,7 @@ static void del_timer_wait_running(struct timer_list *timer) + u32 tf; + + tf = READ_ONCE(timer->flags); +- if (!(tf & TIMER_MIGRATING)) { ++ if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { + struct timer_base *base = get_timer_base(tf); + + /* +@@ -1371,6 +1371,13 @@ int del_timer_sync(struct timer_list *timer) + */ + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + ++ /* ++ * Must be able to sleep on PREEMPT_RT because of the slowpath in ++ * del_timer_wait_running(). ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) ++ lockdep_assert_preemption_enabled(); ++ + do { + ret = try_to_del_timer_sync(timer); + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 0f3d391b5..15ad561bc 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2590,60 +2590,43 @@ enum print_line_t trace_handle_return(struct trace_seq *s) + } + EXPORT_SYMBOL_GPL(trace_handle_return); + +-unsigned int tracing_gen_ctx_flags(unsigned long irqflags) ++static unsigned short migration_disable_value(void) + { +- unsigned int trace_flags = 0; ++#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) ++ return current->migration_disabled; ++#else ++ return 0; ++#endif ++} ++ ++unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) ++{ ++ unsigned int trace_flags = irqs_status; + unsigned int pc; + + pc = preempt_count(); + +-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +- if (irqs_disabled_flags(irqflags)) +- trace_flags |= TRACE_FLAG_IRQS_OFF; +-#else +- trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT; +-#endif +- + if (pc & NMI_MASK) + trace_flags |= TRACE_FLAG_NMI; + if (pc & HARDIRQ_MASK) + trace_flags |= TRACE_FLAG_HARDIRQ; +- +- if (pc & SOFTIRQ_OFFSET) ++ if (in_serving_softirq()) + trace_flags |= TRACE_FLAG_SOFTIRQ; + + if (tif_need_resched()) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; +- return (trace_flags << 16) | (pc & 0xff); +-} + +-unsigned int tracing_gen_ctx(void) +-{ +- unsigned long irqflags; +- +-#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +- local_save_flags(irqflags); +-#else +- irqflags = 0; ++#ifdef CONFIG_PREEMPT_LAZY ++ if (need_resched_lazy()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; + #endif +- return tracing_gen_ctx_flags(irqflags); +-} + +-unsigned int tracing_gen_ctx_dec(void) +-{ +- unsigned int trace_ctx; +- +- trace_ctx = tracing_gen_ctx(); +- +- /* +- * Subtract one from the preeption counter if preemption is enabled, +- * see trace_event_buffer_reserve()for details. +- */ +- if (IS_ENABLED(CONFIG_PREEMPTION)) +- trace_ctx--; +- return trace_ctx; ++ return (pc & 0xff) | ++ (migration_disable_value() & 0xff) << 8 | ++ (preempt_lazy_count() & 0xff) << 16 | ++ (trace_flags << 24); + } + + struct ring_buffer_event * +@@ -3839,14 +3822,17 @@ unsigned long trace_total_entries(struct trace_array *tr) + + static void print_lat_help_header(struct seq_file *m) + { +- seq_puts(m, "# _------=> CPU# \n" +- "# / _-----=> irqs-off \n" +- "# | / _----=> need-resched \n" +- "# || / _---=> hardirq/softirq \n" +- "# ||| / _--=> preempt-depth \n" +- "# |||| / delay \n" +- "# cmd pid ||||| time | caller \n" +- "# \\ / ||||| \\ | / \n"); ++ seq_puts(m, "# _--------=> CPU# \n" ++ "# / _-------=> irqs-off \n" ++ "# | / _------=> need-resched \n" ++ "# || / _-----=> need-resched-lazy\n" ++ "# ||| / _----=> hardirq/softirq \n" ++ "# |||| / _---=> preempt-depth \n" ++ "# ||||| / _--=> preempt-lazy-depth\n" ++ "# |||||| / _-=> migrate-disable \n" ++ "# ||||||| / delay \n" ++ "# cmd pid |||||||| time | caller \n" ++ "# \\ / |||||||| \\ | / \n"); + } + + static void print_event_info(struct array_buffer *buf, struct seq_file *m) +@@ -3880,13 +3866,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file + + print_event_info(buf, m); + +- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); +- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); +- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); +- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); +- seq_printf(m, "# %.*s||| / delay\n", prec, space); +- seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); +- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); ++ seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); ++ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); ++ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); ++ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); ++ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); ++ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); ++ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); ++ seq_printf(m, "# %.*s|||||| / delay\n", prec, space); ++ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); ++ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); + } + + void +@@ -9422,7 +9411,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) + tracing_off(); + + local_irq_save(flags); +- printk_nmi_direct_enter(); + + /* Simulate the iterator */ + trace_init_global_iter(&iter); +@@ -9502,7 +9490,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) + atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); + } + atomic_dec(&dump_running); +- printk_nmi_direct_exit(); + local_irq_restore(flags); + } + EXPORT_SYMBOL_GPL(ftrace_dump); +diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h +index c0596e250..44943a9be 100644 +--- a/kernel/trace/trace.h ++++ b/kernel/trace/trace.h +@@ -141,25 +141,6 @@ struct kretprobe_trace_entry_head { + unsigned long ret_ip; + }; + +-/* +- * trace_flag_type is an enumeration that holds different +- * states when a trace occurs. These are: +- * IRQS_OFF - interrupts were disabled +- * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags +- * NEED_RESCHED - reschedule is requested +- * HARDIRQ - inside an interrupt handler +- * SOFTIRQ - inside a softirq handler +- */ +-enum trace_flag_type { +- TRACE_FLAG_IRQS_OFF = 0x01, +- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, +- TRACE_FLAG_NEED_RESCHED = 0x04, +- TRACE_FLAG_HARDIRQ = 0x08, +- TRACE_FLAG_SOFTIRQ = 0x10, +- TRACE_FLAG_PREEMPT_RESCHED = 0x20, +- TRACE_FLAG_NMI = 0x40, +-}; +- + #define TRACE_BUF_SIZE 1024 + + struct trace_array; +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index f4b11f609..a2abc0b40 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -183,6 +183,8 @@ static int trace_define_common_fields(void) + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); ++ __common_field(unsigned char, migrate_disable); ++ __common_field(unsigned char, preempt_lazy_count); + + return ret; + } +diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c +index 7042544c5..c711eb334 100644 +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -441,6 +441,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + { + char hardsoft_irq; + char need_resched; ++ char need_resched_lazy; + char irqs_off; + int hardirq; + int softirq; +@@ -471,6 +472,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + break; + } + ++ need_resched_lazy = ++ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; ++ + hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : +@@ -479,14 +483,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + softirq ? 's' : + '.' ; + +- trace_seq_printf(s, "%c%c%c", +- irqs_off, need_resched, hardsoft_irq); ++ trace_seq_printf(s, "%c%c%c%c", ++ irqs_off, need_resched, need_resched_lazy, ++ hardsoft_irq); + + if (entry->preempt_count) + trace_seq_printf(s, "%x", entry->preempt_count); + else + trace_seq_putc(s, '.'); + ++ if (entry->preempt_lazy_count) ++ trace_seq_printf(s, "%x", entry->preempt_lazy_count); ++ else ++ trace_seq_putc(s, '.'); ++ ++ if (entry->migrate_disable) ++ trace_seq_printf(s, "%x", entry->migrate_disable); ++ else ++ trace_seq_putc(s, '.'); ++ + return !trace_seq_has_overflowed(s); + } + +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index 6bef482a1..855f2d8c9 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -4933,6 +4933,10 @@ static void unbind_workers(int cpu) + pool->flags |= POOL_DISASSOCIATED; + + raw_spin_unlock_irq(&pool->lock); ++ ++ for_each_pool_worker(worker, pool) ++ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); ++ + mutex_unlock(&wq_pool_attach_mutex); + + /* +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index f906df9db..c52d39d10 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1366,7 +1366,7 @@ config DEBUG_ATOMIC_SLEEP + + config DEBUG_LOCKING_API_SELFTESTS + bool "Locking API boot-time self-tests" +- depends on DEBUG_KERNEL ++ depends on DEBUG_KERNEL && !PREEMPT_RT + help + Say Y here if you want the kernel to run a short self-test during + bootup. The self-test checks whether common types of locking bugs +diff --git a/lib/bug.c b/lib/bug.c +index 4ab398a2d..9c681f29e 100644 +--- a/lib/bug.c ++++ b/lib/bug.c +@@ -202,6 +202,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) + else + pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", + (void *)bugaddr); ++ pr_flush(1000, true); + + return BUG_TRAP_TYPE_BUG; + } +diff --git a/lib/cpumask.c b/lib/cpumask.c +index fb22fb266..c3c76b833 100644 +--- a/lib/cpumask.c ++++ b/lib/cpumask.c +@@ -261,3 +261,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p, + return next; + } + EXPORT_SYMBOL(cpumask_any_and_distribute); ++ ++int cpumask_any_distribute(const struct cpumask *srcp) ++{ ++ int next, prev; ++ ++ /* NOTE: our first selection will skip 0. */ ++ prev = __this_cpu_read(distribute_cpu_mask_prev); ++ ++ next = cpumask_next(prev, srcp); ++ if (next >= nr_cpu_ids) ++ next = cpumask_first(srcp); ++ ++ if (next < nr_cpu_ids) ++ __this_cpu_write(distribute_cpu_mask_prev, next); ++ ++ return next; ++} ++EXPORT_SYMBOL(cpumask_any_distribute); +diff --git a/lib/debugobjects.c b/lib/debugobjects.c +index 9e14ae023..083882a3c 100644 +--- a/lib/debugobjects.c ++++ b/lib/debugobjects.c +@@ -557,7 +557,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack + struct debug_obj *obj; + unsigned long flags; + +- fill_pool(); ++#ifdef CONFIG_PREEMPT_RT ++ if (preempt_count() == 0 && !irqs_disabled()) ++#endif ++ fill_pool(); + + db = get_bucket((unsigned long) addr); + +diff --git a/lib/dump_stack.c b/lib/dump_stack.c +index a00ee6eed..f5a33b6f7 100644 +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + + static char dump_stack_arch_desc_str[128]; + +@@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl) + log_lvl, dump_stack_arch_desc_str); + + print_worker_info(log_lvl, current); ++ print_stop_info(log_lvl, current); + } + + /** +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d..7557bf7ec 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop) + list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); + raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(irq_poll_sched); + +@@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop) + local_irq_save(flags); + __irq_poll_complete(iop); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(irq_poll_complete); + +@@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) + } + + local_irq_enable(); ++ preempt_check_resched_rt(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new +@@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + + local_irq_enable(); ++ preempt_check_resched_rt(); + } + + /** +@@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu) + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + return 0; + } +diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c +index 76c52b0b7..98c376b02 100644 +--- a/lib/locking-selftest.c ++++ b/lib/locking-selftest.c +@@ -787,6 +787,8 @@ GENERATE_TESTCASE(init_held_rtmutex); + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) + +@@ -802,9 +804,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + ++#ifndef CONFIG_PREEMPT_RT + /* + * Enabling hardirqs with a softirq-safe lock held: + */ +@@ -837,6 +842,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + #undef E1 + #undef E2 + ++#endif ++ + /* + * Enabling irqs with an irq-safe lock held: + */ +@@ -860,6 +867,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) + +@@ -875,6 +884,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + +@@ -906,6 +917,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) + +@@ -921,6 +934,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + #undef E3 +@@ -954,6 +969,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) + #include "locking-selftest-spin-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin) + ++#ifndef CONFIG_PREEMPT_RT ++ + #include "locking-selftest-rlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) + +@@ -969,10 +986,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) + ++#endif ++ + #undef E1 + #undef E2 + #undef E3 + ++#ifndef CONFIG_PREEMPT_RT ++ + /* + * read-lock / write-lock irq inversion. + * +@@ -1162,6 +1183,11 @@ GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_R3W1) + #undef E1 + #undef E2 + #undef E3 ++ ++#endif ++ ++#ifndef CONFIG_PREEMPT_RT ++ + /* + * read-lock / write-lock recursion that is actually safe. + */ +@@ -1208,6 +1234,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) + #undef E2 + #undef E3 + ++#endif ++ + /* + * read-lock / write-lock recursion that is unsafe. + */ +@@ -2456,6 +2484,7 @@ void locking_selftest(void) + + printk(" --------------------------------------------------------------------------\n"); + ++#ifndef CONFIG_PREEMPT_RT + /* + * irq-context testcases: + */ +@@ -2470,6 +2499,28 @@ void locking_selftest(void) + DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2); + DO_TESTCASE_6x2x2RW("irq read-recursion #3", irq_read_recursion3); + ++#else ++ /* On -rt, we only do hardirq context test for raw spinlock */ ++ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12); ++ DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21); ++ ++ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12); ++ DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21); ++ ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321); ++ ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312); ++ DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321); ++#endif + ww_tests(); + + force_read_lock_recursive = 0; +diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c +index 8abe1870d..b09a490f5 100644 +--- a/lib/nmi_backtrace.c ++++ b/lib/nmi_backtrace.c +@@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, + touch_softlockup_watchdog(); + } + +- /* +- * Force flush any remote buffers that might be stuck in IRQ context +- * and therefore could not run their irq_work. +- */ +- printk_safe_flush(); +- + clear_bit_unlock(0, &backtrace_flag); + put_cpu(); + } +diff --git a/lib/scatterlist.c b/lib/scatterlist.c +index a59778946..907f59045 100644 +--- a/lib/scatterlist.c ++++ b/lib/scatterlist.c +@@ -892,7 +892,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) + flush_kernel_dcache_page(miter->page); + + if (miter->__flags & SG_MITER_ATOMIC) { +- WARN_ON_ONCE(preemptible()); ++ WARN_ON_ONCE(!pagefault_disabled()); + kunmap_atomic(miter->addr); + } else + kunmap(miter->page); +diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c +index 525222e4f..1c1dbd300 100644 +--- a/lib/smp_processor_id.c ++++ b/lib/smp_processor_id.c +@@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) + if (current->nr_cpus_allowed == 1) + goto out; + ++#ifdef CONFIG_SMP ++ if (current->migration_disabled) ++ goto out; ++#endif ++ + /* + * It is valid to assume CPU-locality during early bootup: + */ +diff --git a/lib/test_lockup.c b/lib/test_lockup.c +index f1a020bcc..864554e76 100644 +--- a/lib/test_lockup.c ++++ b/lib/test_lockup.c +@@ -480,6 +480,21 @@ static int __init test_lockup_init(void) + return -EINVAL; + + #ifdef CONFIG_DEBUG_SPINLOCK ++#ifdef CONFIG_PREEMPT_RT ++ if (test_magic(lock_spinlock_ptr, ++ offsetof(spinlock_t, lock.wait_lock.magic), ++ SPINLOCK_MAGIC) || ++ test_magic(lock_rwlock_ptr, ++ offsetof(rwlock_t, rtmutex.wait_lock.magic), ++ SPINLOCK_MAGIC) || ++ test_magic(lock_mutex_ptr, ++ offsetof(struct mutex, lock.wait_lock.magic), ++ SPINLOCK_MAGIC) || ++ test_magic(lock_rwsem_ptr, ++ offsetof(struct rw_semaphore, rtmutex.wait_lock.magic), ++ SPINLOCK_MAGIC)) ++ return -EINVAL; ++#else + if (test_magic(lock_spinlock_ptr, + offsetof(spinlock_t, rlock.magic), + SPINLOCK_MAGIC) || +@@ -493,6 +508,7 @@ static int __init test_lockup_init(void) + offsetof(struct rw_semaphore, wait_lock.magic), + SPINLOCK_MAGIC)) + return -EINVAL; ++#endif + #endif + + if ((wait_state != TASK_RUNNING || +diff --git a/mm/Kconfig b/mm/Kconfig +index 4475bd9f8..9d225b5c2 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -404,7 +404,7 @@ config NOMMU_INITIAL_TRIM_EXCESS + + config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" +- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE ++ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT + select COMPACTION + select XARRAY_MULTI + help +@@ -971,4 +971,7 @@ config MEMORY_RELIABLE + + source "mm/damon/Kconfig" + ++config KMAP_LOCAL ++ bool ++ + endmenu +diff --git a/mm/highmem.c b/mm/highmem.c +index efe38ab47..16f3ecd4a 100644 +--- a/mm/highmem.c ++++ b/mm/highmem.c +@@ -31,10 +31,6 @@ + #include + #include + +-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +-DEFINE_PER_CPU(int, __kmap_atomic_idx); +-#endif +- + /* + * Virtual_count is not a pure "count". + * 0 means that it is not mapped, and has not been mapped +@@ -108,9 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) + atomic_long_t _totalhigh_pages __read_mostly; + EXPORT_SYMBOL(_totalhigh_pages); + +-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); +- +-unsigned int nr_free_highpages (void) ++unsigned int __nr_free_highpages (void) + { + struct zone *zone; + unsigned int pages = 0; +@@ -147,7 +141,7 @@ pte_t * pkmap_page_table; + do { spin_unlock(&kmap_lock); (void)(flags); } while (0) + #endif + +-struct page *kmap_to_page(void *vaddr) ++struct page *__kmap_to_page(void *vaddr) + { + unsigned long addr = (unsigned long)vaddr; + +@@ -158,7 +152,7 @@ struct page *kmap_to_page(void *vaddr) + + return virt_to_page(addr); + } +-EXPORT_SYMBOL(kmap_to_page); ++EXPORT_SYMBOL(__kmap_to_page); + + static void flush_all_zero_pkmaps(void) + { +@@ -200,10 +194,7 @@ static void flush_all_zero_pkmaps(void) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + } + +-/** +- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings +- */ +-void kmap_flush_unused(void) ++void __kmap_flush_unused(void) + { + lock_kmap(); + flush_all_zero_pkmaps(); +@@ -367,7 +358,6 @@ void kunmap_high(struct page *page) + if (need_wakeup) + wake_up(pkmap_map_wait); + } +- + EXPORT_SYMBOL(kunmap_high); + + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +@@ -428,7 +418,249 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, + } + EXPORT_SYMBOL(zero_user_segments); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +-#endif /* CONFIG_HIGHMEM */ ++#endif /* CONFIG_HIGHMEM */ ++ ++#ifdef CONFIG_KMAP_LOCAL ++ ++#include ++ ++/* ++ * With DEBUG_HIGHMEM the stack depth is doubled and every second ++ * slot is unused which acts as a guard page ++ */ ++#ifdef CONFIG_DEBUG_HIGHMEM ++# define KM_INCR 2 ++#else ++# define KM_INCR 1 ++#endif ++ ++static inline int kmap_local_idx_push(void) ++{ ++ WARN_ON_ONCE(in_irq() && !irqs_disabled()); ++ current->kmap_ctrl.idx += KM_INCR; ++ BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); ++ return current->kmap_ctrl.idx - 1; ++} ++ ++static inline int kmap_local_idx(void) ++{ ++ return current->kmap_ctrl.idx - 1; ++} ++ ++static inline void kmap_local_idx_pop(void) ++{ ++ current->kmap_ctrl.idx -= KM_INCR; ++ BUG_ON(current->kmap_ctrl.idx < 0); ++} ++ ++#ifndef arch_kmap_local_post_map ++# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) ++#endif ++ ++#ifndef arch_kmap_local_pre_unmap ++# define arch_kmap_local_pre_unmap(vaddr) do { } while (0) ++#endif ++ ++#ifndef arch_kmap_local_post_unmap ++# define arch_kmap_local_post_unmap(vaddr) do { } while (0) ++#endif ++ ++#ifndef arch_kmap_local_map_idx ++#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) ++#endif ++ ++#ifndef arch_kmap_local_unmap_idx ++#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) ++#endif ++ ++#ifndef arch_kmap_local_high_get ++static inline void *arch_kmap_local_high_get(struct page *page) ++{ ++ return NULL; ++} ++#endif ++ ++/* Unmap a local mapping which was obtained by kmap_high_get() */ ++static inline bool kmap_high_unmap_local(unsigned long vaddr) ++{ ++#ifdef ARCH_NEEDS_KMAP_HIGH_GET ++ if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { ++ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); ++ return true; ++ } ++#endif ++ return false; ++} ++ ++static inline int kmap_local_calc_idx(int idx) ++{ ++ return idx + KM_MAX_IDX * smp_processor_id(); ++} ++ ++static pte_t *__kmap_pte; ++ ++static pte_t *kmap_get_pte(void) ++{ ++ if (!__kmap_pte) ++ __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); ++ return __kmap_pte; ++} ++ ++void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) ++{ ++ pte_t pteval, *kmap_pte = kmap_get_pte(); ++ unsigned long vaddr; ++ int idx; ++ ++ /* ++ * Disable migration so resulting virtual address is stable ++ * accross preemption. ++ */ ++ migrate_disable(); ++ preempt_disable(); ++ idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); ++ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ BUG_ON(!pte_none(*(kmap_pte - idx))); ++ pteval = pfn_pte(pfn, prot); ++ set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); ++ arch_kmap_local_post_map(vaddr, pteval); ++ current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; ++ preempt_enable(); ++ ++ return (void *)vaddr; ++} ++EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); ++ ++void *__kmap_local_page_prot(struct page *page, pgprot_t prot) ++{ ++ void *kmap; ++ ++ if (!PageHighMem(page)) ++ return page_address(page); ++ ++ /* Try kmap_high_get() if architecture has it enabled */ ++ kmap = arch_kmap_local_high_get(page); ++ if (kmap) ++ return kmap; ++ ++ return __kmap_local_pfn_prot(page_to_pfn(page), prot); ++} ++EXPORT_SYMBOL(__kmap_local_page_prot); ++ ++void kunmap_local_indexed(void *vaddr) ++{ ++ unsigned long addr = (unsigned long) vaddr & PAGE_MASK; ++ pte_t *kmap_pte = kmap_get_pte(); ++ int idx; ++ ++ if (addr < __fix_to_virt(FIX_KMAP_END) || ++ addr > __fix_to_virt(FIX_KMAP_BEGIN)) { ++ /* ++ * Handle mappings which were obtained by kmap_high_get() ++ * first as the virtual address of such mappings is below ++ * PAGE_OFFSET. Warn for all other addresses which are in ++ * the user space part of the virtual address space. ++ */ ++ if (!kmap_high_unmap_local(addr)) ++ WARN_ON_ONCE(addr < PAGE_OFFSET); ++ return; ++ } ++ ++ preempt_disable(); ++ idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); ++ WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); ++ ++ arch_kmap_local_pre_unmap(addr); ++ pte_clear(&init_mm, addr, kmap_pte - idx); ++ arch_kmap_local_post_unmap(addr); ++ current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); ++ kmap_local_idx_pop(); ++ preempt_enable(); ++ migrate_enable(); ++} ++EXPORT_SYMBOL(kunmap_local_indexed); ++ ++/* ++ * Invoked before switch_to(). This is safe even when during or after ++ * clearing the maps an interrupt which needs a kmap_local happens because ++ * the task::kmap_ctrl.idx is not modified by the unmapping code so a ++ * nested kmap_local will use the next unused index and restore the index ++ * on unmap. The already cleared kmaps of the outgoing task are irrelevant ++ * because the interrupt context does not know about them. The same applies ++ * when scheduling back in for an interrupt which happens before the ++ * restore is complete. ++ */ ++void __kmap_local_sched_out(void) ++{ ++ struct task_struct *tsk = current; ++ pte_t *kmap_pte = kmap_get_pte(); ++ int i; ++ ++ /* Clear kmaps */ ++ for (i = 0; i < tsk->kmap_ctrl.idx; i++) { ++ pte_t pteval = tsk->kmap_ctrl.pteval[i]; ++ unsigned long addr; ++ int idx; ++ ++ /* With debug all even slots are unmapped and act as guard */ ++ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { ++ WARN_ON_ONCE(!pte_none(pteval)); ++ continue; ++ } ++ if (WARN_ON_ONCE(pte_none(pteval))) ++ continue; ++ ++ /* ++ * This is a horrible hack for XTENSA to calculate the ++ * coloured PTE index. Uses the PFN encoded into the pteval ++ * and the map index calculation because the actual mapped ++ * virtual address is not stored in task::kmap_ctrl. ++ * For any sane architecture this is optimized out. ++ */ ++ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); ++ ++ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ arch_kmap_local_pre_unmap(addr); ++ pte_clear(&init_mm, addr, kmap_pte - idx); ++ arch_kmap_local_post_unmap(addr); ++ } ++} ++ ++void __kmap_local_sched_in(void) ++{ ++ struct task_struct *tsk = current; ++ pte_t *kmap_pte = kmap_get_pte(); ++ int i; ++ ++ /* Restore kmaps */ ++ for (i = 0; i < tsk->kmap_ctrl.idx; i++) { ++ pte_t pteval = tsk->kmap_ctrl.pteval[i]; ++ unsigned long addr; ++ int idx; ++ ++ /* With debug all even slots are unmapped and act as guard */ ++ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { ++ WARN_ON_ONCE(!pte_none(pteval)); ++ continue; ++ } ++ if (WARN_ON_ONCE(pte_none(pteval))) ++ continue; ++ ++ /* See comment in __kmap_local_sched_out() */ ++ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); ++ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); ++ set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); ++ arch_kmap_local_post_map(addr, pteval); ++ } ++} ++ ++void kmap_local_fork(struct task_struct *tsk) ++{ ++ if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) ++ memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); ++} ++ ++#endif + + #if defined(HASHED_PAGE_VIRTUAL) + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 7061f9283..124feb170 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -67,6 +67,7 @@ + #include + #include + #include "slab.h" ++#include + + #include + +@@ -101,6 +102,13 @@ static bool cgroup_memory_kswapd = false; + DEFINE_STATIC_KEY_FALSE(memcg_kswapd_key); + EXPORT_SYMBOL(memcg_kswapd_key); + ++struct event_lock { ++ local_lock_t l; ++}; ++static DEFINE_PER_CPU(struct event_lock, event_lock) = { ++ .l = INIT_LOCAL_LOCK(l), ++}; ++ + /* Whether legacy memory+swap accounting is active */ + static bool do_memsw_account(void) + { +@@ -743,6 +751,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + ++ preempt_disable_rt(); + /* Update memcg */ + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + +@@ -750,6 +759,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg); ++ preempt_enable_rt(); + } + + /** +@@ -2159,6 +2169,7 @@ void unlock_page_memcg(struct page *page) + EXPORT_SYMBOL(unlock_page_memcg); + + struct memcg_stock_pcp { ++ local_lock_t lock; + struct mem_cgroup *cached; /* this never be root cgroup */ + unsigned int nr_pages; + +@@ -2210,7 +2221,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (nr_pages > MEMCG_CHARGE_BATCH) + return ret; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { +@@ -2218,7 +2229,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + + return ret; + } +@@ -2253,14 +2264,14 @@ static void drain_local_stock(struct work_struct *dummy) + * The only protection from memory hotplug vs. drain_stock races is + * that we always operate on local CPU stock here with IRQ disabled + */ +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + drain_obj_stock(stock); + drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + } + + /* +@@ -2272,7 +2283,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + struct memcg_stock_pcp *stock; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached != memcg) { /* reset if necessary */ +@@ -2285,7 +2296,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (stock->nr_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + } + + /* +@@ -2305,7 +2316,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ +- curcpu = get_cpu(); ++ curcpu = get_cpu_light(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; +@@ -2328,7 +2339,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + schedule_work_on(cpu, &stock->work); + } + } +- put_cpu(); ++ put_cpu_light(); + mutex_unlock(&percpu_charge_mutex); + } + +@@ -3089,7 +3100,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + unsigned long flags; + bool ret = false; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { +@@ -3097,7 +3108,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + + return ret; + } +@@ -3153,7 +3164,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + struct memcg_stock_pcp *stock; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached_objcg != objcg) { /* reset if necessary */ +@@ -3167,7 +3178,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + if (stock->nr_bytes > PAGE_SIZE) + drain_obj_stock(stock); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.lock, flags); + } + + int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +@@ -6039,12 +6050,12 @@ static int mem_cgroup_move_account(struct page *page, + + ret = 0; + +- local_irq_disable(); ++ local_lock_irq(&event_lock.l); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); +- local_irq_enable(); ++ local_unlock_irq(&event_lock.l); + out_unlock: + unlock_page(page); + out: +@@ -7016,10 +7027,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) + css_get(&memcg->css); + commit_charge(page, memcg); + +- local_irq_disable(); ++ local_lock_irq(&event_lock.l); + mem_cgroup_charge_statistics(memcg, page, nr_pages); + memcg_check_events(memcg, page); +- local_irq_enable(); ++ local_unlock_irq(&event_lock.l); + + /* + * Cgroup1's unified memory+swap counter has been charged with the +@@ -7075,11 +7086,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) + memcg_oom_recover(ug->memcg); + } + +- local_irq_save(flags); ++ local_lock_irqsave(&event_lock.l, flags); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); + memcg_check_events(ug->memcg, ug->dummy_page); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&event_lock.l, flags); + + /* drop reference from uncharge_page */ + css_put(&ug->memcg->css); +@@ -7251,10 +7262,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) + css_get(&memcg->css); + commit_charge(newpage, memcg); + +- local_irq_save(flags); ++ local_lock_irqsave(&event_lock.l, flags); + mem_cgroup_charge_statistics(memcg, newpage, nr_pages); + memcg_check_events(memcg, newpage); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&event_lock.l, flags); + } + + DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); +@@ -7391,9 +7402,13 @@ static int __init mem_cgroup_init(void) + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, + memcg_hotplug_cpu_dead); + +- for_each_possible_cpu(cpu) +- INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, +- drain_local_stock); ++ for_each_possible_cpu(cpu) { ++ struct memcg_stock_pcp *stock; ++ ++ stock = per_cpu_ptr(&memcg_stock, cpu); ++ INIT_WORK(&stock->work, drain_local_stock); ++ local_lock_init(&stock->lock); ++ } + + for_each_node(node) { + struct mem_cgroup_tree_per_node *rtpn; +@@ -7444,6 +7459,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; + unsigned short oldid; ++ unsigned long flags; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); +@@ -7489,9 +7505,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ ++ local_lock_irqsave(&event_lock.l, flags); ++#ifndef CONFIG_PREEMPT_RT + VM_BUG_ON(!irqs_disabled()); ++#endif + mem_cgroup_charge_statistics(memcg, page, -nr_entries); + memcg_check_events(memcg, page); ++ local_unlock_irqrestore(&event_lock.l, flags); + + css_put(&memcg->css); + } +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 04d75394e..233d356bd 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -61,6 +61,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -389,6 +390,13 @@ EXPORT_SYMBOL(nr_node_ids); + EXPORT_SYMBOL(nr_online_nodes); + #endif + ++struct pa_lock { ++ local_lock_t l; ++}; ++static DEFINE_PER_CPU(struct pa_lock, pa_lock) = { ++ .l = INIT_LOCAL_LOCK(l), ++}; ++ + int page_group_by_mobility_disabled __read_mostly; + + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +@@ -1333,7 +1341,7 @@ static inline void prefetch_buddy(struct page *page) + } + + /* +- * Frees a number of pages from the PCP lists ++ * Frees a number of pages which have been collected from the pcp lists. + * Assumes all pages on list are in same zone, and of same order. + * count is the number of pages to free. + * +@@ -1343,15 +1351,56 @@ static inline void prefetch_buddy(struct page *page) + * And clear the zone's pages_scanned counter, to hold off the "all pages are + * pinned" detection logic. + */ +-static void free_pcppages_bulk(struct zone *zone, int count, +- struct per_cpu_pages *pcp) ++static void free_pcppages_bulk(struct zone *zone, struct list_head *head, ++ bool zone_retry) ++{ ++ bool isolated_pageblocks; ++ struct page *page, *tmp; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&zone->lock, flags); ++ isolated_pageblocks = has_isolate_pageblock(zone); ++ ++ /* ++ * Use safe version since after __free_one_page(), ++ * page->lru.next will not point to original list. ++ */ ++ list_for_each_entry_safe(page, tmp, head, lru) { ++ int mt = get_pcppage_migratetype(page); ++ ++ if (page_zone(page) != zone) { ++ /* ++ * free_unref_page_list() sorts pages by zone. If we end ++ * up with pages from a different NUMA nodes belonging ++ * to the same ZONE index then we need to redo with the ++ * correct ZONE pointer. Skip the page for now, redo it ++ * on the next iteration. ++ */ ++ WARN_ON_ONCE(zone_retry == false); ++ if (zone_retry) ++ continue; ++ } ++ ++ /* MIGRATE_ISOLATE page should not go to pcplists */ ++ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); ++ /* Pageblock could have been isolated meanwhile */ ++ if (unlikely(isolated_pageblocks)) ++ mt = get_pageblock_migratetype(page); ++ ++ list_del(&page->lru); ++ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); ++ trace_mm_page_pcpu_drain(page, 0, mt); ++ } ++ spin_unlock_irqrestore(&zone->lock, flags); ++} ++ ++static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp, ++ struct list_head *dst) + { + int migratetype = 0; + int batch_free = 0; + int prefetch_nr = READ_ONCE(pcp->batch); +- bool isolated_pageblocks; +- struct page *page, *tmp; +- LIST_HEAD(head); ++ struct page *page; + + /* + * Ensure proper count is passed which otherwise would stuck in the +@@ -1388,7 +1437,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, + if (bulkfree_pcp_prepare(page)) + continue; + +- list_add_tail(&page->lru, &head); ++ list_add_tail(&page->lru, dst); + + /* + * We are going to put the page back to the global +@@ -1405,26 +1454,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, + } + } while (--count && --batch_free && !list_empty(list)); + } +- +- spin_lock(&zone->lock); +- isolated_pageblocks = has_isolate_pageblock(zone); +- +- /* +- * Use safe version since after __free_one_page(), +- * page->lru.next will not point to original list. +- */ +- list_for_each_entry_safe(page, tmp, &head, lru) { +- int mt = get_pcppage_migratetype(page); +- /* MIGRATE_ISOLATE page should not go to pcplists */ +- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); +- /* Pageblock could have been isolated meanwhile */ +- if (unlikely(isolated_pageblocks)) +- mt = get_pageblock_migratetype(page); +- +- __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); +- trace_mm_page_pcpu_drain(page, 0, mt); +- } +- spin_unlock(&zone->lock); + } + + static void free_one_page(struct zone *zone, +@@ -1526,11 +1555,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, + return; + + migratetype = get_pfnblock_migratetype(page, pfn); +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + __count_vm_events(PGFREE, 1 << order); + free_one_page(page_zone(page), page, pfn, order, migratetype, + fpi_flags); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + } + + void __free_pages_core(struct page *page, unsigned int order) +@@ -2941,13 +2970,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) + { + unsigned long flags; + int to_drain, batch; ++ LIST_HEAD(dst); + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + batch = READ_ONCE(pcp->batch); + to_drain = min(pcp->count, batch); + if (to_drain > 0) +- free_pcppages_bulk(zone, to_drain, pcp); +- local_irq_restore(flags); ++ isolate_pcp_pages(to_drain, pcp, &dst); ++ ++ local_unlock_irqrestore(&pa_lock.l, flags); ++ ++ if (to_drain > 0) ++ free_pcppages_bulk(zone, &dst, false); + } + #endif + +@@ -2963,14 +2997,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) + unsigned long flags; + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; ++ LIST_HEAD(dst); ++ int count; + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + pset = per_cpu_ptr(zone->pageset, cpu); + + pcp = &pset->pcp; +- if (pcp->count) +- free_pcppages_bulk(zone, pcp->count, pcp); +- local_irq_restore(flags); ++ count = pcp->count; ++ if (count) ++ isolate_pcp_pages(count, pcp, &dst); ++ ++ local_unlock_irqrestore(&pa_lock.l, flags); ++ ++ if (count) ++ free_pcppages_bulk(zone, &dst, false); + } + + /* +@@ -3018,9 +3059,9 @@ static void drain_local_pages_wq(struct work_struct *work) + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ +- preempt_disable(); ++ migrate_disable(); + drain_local_pages(drain->zone); +- preempt_enable(); ++ migrate_enable(); + } + + /* +@@ -3190,7 +3231,8 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) + return true; + } + +-static void free_unref_page_commit(struct page *page, unsigned long pfn) ++static void free_unref_page_commit(struct page *page, unsigned long pfn, ++ struct list_head *dst) + { + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; +@@ -3218,8 +3260,11 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list_add(&page->lru, &pcp->lists[migratetype]); + pcp->count++; +- if (pcp->count >= READ_ONCE(pcp->high)) +- free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); ++ if (pcp->count >= READ_ONCE(pcp->high)) { ++ unsigned long batch = READ_ONCE(pcp->batch); ++ ++ isolate_pcp_pages(batch, pcp, dst); ++ } + } + + /* +@@ -3229,6 +3274,8 @@ void free_unref_page(struct page *page) + { + unsigned long flags; + unsigned long pfn = page_to_pfn(page); ++ struct zone *zone = page_zone(page); ++ LIST_HEAD(dst); + + /* Free dynamic hugetlb page */ + if (free_page_to_dhugetlb_pool(page)) +@@ -3237,9 +3284,11 @@ void free_unref_page(struct page *page) + if (!free_unref_page_prepare(page, pfn)) + return; + +- local_irq_save(flags); +- free_unref_page_commit(page, pfn); +- local_irq_restore(flags); ++ local_lock_irqsave(&pa_lock.l, flags); ++ free_unref_page_commit(page, pfn, &dst); ++ local_unlock_irqrestore(&pa_lock.l, flags); ++ if (!list_empty(&dst)) ++ free_pcppages_bulk(zone, &dst, false); + } + + /* +@@ -3250,6 +3299,11 @@ void free_unref_page_list(struct list_head *list) + struct page *page, *next; + unsigned long flags, pfn; + int batch_count = 0; ++ struct list_head dsts[__MAX_NR_ZONES]; ++ int i; ++ ++ for (i = 0; i < __MAX_NR_ZONES; i++) ++ INIT_LIST_HEAD(&dsts[i]); + + /* Free dynamic hugetlb page list */ + free_page_list_to_dhugetlb_pool(list); +@@ -3262,25 +3316,42 @@ void free_unref_page_list(struct list_head *list) + set_page_private(page, pfn); + } + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_private(page); ++ enum zone_type type; + + set_page_private(page, 0); + trace_mm_page_free_batched(page); +- free_unref_page_commit(page, pfn); ++ type = page_zonenum(page); ++ free_unref_page_commit(page, pfn, &dsts[type]); + + /* + * Guard against excessive IRQ disabled times when we get + * a large list of pages to free. + */ + if (++batch_count == SWAP_CLUSTER_MAX) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + batch_count = 0; +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + } + } +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); ++ ++ for (i = 0; i < __MAX_NR_ZONES; ) { ++ struct page *page; ++ struct zone *zone; ++ ++ if (list_empty(&dsts[i])) { ++ i++; ++ continue; ++ } ++ ++ page = list_first_entry(&dsts[i], struct page, lru); ++ zone = page_zone(page); ++ ++ free_pcppages_bulk(zone, &dsts[i], true); ++ } + } + + /* +@@ -3437,7 +3508,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct page *page; + unsigned long flags; + +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); +@@ -3445,7 +3516,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + zone_statistics(preferred_zone, zone); + } +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + return page; + } + +@@ -3479,7 +3550,8 @@ struct page *rmqueue(struct zone *preferred_zone, + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); +- spin_lock_irqsave(&zone->lock, flags); ++ local_lock_irqsave(&pa_lock.l, flags); ++ spin_lock(&zone->lock); + + do { + page = NULL; +@@ -3505,7 +3577,7 @@ struct page *rmqueue(struct zone *preferred_zone, + + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + + out: + /* Separate test+clear to avoid unnecessary atomics */ +@@ -3518,7 +3590,7 @@ struct page *rmqueue(struct zone *preferred_zone, + return page; + + failed: +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + return NULL; + } + +@@ -9066,7 +9138,7 @@ void zone_pcp_reset(struct zone *zone) + struct per_cpu_pageset *pset; + + /* avoid races with drain_pages() */ +- local_irq_save(flags); ++ local_lock_irqsave(&pa_lock.l, flags); + if (zone->pageset != &boot_pageset) { + for_each_online_cpu(cpu) { + pset = per_cpu_ptr(zone->pageset, cpu); +@@ -9075,7 +9147,7 @@ void zone_pcp_reset(struct zone *zone) + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } +- local_irq_restore(flags); ++ local_unlock_irqrestore(&pa_lock.l, flags); + } + + #ifdef CONFIG_MEMORY_HOTREMOVE +diff --git a/mm/shmem.c b/mm/shmem.c +index 9df016296..d2333b15e 100644 +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -307,10 +307,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; +@@ -333,7 +333,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) + } + *inop = ino; + } +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it +@@ -348,13 +348,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) + * to worry about things like glibc compatibility. + */ + ino_t *next_ino; ++ + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } +@@ -370,9 +371,9 @@ static void shmem_free_inode(struct super_block *sb) + { + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + if (sbinfo->max_inodes) { +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + } + } + +@@ -1469,10 +1470,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) + { + struct mempolicy *mpol = NULL; + if (sbinfo->mpol) { +- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ ++ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + mpol = sbinfo->mpol; + mpol_get(mpol); +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + } + return mpol; + } +@@ -3546,9 +3547,10 @@ static int shmem_reconfigure(struct fs_context *fc) + struct shmem_options *ctx = fc->fs_private; + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); + unsigned long inodes; ++ struct mempolicy *mpol = NULL; + const char *err; + +- spin_lock(&sbinfo->stat_lock); ++ raw_spin_lock(&sbinfo->stat_lock); + inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { + if (!sbinfo->max_blocks) { +@@ -3593,14 +3595,15 @@ static int shmem_reconfigure(struct fs_context *fc) + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (ctx->mpol) { +- mpol_put(sbinfo->mpol); ++ mpol = sbinfo->mpol; + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ + ctx->mpol = NULL; + } +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); ++ mpol_put(mpol); + return 0; + out: +- spin_unlock(&sbinfo->stat_lock); ++ raw_spin_unlock(&sbinfo->stat_lock); + return invalfc(fc, "%s", err); + } + +@@ -3717,7 +3720,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) + sbinfo->mpol = ctx->mpol; + ctx->mpol = NULL; + +- spin_lock_init(&sbinfo->stat_lock); ++ raw_spin_lock_init(&sbinfo->stat_lock); + if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) + goto failed; + spin_lock_init(&sbinfo->shrinklist_lock); +diff --git a/mm/slab.c b/mm/slab.c +index ae84578f3..a65a5f169 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) + parent->shared = NULL; + parent->alien = NULL; + parent->colour_next = 0; +- spin_lock_init(&parent->list_lock); ++ raw_spin_lock_init(&parent->list_lock); + parent->free_objects = 0; + parent->free_touched = 0; + } +@@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + page_node = page_to_nid(page); + n = get_node(cachep, page_node); + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + slabs_destroy(cachep, &list); + } +@@ -699,7 +699,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, + struct kmem_cache_node *n = get_node(cachep, node); + + if (ac->avail) { +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + /* + * Stuff objects into the remote nodes shared array first. + * That way we could avoid the overhead of putting the objects +@@ -710,7 +710,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, + + free_block(cachep, ac->entry, ac->avail, node, list); + ac->avail = 0; +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + } + } + +@@ -783,9 +783,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, + slabs_destroy(cachep, &list); + } else { + n = get_node(cachep, page_node); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); + } + return 1; +@@ -826,10 +826,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) + */ + n = get_node(cachep, node); + if (n) { +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + return 0; + } +@@ -908,7 +908,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, + goto fail; + + n = get_node(cachep, node); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); +@@ -926,7 +926,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, + new_alien = NULL; + } + +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + + /* +@@ -965,7 +965,7 @@ static void cpuup_canceled(long cpu) + if (!n) + continue; + +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + + /* Free limit for this kmem_cache_node */ + n->free_limit -= cachep->batchcount; +@@ -976,7 +976,7 @@ static void cpuup_canceled(long cpu) + nc->avail = 0; + + if (!cpumask_empty(mask)) { +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + goto free_slab; + } + +@@ -990,7 +990,7 @@ static void cpuup_canceled(long cpu) + alien = n->alien; + n->alien = NULL; + +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + kfree(shared); + if (alien) { +@@ -1174,7 +1174,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node * + /* + * Do not assume that spinlocks can be initialized via memcpy: + */ +- spin_lock_init(&ptr->list_lock); ++ raw_spin_lock_init(&ptr->list_lock); + + MAKE_ALL_LISTS(cachep, ptr, nodeid); + cachep->node[nodeid] = ptr; +@@ -1345,11 +1345,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) + for_each_kmem_cache_node(cachep, node, n) { + unsigned long total_slabs, free_slabs, free_objs; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, +@@ -2106,7 +2106,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) + { + #ifdef CONFIG_SMP + check_irq_off(); +- assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); ++ assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); + #endif + } + +@@ -2114,7 +2114,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) + { + #ifdef CONFIG_SMP + check_irq_off(); +- assert_spin_locked(&get_node(cachep, node)->list_lock); ++ assert_raw_spin_locked(&get_node(cachep, node)->list_lock); + #endif + } + +@@ -2154,9 +2154,9 @@ static void do_drain(void *arg) + check_irq_off(); + ac = cpu_cache_get(cachep); + n = get_node(cachep, node); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + ac->avail = 0; + slabs_destroy(cachep, &list); + } +@@ -2174,9 +2174,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) + drain_alien_cache(cachep, n->alien); + + for_each_kmem_cache_node(cachep, node, n) { +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } +@@ -2198,10 +2198,10 @@ static int drain_freelist(struct kmem_cache *cache, + nr_freed = 0; + while (nr_freed < tofree && !list_empty(&n->slabs_free)) { + +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + p = n->slabs_free.prev; + if (p == &n->slabs_free) { +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + goto out; + } + +@@ -2214,7 +2214,7 @@ static int drain_freelist(struct kmem_cache *cache, + * to the cache. + */ + n->free_objects -= cache->num; +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + slab_destroy(cache, page); + nr_freed++; + } +@@ -2650,7 +2650,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) + INIT_LIST_HEAD(&page->slab_list); + n = get_node(cachep, page_to_nid(page)); + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + n->total_slabs++; + if (!page->active) { + list_add_tail(&page->slab_list, &n->slabs_free); +@@ -2660,7 +2660,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) + + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - page->active; +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); + } +@@ -2826,7 +2826,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) + { + struct page *page; + +- assert_spin_locked(&n->list_lock); ++ assert_raw_spin_locked(&n->list_lock); + page = list_first_entry_or_null(&n->slabs_partial, struct page, + slab_list); + if (!page) { +@@ -2853,10 +2853,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + return NULL; + } + +@@ -2865,7 +2865,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + + fixup_slab_list(cachep, n, page, &list); + +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +@@ -2924,7 +2924,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); + + /* See if we can refill from the shared array */ +@@ -2948,7 +2948,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) + must_grow: + n->free_objects -= ac->avail; + alloc_done: +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + direct_grow: +@@ -3173,7 +3173,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + BUG_ON(!n); + + check_irq_off(); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + page = get_first_slab(n, false); + if (!page) + goto must_grow; +@@ -3191,12 +3191,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + + fixup_slab_list(cachep, n, page, &list); + +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + return obj; + + must_grow: +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (page) { + /* This slab isn't counted yet so don't update free_objects */ +@@ -3384,7 +3384,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + + check_irq_off(); + n = get_node(cachep, node); +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + if (n->shared) { + struct array_cache *shared_array = n->shared; + int max = shared_array->limit - shared_array->avail; +@@ -3413,7 +3413,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + STATS_SET_FREEABLE(cachep, i); + } + #endif +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + ac->avail -= batchcount; + memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); + slabs_destroy(cachep, &list); +@@ -3849,9 +3849,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, + + node = cpu_to_mem(cpu); + n = get_node(cachep, node); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + } + free_percpu(prev); +@@ -3946,9 +3946,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, + return; + } + +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } +@@ -4032,7 +4032,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) + + for_each_kmem_cache_node(cachep, node, n) { + check_irq_on(); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; +@@ -4041,7 +4041,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) + if (n->shared) + shared_avail += n->shared->avail; + +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + } + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; +diff --git a/mm/slab.h b/mm/slab.h +index 8414c3451..d937f8673 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -523,7 +523,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, + * The slab lists for all objects. + */ + struct kmem_cache_node { +- spinlock_t list_lock; ++ raw_spinlock_t list_lock; + + #ifdef CONFIG_SLAB + struct list_head slabs_partial; /* partial list first, better asm code */ +diff --git a/mm/slub.c b/mm/slub.c +index 98452815a..b0b21c2b5 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -458,7 +458,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + + #ifdef CONFIG_SLUB_DEBUG + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +-static DEFINE_SPINLOCK(object_map_lock); ++static DEFINE_RAW_SPINLOCK(object_map_lock); + + /* + * Determine a map of object in use on a page. +@@ -474,7 +474,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) + + VM_BUG_ON(!irqs_disabled()); + +- spin_lock(&object_map_lock); ++ raw_spin_lock(&object_map_lock); + + bitmap_zero(object_map, page->objects); + +@@ -487,7 +487,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) + static void put_map(unsigned long *map) __releases(&object_map_lock) + { + VM_BUG_ON(map != object_map); +- spin_unlock(&object_map_lock); ++ raw_spin_unlock(&object_map_lock); + } + + static inline unsigned int size_from_object(struct kmem_cache *s) +@@ -1238,7 +1238,7 @@ static noinline int free_debug_processing( + unsigned long flags; + int ret = 0; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { +@@ -1273,7 +1273,7 @@ static noinline int free_debug_processing( + bulk_cnt, cnt); + + slab_unlock(page); +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; +@@ -1521,6 +1521,12 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page, + } + #endif /* CONFIG_SLUB_DEBUG */ + ++struct slub_free_list { ++ raw_spinlock_t lock; ++ struct list_head list; ++}; ++static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); ++ + /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. +@@ -1776,10 +1782,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) + void *start, *p, *next; + int idx; + bool shuffle; ++ bool enableirqs = false; + + flags &= gfp_allowed_mask; + + if (gfpflags_allow_blocking(flags)) ++ enableirqs = true; ++ ++#ifdef CONFIG_PREEMPT_RT ++ if (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND) ++ enableirqs = true; ++#endif ++ if (enableirqs) + local_irq_enable(); + + flags |= s->allocflags; +@@ -1838,7 +1852,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) + page->frozen = 1; + + out: +- if (gfpflags_allow_blocking(flags)) ++ if (enableirqs) + local_irq_disable(); + if (!page) + return NULL; +@@ -1881,6 +1895,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) + __free_pages(page, order); + } + ++static void free_delayed(struct list_head *h) ++{ ++ while (!list_empty(h)) { ++ struct page *page = list_first_entry(h, struct page, lru); ++ ++ list_del(&page->lru); ++ __free_slab(page->slab_cache, page); ++ } ++} ++ + static void rcu_free_slab(struct rcu_head *h) + { + struct page *page = container_of(h, struct page, rcu_head); +@@ -1892,6 +1916,12 @@ static void free_slab(struct kmem_cache *s, struct page *page) + { + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + call_rcu(&page->rcu_head, rcu_free_slab); ++ } else if (irqs_disabled()) { ++ struct slub_free_list *f = this_cpu_ptr(&slub_free_list); ++ ++ raw_spin_lock(&f->lock); ++ list_add(&page->lru, &f->list); ++ raw_spin_unlock(&f->lock); + } else + __free_slab(s, page); + } +@@ -1999,7 +2029,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + if (!n || !n->nr_partial) + return NULL; + +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { + void *t; + +@@ -2024,7 +2054,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, + break; + + } +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + return object; + } + +@@ -2267,7 +2297,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + * that acquire_slab() will see a slab page that + * is frozen + */ +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + } + } else { + m = M_FULL; +@@ -2279,7 +2309,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + * slabs from diagnostic functions will not see + * any frozen slabs. + */ +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + } + #endif + } +@@ -2304,7 +2334,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, + goto redo; + + if (lock) +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + if (m == M_PARTIAL) + stat(s, tail); +@@ -2343,10 +2373,10 @@ static void unfreeze_partials(struct kmem_cache *s, + n2 = get_node(s, page_to_nid(page)); + if (n != n2) { + if (n) +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + n = n2; +- spin_lock(&n->list_lock); ++ raw_spin_lock(&n->list_lock); + } + + do { +@@ -2375,7 +2405,7 @@ static void unfreeze_partials(struct kmem_cache *s, + } + + if (n) +- spin_unlock(&n->list_lock); ++ raw_spin_unlock(&n->list_lock); + + while (discard_page) { + page = discard_page; +@@ -2412,14 +2442,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > slub_cpu_partial(s)) { ++ struct slub_free_list *f; + unsigned long flags; ++ LIST_HEAD(tofree); + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); ++ f = this_cpu_ptr(&slub_free_list); ++ raw_spin_lock(&f->lock); ++ list_splice_init(&f->list, &tofree); ++ raw_spin_unlock(&f->lock); + local_irq_restore(flags); ++ free_delayed(&tofree); + oldpage = NULL; + pobjects = 0; + pages = 0; +@@ -2487,7 +2524,19 @@ static bool has_cpu_slab(int cpu, void *info) + + static void flush_all(struct kmem_cache *s) + { ++ LIST_HEAD(tofree); ++ int cpu; ++ + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); ++ for_each_online_cpu(cpu) { ++ struct slub_free_list *f; ++ ++ f = &per_cpu(slub_free_list, cpu); ++ raw_spin_lock_irq(&f->lock); ++ list_splice_init(&f->list, &tofree); ++ raw_spin_unlock_irq(&f->lock); ++ free_delayed(&tofree); ++ } + } + + /* +@@ -2542,10 +2591,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, + unsigned long x = 0; + struct page *page; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) + x += get_count(page); +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + return x; + } + #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +@@ -2684,8 +2733,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) + * already disabled (which is the case for bulk allocation). + */ + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +- unsigned long addr, struct kmem_cache_cpu *c) ++ unsigned long addr, struct kmem_cache_cpu *c, ++ struct list_head *to_free) + { ++ struct slub_free_list *f; + void *freelist; + struct page *page; + +@@ -2753,6 +2804,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + VM_BUG_ON(!c->page->frozen); + c->freelist = get_freepointer(s, freelist); + c->tid = next_tid(c->tid); ++ ++out: ++ f = this_cpu_ptr(&slub_free_list); ++ raw_spin_lock(&f->lock); ++ list_splice_init(&f->list, to_free); ++ raw_spin_unlock(&f->lock); ++ + return freelist; + + new_slab: +@@ -2768,7 +2826,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + + if (unlikely(!freelist)) { + slab_out_of_memory(s, gfpflags, node); +- return NULL; ++ goto out; + } + + page = c->page; +@@ -2781,7 +2839,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + goto new_slab; /* Slab failed checks. Next slab needed */ + + deactivate_slab(s, page, get_freepointer(s, freelist), c); +- return freelist; ++ goto out; + } + + /* +@@ -2793,6 +2851,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + { + void *p; + unsigned long flags; ++ LIST_HEAD(tofree); + + local_irq_save(flags); + #ifdef CONFIG_PREEMPTION +@@ -2804,8 +2863,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + c = this_cpu_ptr(s->cpu_slab); + #endif + +- p = ___slab_alloc(s, gfpflags, node, addr, c); ++ p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); + local_irq_restore(flags); ++ free_delayed(&tofree); + return p; + } + +@@ -2839,6 +2899,10 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + unsigned long tid; + struct obj_cgroup *objcg = NULL; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) ++ WARN_ON_ONCE(!preemptible() && ++ (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); ++ + s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); + if (!s) + return NULL; +@@ -3013,7 +3077,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + + do { + if (unlikely(n)) { +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + n = NULL; + } + prior = page->freelist; +@@ -3045,7 +3109,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + + } + } +@@ -3087,7 +3151,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + return; + + slab_empty: +@@ -3102,7 +3166,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, + remove_full(s, n, page); + } + +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, FREE_SLAB); + discard_slab(s, page); + } +@@ -3329,9 +3393,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; ++ LIST_HEAD(to_free); + int i; + struct obj_cgroup *objcg = NULL; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) ++ WARN_ON_ONCE(!preemptible() && ++ (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); ++ + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, &objcg, size, flags); + if (unlikely(!s)) +@@ -3368,7 +3437,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + * of re-populating per CPU c->freelist + */ + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, +- _RET_IP_, c); ++ _RET_IP_, c, &to_free); + if (unlikely(!p[i])) + goto error; + +@@ -3383,6 +3452,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + } + c->tid = next_tid(c->tid); + local_irq_enable(); ++ free_delayed(&to_free); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(slab_want_init_on_alloc(flags, s))) { +@@ -3397,6 +3467,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + return i; + error: + local_irq_enable(); ++ free_delayed(&to_free); + slab_post_alloc_hook(s, objcg, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; +@@ -3532,7 +3603,7 @@ static void + init_kmem_cache_node(struct kmem_cache_node *n) + { + n->nr_partial = 0; +- spin_lock_init(&n->list_lock); ++ raw_spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); + #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); +@@ -3927,7 +3998,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) + struct page *page, *h; + + BUG_ON(irqs_disabled()); +- spin_lock_irq(&n->list_lock); ++ raw_spin_lock_irq(&n->list_lock); + list_for_each_entry_safe(page, h, &n->partial, slab_list) { + if (!page->inuse) { + remove_partial(n, page); +@@ -3937,7 +4008,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) + "Objects remaining in %s on __kmem_cache_shutdown()"); + } + } +- spin_unlock_irq(&n->list_lock); ++ raw_spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(page, h, &discard, slab_list) + discard_slab(s, page); +@@ -4206,7 +4277,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists of slabs to discard or promote. +@@ -4237,7 +4308,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); + +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, &discard, slab_list) +@@ -4413,6 +4484,12 @@ void __init kmem_cache_init(void) + static __initdata struct kmem_cache boot_kmem_cache, + boot_kmem_cache_node; + int node; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); ++ INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); ++ } + + if (debug_guardpage_minorder()) + slub_max_order = 0; +@@ -4611,7 +4688,7 @@ static int validate_slab_node(struct kmem_cache *s, + struct page *page; + unsigned long flags; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry(page, &n->partial, slab_list) { + validate_slab(s, page); +@@ -4633,7 +4710,7 @@ static int validate_slab_node(struct kmem_cache *s, + s->name, count, atomic_long_read(&n->nr_slabs)); + + out: +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + return count; + } + +@@ -4684,6 +4761,9 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) + struct location *l; + int order; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && flags == GFP_ATOMIC) ++ return 0; ++ + order = get_order(sizeof(struct location) * max); + + l = (void *)__get_free_pages(flags, order); +@@ -4812,12 +4892,12 @@ static int list_locations(struct kmem_cache *s, char *buf, + if (!atomic_long_read(&n->nr_slabs)) + continue; + +- spin_lock_irqsave(&n->list_lock, flags); ++ raw_spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, slab_list) + process_slab(&t, s, page, alloc); + list_for_each_entry(page, &n->full, slab_list) + process_slab(&t, s, page, alloc); +- spin_unlock_irqrestore(&n->list_lock, flags); ++ raw_spin_unlock_irqrestore(&n->list_lock, flags); + } + + for (i = 0; i < t.count; i++) { +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index dadbea292..dd7da773b 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1886,7 +1886,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; +- int node, err; ++ int node, err, cpu; + void *vaddr; + + node = numa_node_id(); +@@ -1923,11 +1923,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) + return ERR_PTR(err); + } + +- vbq = &get_cpu_var(vmap_block_queue); ++ cpu = get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + spin_lock(&vbq->lock); + list_add_tail_rcu(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + + return vaddr; + } +@@ -1992,6 +1993,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + struct vmap_block *vb; + void *vaddr = NULL; + unsigned int order; ++ int cpu; + + BUG_ON(offset_in_page(size)); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); +@@ -2006,7 +2008,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + order = get_order(size); + + rcu_read_lock(); +- vbq = &get_cpu_var(vmap_block_queue); ++ cpu = get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long pages_off; + +@@ -2029,7 +2032,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + break; + } + +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + rcu_read_unlock(); + + /* Allocate new block if nothing was found */ +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 2d9d742ec..a0a557510 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -321,6 +321,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + long x; + long t; + ++ preempt_disable_rt(); + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); +@@ -330,6 +331,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + x = 0; + } + __this_cpu_write(*p, x); ++ preempt_enable_rt(); + } + EXPORT_SYMBOL(__mod_zone_page_state); + +@@ -346,6 +348,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + delta >>= PAGE_SHIFT; + } + ++ preempt_disable_rt(); + x = delta + __this_cpu_read(*p); + + t = __this_cpu_read(pcp->stat_threshold); +@@ -355,6 +358,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, + x = 0; + } + __this_cpu_write(*p, x); ++ preempt_enable_rt(); + } + EXPORT_SYMBOL(__mod_node_page_state); + +@@ -387,6 +391,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + ++ preempt_disable_rt(); + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { +@@ -395,6 +400,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); + } ++ preempt_enable_rt(); + } + + void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -405,6 +411,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + ++ preempt_disable_rt(); + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { +@@ -413,6 +420,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) + node_page_state_add(v + overstep, pgdat, item); + __this_cpu_write(*p, -overstep); + } ++ preempt_enable_rt(); + } + + void __inc_zone_page_state(struct page *page, enum zone_stat_item item) +@@ -433,6 +441,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; + ++ preempt_disable_rt(); + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { +@@ -441,6 +450,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); + } ++ preempt_enable_rt(); + } + + void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) +@@ -451,6 +461,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) + + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + ++ preempt_disable_rt(); + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { +@@ -459,6 +470,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) + node_page_state_add(v - overstep, pgdat, item); + __this_cpu_write(*p, overstep); + } ++ preempt_enable_rt(); + } + + void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +diff --git a/mm/workingset.c b/mm/workingset.c +index 4a30e4a81..4c92584ca 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -431,6 +431,8 @@ static struct list_lru shadow_nodes; + + void workingset_update_node(struct xa_node *node) + { ++ struct address_space *mapping; ++ + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. +@@ -439,7 +441,8 @@ void workingset_update_node(struct xa_node *node) + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by the i_pages lock. + */ +- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ ++ mapping = container_of(node->array, struct address_space, i_pages); ++ lockdep_assert_held(&mapping->i_pages.xa_lock); + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { +diff --git a/mm/z3fold.c b/mm/z3fold.c +index f75c638c6..6fdf4774f 100644 +--- a/mm/z3fold.c ++++ b/mm/z3fold.c +@@ -623,14 +623,16 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool, + { + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { +- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); +- ++ struct list_head *unbuddied; + int freechunks = num_free_chunks(zhdr); ++ ++ migrate_disable(); ++ unbuddied = this_cpu_ptr(pool->unbuddied); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + } + } + +@@ -880,8 +882,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + int chunks = size_to_chunks(size), i; + + lookup: ++ migrate_disable(); + /* First, try to find an unbuddied z3fold page. */ +- unbuddied = get_cpu_ptr(pool->unbuddied); ++ unbuddied = this_cpu_ptr(pool->unbuddied); + for_each_unbuddied_list(i, chunks) { + struct list_head *l = &unbuddied[i]; + +@@ -899,7 +902,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + if (can_sleep) + cond_resched(); + goto lookup; +@@ -913,7 +916,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + test_bit(PAGE_CLAIMED, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + if (can_sleep) + cond_resched(); + goto lookup; +@@ -928,7 +931,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + kref_get(&zhdr->refcount); + break; + } +- put_cpu_ptr(pool->unbuddied); ++ migrate_enable(); + + if (!zhdr) { + int cpu; +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index 73cd50735..142170f87 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + + #define ZSPAGE_MAGIC 0x58 + +@@ -77,6 +78,20 @@ + + #define ZS_HANDLE_SIZE (sizeof(unsigned long)) + ++#ifdef CONFIG_PREEMPT_RT ++ ++struct zsmalloc_handle { ++ unsigned long addr; ++ spinlock_t lock; ++}; ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) ++ ++#else ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) ++#endif ++ + /* + * Object location (, ) is encoded as + * a single (unsigned long) handle value. +@@ -293,6 +308,7 @@ struct zspage { + }; + + struct mapping_area { ++ local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} + + static int create_cache(struct zs_pool *pool) + { +- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, ++ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, + 0, 0, NULL); + if (!pool->handle_cachep) + return 1; +@@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool *pool) + + static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + { +- return (unsigned long)kmem_cache_alloc(pool->handle_cachep, +- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++ void *p; ++ ++ p = kmem_cache_alloc(pool->handle_cachep, ++ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++#ifdef CONFIG_PREEMPT_RT ++ if (p) { ++ struct zsmalloc_handle *zh = p; ++ ++ spin_lock_init(&zh->lock); ++ } ++#endif ++ return (unsigned long)p; + } + ++#ifdef CONFIG_PREEMPT_RT ++static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) ++{ ++ return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1)); ++} ++#endif ++ + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) + { + kmem_cache_free(pool->handle_cachep, (void *)handle); +@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) + + static void record_obj(unsigned long handle, unsigned long obj) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ WRITE_ONCE(zh->addr, obj); ++#else + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); ++#endif + } + + /* zpool driver */ +@@ -455,7 +494,10 @@ MODULE_ALIAS("zpool-zsmalloc"); + #endif /* CONFIG_ZPOOL */ + + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +-static DEFINE_PER_CPU(struct mapping_area, zs_map_area); ++static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { ++ /* XXX remove this and use a spin_lock_t in pin_tag() */ ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static bool is_zspage_isolated(struct zspage *zspage) + { +@@ -865,7 +907,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) + + static unsigned long handle_to_obj(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return zh->addr; ++#else + return *(unsigned long *)handle; ++#endif + } + + static unsigned long obj_to_head(struct page *page, void *obj) +@@ -879,22 +927,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) + + static inline int testpin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_is_locked(&zh->lock); ++#else + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static inline int trypin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_trylock(&zh->lock); ++#else + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void pin_tag(unsigned long handle) __acquires(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_lock(&zh->lock); ++#else + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void unpin_tag(unsigned long handle) __releases(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_unlock(&zh->lock); ++#else + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void reset_page(struct page *page) +@@ -1278,7 +1350,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + class = pool->size_class[class_idx]; + off = (class->size * obj_idx) & ~PAGE_MASK; + +- area = &get_cpu_var(zs_map_area); ++ local_lock(&zs_map_area.lock); ++ area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ +@@ -1332,7 +1405,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + + __zs_unmap_object(area, pages, off, class->size); + } +- put_cpu_var(zs_map_area); ++ local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); + unpin_tag(handle); +diff --git a/mm/zswap.c b/mm/zswap.c +index 030254e04..f848f93a1 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +diff --git a/net/Kconfig b/net/Kconfig +index d6567162c..05b0f041f 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -282,7 +282,7 @@ config CGROUP_NET_CLASSID + + config NET_RX_BUSY_POLL + bool +- default y ++ default y if !PREEMPT_RT + + config BQL + bool +diff --git a/net/core/dev.c b/net/core/dev.c +index f20f0d5e5..8b857021d 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -221,14 +221,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) + static inline void rps_lock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_lock(&sd->input_pkt_queue.lock); ++ raw_spin_lock(&sd->input_pkt_queue.raw_lock); + #endif + } + + static inline void rps_unlock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_unlock(&sd->input_pkt_queue.lock); ++ raw_spin_unlock(&sd->input_pkt_queue.raw_lock); + #endif + } + +@@ -3050,6 +3050,7 @@ static void __netif_reschedule(struct Qdisc *q) + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + + void __netif_schedule(struct Qdisc *q) +@@ -3112,6 +3113,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__dev_kfree_skb_irq); + +@@ -3786,7 +3788,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + * This permits qdisc->running owner to get the lock more + * often and dequeue packets faster. + */ ++#ifdef CONFIG_PREEMPT_RT ++ contended = true; ++#else + contended = qdisc_is_running(q); ++#endif + if (unlikely(contended)) + spin_lock(&q->busylock); + +@@ -4585,6 +4591,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + rps_unlock(sd); + + local_irq_restore(flags); ++ preempt_check_resched_rt(); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); +@@ -4800,7 +4807,7 @@ static int netif_rx_internal(struct sk_buff *skb) + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + +- preempt_disable(); ++ migrate_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); +@@ -4810,14 +4817,14 @@ static int netif_rx_internal(struct sk_buff *skb) + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); +- preempt_enable(); ++ migrate_enable(); + } else + #endif + { + unsigned int qtail; + +- ret = enqueue_to_backlog(skb, get_cpu(), &qtail); +- put_cpu(); ++ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); ++ put_cpu_light(); + } + return ret; + } +@@ -4856,11 +4863,9 @@ int netif_rx_ni(struct sk_buff *skb) + + trace_netif_rx_ni_entry(skb); + +- preempt_disable(); ++ local_bh_disable(); + err = netif_rx_internal(skb); +- if (local_softirq_pending()) +- do_softirq(); +- preempt_enable(); ++ local_bh_enable(); + trace_netif_rx_ni_exit(err); + + return err; +@@ -6336,12 +6341,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) + sd->rps_ipi_list = NULL; + + local_irq_enable(); ++ preempt_check_resched_rt(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else + #endif + local_irq_enable(); ++ preempt_check_resched_rt(); + } + + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +@@ -6419,6 +6426,7 @@ void __napi_schedule(struct napi_struct *n) + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__napi_schedule); + +@@ -10981,6 +10989,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + #ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; +@@ -10994,7 +11003,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { ++ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +@@ -11310,7 +11319,7 @@ static int __init net_dev_init(void) + + INIT_WORK(flush, flush_backlog); + +- skb_queue_head_init(&sd->input_pkt_queue); ++ skb_queue_head_init_raw(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + #ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c +index 8e582e29a..e51f4854d 100644 +--- a/net/core/gen_estimator.c ++++ b/net/core/gen_estimator.c +@@ -42,7 +42,7 @@ + struct net_rate_estimator { + struct gnet_stats_basic_packed *bstats; + spinlock_t *stats_lock; +- seqcount_t *running; ++ net_seqlock_t *running; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ +@@ -125,7 +125,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, ++ net_seqlock_t *running, + struct nlattr *opt) + { + struct gnet_estimator *parm = nla_data(opt); +@@ -226,7 +226,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt) ++ net_seqlock_t *running, struct nlattr *opt) + { + return gen_new_estimator(bstats, cpu_bstats, rate_est, + lock, running, opt); +diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c +index e491b083b..ef432cea2 100644 +--- a/net/core/gen_stats.c ++++ b/net/core/gen_stats.c +@@ -137,7 +137,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, + } + + void +-__gnet_stats_copy_basic(const seqcount_t *running, ++__gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_stats_basic_packed *bstats, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +@@ -150,15 +150,15 @@ __gnet_stats_copy_basic(const seqcount_t *running, + } + do { + if (running) +- seq = read_seqcount_begin(running); ++ seq = net_seq_begin(running); + bstats->bytes = b->bytes; + bstats->packets = b->packets; +- } while (running && read_seqcount_retry(running, seq)); ++ } while (running && net_seq_retry(running, seq)); + } + EXPORT_SYMBOL(__gnet_stats_copy_basic); + + static int +-___gnet_stats_copy_basic(const seqcount_t *running, ++___gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b, +@@ -204,7 +204,7 @@ ___gnet_stats_copy_basic(const seqcount_t *running, + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic(const seqcount_t *running, ++gnet_stats_copy_basic(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +@@ -228,7 +228,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic_hw(const seqcount_t *running, ++gnet_stats_copy_basic_hw(net_seqlock_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +diff --git a/net/core/sock.c b/net/core/sock.c +index 2fa8863ca..e96d3695b 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -3057,12 +3057,11 @@ void lock_sock_nested(struct sock *sk, int subclass) + if (sk->sk_lock.owned) + __lock_sock(sk); + sk->sk_lock.owned = 1; +- spin_unlock(&sk->sk_lock.slock); ++ spin_unlock_bh(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); +- local_bh_enable(); + } + EXPORT_SYMBOL(lock_sock_nested); + +@@ -3111,12 +3110,11 @@ bool lock_sock_fast(struct sock *sk) + + __lock_sock(sk); + sk->sk_lock.owned = 1; +- spin_unlock(&sk->sk_lock.slock); ++ spin_unlock_bh(&sk->sk_lock.slock); + /* + * The sk_lock has mutex_lock() semantics here: + */ + mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); +- local_bh_enable(); + return true; + } + EXPORT_SYMBOL(lock_sock_fast); +diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c +index fe74b45ae..2c67e2fd9 100644 +--- a/net/ipv4/inet_hashtables.c ++++ b/net/ipv4/inet_hashtables.c +@@ -637,7 +637,9 @@ int __inet_hash(struct sock *sk, struct sock *osk) + int err = 0; + + if (sk->sk_state != TCP_LISTEN) { ++ local_bh_disable(); + inet_ehash_nolisten(sk, osk, NULL); ++ local_bh_enable(); + return 0; + } + WARN_ON(!sk_unhashed(sk)); +@@ -669,11 +671,8 @@ int inet_hash(struct sock *sk) + { + int err = 0; + +- if (sk->sk_state != TCP_CLOSE) { +- local_bh_disable(); ++ if (sk->sk_state != TCP_CLOSE) + err = __inet_hash(sk, NULL); +- local_bh_enable(); +- } + + return err; + } +@@ -684,17 +683,20 @@ void inet_unhash(struct sock *sk) + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_listen_hashbucket *ilb = NULL; + spinlock_t *lock; ++ bool state_listen; + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) { ++ state_listen = true; + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; +- lock = &ilb->lock; ++ spin_lock(&ilb->lock); + } else { ++ state_listen = false; + lock = inet_ehash_lockp(hashinfo, sk->sk_hash); ++ spin_lock_bh(lock); + } +- spin_lock_bh(lock); + if (sk_unhashed(sk)) + goto unlock; + +@@ -707,7 +709,10 @@ void inet_unhash(struct sock *sk) + __sk_nulls_del_node_init_rcu(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + unlock: +- spin_unlock_bh(lock); ++ if (state_listen) ++ spin_unlock(&ilb->lock); ++ else ++ spin_unlock_bh(lock); + } + EXPORT_SYMBOL_GPL(inet_unhash); + +diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c +index c9e7ecc7a..40203255e 100644 +--- a/net/ipv6/inet6_hashtables.c ++++ b/net/ipv6/inet6_hashtables.c +@@ -333,11 +333,8 @@ int inet6_hash(struct sock *sk) + { + int err = 0; + +- if (sk->sk_state != TCP_CLOSE) { +- local_bh_disable(); ++ if (sk->sk_state != TCP_CLOSE) + err = __inet_hash(sk, NULL); +- local_bh_enable(); +- } + + return err; + } +diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c +index 6e18aa417..2d538f14e 100644 +--- a/net/sched/sch_api.c ++++ b/net/sched/sch_api.c +@@ -1264,7 +1264,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { +- seqcount_t *running; ++ net_seqlock_t *running; + + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) { +diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c +index 5d5391adb..8fe206c7b 100644 +--- a/net/sched/sch_generic.c ++++ b/net/sched/sch_generic.c +@@ -578,7 +578,11 @@ struct Qdisc noop_qdisc = { + .ops = &noop_qdisc_ops, + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, ++#ifdef CONFIG_PREEMPT_RT ++ .running = __SEQLOCK_UNLOCKED(noop_qdisc.running), ++#else + .running = SEQCNT_ZERO(noop_qdisc.running), ++#endif + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), + .gso_skb = { + .next = (struct sk_buff *)&noop_qdisc.gso_skb, +@@ -889,9 +893,15 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + lockdep_set_class(&sch->seqlock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + ++#ifdef CONFIG_PREEMPT_RT ++ seqlock_init(&sch->running); ++ lockdep_set_class(&sch->running.lock, ++ dev->qdisc_running_key ?: &qdisc_running_key); ++#else + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); ++#endif + + sch->ops = ops; + sch->flags = ops->static_flags; +diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c +index 362487f3a..5c6c31fc7 100644 +--- a/net/sunrpc/svc_xprt.c ++++ b/net/sunrpc/svc_xprt.c +@@ -422,7 +422,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + return; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + + atomic_long_inc(&pool->sp_stats.packets); +@@ -446,7 +446,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + rqstp = NULL; + out_unlock: + rcu_read_unlock(); +- put_cpu(); ++ put_cpu_light(); + trace_svc_xprt_do_enqueue(xprt, rqstp); + } + EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); +diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c +index 77499abd9..7a2840d53 100644 +--- a/net/xfrm/xfrm_state.c ++++ b/net/xfrm/xfrm_state.c +@@ -2663,7 +2663,8 @@ int __net_init xfrm_state_init(struct net *net) + net->xfrm.state_num = 0; + INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); + spin_lock_init(&net->xfrm.xfrm_state_lock); +- seqcount_init(&net->xfrm.xfrm_state_hash_generation); ++ seqcount_spinlock_init(&net->xfrm.xfrm_state_hash_generation, ++ &net->xfrm.xfrm_state_lock); + return 0; + + out_byspi: +-- +2.33.0 + diff --git a/0002-modify-bcm2711_defconfig-for-rt-rpi-kernel.patch b/0002-modify-bcm2711_defconfig-for-rt-rpi-kernel.patch new file mode 100644 index 0000000..ba8ec95 --- /dev/null +++ b/0002-modify-bcm2711_defconfig-for-rt-rpi-kernel.patch @@ -0,0 +1,34 @@ +From 5b8683729aea03fb81092be5817f80365f49eee6 Mon Sep 17 00:00:00 2001 +From: zhangyuanhang +Date: Fri, 29 Apr 2022 16:57:28 +0800 +Subject: [PATCH 2/2] modify bcm2711_defconfig for rt rpi kernel + +Signed-off-by: zhangyuanhang +--- + arch/arm64/configs/bcm2711_defconfig | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/arch/arm64/configs/bcm2711_defconfig b/arch/arm64/configs/bcm2711_defconfig +index 75333e69e..11a7eeb8d 100644 +--- a/arch/arm64/configs/bcm2711_defconfig ++++ b/arch/arm64/configs/bcm2711_defconfig +@@ -6,6 +6,7 @@ CONFIG_GENERIC_IRQ_DEBUGFS=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_PREEMPT=y ++CONFIG_PREEMPT_RT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASK_XACCT=y +@@ -55,7 +56,7 @@ CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + CONFIG_CPUFREQ_DT=y + CONFIG_ARM_RASPBERRYPI_CPUFREQ=y + CONFIG_RASPBERRYPI_FIRMWARE=y +-CONFIG_VIRTUALIZATION=y ++# CONFIG_VIRTUALIZATION is not set + CONFIG_KVM=y + CONFIG_CRYPTO_AES_ARM64_BS=m + CONFIG_JUMP_LABEL=y +-- +2.33.0 + diff --git a/_multibuild b/_multibuild index 7938bc7..0508980 100644 --- a/_multibuild +++ b/_multibuild @@ -1,4 +1,5 @@ raspberrypi-kernel kernel-rt + raspberrypi-kernel-rt diff --git a/raspberrypi-kernel-rt.spec b/raspberrypi-kernel-rt.spec new file mode 100644 index 0000000..66ba982 --- /dev/null +++ b/raspberrypi-kernel-rt.spec @@ -0,0 +1,2784 @@ +%global Arch $(echo %{_host_cpu} | sed -e s/i.86/x86/ -e s/x86_64/x86/ -e s/aarch64.*/arm64/) + +%global KernelVer %{version}-%{release}.raspi.%{_target_cpu} + +%global hulkrelease 99.0.0 + +%global debug_package %{nil} + +Name: raspberrypi-kernel-rt +Version: 5.10.0 +Release: %{hulkrelease}.rt62.8 +Summary: Linux Kernel +License: GPLv2 +URL: http://www.kernel.org/ +Source0: kernel.tar.gz +Patch0000: 0000-raspberrypi-kernel.patch +Patch0001: 0001-apply-preempt-RT-patch.patch +Patch0002: 0002-modify-bcm2711_defconfig-for-rt-rpi-kernel.patch + +BuildRequires: module-init-tools, patch >= 2.5.4, bash >= 2.03, tar +BuildRequires: bzip2, xz, findutils, gzip, m4, perl, make >= 3.78, diffutils, gawk +BuildRequires: gcc >= 3.4.2, binutils >= 2.12 +BuildRequires: hostname, net-tools, bc +BuildRequires: xmlto, asciidoc +BuildRequires: openssl-devel +BuildRequires: hmaccalc +BuildRequires: ncurses-devel +BuildRequires: elfutils-libelf-devel +BuildRequires: rpm >= 4.14.2 +BuildRequires: elfutils-devel zlib-devel binutils-devel newt-devel perl(ExtUtils::Embed) bison +BuildRequires: audit-libs-devel +BuildRequires: pciutils-devel gettext +BuildRequires: rpm-build, elfutils +BuildRequires: numactl-devel python3-devel glibc-static python3-docutils +BuildRequires: perl-generators perl(Carp) libunwind-devel gtk2-devel libbabeltrace-devel java-1.8.0-openjdk +AutoReq: no +AutoProv: yes + +Provides: raspberrypi-kernel-rt-aarch64 = %{version}-%{release} + +ExclusiveArch: aarch64 +ExclusiveOS: Linux + +%description +The Linux Kernel preempt-rt image for RaspberryPi. + +%prep +%setup -q -n kernel-%{version} -c +mv kernel linux-%{version} +cp -a linux-%{version} linux-%{KernelVer} + +cd linux-%{KernelVer} +%patch0000 -p1 +%patch0001 -p1 +%patch0002 -p1 + +find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null +find . -name .gitignore -exec rm -f {} \; >/dev/null + +%build +cd linux-%{KernelVer} + +perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = -%{release}.raspi.%{_target_cpu}/" Makefile + +make ARCH=%{Arch} %{?_smp_mflags} bcm2711_defconfig + +make ARCH=%{Arch} %{?_smp_mflags} KERNELRELEASE=%{KernelVer} + +%install +cd linux-%{KernelVer} + +## install linux + +make ARCH=%{Arch} INSTALL_MOD_PATH=$RPM_BUILD_ROOT modules_install KERNELRELEASE=%{KernelVer} +rm -rf $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/source $RPM_BUILD_ROOT/lib/modules/%{KernelVer}/build + +mkdir -p $RPM_BUILD_ROOT/boot +TargetImage=$(make -s image_name) +TargetImage=${TargetImage%.*} +install -m 755 $TargetImage $RPM_BUILD_ROOT/boot/vmlinuz-%{KernelVer} +install -m 644 .config $RPM_BUILD_ROOT/boot/config-%{KernelVer} +install -m 644 System.map $RPM_BUILD_ROOT/boot/System.map-%{KernelVer} + +mkdir -p $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/overlays +install -m 644 $(find arch/%{Arch}/boot/dts/broadcom/ -name "*.dtb") $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/ +install -m 644 $(find arch/%{Arch}/boot/dts/overlays/ -name "*.dtbo") $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/overlays/ +if ls arch/%{Arch}/boot/dts/overlays/*.dtb > /dev/null 2>&1; then + install -m 644 $(find arch/%{Arch}/boot/dts/overlays/ -name "*.dtb") $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/overlays/ +fi +install -m 644 arch/%{Arch}/boot/dts/overlays/README $RPM_BUILD_ROOT/boot/dtb-%{KernelVer}/overlays/ + +%postun +version_old=0 +if [ "$1" == "0" ]; then + echo "warning: something may go wrong when starting this device next time after uninstalling raspberrypi-kernel-rt." +else + version_tmp=0 + name_len=`echo -n %{name}-|wc -c` + for item in `rpm -qa %{name} 2>/dev/null` + do + cur_version=${item:name_len} + cpu_version=${cur_version##*.} + if [ "$cpu_version" == "%{_target_cpu}" ]; then + cur_version=${cur_version%.*} + cur_version=$cur_version.raspi.$cpu_version + if [[ "$cur_version" != "%{KernelVer}" && "$cur_version" > "$version_tmp" ]]; then + version_tmp=$cur_version + fi + fi + done + if [[ "$version_tmp" < "%{KernelVer}" ]]; then + version_old=$version_tmp + fi +fi +if [ "$version_old" != "0" ]; then + if [ -f /boot/vmlinuz-$version_old ] && [ -d /boot/dtb-$version_old ] && [ -d /lib/modules/$version_old ]; then + ls /boot/dtb-$version_old/overlays/*.dtbo > /dev/null 2>&1 + if [ "$?" == "0" ]; then + ls /boot/dtb-$version_old/*.dtb > /dev/null 2>&1 + if [ "$?" == "0" ]; then + rm -rf /boot/*.dtb /boot/overlays /boot/kernel8.img + mkdir /boot/overlays + install -m 755 /boot/vmlinuz-$version_old /boot/kernel8.img + for file in `ls /boot/dtb-$version_old/*.dtb 2>/dev/null` + do + if [ -f $file ]; then + install -m 644 $file /boot/`basename $file` + fi + done + install -m 644 $(find /boot/dtb-$version_old/overlays/ -name "*.dtbo") /boot/overlays/ + if ls /boot/dtb-$version_old/overlays/*.dtb > /dev/null 2>&1; then + install -m 644 $(find /boot/dtb-$version_old/overlays/ -name "*.dtb") /boot/overlays/ + fi + install -m 644 /boot/dtb-$version_old/overlays/README /boot/overlays/ + else + echo "warning: files in /boot/dtb-$version_old/*.dtb missing when resetting raspberrypi-kernel-rt as $version_old, something may go wrong when starting this device next time." + fi + else + echo "warning: files in /boot/dtb-$version_old/overlays missing when resetting raspberrypi-kernel-rt as $version_old, something may go wrong when starting this device next time." + fi + else + echo "warning: files missing when resetting raspberrypi-kernel-rt as $version_old, something may go wrong when starting this device next time." + fi +fi + +%posttrans +rm -rf /boot/*.dtb /boot/overlays /boot/kernel8.img +mkdir -p /boot/overlays +install -m 755 /boot/vmlinuz-%{KernelVer} /boot/kernel8.img +for file in `ls /boot/dtb-%{KernelVer}/*.dtb 2>/dev/null` +do + if [ -f $file ]; then + install -m 644 $file /boot/`basename $file` + fi +done +install -m 644 $(find /boot/dtb-%{KernelVer}/overlays/ -name "*.dtbo") /boot/overlays/ +if ls /boot/dtb-%{KernelVer}/overlays/*.dtb > /dev/null 2>&1; then + install -m 644 $(find /boot/dtb-%{KernelVer}/overlays/ -name "*.dtb") /boot/overlays/ +fi +install -m 644 /boot/dtb-%{KernelVer}/overlays/README /boot/overlays/ + + +%files +%defattr (-, root, root) +%doc +/boot/config-* +/boot/System.map-* +/boot/vmlinuz-* +/boot/dtb-* +/lib/modules/%{KernelVer} + +%changelog +* Mon Jun 27 2022 zhangyuanhang - 5.10.0-99.0.0.8 +- - update preempt-RT to openEuler 5.10.0-99.0.0 + +* Tue Jun 21 2022 zhangyuanhang - 5.10.0-98.0.0.7 +- - update preempt-RT to openEuler 5.10.0-98.0.0 + +* Mon Jun 6 2022 zhangyuanhang - 5.10.0-95.0.0.6 +- - add preempt-RT to openEuler 5.10.0-95.0.0 + +* Fri Mar 11 2022 Yafen Fang - 5.10.0-52.0.0.5 +- update warning info when uninstall or update raspberrypi-kernel + +* Fri Mar 11 2022 Yafen Fang - 5.10.0-52.0.0.4 +- update kernel version to openEuler 5.10.0-52.0.0 +- update Raspberry Pi patch, last commit (b0272c695e99a8dcc3a01298db56361333f1fdcf): net: phy: lan87xx: Decrease phy polling rate + +* Mon Oct 25 2021 Yafen Fang - 5.10.0-15.0.0.3 +- update kernel version to openEuler 5.10.0-15.0.0 + +* Wed Oct 20 2021 Yafen Fang - 5.10.0-14.0.0.2 +- update Raspberry Pi patch, last commit (03ab8875d1fc756bd6d2fd8fdb211532eff33062): gpio: bcm-virt: Fix the get() method + +* Tue Oct 19 2021 Zheng Zengkai - 5.10.0-14.0.0.1 +- Revert "time: Handle negative seconds correctly in timespec64_to_ns()" +- Revert "posix-cpu-timers: Force next expiration recalc after itimer reset" +- Revert "block: nbd: add sanity check for first_minor" +- Revert "Bluetooth: Move shutdown callback before flushing tx and rx queue" +- clk: kirkwood: Fix a clocking boot regression +- backlight: pwm_bl: Improve bootloader/kernel device handover +- fbmem: don't allow too huge resolutions +- IMA: remove the dependency on CRYPTO_MD5 +- IMA: remove -Wmissing-prototypes warning +- fuse: flush extending writes +- fuse: truncate pagecache on atomic_o_trunc +- ARM: dts: at91: add pinctrl-{names, 0} for all gpios +- KVM: nVMX: Unconditionally clear nested.pi_pending on nested VM-Enter +- KVM: VMX: avoid running vmx_handle_exit_irqoff in case of emulation +- KVM: x86: Update vCPU's hv_clock before back to guest when tsc_offset is adjusted +- KVM: s390: index kvm->arch.idle_mask by vcpu_idx +- Revert "KVM: x86: mmu: Add guest physical address check in translate_gpa()" +- x86/resctrl: Fix a maybe-uninitialized build warning treated as error +- perf/x86/amd/ibs: Extend PERF_PMU_CAP_NO_EXCLUDE to IBS Op +- tty: Fix data race between tiocsti() and flush_to_ldisc() +- bio: fix page leak bio_add_hw_page failure +- io_uring: IORING_OP_WRITE needs hash_reg_file set +- time: Handle negative seconds correctly in timespec64_to_ns() +- f2fs: guarantee to write dirty data when enabling checkpoint back +- iwlwifi Add support for ax201 in Samsung Galaxy Book Flex2 Alpha +- ASoC: rt5682: Remove unused variable in rt5682_i2c_remove() +- ipv4: fix endianness issue in inet_rtm_getroute_build_skb() +- octeontx2-af: Set proper errorcode for IPv4 checksum errors +- octeontx2-af: Fix static code analyzer reported issues +- octeontx2-af: Fix loop in free and unmap counter +- net: qualcomm: fix QCA7000 checksum handling +- net: sched: Fix qdisc_rate_table refcount leak when get tcf_block failed +- ipv4: make exception cache less predictible +- ipv6: make exception cache less predictible +- brcmfmac: pcie: fix oops on failure to resume and reprobe +- bcma: Fix memory leak for internally-handled cores +- atlantic: Fix driver resume flow. +- ath6kl: wmi: fix an error code in ath6kl_wmi_sync_point() +- ice: Only lock to update netdev dev_addr +- iwlwifi: skip first element in the WTAS ACPI table +- iwlwifi: follow the new inclusive terminology +- ASoC: wcd9335: Disable irq on slave ports in the remove function +- ASoC: wcd9335: Fix a memory leak in the error handling path of the probe function +- ASoC: wcd9335: Fix a double irq free in the remove function +- tty: serial: fsl_lpuart: fix the wrong mapbase value +- usb: bdc: Fix a resource leak in the error handling path of 'bdc_probe()' +- usb: bdc: Fix an error handling path in 'bdc_probe()' when no suitable DMA config is available +- usb: ehci-orion: Handle errors of clk_prepare_enable() in probe +- i2c: xlp9xx: fix main IRQ check +- i2c: mt65xx: fix IRQ check +- CIFS: Fix a potencially linear read overflow +- bpf: Fix possible out of bound write in narrow load handling +- mmc: moxart: Fix issue with uninitialized dma_slave_config +- mmc: dw_mmc: Fix issue with uninitialized dma_slave_config +- mmc: sdhci: Fix issue with uninitialized dma_slave_config +- ASoC: Intel: Skylake: Fix module resource and format selection +- ASoC: Intel: Skylake: Leave data as is when invoking TLV IPCs +- ASoC: Intel: kbl_da7219_max98927: Fix format selection for max98373 +- rsi: fix an error code in rsi_probe() +- rsi: fix error code in rsi_load_9116_firmware() +- gfs2: init system threads before freeze lock +- i2c: hix5hd2: fix IRQ check +- i2c: fix platform_get_irq.cocci warnings +- i2c: s3c2410: fix IRQ check +- i2c: iop3xx: fix deferred probing +- Bluetooth: add timeout sanity check to hci_inquiry +- lkdtm: replace SCSI_DISPATCH_CMD with SCSI_QUEUE_RQ +- mm/swap: consider max pages in iomap_swapfile_add_extent +- usb: gadget: mv_u3d: request_irq() after initializing UDC +- firmware: raspberrypi: Fix a leak in 'rpi_firmware_get()' +- firmware: raspberrypi: Keep count of all consumers +- i2c: synquacer: fix deferred probing +- clk: staging: correct reference to config IOMEM to config HAS_IOMEM +- arm64: dts: marvell: armada-37xx: Extend PCIe MEM space +- nfsd4: Fix forced-expiry locking +- lockd: Fix invalid lockowner cast after vfs_test_lock +- locking/local_lock: Add missing owner initialization +- locking/lockdep: Mark local_lock_t +- mac80211: Fix insufficient headroom issue for AMSDU +- libbpf: Re-build libbpf.so when libbpf.map changes +- usb: phy: tahvo: add IRQ check +- usb: host: ohci-tmio: add IRQ check +- PM: cpu: Make notifier chain use a raw_spinlock_t +- Bluetooth: Move shutdown callback before flushing tx and rx queue +- samples: pktgen: add missing IPv6 option to pktgen scripts +- devlink: Clear whole devlink_flash_notify struct +- selftests/bpf: Fix test_core_autosize on big-endian machines +- usb: gadget: udc: renesas_usb3: Fix soc_device_match() abuse +- usb: phy: twl6030: add IRQ checks +- usb: phy: fsl-usb: add IRQ check +- usb: gadget: udc: s3c2410: add IRQ check +- usb: gadget: udc: at91: add IRQ check +- usb: dwc3: qcom: add IRQ check +- usb: dwc3: meson-g12a: add IRQ check +- ASoC: rt5682: Properly turn off regulators if wrong device ID +- ASoC: rt5682: Implement remove callback +- net/mlx5: Fix unpublish devlink parameters +- net/mlx5: Register to devlink ingress VLAN filter trap +- drm/msm/dsi: Fix some reference counted resource leaks +- Bluetooth: fix repeated calls to sco_sock_kill +- ASoC: Intel: Fix platform ID matching +- cgroup/cpuset: Fix violation of cpuset locking rule +- cgroup/cpuset: Miscellaneous code cleanup +- counter: 104-quad-8: Return error when invalid mode during ceiling_write +- arm64: dts: exynos: correct GIC CPU interfaces address range on Exynos7 +- drm/msm/dpu: make dpu_hw_ctl_clear_all_blendstages clear necessary LMs +- drm/msm/mdp4: move HW revision detection to earlier phase +- drm/msm/mdp4: refactor HW revision detection into read_mdp_hw_revision +- selftests/bpf: Fix bpf-iter-tcp4 test to print correctly the dest IP +- PM: EM: Increase energy calculation precision +- Bluetooth: increase BTNAMSIZ to 21 chars to fix potential buffer overflow +- debugfs: Return error during {full/open}_proxy_open() on rmmod +- soc: qcom: smsm: Fix missed interrupts if state changes while masked +- bpf, samples: Add missing mprog-disable to xdp_redirect_cpu's optstring +- PCI: PM: Enable PME if it can be signaled from D3cold +- PCI: PM: Avoid forcing PCI_D0 for wakeup reasons inconsistently +- media: venus: venc: Fix potential null pointer dereference on pointer fmt +- media: em28xx-input: fix refcount bug in em28xx_usb_disconnect +- leds: trigger: audio: Add an activate callback to ensure the initial brightness is set +- leds: lt3593: Put fwnode in any case during ->probe() +- i2c: highlander: add IRQ check +- net/mlx5: Fix missing return value in mlx5_devlink_eswitch_inline_mode_set() +- devlink: Break parameter notification sequence to be before/after unload/load driver +- arm64: dts: renesas: hihope-rzg2-ex: Add EtherAVB internal rx delay +- arm64: dts: renesas: rzg2: Convert EtherAVB to explicit delay handling +- Bluetooth: mgmt: Fix wrong opcode in the response for add_adv cmd +- net: cipso: fix warnings in netlbl_cipsov4_add_std +- drm: mxsfb: Clear FIFO_CLEAR bit +- drm: mxsfb: Increase number of outstanding requests on V4 and newer HW +- drm: mxsfb: Enable recovery on underflow +- cgroup/cpuset: Fix a partition bug with hotplug +- net/mlx5e: Block LRO if firmware asks for tunneled LRO +- net/mlx5e: Prohibit inner indir TIRs in IPoIB +- ARM: dts: meson8b: ec100: Fix the pwm regulator supply properties +- ARM: dts: meson8b: mxq: Fix the pwm regulator supply properties +- ARM: dts: meson8b: odroidc1: Fix the pwm regulator supply properties +- ARM: dts: meson8: Use a higher default GPU clock frequency +- tcp: seq_file: Avoid skipping sk during tcp_seek_last_pos +- drm/amdgpu/acp: Make PM domain really work +- 6lowpan: iphc: Fix an off-by-one check of array index +- Bluetooth: sco: prevent information leak in sco_conn_defer_accept() +- media: atomisp: fix the uninitialized use and rename "retvalue" +- media: coda: fix frame_mem_ctrl for YUV420 and YVU420 formats +- media: rockchip/rga: fix error handling in probe +- media: rockchip/rga: use pm_runtime_resume_and_get() +- media: go7007: remove redundant initialization +- media: go7007: fix memory leak in go7007_usb_probe +- media: dvb-usb: Fix error handling in dvb_usb_i2c_init +- media: dvb-usb: fix uninit-value in vp702x_read_mac_addr +- media: dvb-usb: fix uninit-value in dvb_usb_adapter_dvb_init +- ionic: cleanly release devlink instance +- driver core: Fix error return code in really_probe() +- firmware: fix theoretical UAF race with firmware cache and resume +- gfs2: Fix memory leak of object lsi on error return path +- libbpf: Fix removal of inner map in bpf_object__create_map +- soc: qcom: rpmhpd: Use corner in power_off +- i40e: improve locking of mac_filter_hash +- arm64: dts: renesas: r8a77995: draak: Remove bogus adv7511w properties +- ARM: dts: aspeed-g6: Fix HVI3C function-group in pinctrl dtsi +- libbpf: Fix the possible memory leak on error +- gve: fix the wrong AdminQ buffer overflow check +- drm/of: free the iterator object on failure +- bpf: Fix potential memleak and UAF in the verifier. +- bpf: Fix a typo of reuseport map in bpf.h. +- drm/of: free the right object +- media: cxd2880-spi: Fix an error handling path +- soc: rockchip: ROCKCHIP_GRF should not default to y, unconditionally +- leds: is31fl32xx: Fix missing error code in is31fl32xx_parse_dt() +- media: TDA1997x: enable EDID support +- ASoC: mediatek: mt8183: Fix Unbalanced pm_runtime_enable in mt8183_afe_pcm_dev_probe +- drm/gma500: Fix end of loop tests for list_for_each_entry +- drm/panfrost: Fix missing clk_disable_unprepare() on error in panfrost_clk_init() +- EDAC/i10nm: Fix NVDIMM detection +- spi: spi-zynq-qspi: use wait_for_completion_timeout to make zynq_qspi_exec_mem_op not interruptible +- spi: sprd: Fix the wrong WDG_LOAD_VAL +- regulator: vctrl: Avoid lockdep warning in enable/disable ops +- regulator: vctrl: Use locked regulator_get_voltage in probe path +- blk-crypto: fix check for too-large dun_bytes +- spi: davinci: invoke chipselect callback +- x86/mce: Defer processing of early errors +- tpm: ibmvtpm: Avoid error message when process gets signal while waiting +- certs: Trigger creation of RSA module signing key if it's not an RSA key +- crypto: qat - use proper type for vf_mask +- irqchip/gic-v3: Fix priority comparison when non-secure priorities are used +- spi: coldfire-qspi: Use clk_disable_unprepare in the remove function +- block: nbd: add sanity check for first_minor +- clocksource/drivers/sh_cmt: Fix wrong setting if don't request IRQ for clock source channel +- lib/mpi: use kcalloc in mpi_resize +- irqchip/loongson-pch-pic: Improve edge triggered interrupt support +- genirq/timings: Fix error return code in irq_timings_test_irqs() +- spi: spi-pic32: Fix issue with uninitialized dma_slave_config +- spi: spi-fsl-dspi: Fix issue with uninitialized dma_slave_config +- block: return ELEVATOR_DISCARD_MERGE if possible +- m68k: Fix invalid RMW_INSNS on CPUs that lack CAS +- rcu: Fix stall-warning deadlock due to non-release of rcu_node ->lock +- rcu: Add lockdep_assert_irqs_disabled() to rcu_sched_clock_irq() and callees +- rcu: Fix to include first blocked task in stall warning +- sched: Fix UCLAMP_FLAG_IDLE setting +- sched/numa: Fix is_core_idle() +- m68k: emu: Fix invalid free in nfeth_cleanup() +- power: supply: cw2015: use dev_err_probe to allow deferred probe +- s390/ap: fix state machine hang after failure to enable irq +- s390/debug: fix debug area life cycle +- s390/debug: keep debug data on resize +- s390/pci: fix misleading rc in clp_set_pci_fn() +- s390/kasan: fix large PMD pages address alignment check +- udf_get_extendedattr() had no boundary checks. +- fcntl: fix potential deadlock for &fasync_struct.fa_lock +- crypto: qat - do not export adf_iov_putmsg() +- crypto: qat - fix naming for init/shutdown VF to PF notifications +- crypto: qat - fix reuse of completion variable +- crypto: qat - handle both source of interrupt in VF ISR +- crypto: qat - do not ignore errors from enable_vf2pf_comms() +- crypto: omap - Fix inconsistent locking of device lists +- libata: fix ata_host_start() +- s390/zcrypt: fix wrong offset index for APKA master key valid state +- s390/cio: add dev_busid sysfs entry for each subchannel +- power: supply: max17042_battery: fix typo in MAx17042_TOFF +- power: supply: smb347-charger: Add missing pin control activation +- nvmet: pass back cntlid on successful completion +- nvme-rdma: don't update queue count when failing to set io queues +- nvme-tcp: don't update queue count when failing to set io queues +- blk-throtl: optimize IOPS throttle for large IO scenarios +- bcache: add proper error unwinding in bcache_device_init +- isofs: joliet: Fix iocharset=utf8 mount option +- udf: Fix iocharset=utf8 mount option +- udf: Check LVID earlier +- hrtimer: Ensure timerfd notification for HIGHRES=n +- hrtimer: Avoid double reprogramming in __hrtimer_start_range_ns() +- posix-cpu-timers: Force next expiration recalc after itimer reset +- EDAC/mce_amd: Do not load edac_mce_amd module on guests +- rcu/tree: Handle VM stoppage in stall detection +- sched/deadline: Fix missing clock update in migrate_task_rq_dl() +- crypto: omap-sham - clear dma flags only after omap_sham_update_dma_stop() +- power: supply: axp288_fuel_gauge: Report register-address on readb / writeb errors +- sched/deadline: Fix reset_on_fork reporting of DL tasks +- crypto: mxs-dcp - Check for DMA mapping errors +- regulator: tps65910: Silence deferred probe error +- regmap: fix the offset of register error log +- locking/mutex: Fix HANDOFF condition +- PCI: Call Max Payload Size-related fixup quirks early +- x86/reboot: Limit Dell Optiplex 990 quirk to early BIOS versions +- xhci: fix unsafe memory usage in xhci tracing +- xhci: fix even more unsafe memory usage in xhci tracing +- usb: mtu3: fix the wrong HS mult value +- usb: mtu3: use @mult for HS isoc or intr +- usb: mtu3: restore HS function when set SS/SSP +- usb: gadget: tegra-xudc: fix the wrong mult value for HS isoc or intr +- usb: host: xhci-rcar: Don't reload firmware after the completion +- ALSA: usb-audio: Add registration quirk for JBL Quantum 800 +- blk-mq: clearing flush request reference in tags->rqs[] +- netfilter: nftables: clone set element expression template +- netfilter: nf_tables: initialize set before expression setup +- blk-mq: fix is_flush_rq +- blk-mq: fix kernel panic during iterating over flush request +- x86/events/amd/iommu: Fix invalid Perf result due to IOMMU PMC power-gating +- Revert "r8169: avoid link-up interrupt issue on RTL8106e if user enables ASPM" +- tty: drop termiox user definitions +- net: linux/skbuff.h: combine SKB_EXTENSIONS + KCOV handling +- serial: 8250: 8250_omap: Fix unused variable warning +- net: kcov: don't select SKB_EXTENSIONS when there is no NET +- net: ll_temac: Remove left-over debug message +- USB: serial: mos7720: improve OOM-handling in read_mos_reg() +- livepatch: Adapt livepatch-sample for stop_machine model +- livepatch: Add klp_{register,unregister}_patch for stop_machine model +- media: stkwebcam: fix memory leak in stk_camera_probe +- fuse: fix illegal access to inode with reused nodeid +- new helper: inode_wrong_type() +- spi: Switch to signed types for *_native_cs SPI controller fields +- ALSA: pcm: fix divide error in snd_pcm_lib_ioctl +- ALSA: hda/realtek: Workaround for conflicting SSID on ASUS ROG Strix G17 +- ALSA: hda/realtek: Quirk for HP Spectre x360 14 amp setup +- cryptoloop: add a deprecation warning +- perf/x86/amd/power: Assign pmu.module +- perf/x86/amd/ibs: Work around erratum #1197 +- ceph: fix possible null-pointer dereference in ceph_mdsmap_decode() +- perf/x86/intel/pt: Fix mask of num_address_ranges +- qede: Fix memset corruption +- net: macb: Add a NULL check on desc_ptp +- qed: Fix the VF msix vectors flow +- reset: reset-zynqmp: Fixed the argument data type +- gpu: ipu-v3: Fix i.MX IPU-v3 offset calculations for (semi)planar U/V formats +- ARM: OMAP1: ams-delta: remove unused function ams_delta_camera_power +- xtensa: fix kconfig unmet dependency warning for HAVE_FUTEX_CMPXCHG +- static_call: Fix unused variable warn w/o MODULE +- Revert "Add a reference to ucounts for each cred" +- Revert "cred: add missing return error code when set_cred_ucounts() failed" +- Revert "ucounts: Increase ucounts reference counter before the security hook" +- ubifs: report correct st_size for encrypted symlinks +- f2fs: report correct st_size for encrypted symlinks +- ext4: report correct st_size for encrypted symlinks +- fscrypt: add fscrypt_symlink_getattr() for computing st_size +- bpf: Fix potentially incorrect results with bpf_get_local_storage() +- audit: move put_tree() to avoid trim_trees refcount underflow and UAF +- net: don't unconditionally copy_from_user a struct ifreq for socket ioctls +- Revert "parisc: Add assembly implementations for memset, strlen, strcpy, strncpy and strcat" +- Revert "floppy: reintroduce O_NDELAY fix" +- arm64: dts: qcom: msm8994-angler: Fix gpio-reserved-ranges 85-88 +- lkdtm: Enable DOUBLE_FAULT on all architectures +- net: dsa: mt7530: fix VLAN traffic leaks again +- usb: typec: ucsi: Clear pending after acking connector change +- usb: typec: ucsi: Work around PPM losing change information +- usb: typec: ucsi: acpi: Always decode connector change information +- tracepoint: Use rcu get state and cond sync for static call updates +- srcu: Provide polling interfaces for Tiny SRCU grace periods +- srcu: Make Tiny SRCU use multi-bit grace-period counter +- srcu: Provide internal interface to start a Tiny SRCU grace period +- srcu: Provide polling interfaces for Tree SRCU grace periods +- srcu: Provide internal interface to start a Tree SRCU grace period +- riscv: Fixup patch_text panic in ftrace +- riscv: Fixup wrong ftrace remove cflag +- Bluetooth: btusb: check conditions before enabling USB ALT 3 for WBS +- tipc: call tipc_wait_for_connect only when dlen is not 0 +- mtd: spinand: Fix incorrect parameters for on-die ECC +- pipe: do FASYNC notifications for every pipe IO, not just state changes +- pipe: avoid unnecessary EPOLLET wakeups under normal loads +- btrfs: fix race between marking inode needs to be logged and log syncing +- net/rds: dma_map_sg is entitled to merge entries +- drm/nouveau/kms/nv50: workaround EFI GOP window channel format differences +- drm/nouveau/disp: power down unused DP links during init +- drm: Copy drm_wait_vblank to user before returning +- blk-mq: don't grab rq's refcount in blk_mq_check_expired() +- drm/amd/pm: change the workload type for some cards +- Revert "drm/amd/pm: fix workload mismatch on vega10" +- qed: Fix null-pointer dereference in qed_rdma_create_qp() +- qed: qed ll2 race condition fixes +- tools/virtio: fix build +- vringh: Use wiov->used to check for read/write desc order +- virtio_vdpa: reject invalid vq indices +- virtio_pci: Support surprise removal of virtio pci device +- virtio: Improve vq->broken access to avoid any compiler optimization +- cpufreq: blocklist Qualcomm sm8150 in cpufreq-dt-platdev +- opp: remove WARN when no valid OPPs remain +- iwlwifi: pnvm: accept multiple HW-type TLVs +- clk: renesas: rcar-usb2-clock-sel: Fix kernel NULL pointer dereference +- perf/x86/intel/uncore: Fix integer overflow on 23 bit left shift of a u32 +- dt-bindings: sifive-l2-cache: Fix 'select' matching +- usb: gadget: u_audio: fix race condition on endpoint stop +- drm/i915: Fix syncmap memory leak +- net: stmmac: fix kernel panic due to NULL pointer dereference of plat->est +- net: stmmac: add mutex lock to protect est parameters +- Revert "mmc: sdhci-iproc: Set SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN on BCM2711" +- rtnetlink: Return correct error on changing device netns +- cxgb4: dont touch blocked freelist bitmap after free +- ipv4: use siphash instead of Jenkins in fnhe_hashfun() +- ipv6: use siphash in rt6_exception_hash() +- net/sched: ets: fix crash when flipping from 'strict' to 'quantum' +- ucounts: Increase ucounts reference counter before the security hook +- net: marvell: fix MVNETA_TX_IN_PRGRS bit number +- xgene-v2: Fix a resource leak in the error handling path of 'xge_probe()' +- ip_gre: add validation for csum_start +- RDMA/efa: Free IRQ vectors on error flow +- e1000e: Do not take care about recovery NVM checksum +- e1000e: Fix the max snoop/no-snoop latency for 10M +- igc: Use num_tx_queues when iterating over tx_ring queue +- igc: fix page fault when thunderbolt is unplugged +- net: usb: pegasus: fixes of set_register(s) return value evaluation; +- ice: do not abort devlink info if board identifier can't be found +- RDMA/bnxt_re: Remove unpaired rtnl unlock in bnxt_re_dev_init() +- IB/hfi1: Fix possible null-pointer dereference in _extend_sdma_tx_descs() +- RDMA/bnxt_re: Add missing spin lock initialization +- scsi: core: Fix hang of freezing queue between blocking and running device +- usb: dwc3: gadget: Stop EP0 transfers during pullup disable +- usb: dwc3: gadget: Fix dwc3_calc_trbs_left() +- usb: renesas-xhci: Prefer firmware loading on unknown ROM state +- USB: serial: option: add new VID/PID to support Fibocom FG150 +- Revert "USB: serial: ch341: fix character loss at high transfer rates" +- drm/amdgpu: Cancel delayed work when GFXOFF is disabled +- Revert "btrfs: compression: don't try to compress if we don't have enough pages" +- riscv: Ensure the value of FP registers in the core dump file is up to date +- ceph: correctly handle releasing an embedded cap flush +- can: usb: esd_usb2: esd_usb2_rx_event(): fix the interchange of the CAN RX and TX error counters +- net: mscc: Fix non-GPL export of regmap APIs +- ovl: fix uninitialized pointer read in ovl_lookup_real_one() +- blk-iocost: fix lockdep warning on blkcg->lock +- netfilter: conntrack: collect all entries in one cycle +- ARC: Fix CONFIG_STACKDEPOT +- ASoC: component: Remove misplaced prefix handling in pin control functions +- ASoC: rt5682: Adjust headset volume button threshold +- bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper +- bpf: Fix ringbuf helper function compatibility +- ARM: spectre-v2: turn off the mitigation via boot cmdline param +- ext4: fix potential uninitialized access to retval in kmmpd +- take LOOKUP_{ROOT,ROOT_GRABBED,JUMPED} out of LOOKUP_... space +- switch file_open_root() to struct path +- kyber: introduce kyber_depth_updated() +- perf annotate: Add itrace options support +- mm: Fix the uninitialized use in overcommit_policy_handler +- memcg: enable accounting for ldt_struct objects +- memcg: enable accounting for posix_timers_cache slab +- memcg: enable accounting for signals +- memcg: enable accounting for new namesapces and struct nsproxy +- memcg: enable accounting for fasync_cache +- memcg: enable accounting for mnt_cache entries +- memcg: charge fs_context and legacy_fs_context +- memcg: enable accounting for pids in nested pid namespaces +- blk-mq: fix divide by zero crash in tg_may_dispatch() +- ext4: prevent getting empty inode buffer +- ext4: move ext4_fill_raw_inode() related functions +- ext4: factor out ext4_fill_raw_inode() +- ext4: make the updating inode data procedure atomic +- ext4: move inode eio simulation behind io completeion +- sched: Aware multi-core system for optimize loadtracking +- livepatch: Fix compile warnning +- md: revert io stats accounting +- sched/idle: Reported an error when an illegal negative value is passed +- sched/idle: Optimize the loop time algorithm to reduce multicore disturb +- serial: 8250: 8250_omap: Fix possible array out of bounds access +- once: Fix panic when module unload +- ext4: wipe ext4_dir_entry2 upon file deletion +- livepatch: move arch_klp_mem_recycle after the return value judgment +- livepatch/x86: only check stack top +- livepatch/ppc64: only check stack top +- livepatch/ppc32: only check stack top +- livepatch/arm: only check stack top +- livepatch/arm64: only check stack top +- livepatch: checks only if the replaced instruction is on the stack +- livepatch: Add state describe for force +- blk-mq: clear active_queues before clearing BLK_MQ_F_TAG_QUEUE_SHARED +- sysctl: Refactor IAS framework +- io_uring: ensure symmetry in handling iter types in loop_rw_iter() +- ext4: fix race writing to an inline_data file while its xattrs are changing +- memcg: enable accounting of ipc resources +- vt_kdsetmode: extend console locking +- net: qrtr: fix another OOB Read in qrtr_endpoint_post +- btrfs: fix NULL pointer dereference when deleting device by invalid id +- acpi: acpica: fix acpi parse and parseext cache leaks +- acpi: acpica: fix acpi operand cache leak in dsutils.c +- sctp: add param size validation for SCTP_PARAM_SET_PRIMARY +- sctp: validate chunk size in __rcv_asconf_lookup +- ARM: footbridge: remove personal server platform +- hfs: fix null-ptr-deref in hfs_find_init() +- io_uring: only assign io_uring_enter() SQPOLL error in actual error case +- io_uring: fix xa_alloc_cycle() error return value check +- fs: warn about impending deprecation of mandatory locks +- mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim +- ASoC: intel: atom: Fix breakage for PCM buffer address setup +- ALSA: hda/realtek: Limit mic boost on HP ProBook 445 G8 +- PCI: Increase D3 delay for AMD Renoir/Cezanne XHCI +- s390/pci: fix use after free of zpci_dev +- ALSA: hda/via: Apply runtime PM workaround for ASUS B23E +- btrfs: prevent rename2 from exchanging a subvol with a directory from different parents +- mmc: sdhci-iproc: Set SDHCI_QUIRK_CAP_CLOCK_BASE_BROKEN on BCM2711 +- mmc: sdhci-iproc: Cap min clock frequency on BCM2711 +- ALSA: hda/realtek: Enable 4-speaker output for Dell XPS 15 9510 laptop +- ipack: tpci200: fix memory leak in the tpci200_register +- ipack: tpci200: fix many double free issues in tpci200_pci_probe +- slimbus: ngd: reset dma setup during runtime pm +- slimbus: messaging: check for valid transaction id +- slimbus: messaging: start transaction ids from 1 instead of zero +- tracing / histogram: Fix NULL pointer dereference on strcmp() on NULL event name +- ALSA: hda - fix the 'Capture Switch' value change notifications +- clk: qcom: gdsc: Ensure regulator init state matches GDSC state +- clk: imx6q: fix uart earlycon unwork +- mmc: sdhci-msm: Update the software timeout value for sdhc +- mmc: mmci: stm32: Check when the voltage switch procedure should be done +- mmc: dw_mmc: Fix hang on data CRC error +- Revert "flow_offload: action should not be NULL when it is referenced" +- iavf: Fix ping is lost after untrusted VF had tried to change MAC +- i40e: Fix ATR queue selection +- r8152: fix writing USB_BP2_EN +- iommu/vt-d: Fix incomplete cache flush in intel_pasid_tear_down_entry() +- iommu/vt-d: Consolidate duplicate cache invaliation code +- ovs: clear skb->tstamp in forwarding path +- net: mdio-mux: Handle -EPROBE_DEFER correctly +- net: mdio-mux: Don't ignore memory allocation errors +- sch_cake: fix srchost/dsthost hashing mode +- ixgbe, xsk: clean up the resources in ixgbe_xsk_pool_enable error path +- net: qlcnic: add missed unlock in qlcnic_83xx_flash_read32 +- virtio-net: use NETIF_F_GRO_HW instead of NETIF_F_LRO +- virtio-net: support XDP when not more queues +- vrf: Reset skb conntrack connection on VRF rcv +- bnxt_en: Add missing DMA memory barriers +- bnxt_en: Disable aRFS if running on 212 firmware +- ptp_pch: Restore dependency on PCI +- net: 6pack: fix slab-out-of-bounds in decode_data +- bnxt: count Tx drops +- bnxt: make sure xmit_more + errors does not miss doorbells +- bnxt: disable napi before canceling DIM +- bnxt: don't lock the tx queue from napi poll +- bpf: Clear zext_dst of dead insns +- drm/mediatek: Add AAL output size configuration +- drm/mediatek: Fix aal size config +- soc / drm: mediatek: Move DDP component defines into mtk-mmsys.h +- vdpa/mlx5: Avoid destroying MR on empty iotlb +- vhost: Fix the calculation in vhost_overflow() +- bus: ti-sysc: Fix error handling for sysc_check_active_timer() +- vhost-vdpa: Fix integer overflow in vhost_vdpa_process_iotlb_update() +- virtio: Protect vqs list access +- dccp: add do-while-0 stubs for dccp_pr_debug macros +- cpufreq: armada-37xx: forbid cpufreq for 1.2 GHz variant +- iommu: Check if group is NULL before remove device +- arm64: dts: qcom: msm8992-bullhead: Remove PSCI +- arm64: dts: qcom: c630: fix correct powerdown pin for WSA881x +- Bluetooth: hidp: use correct wait queue when removing ctrl_wait +- drm/amd/display: workaround for hard hang on HPD on native DP +- drm/amd/display: Fix Dynamic bpp issue with 8K30 with Navi 1X +- net: usb: lan78xx: don't modify phy_device state concurrently +- net: usb: pegasus: Check the return value of get_geristers() and friends; +- ARM: dts: nomadik: Fix up interrupt controller node names +- qede: fix crash in rmmod qede while automatic debug collection +- drm/amdgpu: fix the doorbell missing when in CGPG issue for renoir. +- scsi: core: Fix capacity set to zero after offlinining device +- scsi: core: Avoid printing an error if target_alloc() returns -ENXIO +- scsi: scsi_dh_rdac: Avoid crash during rdac_bus_attach() +- scsi: megaraid_mm: Fix end of loop tests for list_for_each_entry() +- scsi: pm80xx: Fix TMF task completion race condition +- dmaengine: of-dma: router_xlate to return -EPROBE_DEFER if controller is not yet available +- ARM: dts: am43x-epos-evm: Reduce i2c0 bus speed for tps65218 +- net: xfrm: Fix end of loop tests for list_for_each_entry +- spi: spi-mux: Add module info needed for autoloading +- dmaengine: usb-dmac: Fix PM reference leak in usb_dmac_probe() +- dmaengine: xilinx_dma: Fix read-after-free bug when terminating transfers +- USB: core: Fix incorrect pipe calculation in do_proc_control() +- USB: core: Avoid WARNings for 0-length descriptor requests +- KVM: X86: Fix warning caused by stale emulation context +- KVM: x86: Factor out x86 instruction emulation with decoding +- media: drivers/media/usb: fix memory leak in zr364xx_probe +- media: zr364xx: fix memory leaks in probe() +- media: zr364xx: propagate errors from zr364xx_start_readpipe() +- mtd: cfi_cmdset_0002: fix crash when erasing/writing AMD cards +- ath9k: Postpone key cache entry deletion for TXQ frames reference it +- ath: Modify ath_key_delete() to not need full key entry +- ath: Export ath_hw_keysetmac() +- ath9k: Clear key cache explicitly on disabling hardware +- ath: Use safer key clearing with key cache entries +- net: dsa: microchip: ksz8795: Use software untagging on CPU port +- net: dsa: microchip: ksz8795: Fix VLAN untagged flag change on deletion +- net: dsa: microchip: ksz8795: Reject unsupported VLAN configuration +- net: dsa: microchip: ksz8795: Fix PVID tag insertion +- net: dsa: microchip: Fix probing KSZ87xx switch with DT node for host port +- KVM: nSVM: always intercept VMLOAD/VMSAVE when nested (CVE-2021-3656) +- KVM: nSVM: avoid picking up unsupported bits from L2 in int_ctl (CVE-2021-3653) +- vmlinux.lds.h: Handle clang's module.{c,d}tor sections +- ceph: take snap_empty_lock atomically with snaprealm refcount change +- ceph: clean up locking annotation for ceph_get_snap_realm and __lookup_snap_realm +- ceph: add some lockdep assertions around snaprealm handling +- vboxsf: Add support for the atomic_open directory-inode op +- vboxsf: Add vboxsf_[create|release]_sf_handle() helpers +- KVM: nVMX: Use vmx_need_pf_intercept() when deciding if L0 wants a #PF +- KVM: VMX: Use current VMCS to query WAITPKG support for MSR emulation +- efi/libstub: arm64: Double check image alignment at entry +- powerpc/smp: Fix OOPS in topology_init() +- PCI/MSI: Protect msi_desc::masked for multi-MSI +- PCI/MSI: Use msi_mask_irq() in pci_msi_shutdown() +- PCI/MSI: Correct misleading comments +- PCI/MSI: Do not set invalid bits in MSI mask +- PCI/MSI: Enforce MSI[X] entry updates to be visible +- PCI/MSI: Enforce that MSI-X table entry is masked for update +- PCI/MSI: Mask all unused MSI-X entries +- PCI/MSI: Enable and mask MSI-X early +- genirq/timings: Prevent potential array overflow in __irq_timings_store() +- genirq/msi: Ensure deactivation on teardown +- x86/resctrl: Fix default monitoring groups reporting +- x86/ioapic: Force affinity setup before startup +- x86/msi: Force affinity setup before startup +- genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP +- x86/tools: Fix objdump version check again +- efi/libstub: arm64: Relax 2M alignment again for relocatable kernels +- efi/libstub: arm64: Force Image reallocation if BSS was not reserved +- arm64: efi: kaslr: Fix occasional random alloc (and boot) failure +- nbd: Aovid double completion of a request +- vsock/virtio: avoid potential deadlock when vsock device remove +- xen/events: Fix race in set_evtchn_to_irq +- drm/i915: Only access SFC_DONE when media domain is not fused off +- net: igmp: increase size of mr_ifc_count +- tcp_bbr: fix u32 wrap bug in round logic if bbr_init() called after 2B packets +- net: linkwatch: fix failure to restore device state across suspend/resume +- net: bridge: fix memleak in br_add_if() +- net: bridge: fix flags interpretation for extern learn fdb entries +- net: bridge: validate the NUD_PERMANENT bit when adding an extern_learn FDB entry +- net: dsa: sja1105: fix broken backpressure in .port_fdb_dump +- net: dsa: lantiq: fix broken backpressure in .port_fdb_dump +- net: dsa: lan9303: fix broken backpressure in .port_fdb_dump +- net: igmp: fix data-race in igmp_ifc_timer_expire() +- net: Fix memory leak in ieee802154_raw_deliver +- net: dsa: microchip: ksz8795: Fix VLAN filtering +- net: dsa: microchip: Fix ksz_read64() +- drm/meson: fix colour distortion from HDR set during vendor u-boot +- net/mlx5: Fix return value from tracer initialization +- net/mlx5: Synchronize correct IRQ when destroying CQ +- bareudp: Fix invalid read beyond skb's linear data +- psample: Add a fwd declaration for skbuff +- iavf: Set RSS LUT and key in reset handle path +- ice: don't remove netdev->dev_addr from uc sync list +- ice: Prevent probing virtual functions +- net: sched: act_mirred: Reset ct info when mirror/redirect skb +- net/smc: fix wait on already cleared link +- ppp: Fix generating ifname when empty IFLA_IFNAME is specified +- net: phy: micrel: Fix link detection on ksz87xx switch" +- bpf: Fix integer overflow involving bucket_size +- libbpf: Fix probe for BPF_PROG_TYPE_CGROUP_SOCKOPT +- platform/x86: pcengines-apuv2: Add missing terminating entries to gpio-lookup tables +- net: mvvp2: fix short frame size on s390 +- net: dsa: mt7530: add the missing RxUnicast MIB counter +- ASoC: cs42l42: Fix LRCLK frame start edge +- pinctrl: tigerlake: Fix GPIO mapping for newer version of software +- netfilter: nf_conntrack_bridge: Fix memory leak when error +- ASoC: cs42l42: Remove duplicate control for WNF filter frequency +- ASoC: cs42l42: Fix inversion of ADC Notch Switch control +- ASoC: SOF: Intel: hda-ipc: fix reply size checking +- ASoC: cs42l42: Don't allow SND_SOC_DAIFMT_LEFT_J +- ASoC: cs42l42: Correct definition of ADC Volume control +- pinctrl: mediatek: Fix fallback behavior for bias_set_combo +- ieee802154: hwsim: fix GPF in hwsim_new_edge_nl +- ieee802154: hwsim: fix GPF in hwsim_set_edge_lqi +- drm/amdgpu: don't enable baco on boco platforms in runpm +- drm/amd/display: use GFP_ATOMIC in amdgpu_dm_irq_schedule_work +- drm/amd/display: Remove invalid assert for ODM + MPC case +- libnvdimm/region: Fix label activation vs errors +- ACPI: NFIT: Fix support for virtual SPA ranges +- ceph: reduce contention in ceph_check_delayed_caps() +- ARC: fp: set FPU_STATUS.FWE to enable FPU_STATUS update on context switch +- net: ethernet: ti: cpsw: fix min eth packet size for non-switch use-cases +- seccomp: Fix setting loaded filter count during TSYNC +- scsi: lpfc: Move initialization of phba->poll_list earlier to avoid crash +- cifs: create sd context must be a multiple of 8 +- i2c: dev: zero out array used for i2c reads from userspace +- ASoC: intel: atom: Fix reference to PCM buffer address +- ASoC: tlv320aic31xx: Fix jack detection after suspend +- ASoC: uniphier: Fix reference to PCM buffer address +- ASoC: xilinx: Fix reference to PCM buffer address +- ASoC: amd: Fix reference to PCM buffer address +- iio: adc: Fix incorrect exit of for-loop +- iio: humidity: hdc100x: Add margin to the conversion time +- iio: adis: set GPIO reset pin direction +- iio: adc: ti-ads7950: Ensure CS is deasserted after reading channels +- net: xilinx_emaclite: Do not print real IOMEM pointer +- ovl: prevent private clone if bind mount is not allowed +- ppp: Fix generating ppp unit id when ifname is not specified +- ALSA: hda: Add quirk for ASUS Flow x13 +- ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 650 G8 Notebook PC +- ALSA: pcm: Fix mmap breakage without explicit buffer setup +- USB:ehci:fix Kunpeng920 ehci hardware problem +- vboxsf: Make vboxsf_dir_create() return the handle for the created file +- vboxsf: Honor excl flag to the dir-inode create op +- arm64: dts: renesas: beacon: Fix USB ref clock references +- arm64: dts: renesas: beacon: Fix USB extal reference +- arm64: dts: renesas: rzg2: Add usb2_clksel to RZ/G2 M/N/H +- mm: make zone_to_nid() and zone_set_nid() available for DISCONTIGMEM +- Revert "selftests/resctrl: Use resctrl/info for feature detection" +- bpf: Add lockdown check for probe_write_user helper +- firmware: tee_bnxt: Release TEE shm, session, and context during kexec +- tee: Correct inappropriate usage of TEE_SHM_DMA_BUF flag +- KVM: SVM: Fix off-by-one indexing when nullifying last used SEV VMCB +- sched: Add menuconfig option for CONFIG_SCHED_OPTIMIZE_LOAD_TRACKING +- sched/rt: Fix double enqueue caused by rt_effective_prio +- Revert "sched/rt: Fix double enqueue caused by rt_effective_prio" +- drm/amdgpu/display: only enable aux backlight control for OLED panels +- smb3: rc uninitialized in one fallocate path +- net/qla3xxx: fix schedule while atomic in ql_wait_for_drvr_lock and ql_adapter_reset +- alpha: Send stop IPI to send to online CPUs +- net: qede: Fix end of loop tests for list_for_each_entry +- virt_wifi: fix error on connect +- reiserfs: check directory items on read from disk +- reiserfs: add check for root_inode in reiserfs_fill_super +- libata: fix ata_pio_sector for CONFIG_HIGHMEM +- drm/i915: avoid uninitialised var in eb_parse() +- sched/rt: Fix double enqueue caused by rt_effective_prio +- perf/x86/amd: Don't touch the AMD64_EVENTSEL_HOSTONLY bit inside the guest +- soc: ixp4xx/qmgr: fix invalid __iomem access +- drm/i915: Correct SFC_DONE register offset +- interconnect: qcom: icc-rpmh: Ensure floor BW is enforced for all nodes +- interconnect: Always call pre_aggregate before aggregate +- interconnect: Zero initial BW after sync-state +- spi: meson-spicc: fix memory leak in meson_spicc_remove +- interconnect: Fix undersized devress_alloc allocation +- soc: ixp4xx: fix printing resources +- arm64: vdso: Avoid ISB after reading from cntvct_el0 +- KVM: x86/mmu: Fix per-cpu counter corruption on 32-bit builds +- KVM: Do not leak memory for duplicate debugfs directories +- KVM: x86: accept userspace interrupt only if no event is injected +- md/raid10: properly indicate failure when ending a failed write request +- ARM: omap2+: hwmod: fix potential NULL pointer access +- Revert "gpio: mpc8xxx: change the gpio interrupt flags." +- bus: ti-sysc: AM3: RNG is GP only +- selinux: correct the return value when loads initial sids +- pcmcia: i82092: fix a null pointer dereference bug +- net/xfrm/compat: Copy xfrm_spdattr_type_t atributes +- xfrm: Fix RCU vs hash_resize_mutex lock inversion +- timers: Move clearing of base::timer_running under base:: Lock +- fpga: dfl: fme: Fix cpu hotplug issue in performance reporting +- serial: 8250_pci: Avoid irq sharing for MSI(-X) interrupts. +- serial: 8250_pci: Enumerate Elkhart Lake UARTs via dedicated driver +- MIPS: Malta: Do not byte-swap accesses to the CBUS UART +- serial: 8250: Mask out floating 16/32-bit bus bits +- serial: 8250_mtk: fix uart corruption issue when rx power off +- serial: tegra: Only print FIFO error message when an error occurs +- ext4: fix potential htree corruption when growing large_dir directories +- pipe: increase minimum default pipe size to 2 pages +- media: rtl28xxu: fix zero-length control request +- drivers core: Fix oops when driver probe fails +- staging: rtl8712: error handling refactoring +- staging: rtl8712: get rid of flush_scheduled_work +- staging: rtl8723bs: Fix a resource leak in sd_int_dpc +- tpm_ftpm_tee: Free and unregister TEE shared memory during kexec +- optee: fix tee out of memory failure seen during kexec reboot +- optee: Refuse to load the driver under the kdump kernel +- optee: Fix memory leak when failing to register shm pages +- tee: add tee_shm_alloc_kernel_buf() +- optee: Clear stale cache entries during initialization +- arm64: stacktrace: avoid tracing arch_stack_walk() +- tracepoint: Fix static call function vs data state mismatch +- tracepoint: static call: Compare data on transition from 2->1 callees +- tracing: Fix NULL pointer dereference in start_creating +- tracing: Reject string operand in the histogram expression +- tracing / histogram: Give calculation hist_fields a size +- scripts/tracing: fix the bug that can't parse raw_trace_func +- clk: fix leak on devm_clk_bulk_get_all() unwind +- usb: otg-fsm: Fix hrtimer list corruption +- usb: typec: tcpm: Keep other events when receiving FRS and Sourcing_vbus events +- usb: host: ohci-at91: suspend/resume ports after/before OHCI accesses +- usb: gadget: f_hid: idle uses the highest byte for duration +- usb: gadget: f_hid: fixed NULL pointer dereference +- usb: gadget: f_hid: added GET_IDLE and SET_IDLE handlers +- usb: cdns3: Fixed incorrect gadget state +- usb: gadget: remove leaked entry from udc driver list +- usb: dwc3: gadget: Avoid runtime resume if disabling pullup +- ALSA: usb-audio: Add registration quirk for JBL Quantum 600 +- ALSA: usb-audio: Fix superfluous autosuspend recovery +- ALSA: hda/realtek: Fix headset mic for Acer SWIFT SF314-56 (ALC256) +- ALSA: hda/realtek: add mic quirk for Acer SF314-42 +- ALSA: pcm - fix mmap capability check for the snd-dummy driver +- drm/amdgpu/display: fix DMUB firmware version info +- firmware_loader: fix use-after-free in firmware_fallback_sysfs +- firmware_loader: use -ETIMEDOUT instead of -EAGAIN in fw_load_sysfs_fallback +- USB: serial: ftdi_sio: add device ID for Auto-M3 OP-COM v2 +- USB: serial: ch341: fix character loss at high transfer rates +- USB: serial: option: add Telit FD980 composition 0x1056 +- USB: usbtmc: Fix RCU stall warning +- Bluetooth: defer cleanup of resources in hci_unregister_dev() +- blk-iolatency: error out if blk_get_queue() failed in iolatency_set_limit() +- net: vxge: fix use-after-free in vxge_device_unregister +- net: fec: fix use-after-free in fec_drv_remove +- net: pegasus: fix uninit-value in get_interrupt_interval +- bnx2x: fix an error code in bnx2x_nic_load() +- mips: Fix non-POSIX regexp +- MIPS: check return value of pgtable_pmd_page_ctor +- net: sched: fix lockdep_set_class() typo error for sch->seqlock +- net: dsa: qca: ar9331: reorder MDIO write sequence +- net: ipv6: fix returned variable type in ip6_skb_dst_mtu +- nfp: update ethtool reporting of pauseframe control +- sctp: move the active_key update after sh_keys is added +- RDMA/mlx5: Delay emptying a cache entry when a new MR is added to it recently +- gpio: tqmx86: really make IRQ optional +- net: natsemi: Fix missing pci_disable_device() in probe and remove +- net: phy: micrel: Fix detection of ksz87xx switch +- net: dsa: sja1105: match FDB entries regardless of inner/outer VLAN tag +- net: dsa: sja1105: be stateless with FDB entries on SJA1105P/Q/R/S/SJA1110 too +- net: dsa: sja1105: invalidate dynamic FDB entries learned concurrently with statically added ones +- net: dsa: sja1105: overwrite dynamic FDB entries with static ones in .port_fdb_add +- net, gro: Set inner transport header offset in tcp/udp GRO hook +- dmaengine: imx-dma: configure the generic DMA type to make it work +- ARM: dts: stm32: Fix touchscreen IRQ line assignment on DHCOM +- ARM: dts: stm32: Disable LAN8710 EDPD on DHCOM +- media: videobuf2-core: dequeue if start_streaming fails +- scsi: sr: Return correct event when media event code is 3 +- spi: imx: mx51-ecspi: Fix low-speed CONFIGREG delay calculation +- spi: imx: mx51-ecspi: Reinstate low-speed CONFIGREG delay +- dmaengine: stm32-dmamux: Fix PM usage counter unbalance in stm32 dmamux ops +- dmaengine: stm32-dma: Fix PM usage counter imbalance in stm32 dma ops +- clk: tegra: Implement disable_unused() of tegra_clk_sdmmc_mux_ops +- dmaengine: uniphier-xdmac: Use readl_poll_timeout_atomic() in atomic state +- omap5-board-common: remove not physically existing vdds_1v8_main fixed-regulator +- ARM: dts: am437x-l4: fix typo in can@0 node +- clk: stm32f4: fix post divisor setup for I2S/SAI PLLs +- ALSA: usb-audio: fix incorrect clock source setting +- arm64: dts: armada-3720-turris-mox: remove mrvl,i2c-fast-mode +- arm64: dts: armada-3720-turris-mox: fixed indices for the SDHC controllers +- ARM: dts: imx: Swap M53Menlo pinctrl_power_button/pinctrl_power_out pins +- ARM: imx: fix missing 3rd argument in macro imx_mmdc_perf_init +- ARM: dts: colibri-imx6ull: limit SDIO clock to 25MHz +- arm64: dts: ls1028: sl28: fix networking for variant 2 +- ARM: dts: imx6qdl-sr-som: Increase the PHY reset duration to 10ms +- ARM: imx: add missing clk_disable_unprepare() +- ARM: imx: add missing iounmap() +- arm64: dts: ls1028a: fix node name for the sysclk +- net: xfrm: fix memory leak in xfrm_user_rcv_msg +- bus: ti-sysc: Fix gpt12 system timer issue with reserved status +- ALSA: seq: Fix racy deletion of subscriber +- Revert "ACPICA: Fix memory leak caused by _CID repair function" +- sched/idle: Add IAS_SMART_HALT_POLL config for smart halt polling feature +- sched/idle: introduce smart halt polling +- arm: Optimize ttwu IPI +- kthread: Fix PF_KTHREAD vs to_kthread() race +- mtd: mtdconcat: Check _read,_write callbacks existence before assignment +- mtd: mtdconcat: Judge callback existence based on the master +- lib: use PFN_PHYS() in devmem_is_allowed() +- arm64: fix compat syscall return truncation +- blk: reuse lookup_sem to serialize partition operations +- Revert "[Backport] block: take bd_mutex around delete_partitions in del_gendisk" +- Revert "[Huawei] block: avoid creating invalid symlink file for patitions" +- block: ensure the memory order between bi_private and bi_status +- amba-pl011: Fix no irq issue due to no IRQ domain found +- arm64: seccomp: fix the incorrect name of syscall __NR_compat_exit in secure computing mode +- seqlock: avoid -Wshadow warnings +- asm-generic: fix ffs -Wshadow warning +- spi: mediatek: Fix fifo transfer +- selftest/bpf: Verifier tests for var-off access +- bpf, selftests: Adjust few selftest outcomes wrt unreachable code +- bpf: Update selftests to reflect new error states +- bpf, selftests: Adjust few selftest result_unpriv outcomes +- selftest/bpf: Adjust expected verifier errors +- selftests/bpf: Add a test for ptr_to_map_value on stack for helper access +- Revert "watchdog: iTCO_wdt: Account for rebooting on second timeout" +- firmware: arm_scmi: Add delayed response status check +- firmware: arm_scmi: Ensure drivers provide a probe function +- Revert "Bluetooth: Shutdown controller after workqueues are flushed or cancelled" +- ACPI: fix NULL pointer dereference +- drm/amd/display: Fix max vstartup calculation for modes with borders +- drm/amd/display: Fix comparison error in dcn21 DML +- nvme: fix nvme_setup_command metadata trace event +- efi/mokvar: Reserve the table only if it is in boot services data +- ASoC: ti: j721e-evm: Check for not initialized parent_clk_id +- ASoC: ti: j721e-evm: Fix unbalanced domain activity tracking during startup +- net: Fix zero-copy head len calculation. +- ASoC: rt5682: Fix the issue of garbled recording after powerd_dbus_suspend +- qed: fix possible unpaired spin_{un}lock_bh in _qed_mcp_cmd_and_union() +- r8152: Fix potential PM refcount imbalance +- ASoC: tlv320aic31xx: fix reversed bclk/wclk master bits +- spi: stm32h7: fix full duplex irq handler handling +- regulator: rt5033: Fix n_voltages settings for BUCK and LDO +- regulator: rtmv20: Fix wrong mask for strobe-polarity-high +- btrfs: fix lost inode on log replay after mix of fsync, rename and inode eviction +- btrfs: fix race causing unnecessary inode logging during link and rename +- Revert "drm/i915: Propagate errors on awaiting already signaled fences" +- drm/i915: Revert "drm/i915/gem: Asynchronous cmdparser" +- powerpc/kprobes: Fix kprobe Oops happens in booke +- sched: Fix branch prediction error in static_key +- sched: Access control for sysctl_update_load_latency +- mm,hwpoison: return -EHWPOISON to denote that the page has already been poisoned +- mm/memory-failure: use a mutex to avoid memory_failure() races +- can: j1939: j1939_session_deactivate(): clarify lifetime of session object +- i40e: Add additional info to PHY type error +- Revert "perf map: Fix dso->nsinfo refcounting" +- powerpc/pseries: Fix regression while building external modules +- SMB3: fix readpage for large swap cache +- bpf: Fix pointer arithmetic mask tightening under state pruning +- bpf: verifier: Allocate idmap scratch in verifier env +- bpf: Remove superfluous aux sanitation on subprog rejection +- bpf: Fix leakage due to insufficient speculative store bypass mitigation +- bpf: Introduce BPF nospec instruction for mitigating Spectre v4 +- can: hi311x: fix a signedness bug in hi3110_cmd() +- sis900: Fix missing pci_disable_device() in probe and remove +- tulip: windbond-840: Fix missing pci_disable_device() in probe and remove +- sctp: fix return value check in __sctp_rcv_asconf_lookup +- net/mlx5e: Fix nullptr in mlx5e_hairpin_get_mdev() +- net/mlx5: Fix flow table chaining +- skmsg: Make sk_psock_destroy() static +- drm/msm/dp: Initialize the INTF_CONFIG register +- drm/msm/dpu: Fix sm8250_mdp register length +- net: llc: fix skb_over_panic +- KVM: x86: Check the right feature bit for MSR_KVM_ASYNC_PF_ACK access +- mlx4: Fix missing error code in mlx4_load_one() +- octeontx2-pf: Fix interface down flag on error +- tipc: do not write skb_shinfo frags when doing decrytion +- ionic: count csum_none when offload enabled +- ionic: fix up dim accounting for tx and rx +- ionic: remove intr coalesce update from napi +- net: qrtr: fix memory leaks +- net: Set true network header for ECN decapsulation +- tipc: fix sleeping in tipc accept routine +- tipc: fix implicit-connect for SYN+ +- i40e: Fix log TC creation failure when max num of queues is exceeded +- i40e: Fix queue-to-TC mapping on Tx +- i40e: Fix firmware LLDP agent related warning +- i40e: Fix logic of disabling queues +- netfilter: nft_nat: allow to specify layer 4 protocol NAT only +- netfilter: conntrack: adjust stop timestamp to real expiry value +- mac80211: fix enabling 4-address mode on a sta vif after assoc +- bpf: Fix OOB read when printing XDP link fdinfo +- RDMA/bnxt_re: Fix stats counters +- cfg80211: Fix possible memory leak in function cfg80211_bss_update +- nfc: nfcsim: fix use after free during module unload +- blk-iocost: fix operation ordering in iocg_wake_fn() +- drm/amdgpu: Fix resource leak on probe error path +- drm/amdgpu: Avoid printing of stack contents on firmware load error +- drm/amd/display: ensure dentist display clock update finished in DCN20 +- NIU: fix incorrect error return, missed in previous revert +- HID: wacom: Re-enable touch by default for Cintiq 24HDT / 27QHDT +- alpha: register early reserved memory in memblock +- can: esd_usb2: fix memory leak +- can: ems_usb: fix memory leak +- can: usb_8dev: fix memory leak +- can: mcba_usb_start(): add missing urb->transfer_dma initialization +- can: peak_usb: pcan_usb_handle_bus_evt(): fix reading rxerr/txerr values +- can: raw: raw_setsockopt(): fix raw_rcv panic for sock UAF +- can: j1939: j1939_xtp_rx_dat_one(): fix rxtimer value between consecutive TP.DT to 750ms +- ocfs2: issue zeroout to EOF blocks +- ocfs2: fix zero out valid data +- KVM: add missing compat KVM_CLEAR_DIRTY_LOG +- x86/kvm: fix vcpu-id indexed array sizes +- ACPI: DPTF: Fix reading of attributes +- Revert "ACPI: resources: Add checks for ACPI IRQ override" +- btrfs: mark compressed range uptodate only if all bio succeed +- btrfs: fix rw device counting in __btrfs_free_extra_devids +- pipe: make pipe writes always wake up readers +- x86/asm: Ensure asm/proto.h can be included stand-alone +- io_uring: fix null-ptr-deref in io_sq_offload_start() +- selftest: fix build error in tools/testing/selftests/vm/userfaultfd.c +- ipv6: ip6_finish_output2: set sk into newly allocated nskb +- ARM: dts: versatile: Fix up interrupt controller node names +- iomap: remove the length variable in iomap_seek_hole +- iomap: remove the length variable in iomap_seek_data +- cifs: fix the out of range assignment to bit fields in parse_server_interfaces +- firmware: arm_scmi: Fix range check for the maximum number of pending messages +- firmware: arm_scmi: Fix possible scmi_linux_errmap buffer overflow +- hfs: add lock nesting notation to hfs_find_init +- hfs: fix high memory mapping in hfs_bnode_read +- hfs: add missing clean-up in hfs_fill_super +- drm/ttm: add a check against null pointer dereference +- ipv6: allocate enough headroom in ip6_finish_output2() +- rcu-tasks: Don't delete holdouts within trc_wait_for_one_reader() +- rcu-tasks: Don't delete holdouts within trc_inspect_reader() +- sctp: move 198 addresses from unusable to private scope +- net: annotate data race around sk_ll_usec +- net/802/garp: fix memleak in garp_request_join() +- net/802/mrp: fix memleak in mrp_request_join() +- cgroup1: fix leaked context root causing sporadic NULL deref in LTP +- workqueue: fix UAF in pwq_unbound_release_workfn() +- af_unix: fix garbage collect vs MSG_PEEK +- KVM: x86: determine if an exception has an error code only when injecting it. +- io_uring: fix link timeout refs +- tools: Allow proper CC/CXX/... override with LLVM=1 in Makefile.include +- perf annotate: Add error log in symbol__annotate() +- perf env: Normalize aarch64.* and arm64.* to arm64 in normalize_arch() +- skbuff: Fix build with SKB extensions disabled +- xhci: add xhci_get_virt_ep() helper +- sfc: ensure correct number of XDP queues +- drm/i915/gvt: Clear d3_entered on elsp cmd submission. +- usb: ehci: Prevent missed ehci interrupts with edge-triggered MSI +- perf inject: Close inject.output on exit +- Documentation: Fix intiramfs script name +- skbuff: Release nfct refcount on napi stolen or re-used skbs +- bonding: fix build issue +- PCI: Mark AMD Navi14 GPU ATS as broken +- net: dsa: mv88e6xxx: enable SerDes PCS register dump via ethtool -d on Topaz +- net: dsa: mv88e6xxx: enable SerDes RX stats for Topaz +- drm/amdgpu: update golden setting for sienna_cichlid +- drm: Return -ENOTTY for non-drm ioctls +- driver core: Prevent warning when removing a device link from unregistered consumer +- nds32: fix up stack guard gap +- misc: eeprom: at24: Always append device id even if label property is set. +- rbd: always kick acquire on "acquired" and "released" notifications +- rbd: don't hold lock_rwsem while running_list is being drained +- hugetlbfs: fix mount mode command line processing +- memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions +- userfaultfd: do not untag user pointers +- io_uring: remove double poll entry on arm failure +- io_uring: explicitly count entries for poll reqs +- selftest: use mmap instead of posix_memalign to allocate memory +- posix-cpu-timers: Fix rearm racing against process tick +- bus: mhi: core: Validate channel ID when processing command completions +- ixgbe: Fix packet corruption due to missing DMA sync +- media: ngene: Fix out-of-bounds bug in ngene_command_config_free_buf() +- btrfs: check for missing device in btrfs_trim_fs +- tracing: Synthetic event field_pos is an index not a boolean +- tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop. +- tracing/histogram: Rename "cpu" to "common_cpu" +- tracepoints: Update static_call before tp_funcs when adding a tracepoint +- firmware/efi: Tell memblock about EFI iomem reservations +- usb: typec: stusb160x: register role switch before interrupt registration +- usb: dwc2: gadget: Fix sending zero length packet in DDMA mode. +- usb: dwc2: gadget: Fix GOUTNAK flow for Slave mode. +- usb: gadget: Fix Unbalanced pm_runtime_enable in tegra_xudc_probe +- USB: serial: cp210x: add ID for CEL EM3588 USB ZigBee stick +- USB: serial: cp210x: fix comments for GE CS1000 +- USB: serial: option: add support for u-blox LARA-R6 family +- usb: renesas_usbhs: Fix superfluous irqs happen after usb_pkt_pop() +- usb: max-3421: Prevent corruption of freed memory +- USB: usb-storage: Add LaCie Rugged USB3-FW to IGNORE_UAS +- usb: hub: Fix link power management max exit latency (MEL) calculations +- usb: hub: Disable USB 3 device initiated lpm if exit latency is too high +- KVM: PPC: Book3S HV Nested: Sanitise H_ENTER_NESTED TM state +- KVM: PPC: Book3S: Fix H_RTAS rets buffer overflow +- xhci: Fix lost USB 2 remote wake +- usb: xhci: avoid renesas_usb_fw.mem when it's unusable +- Revert "usb: renesas-xhci: Fix handling of unknown ROM state" +- ALSA: pcm: Fix mmap capability check +- ALSA: pcm: Call substream ack() method upon compat mmap commit +- ALSA: hdmi: Expose all pins on MSI MS-7C94 board +- ALSA: hda/realtek: Fix pop noise and 2 Front Mic issues on a machine +- ALSA: sb: Fix potential ABBA deadlock in CSP driver +- ALSA: usb-audio: Add registration quirk for JBL Quantum headsets +- ALSA: usb-audio: Add missing proc text entry for BESPOKEN type +- s390/boot: fix use of expolines in the DMA code +- s390/ftrace: fix ftrace_update_ftrace_func implementation +- mmc: core: Don't allocate IDA for OF aliases +- proc: Avoid mixing integer types in mem_rw() +- cifs: fix fallocate when trying to allocate a hole. +- cifs: only write 64kb at a time when fallocating a small region of a file +- drm/panel: raspberrypi-touchscreen: Prevent double-free +- net: sched: cls_api: Fix the the wrong parameter +- net: dsa: sja1105: make VID 4095 a bridge VLAN too +- tcp: disable TFO blackhole logic by default +- sctp: update active_key for asoc when old key is being replaced +- nvme: set the PRACT bit when using Write Zeroes with T10 PI +- r8169: Avoid duplicate sysfs entry creation error +- afs: Fix tracepoint string placement with built-in AFS +- Revert "USB: quirks: ignore remote wake-up on Fibocom L850-GL LTE modem" +- nvme-pci: don't WARN_ON in nvme_reset_work if ctrl.state is not RESETTING +- ceph: don't WARN if we're still opening a session to an MDS +- ipv6: fix another slab-out-of-bounds in fib6_nh_flush_exceptions +- net/sched: act_skbmod: Skip non-Ethernet packets +- spi: spi-bcm2835: Fix deadlock +- ALSA: hda: intel-dsp-cfg: add missing ElkhartLake PCI ID +- net/tcp_fastopen: fix data races around tfo_active_disable_stamp +- net: hisilicon: rename CACHE_LINE_MASK to avoid redefinition +- bnxt_en: Check abort error state in bnxt_half_open_nic() +- bnxt_en: Validate vlan protocol ID on RX packets +- bnxt_en: Add missing check for BNXT_STATE_ABORT_ERR in bnxt_fw_rset_task() +- bnxt_en: Refresh RoCE capabilities in bnxt_ulp_probe() +- bnxt_en: don't disable an already disabled PCI device +- ACPI: Kconfig: Fix table override from built-in initrd +- spi: cadence: Correct initialisation of runtime PM again +- scsi: target: Fix protect handling in WRITE SAME(32) +- scsi: iscsi: Fix iface sysfs attr detection +- netrom: Decrease sock refcount when sock timers expire +- sctp: trim optlen when it's a huge value in sctp_setsockopt +- net: sched: fix memory leak in tcindex_partial_destroy_work +- KVM: PPC: Fix kvm_arch_vcpu_ioctl vcpu_load leak +- KVM: PPC: Book3S: Fix CONFIG_TRANSACTIONAL_MEM=n crash +- net: decnet: Fix sleeping inside in af_decnet +- efi/tpm: Differentiate missing and invalid final event log table. +- dma-mapping: handle vmalloc addresses in dma_common_{mmap,get_sgtable} +- usb: hso: fix error handling code of hso_create_net_device +- net: fix uninit-value in caif_seqpkt_sendmsg +- bpftool: Check malloc return value in mount_bpffs_for_pin +- bpf, sockmap, udp: sk_prot needs inuse_idx set for proc stats +- bpf, sockmap, tcp: sk_prot needs inuse_idx set for proc stats +- bpf, sockmap: Fix potential memory leak on unlikely error case +- s390/bpf: Perform r1 range checking before accessing jit->seen_reg[r1] +- liquidio: Fix unintentional sign extension issue on left shift of u16 +- timers: Fix get_next_timer_interrupt() with no timers pending +- xdp, net: Fix use-after-free in bpf_xdp_link_release +- bpf: Fix tail_call_reachable rejection for interpreter when jit failed +- bpf, test: fix NULL pointer dereference on invalid expected_attach_type +- ASoC: rt5631: Fix regcache sync errors on resume +- spi: mediatek: fix fifo rx mode +- regulator: hi6421: Fix getting wrong drvdata +- regulator: hi6421: Use correct variable type for regmap api val argument +- spi: stm32: fixes pm_runtime calls in probe/remove +- spi: imx: add a check for speed_hz before calculating the clock +- ASoC: wm_adsp: Correct wm_coeff_tlv_get handling +- perf sched: Fix record failure when CONFIG_SCHEDSTATS is not set +- perf lzma: Close lzma stream on exit +- perf script: Fix memory 'threads' and 'cpus' leaks on exit +- perf report: Free generated help strings for sort option +- perf env: Fix memory leak of cpu_pmu_caps +- perf test maps__merge_in: Fix memory leak of maps +- perf dso: Fix memory leak in dso__new_map() +- perf test event_update: Fix memory leak of evlist +- perf test session_topology: Delete session->evlist +- perf env: Fix sibling_dies memory leak +- perf probe: Fix dso->nsinfo refcounting +- perf map: Fix dso->nsinfo refcounting +- perf inject: Fix dso->nsinfo refcounting +- KVM: x86/pmu: Clear anythread deprecated bit when 0xa leaf is unsupported on the SVM +- nvme-pci: do not call nvme_dev_remove_admin from nvme_remove +- mptcp: fix warning in __skb_flow_dissect() when do syn cookie for subflow join +- cxgb4: fix IRQ free race during driver unload +- pwm: sprd: Ensure configuring period and duty_cycle isn't wrongly skipped +- selftests: icmp_redirect: IPv6 PMTU info should be cleared after redirect +- selftests: icmp_redirect: remove from checking for IPv6 route get +- stmmac: platform: Fix signedness bug in stmmac_probe_config_dt() +- ipv6: fix 'disable_policy' for fwd packets +- bonding: fix incorrect return value of bond_ipsec_offload_ok() +- bonding: fix suspicious RCU usage in bond_ipsec_offload_ok() +- bonding: Add struct bond_ipesc to manage SA +- bonding: disallow setting nested bonding + ipsec offload +- bonding: fix suspicious RCU usage in bond_ipsec_del_sa() +- ixgbevf: use xso.real_dev instead of xso.dev in callback functions of struct xfrmdev_ops +- bonding: fix null dereference in bond_ipsec_add_sa() +- bonding: fix suspicious RCU usage in bond_ipsec_add_sa() +- net: add kcov handle to skb extensions +- gve: Fix an error handling path in 'gve_probe()' +- igb: Fix position of assignment to *ring +- igb: Check if num of q_vectors is smaller than max before array access +- iavf: Fix an error handling path in 'iavf_probe()' +- e1000e: Fix an error handling path in 'e1000_probe()' +- fm10k: Fix an error handling path in 'fm10k_probe()' +- igb: Fix an error handling path in 'igb_probe()' +- igc: Fix an error handling path in 'igc_probe()' +- ixgbe: Fix an error handling path in 'ixgbe_probe()' +- igc: change default return of igc_read_phy_reg() +- igb: Fix use-after-free error during reset +- igc: Fix use-after-free error during reset +- sched: Add frequency control for load update in scheduler_tick +- sched: Add switch for update_blocked_averages +- sched: Introcude config option SCHED_OPTIMIZE_LOAD_TRACKING +- udp: annotate data races around unix_sk(sk)->gso_size +- drm/panel: nt35510: Do not fail if DSI read fails +- bpf: Track subprog poke descriptors correctly and fix use-after-free +- bpftool: Properly close va_list 'ap' by va_end() on error +- tools: bpf: Fix error in 'make -C tools/ bpf_install' +- tcp: call sk_wmem_schedule before sk_mem_charge in zerocopy path +- ipv6: tcp: drop silly ICMPv6 packet too big messages +- tcp: fix tcp_init_transfer() to not reset icsk_ca_initialized +- tcp: annotate data races around tp->mtu_info +- tcp: consistently disable header prediction for mptcp +- ARM: dts: tacoma: Add phase corrections for eMMC +- ARM: dts: aspeed: Fix AST2600 machines line names +- kbuild: do not suppress Kconfig prompts for silent build +- dma-buf/sync_file: Don't leak fences on merge failure +- net: fddi: fix UAF in fza_probe +- net: dsa: properly check for the bridge_leave methods in dsa_switch_bridge_leave() +- Revert "mm/shmem: fix shmem_swapin() race with swapoff" +- net: validate lwtstate->data before returning from skb_tunnel_info() +- net: send SYNACK packet with accepted fwmark +- net: ti: fix UAF in tlan_remove_one +- net: qcom/emac: fix UAF in emac_remove +- net: moxa: fix UAF in moxart_mac_probe +- net: ip_tunnel: fix mtu calculation for ETHER tunnel devices +- net: bcmgenet: Ensure all TX/RX queues DMAs are disabled +- net: netdevsim: use xso.real_dev instead of xso.dev in callback functions of struct xfrmdev_ops +- net: bridge: sync fdb to new unicast-filtering ports +- net/sched: act_ct: remove and free nf_table callbacks +- vmxnet3: fix cksum offload issues for tunnels with non-default udp ports +- net/sched: act_ct: fix err check for nf_conntrack_confirm +- netfilter: ctnetlink: suspicious RCU usage in ctnetlink_dump_helpinfo +- net: ipv6: fix return value of ip6_skb_dst_mtu +- net: dsa: mv88e6xxx: enable devlink ATU hash param for Topaz +- net: dsa: mv88e6xxx: enable .rmu_disable() on Topaz +- net: dsa: mv88e6xxx: use correct .stats_set_histogram() on Topaz +- net: dsa: mv88e6xxx: enable .port_set_policy() on Topaz +- net: bcmgenet: ensure EXT_ENERGY_DET_MASK is clear +- usb: cdns3: Enable TDL_CHK only for OUT ep +- mm/page_alloc: fix memory map initialization for descending nodes +- mm/userfaultfd: fix uffd-wp special cases for fork() +- mm/thp: simplify copying of huge zero page pmd when fork +- f2fs: Show casefolding support only when supported +- Revert "swap: fix do_swap_page() race with swapoff" +- arm64: dts: marvell: armada-37xx: move firmware node to generic dtsi file +- firmware: turris-mox-rwtm: add marvell,armada-3700-rwtm-firmware compatible string +- cifs: prevent NULL deref in cifs_compose_mount_options() +- s390: introduce proper type handling call_on_stack() macro +- s390/traps: do not test MONITOR CALL without CONFIG_BUG +- thermal/core/thermal_of: Stop zone device before unregistering it +- perf/x86/intel/uncore: Clean up error handling path of iio mapping +- sched/fair: Fix CFS bandwidth hrtimer expiry type +- scsi: qedf: Add check to synchronize abort and flush +- scsi: libfc: Fix array index out of bound exception +- scsi: aic7xxx: Fix unintentional sign extension issue on left shift of u8 +- rtc: max77686: Do not enforce (incorrect) interrupt trigger type +- arch/arm64/boot/dts/marvell: fix NAND partitioning scheme +- kbuild: mkcompile_h: consider timestamp if KBUILD_BUILD_TIMESTAMP is set +- thermal/drivers/sprd: Add missing of_node_put for loop iteration +- thermal/drivers/imx_sc: Add missing of_node_put for loop iteration +- thermal/drivers/rcar_gen3_thermal: Do not shadow rcar_gen3_ths_tj_1 +- thermal/core: Correct function name thermal_zone_device_unregister() +- arm64: dts: imx8mq: assign PCIe clocks +- arm64: dts: ls208xa: remove bus-num from dspi node +- firmware: tegra: bpmp: Fix Tegra234-only builds +- soc/tegra: fuse: Fix Tegra234-only builds +- ARM: OMAP2+: Block suspend for am3 and am4 if PM is not configured +- ARM: dts: stm32: fix stpmic node for stm32mp1 boards +- ARM: dts: stm32: Rename spi-flash/mx66l51235l@N to flash@N on DHCOM SoM +- ARM: dts: stm32: Drop unused linux,wakeup from touchscreen node on DHCOM SoM +- ARM: dts: stm32: fix the Odyssey SoM eMMC VQMMC supply +- ARM: dts: stm32: move stmmac axi config in ethernet node on stm32mp15 +- ARM: dts: stm32: fix i2c node name on stm32f746 to prevent warnings +- ARM: dts: rockchip: fix supply properties in io-domains nodes +- arm64: dts: juno: Update SCPI nodes as per the YAML schema +- ARM: dts: bcm283x: Fix up GPIO LED node names +- ARM: dts: bcm283x: Fix up MMC node names +- firmware: arm_scmi: Fix the build when CONFIG_MAILBOX is not selected +- firmware: arm_scmi: Add SMCCC discovery dependency in Kconfig +- memory: tegra: Fix compilation warnings on 64bit platforms +- ARM: dts: stm32: fix timer nodes on STM32 MCU to prevent warnings +- ARM: dts: stm32: fix RCC node name on stm32f429 MCU +- ARM: dts: stm32: fix gpio-keys node on STM32 MCU boards +- ARM: dts: stm32: fix stm32mp157c-odyssey card detect pin +- ARM: dts: stm32: Fix touchscreen node on dhcom-pdk2 +- ARM: dts: stm32: Remove extra size-cells on dhcom-pdk2 +- arm64: dts: qcom: sc7180: Move rmtfs memory region +- ARM: tegra: nexus7: Correct 3v3 regulator GPIO of PM269 variant +- ARM: tegra: wm8903: Fix polarity of headphones-detection GPIO in device-trees +- arm64: dts: ti: k3-am654x/j721e/j7200-common-proc-board: Fix MCU_RGMII1_TXC direction +- ARM: dts: OMAP2+: Replace underscores in sub-mailbox node names +- ARM: dts: am335x: fix ti,no-reset-on-init flag for gpios +- ARM: dts: am437x-gp-evm: fix ti,no-reset-on-init flag for gpios +- ARM: dts: am57xx-cl-som-am57x: fix ti,no-reset-on-init flag for gpios +- kbuild: sink stdout from cmd for silent build +- rtc: mxc_v2: add missing MODULE_DEVICE_TABLE +- ARM: dts: imx6dl-riotboard: configure PHY clock and set proper EEE value +- ARM: dts: ux500: Fix orientation of accelerometer +- ARM: dts: ux500: Rename gpio-controller node +- ARM: dts: ux500: Fix interrupt cells +- arm64: dts: rockchip: fix regulator-gpio states array +- ARM: imx: pm-imx5: Fix references to imx5_cpu_suspend_info +- ARM: dts: imx6: phyFLEX: Fix UART hardware flow control +- ARM: dts: Hurricane 2: Fix NAND nodes names +- ARM: dts: BCM63xx: Fix NAND nodes names +- ARM: NSP: dts: fix NAND nodes names +- ARM: Cygnus: dts: fix NAND nodes names +- ARM: brcmstb: dts: fix NAND nodes names +- reset: ti-syscon: fix to_ti_syscon_reset_data macro +- arm64: dts: rockchip: Fix power-controller node names for rk3399 +- arm64: dts: rockchip: Fix power-controller node names for rk3328 +- arm64: dts: rockchip: Fix power-controller node names for px30 +- ARM: dts: rockchip: Fix power-controller node names for rk3288 +- ARM: dts: rockchip: Fix power-controller node names for rk3188 +- ARM: dts: rockchip: Fix power-controller node names for rk3066a +- ARM: dts: rockchip: Fix IOMMU nodes properties on rk322x +- ARM: dts: rockchip: Fix the timer clocks order +- arm64: dts: rockchip: fix pinctrl sleep nodename for rk3399.dtsi +- ARM: dts: rockchip: fix pinctrl sleep nodename for rk3036-kylin and rk3288 +- ARM: dts: rockchip: Fix thermal sensor cells o rk322x +- ARM: dts: gemini: add device_type on pci +- ARM: dts: gemini: rename mdio to the right name +- scsi: scsi_dh_alua: Fix signedness bug in alua_rtpg() +- MIPS: vdso: Invalid GIC access through VDSO +- mips: disable branch profiling in boot/decompress.o +- mips: always link byteswap helpers into decompressor +- static_call: Fix static_call_text_reserved() vs __init +- jump_label: Fix jump_label_text_reserved() vs __init +- sched/uclamp: Ignore max aggregation if rq is idle +- scsi: be2iscsi: Fix an error handling path in beiscsi_dev_probe() +- arm64: dts: rockchip: Re-add regulator-always-on for vcc_sdio for rk3399-roc-pc +- arm64: dts: rockchip: Re-add regulator-boot-on, regulator-always-on for vdd_gpu on rk3399-roc-pc +- firmware: turris-mox-rwtm: show message about HWRNG registration +- firmware: turris-mox-rwtm: fail probing when firmware does not support hwrng +- firmware: turris-mox-rwtm: report failures better +- firmware: turris-mox-rwtm: fix reply status decoding function +- thermal/drivers/rcar_gen3_thermal: Fix coefficient calculations +- ARM: dts: imx6q-dhcom: Add gpios pinctrl for i2c bus recovery +- ARM: dts: imx6q-dhcom: Fix ethernet plugin detection problems +- ARM: dts: imx6q-dhcom: Fix ethernet reset time properties +- thermal/drivers/sprd: Add missing MODULE_DEVICE_TABLE +- ARM: dts: am437x: align ti,pindir-d0-out-d1-in property with dt-shema +- ARM: dts: am335x: align ti,pindir-d0-out-d1-in property with dt-shema +- ARM: dts: dra7: Fix duplicate USB4 target module node +- arm64: dts: allwinner: a64-sopine-baseboard: change RGMII mode to TXID +- memory: fsl_ifc: fix leak of private memory on probe failure +- memory: fsl_ifc: fix leak of IO mapping on probe failure +- arm64: dts: ti: k3-j721e-main: Fix external refclk input to SERDES +- arm64: dts: renesas: r8a779a0: Drop power-domains property from GIC node +- reset: bail if try_module_get() fails +- ARM: dts: BCM5301X: Fixup SPI binding +- dt-bindings: i2c: at91: fix example for scl-gpios +- firmware: arm_scmi: Reset Rx buffer to max size during async commands +- firmware: tegra: Fix error return code in tegra210_bpmp_init() +- arm64: dts: qcom: trogdor: Add no-hpd to DSI bridge node +- ARM: dts: stm32: Rework LAN8710Ai PHY reset on DHCOM SoM +- ARM: dts: stm32: Connect PHY IRQ line on DH STM32MP1 SoM +- arm64: dts: renesas: r8a7796[01]: Fix OPP table entry voltages +- arm64: dts: renesas: Add missing opp-suspend properties +- arm64: dts: ti: j7200-main: Enable USB2 PHY RX sensitivity workaround +- ARM: dts: r8a7779, marzen: Fix DU clock names +- arm64: dts: renesas: v3msk: Fix memory size +- rtc: fix snprintf() checking in is_rtc_hctosys() +- ARM: dts: sun8i: h3: orangepi-plus: Fix ethernet phy-mode +- memory: pl353: Fix error return code in pl353_smc_probe() +- reset: brcmstb: Add missing MODULE_DEVICE_TABLE +- memory: atmel-ebi: add missing of_node_put for loop iteration +- memory: stm32-fmc2-ebi: add missing of_node_put for loop iteration +- ARM: dts: exynos: fix PWM LED max brightness on Odroid XU4 +- ARM: dts: exynos: fix PWM LED max brightness on Odroid HC1 +- ARM: dts: exynos: fix PWM LED max brightness on Odroid XU/XU3 +- ARM: exynos: add missing of_node_put for loop iteration +- reset: a10sr: add missing of_match_table reference +- reset: RESET_INTEL_GW should depend on X86 +- reset: RESET_BRCMSTB_RESCAL should depend on ARCH_BRCMSTB +- ARM: dts: gemini-rut1xx: remove duplicate ethernet node +- hexagon: use common DISCARDS macro +- hexagon: handle {,SOFT}IRQENTRY_TEXT in linker script +- NFSv4/pNFS: Don't call _nfs4_pnfs_v3_ds_connect multiple times +- NFSv4/pnfs: Fix layoutget behaviour after invalidation +- NFSv4/pnfs: Fix the layout barrier update +- vdpa/mlx5: Clear vq ready indication upon device reset +- ALSA: isa: Fix error return code in snd_cmi8330_probe() +- nfsd: Reduce contention for the nfsd_file nf_rwsem +- nvme-tcp: can't set sk_user_data without write_lock +- virtio_net: move tx vq operation under tx queue lock +- vdpa/mlx5: Fix possible failure in umem size calculation +- vdpa/mlx5: Fix umem sizes assignments on VQ create +- PCI: tegra194: Fix tegra_pcie_ep_raise_msi_irq() ill-defined shift +- pwm: imx1: Don't disable clocks at device remove time +- PCI: intel-gw: Fix INTx enable +- x86/fpu: Limit xstate copy size in xstateregs_set() +- x86/fpu: Fix copy_xstate_to_kernel() gap handling +- f2fs: fix to avoid adding tab before doc section +- PCI: iproc: Support multi-MSI only on uniprocessor kernel +- PCI: iproc: Fix multi-MSI base vector number allocation +- ubifs: Set/Clear I_LINKABLE under i_lock for whiteout inode +- nfs: fix acl memory leak of posix_acl_create() +- SUNRPC: prevent port reuse on transports which don't request it. +- watchdog: jz4740: Fix return value check in jz4740_wdt_probe() +- watchdog: aspeed: fix hardware timeout calculation +- ubifs: journal: Fix error return code in ubifs_jnl_write_inode() +- ubifs: Fix off-by-one error +- um: fix error return code in winch_tramp() +- um: fix error return code in slip_open() +- misc: alcor_pci: fix inverted branch condition +- NFSv4: Fix an Oops in pnfs_mark_request_commit() when doing O_DIRECT +- NFSv4: Initialise connection to the server in nfs4_alloc_client() +- power: supply: rt5033_battery: Fix device tree enumeration +- PCI/sysfs: Fix dsm_label_utf16s_to_utf8s() buffer overrun +- remoteproc: k3-r5: Fix an error message +- f2fs: compress: fix to disallow temp extension +- f2fs: add MODULE_SOFTDEP to ensure crc32 is included in the initramfs +- x86/signal: Detect and prevent an alternate signal stack overflow +- NFSD: Fix TP_printk() format specifier in nfsd_clid_class +- f2fs: atgc: fix to set default age threshold +- virtio_console: Assure used length from device is limited +- virtio_net: Fix error handling in virtnet_restore() +- virtio-blk: Fix memory leak among suspend/resume procedure +- PCI: rockchip: Register IRQ handlers after device and data are ready +- ACPI: video: Add quirk for the Dell Vostro 3350 +- ACPI: AMBA: Fix resource name in /proc/iomem +- pwm: tegra: Don't modify HW state in .remove callback +- pwm: img: Fix PM reference leak in img_pwm_enable() +- drm/amdkfd: fix sysfs kobj leak +- power: supply: ab8500: add missing MODULE_DEVICE_TABLE +- power: supply: charger-manager: add missing MODULE_DEVICE_TABLE +- NFS: nfs_find_open_context() may only select open files +- drm/gma500: Add the missed drm_gem_object_put() in psb_user_framebuffer_create() +- ceph: remove bogus checks and WARN_ONs from ceph_set_page_dirty +- orangefs: fix orangefs df output. +- PCI: tegra: Add missing MODULE_DEVICE_TABLE +- remoteproc: core: Fix cdev remove and rproc del +- x86/fpu: Return proper error codes from user access functions +- watchdog: iTCO_wdt: Account for rebooting on second timeout +- watchdog: imx_sc_wdt: fix pretimeout +- watchdog: Fix possible use-after-free by calling del_timer_sync() +- watchdog: sc520_wdt: Fix possible use-after-free in wdt_turnoff() +- watchdog: Fix possible use-after-free in wdt_startup() +- PCI: pciehp: Ignore Link Down/Up caused by DPC +- NFSv4: Fix delegation return in cases where we have to retry +- PCI/P2PDMA: Avoid pci_get_slot(), which may sleep +- ARM: 9087/1: kprobes: test-thumb: fix for LLVM_IAS=1 +- power: reset: gpio-poweroff: add missing MODULE_DEVICE_TABLE +- power: supply: max17042: Do not enforce (incorrect) interrupt trigger type +- PCI: hv: Fix a race condition when removing the device +- power: supply: ab8500: Avoid NULL pointers +- PCI: ftpci100: Rename macro name collision +- pwm: spear: Don't modify HW state in .remove callback +- power: supply: sc2731_charger: Add missing MODULE_DEVICE_TABLE +- power: supply: sc27xx: Add missing MODULE_DEVICE_TABLE +- kcov: add __no_sanitize_coverage to fix noinstr for all architectures +- lib/decompress_unlz4.c: correctly handle zero-padding around initrds. +- phy: intel: Fix for warnings due to EMMC clock 175Mhz change in FIP +- i2c: core: Disable client irq on reboot/shutdown +- intel_th: Wait until port is in reset before programming it +- staging: rtl8723bs: fix macro value for 2.4Ghz only device +- leds: turris-omnia: add missing MODULE_DEVICE_TABLE +- ALSA: firewire-motu: fix detection for S/PDIF source on optical interface in v2 protocol +- ALSA: usb-audio: scarlett2: Fix 6i6 Gen 2 line out descriptions +- ALSA: hda: Add IRQ check for platform_get_irq() +- backlight: lm3630a: Fix return code of .update_status() callback +- ASoC: Intel: kbl_da7219_max98357a: shrink platform_id below 20 characters +- powerpc/boot: Fixup device-tree on little endian +- usb: gadget: hid: fix error return code in hid_bind() +- usb: gadget: f_hid: fix endianness issue with descriptors +- ALSA: usb-audio: scarlett2: Fix scarlett2_*_ctl_put() return values +- ALSA: usb-audio: scarlett2: Fix data_mutex lock +- ALSA: usb-audio: scarlett2: Fix 18i8 Gen 2 PCM Input count +- ALSA: bebob: add support for ToneWeal FW66 +- Input: hideep - fix the uninitialized use in hideep_nvm_unlock() +- s390/mem_detect: fix tprot() program check new psw handling +- s390/mem_detect: fix diag260() program check new psw handling +- s390/ipl_parm: fix program check new psw handling +- s390/processor: always inline stap() and __load_psw_mask() +- habanalabs: remove node from list before freeing the node +- habanalabs/gaudi: set the correct cpu_id on MME2_QM failure +- ASoC: soc-core: Fix the error return code in snd_soc_of_parse_audio_routing() +- powerpc/mm/book3s64: Fix possible build error +- gpio: pca953x: Add support for the On Semi pca9655 +- selftests/powerpc: Fix "no_handler" EBB selftest +- ALSA: ppc: fix error return code in snd_pmac_probe() +- scsi: storvsc: Correctly handle multiple flags in srb_status +- gpio: zynq: Check return value of irq_get_irq_data +- gpio: zynq: Check return value of pm_runtime_get_sync +- ASoC: soc-pcm: fix the return value in dpcm_apply_symmetry() +- iommu/arm-smmu: Fix arm_smmu_device refcount leak in address translation +- iommu/arm-smmu: Fix arm_smmu_device refcount leak when arm_smmu_rpm_get fails +- powerpc/ps3: Add dma_mask to ps3_dma_region +- ALSA: sb: Fix potential double-free of CSP mixer elements +- selftests: timers: rtcpie: skip test if default RTC device does not exist +- s390: disable SSP when needed +- s390/sclp_vt220: fix console name to match device +- serial: tty: uartlite: fix console setup +- fsi: Add missing MODULE_DEVICE_TABLE +- ASoC: img: Fix PM reference leak in img_i2s_in_probe() +- mfd: cpcap: Fix cpcap dmamask not set warnings +- mfd: da9052/stmpe: Add and modify MODULE_DEVICE_TABLE +- scsi: qedi: Fix cleanup session block/unblock use +- scsi: qedi: Fix TMF session block/unblock use +- scsi: qedi: Fix race during abort timeouts +- scsi: qedi: Fix null ref during abort handling +- scsi: iscsi: Fix shost->max_id use +- scsi: iscsi: Fix conn use after free during resets +- scsi: iscsi: Add iscsi_cls_conn refcount helpers +- scsi: megaraid_sas: Handle missing interrupts while re-enabling IRQs +- scsi: megaraid_sas: Early detection of VD deletion through RaidMap update +- scsi: megaraid_sas: Fix resource leak in case of probe failure +- fs/jfs: Fix missing error code in lmLogInit() +- scsi: scsi_dh_alua: Check for negative result value +- scsi: core: Fixup calling convention for scsi_mode_sense() +- scsi: mpt3sas: Fix deadlock while cancelling the running firmware event +- tty: serial: 8250: serial_cs: Fix a memory leak in error handling path +- ALSA: ac97: fix PM reference leak in ac97_bus_remove() +- scsi: core: Cap scsi_host cmd_per_lun at can_queue +- scsi: lpfc: Fix crash when lpfc_sli4_hba_setup() fails to initialize the SGLs +- scsi: lpfc: Fix "Unexpected timeout" error in direct attach topology +- scsi: arcmsr: Fix doorbell status being updated late on ARC-1886 +- w1: ds2438: fixing bug that would always get page0 +- usb: common: usb-conn-gpio: fix NULL pointer dereference of charger +- Revert "ALSA: bebob/oxfw: fix Kconfig entry for Mackie d.2 Pro" +- ALSA: usx2y: Don't call free_pages_exact() with NULL address +- ALSA: usx2y: Avoid camelCase +- iio: magn: bmc150: Balance runtime pm + use pm_runtime_resume_and_get() +- iio: gyro: fxa21002c: Balance runtime pm + use pm_runtime_resume_and_get(). +- partitions: msdos: fix one-byte get_unaligned() +- ASoC: intel/boards: add missing MODULE_DEVICE_TABLE +- misc: alcor_pci: fix null-ptr-deref when there is no PCI bridge +- misc/libmasm/module: Fix two use after free in ibmasm_init_one +- serial: fsl_lpuart: disable DMA for console and fix sysrq +- tty: serial: fsl_lpuart: fix the potential risk of division or modulo by zero +- rcu: Reject RCU_LOCKDEP_WARN() false positives +- srcu: Fix broken node geometry after early ssp init +- scsi: arcmsr: Fix the wrong CDB payload report to IOP +- dmaengine: fsl-qdma: check dma_set_mask return value +- ASoC: Intel: sof_sdw: add mutual exclusion between PCH DMIC and RT715 +- leds: tlc591xx: fix return value check in tlc591xx_probe() +- net: bridge: multicast: fix MRD advertisement router port marking race +- net: bridge: multicast: fix PIM hello router port marking race +- Revert "drm/ast: Remove reference to struct drm_device.pdev" +- drm/ingenic: Switch IPU plane to type OVERLAY +- drm/ingenic: Fix non-OSD mode +- drm/dp_mst: Add missing drm parameters to recently added call to drm_dbg_kms() +- drm/dp_mst: Avoid to mess up payload table by ports in stale topology +- drm/dp_mst: Do not set proposed vcpi directly +- fbmem: Do not delete the mode that is still in use +- cgroup: verify that source is a string +- drm/i915/gt: Fix -EDEADLK handling regression +- drm/i915/gtt: drop the page table optimisation +- tracing: Do not reference char * as a string in histograms +- scsi: zfcp: Report port fc_security as unknown early during remote cable pull +- scsi: core: Fix bad pointer dereference when ehandler kthread is invalid +- KVM: X86: Disable hardware breakpoints unconditionally before kvm_x86->run() +- KVM: nSVM: Check the value written to MSR_VM_HSAVE_PA +- KVM: x86/mmu: Do not apply HPA (memory encryption) mask to GPAs +- KVM: x86: Use guest MAXPHYADDR from CPUID.0x8000_0008 iff TDP is enabled +- KVM: mmio: Fix use-after-free Read in kvm_vm_ioctl_unregister_coalesced_mmio +- cifs: handle reconnect of tcon when there is no cached dfs referral +- certs: add 'x509_revocation_list' to gitignore +- f2fs: fix to avoid racing on fsync_entry_slab by multi filesystem instances +- smackfs: restrict bytes count in smk_set_cipso() +- jfs: fix GPF in diFree +- drm/ast: Remove reference to struct drm_device.pdev +- pinctrl: mcp23s08: Fix missing unlock on error in mcp23s08_irq() +- dm writecache: write at least 4k when committing +- io_uring: fix clear IORING_SETUP_R_DISABLED in wrong function +- media: uvcvideo: Fix pixel format change for Elgato Cam Link 4K +- media: gspca/sunplus: fix zero-length control requests +- media: gspca/sq905: fix control-request direction +- media: zr364xx: fix memory leak in zr364xx_start_readpipe +- media: dtv5100: fix control-request directions +- media: subdev: disallow ioctl for saa6588/davinci +- PCI: aardvark: Implement workaround for the readback value of VEND_ID +- PCI: aardvark: Fix checking for PIO Non-posted Request +- PCI: Leave Apple Thunderbolt controllers on for s2idle or standby +- dm writecache: flush origin device when writing and cache is full +- dm zoned: check zone capacity +- coresight: tmc-etf: Fix global-out-of-bounds in tmc_update_etf_buffer() +- coresight: Propagate symlink failure +- ipack/carriers/tpci200: Fix a double free in tpci200_pci_probe +- tracing: Resize tgid_map to pid_max, not PID_MAX_DEFAULT +- tracing: Simplify & fix saved_tgids logic +- rq-qos: fix missed wake-ups in rq_qos_throttle try two +- seq_buf: Fix overflow in seq_buf_putmem_hex() +- extcon: intel-mrfld: Sync hardware and software state on init +- selftests/lkdtm: Fix expected text for CR4 pinning +- lkdtm/bugs: XFAIL UNALIGNED_LOAD_STORE_WRITE +- nvmem: core: add a missing of_node_put +- mfd: syscon: Free the allocated name field of struct regmap_config +- power: supply: ab8500: Fix an old bug +- thermal/drivers/int340x/processor_thermal: Fix tcc setting +- ipmi/watchdog: Stop watchdog timer when the current action is 'none' +- qemu_fw_cfg: Make fw_cfg_rev_attr a proper kobj_attribute +- i40e: fix PTP on 5Gb links +- ASoC: tegra: Set driver_name=tegra for all machine drivers +- fpga: stratix10-soc: Add missing fpga_mgr_free() call +- clocksource/arm_arch_timer: Improve Allwinner A64 timer workaround +- cpu/hotplug: Cure the cpusets trainwreck +- arm64: tlb: fix the TTL value of tlb_get_level +- ata: ahci_sunxi: Disable DIPM +- mmc: core: Allow UHS-I voltage switch for SDSC cards if supported +- mmc: core: clear flags before allowing to retune +- mmc: sdhci: Fix warning message when accessing RPMB in HS400 mode +- mmc: sdhci-acpi: Disable write protect detection on Toshiba Encore 2 WT8-B +- drm/i915/display: Do not zero past infoframes.vsc +- drm/nouveau: Don't set allow_fb_modifiers explicitly +- drm/arm/malidp: Always list modifiers +- drm/msm/mdp4: Fix modifier support enabling +- drm/tegra: Don't set allow_fb_modifiers explicitly +- drm/amd/display: Reject non-zero src_y and src_x for video planes +- pinctrl/amd: Add device HID for new AMD GPIO controller +- drm/amd/display: fix incorrrect valid irq check +- drm/rockchip: dsi: remove extra component_del() call +- drm/dp: Handle zeroed port counts in drm_dp_read_downstream_info() +- drm/vc4: hdmi: Prevent clock unbalance +- drm/vc4: crtc: Skip the TXP +- drm/vc4: txp: Properly set the possible_crtcs mask +- drm/radeon: Call radeon_suspend_kms() in radeon_pci_shutdown() for Loongson64 +- drm/radeon: Add the missed drm_gem_object_put() in radeon_user_framebuffer_create() +- drm/amdgpu: enable sdma0 tmz for Raven/Renoir(V2) +- drm/amdgpu: Update NV SIMD-per-CU to 2 +- powerpc/powernv/vas: Release reference to tgid during window close +- powerpc/barrier: Avoid collision with clang's __lwsync macro +- powerpc/mm: Fix lockup on kernel exec fault +- arm64: dts: rockchip: Enable USB3 for rk3328 Rock64 +- arm64: dts: rockchip: add rk3328 dwc3 usb controller node +- ath11k: unlock on error path in ath11k_mac_op_add_interface() +- MIPS: MT extensions are not available on MIPS32r1 +- selftests/resctrl: Fix incorrect parsing of option "-t" +- MIPS: set mips32r5 for virt extensions +- MIPS: loongsoon64: Reserve memory below starting pfn to prevent Oops +- sctp: add size validation when walking chunks +- sctp: validate from_addr_param return +- flow_offload: action should not be NULL when it is referenced +- bpf: Fix false positive kmemleak report in bpf_ringbuf_area_alloc() +- sched/fair: Ensure _sum and _avg values stay consistent +- Bluetooth: btusb: fix bt fiwmare downloading failure issue for qca btsoc. +- Bluetooth: mgmt: Fix the command returns garbage parameter value +- Bluetooth: btusb: Add support USB ALT 3 for WBS +- Bluetooth: L2CAP: Fix invalid access on ECRED Connection response +- Bluetooth: L2CAP: Fix invalid access if ECRED Reconfigure fails +- Bluetooth: btusb: Add a new QCA_ROME device (0cf3:e500) +- Bluetooth: Shutdown controller after workqueues are flushed or cancelled +- Bluetooth: Fix alt settings for incoming SCO with transparent coding format +- Bluetooth: Fix the HCI to MGMT status conversion table +- Bluetooth: btusb: Fixed too many in-token issue for Mediatek Chip. +- RDMA/cma: Fix rdma_resolve_route() memory leak +- net: ip: avoid OOM kills with large UDP sends over loopback +- media, bpf: Do not copy more entries than user space requested +- IB/isert: Align target max I/O size to initiator size +- mac80211_hwsim: add concurrent channels scanning support over virtio +- mac80211: consider per-CPU statistics if present +- cfg80211: fix default HE tx bitrate mask in 2G band +- wireless: wext-spy: Fix out-of-bounds warning +- sfc: error code if SRIOV cannot be disabled +- sfc: avoid double pci_remove of VFs +- iwlwifi: pcie: fix context info freeing +- iwlwifi: pcie: free IML DMA memory allocation +- iwlwifi: mvm: fix error print when session protection ends +- iwlwifi: mvm: don't change band on bound PHY contexts +- RDMA/rxe: Don't overwrite errno from ib_umem_get() +- vsock: notify server to shutdown when client has pending signal +- atm: nicstar: register the interrupt handler in the right place +- atm: nicstar: use 'dma_free_coherent' instead of 'kfree' +- net: fec: add ndo_select_queue to fix TX bandwidth fluctuations +- MIPS: add PMD table accounting into MIPS'pmd_alloc_one +- rtl8xxxu: Fix device info for RTL8192EU devices +- mt76: mt7915: fix IEEE80211_HE_PHY_CAP7_MAX_NC for station mode +- drm/amdkfd: Walk through list with dqm lock hold +- drm/amdgpu: fix bad address translation for sienna_cichlid +- io_uring: fix false WARN_ONCE +- net: sched: fix error return code in tcf_del_walker() +- net: ipa: Add missing of_node_put() in ipa_firmware_load() +- net: fix mistake path for netdev_features_strings +- mt76: mt7615: fix fixed-rate tx status reporting +- ice: mark PTYPE 2 as reserved +- ice: fix incorrect payload indicator on PTYPE +- bpf: Fix up register-based shifts in interpreter to silence KUBSAN +- drm/amdkfd: Fix circular lock in nocpsch path +- drm/amdkfd: fix circular locking on get_wave_state +- cw1200: add missing MODULE_DEVICE_TABLE +- wl1251: Fix possible buffer overflow in wl1251_cmd_scan +- wlcore/wl12xx: Fix wl12xx get_mac error if device is in ELP +- dm writecache: commit just one block, not a full page +- xfrm: Fix error reporting in xfrm_state_construct. +- drm/amd/display: Verify Gamma & Degamma LUT sizes in amdgpu_dm_atomic_check +- r8169: avoid link-up interrupt issue on RTL8106e if user enables ASPM +- selinux: use __GFP_NOWARN with GFP_NOWAIT in the AVC +- fjes: check return value after calling platform_get_resource() +- drm/amdkfd: use allowed domain for vmbo validation +- net: sgi: ioc3-eth: check return value after calling platform_get_resource() +- selftests: Clean forgotten resources as part of cleanup() +- net: phy: realtek: add delay to fix RXC generation issue +- drm/amd/display: Fix off-by-one error in DML +- drm/amd/display: Set DISPCLK_MAX_ERRDET_CYCLES to 7 +- drm/amd/display: Release MST resources on switch from MST to SST +- drm/amd/display: Update scaling settings on modeset +- drm/amd/display: Fix DCN 3.01 DSCCLK validation +- net: moxa: Use devm_platform_get_and_ioremap_resource() +- net: micrel: check return value after calling platform_get_resource() +- net: mvpp2: check return value after calling platform_get_resource() +- net: bcmgenet: check return value after calling platform_get_resource() +- net: mscc: ocelot: check return value after calling platform_get_resource() +- virtio_net: Remove BUG() to avoid machine dead +- ice: fix clang warning regarding deadcode.DeadStores +- ice: set the value of global config lock timeout longer +- pinctrl: mcp23s08: fix race condition in irq handler +- net: bridge: mrp: Update ring transitions. +- dm: Fix dm_accept_partial_bio() relative to zone management commands +- dm writecache: don't split bios when overwriting contiguous cache content +- dm space maps: don't reset space map allocation cursor when committing +- RDMA/cxgb4: Fix missing error code in create_qp() +- net: tcp better handling of reordering then loss cases +- drm/amdgpu: remove unsafe optimization to drop preamble ib +- drm/amd/display: Avoid HDCP over-read and corruption +- MIPS: ingenic: Select CPU_SUPPORTS_CPUFREQ && MIPS_EXTERNAL_TIMER +- MIPS: cpu-probe: Fix FPU detection on Ingenic JZ4760(B) +- ipv6: use prandom_u32() for ID generation +- virtio-net: Add validation for used length +- drm: bridge: cdns-mhdp8546: Fix PM reference leak in +- clk: tegra: Ensure that PLLU configuration is applied properly +- clk: tegra: Fix refcounting of gate clocks +- RDMA/rtrs: Change MAX_SESS_QUEUE_DEPTH +- net: stmmac: the XPCS obscures a potential "PHY not found" error +- drm: rockchip: add missing registers for RK3066 +- drm: rockchip: add missing registers for RK3188 +- net/mlx5: Fix lag port remapping logic +- net/mlx5e: IPsec/rep_tc: Fix rep_tc_update_skb drops IPsec packet +- clk: renesas: r8a77995: Add ZA2 clock +- drm/bridge: cdns: Fix PM reference leak in cdns_dsi_transfer() +- igb: fix assignment on big endian machines +- igb: handle vlan types with checker enabled +- e100: handle eeprom as little endian +- drm/vc4: hdmi: Fix PM reference leak in vc4_hdmi_encoder_pre_crtc_co() +- drm/vc4: Fix clock source for VEC PixelValve on BCM2711 +- udf: Fix NULL pointer dereference in udf_symlink function +- drm/sched: Avoid data corruptions +- drm/scheduler: Fix hang when sched_entity released +- pinctrl: equilibrium: Add missing MODULE_DEVICE_TABLE +- net/sched: cls_api: increase max_reclassify_loop +- net: mdio: provide shim implementation of devm_of_mdiobus_register +- drm/virtio: Fix double free on probe failure +- reiserfs: add check for invalid 1st journal block +- drm/bridge: lt9611: Add missing MODULE_DEVICE_TABLE +- net: mdio: ipq8064: add regmap config to disable REGCACHE +- drm/mediatek: Fix PM reference leak in mtk_crtc_ddp_hw_init() +- net: Treat __napi_schedule_irqoff() as __napi_schedule() on PREEMPT_RT +- atm: nicstar: Fix possible use-after-free in nicstar_cleanup() +- mISDN: fix possible use-after-free in HFC_cleanup() +- atm: iphase: fix possible use-after-free in ia_module_exit() +- hugetlb: clear huge pte during flush function on mips platform +- clk: renesas: rcar-usb2-clock-sel: Fix error handling in .probe() +- drm/amd/display: fix use_max_lb flag for 420 pixel formats +- net: pch_gbe: Use proper accessors to BE data in pch_ptp_match() +- drm/bridge: nwl-dsi: Force a full modeset when crtc_state->active is changed to be true +- drm/vc4: fix argument ordering in vc4_crtc_get_margins() +- drm/amd/amdgpu/sriov disable all ip hw status by default +- drm/amd/display: fix HDCP reset sequence on reinitialize +- drm/ast: Fixed CVE for DP501 +- drm/zte: Don't select DRM_KMS_FB_HELPER +- drm/mxsfb: Don't select DRM_KMS_FB_HELPER +- perf data: Close all files in close_dir() +- perf test bpf: Free obj_buf +- perf probe-file: Delete namelist in del_events() on the error path +- igmp: Add ip_mc_list lock in ip_check_mc_rcu +- ACPI / PPTT: get PPTT table in the first beginning +- Revert "[Huawei] sched: export sched_setscheduler symbol" +- kcsan: Never set up watchpoints on NULL pointers +- ext4: inline jbd2_journal_[un]register_shrinker() +- jbd2: export jbd2_journal_[un]register_shrinker() +- fs: remove bdev_try_to_free_page callback +- ext4: remove bdev_try_to_free_page() callback +- jbd2: simplify journal_clean_one_cp_list() +- jbd2,ext4: add a shrinker to release checkpointed buffers +- jbd2: remove redundant buffer io error checks +- jbd2: don't abort the journal when freeing buffers +- jbd2: ensure abort the journal if detect IO error when writing original buffer back +- jbd2: remove the out label in __jbd2_journal_remove_checkpoint() +- net: spnic: add NIC layer +- net: spnic: initial commit the common module of Ramaxel NIC driver +- spraid: Add CONFIG_RAMAXEL_SPRAID in defconfig of arch arm64 and x86 +- spraid: support Ramaxel raid controller +- powerpc/preempt: Don't touch the idle task's preempt_count during hotplug +- iommu/dma: Fix compile warning in 32-bit builds +- cred: add missing return error code when set_cred_ucounts() failed +- s390: preempt: Fix preempt_count initialization +- crypto: qce - fix error return code in qce_skcipher_async_req_handle() +- scsi: core: Retry I/O for Notify (Enable Spinup) Required error +- media: exynos4-is: remove a now unused integer +- mmc: vub3000: fix control-request direction +- mmc: block: Disable CMDQ on the ioctl path +- io_uring: fix blocking inline submission +- block: return the correct bvec when checking for gaps +- erofs: fix error return code in erofs_read_superblock() +- tpm: Replace WARN_ONCE() with dev_err_once() in tpm_tis_status() +- fscrypt: fix derivation of SipHash keys on big endian CPUs +- fscrypt: don't ignore minor_hash when hash is 0 +- mailbox: qcom-ipcc: Fix IPCC mbox channel exhaustion +- scsi: target: cxgbit: Unmap DMA buffer before calling target_execute_cmd() +- scsi: fc: Correct RHBA attributes length +- exfat: handle wrong stream entry size in exfat_readdir() +- csky: syscache: Fixup duplicate cache flush +- csky: fix syscache.c fallthrough warning +- perf llvm: Return -ENOMEM when asprintf() fails +- selftests/vm/pkeys: refill shadow register after implicit kernel write +- selftests/vm/pkeys: handle negative sys_pkey_alloc() return code +- selftests/vm/pkeys: fix alloc_random_pkey() to make it really, really random +- lib/math/rational.c: fix divide by zero +- mm/z3fold: use release_z3fold_page_locked() to release locked z3fold page +- mm/z3fold: fix potential memory leak in z3fold_destroy_pool() +- include/linux/huge_mm.h: remove extern keyword +- hugetlb: remove prep_compound_huge_page cleanup +- mm/hugetlb: remove redundant check in preparing and destroying gigantic page +- mm/hugetlb: use helper huge_page_order and pages_per_huge_page +- mm/huge_memory.c: don't discard hugepage if other processes are mapping it +- mm/huge_memory.c: add missing read-only THP checking in transparent_hugepage_enabled() +- mm/huge_memory.c: remove dedicated macro HPAGE_CACHE_INDEX_MASK +- mm/pmem: avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled +- vfio/pci: Handle concurrent vma faults +- arm64: dts: marvell: armada-37xx: Fix reg for standard variant of UART +- serial: mvebu-uart: correctly calculate minimal possible baudrate +- serial: mvebu-uart: do not allow changing baudrate when uartclk is not available +- ALSA: firewire-lib: Fix 'amdtp_domain_start()' when no AMDTP_OUT_STREAM stream is found +- powerpc/papr_scm: Make 'perf_stats' invisible if perf-stats unavailable +- powerpc/64s: Fix copy-paste data exposure into newly created tasks +- powerpc/papr_scm: Properly handle UUID types and API +- powerpc: Offline CPU in stop_this_cpu() +- serial: 8250: 8250_omap: Fix possible interrupt storm on K3 SoCs +- serial: 8250: 8250_omap: Disable RX interrupt after DMA enable +- selftests/ftrace: fix event-no-pid on 1-core machine +- leds: ktd2692: Fix an error handling path +- leds: as3645a: Fix error return code in as3645a_parse_node() +- ASoC: fsl_spdif: Fix unexpected interrupt after suspend +- ASoC: Intel: sof_sdw: add SOF_RT715_DAI_ID_FIX for AlderLake +- ASoC: atmel-i2s: Fix usage of capture and playback at the same time +- powerpc/powernv: Fix machine check reporting of async store errors +- extcon: max8997: Add missing modalias string +- extcon: sm5502: Drop invalid register write in sm5502_reg_data +- phy: ti: dm816x: Fix the error handling path in 'dm816x_usb_phy_probe() +- phy: uniphier-pcie: Fix updating phy parameters +- soundwire: stream: Fix test for DP prepare complete +- scsi: mpt3sas: Fix error return value in _scsih_expander_add() +- habanalabs: Fix an error handling path in 'hl_pci_probe()' +- mtd: rawnand: marvell: add missing clk_disable_unprepare() on error in marvell_nfc_resume() +- of: Fix truncation of memory sizes on 32-bit platforms +- ASoC: cs42l42: Correct definition of CS42L42_ADC_PDN_MASK +- iio: prox: isl29501: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: light: vcnl4035: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- serial: 8250: Actually allow UPF_MAGIC_MULTIPLIER baud rates +- staging: mt7621-dts: fix pci address for PCI memory range +- coresight: core: Fix use of uninitialized pointer +- staging: rtl8712: fix memory leak in rtl871x_load_fw_cb +- staging: rtl8712: fix error handling in r871xu_drv_init +- staging: gdm724x: check for overflow in gdm_lte_netif_rx() +- staging: gdm724x: check for buffer overflow in gdm_lte_multi_sdu_pkt() +- ASoC: fsl_spdif: Fix error handler with pm_runtime_enable +- iio: light: vcnl4000: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: magn: rm3100: Fix alignment of buffer in iio_push_to_buffers_with_timestamp() +- iio: adc: ti-ads8688: Fix alignment of buffer in iio_push_to_buffers_with_timestamp() +- iio: adc: mxs-lradc: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: adc: hx711: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: adc: at91-sama5d2: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- thunderbolt: Bond lanes only when dual_link_port != NULL in alloc_dev_default() +- eeprom: idt_89hpesx: Restore printing the unsupported fwnode name +- eeprom: idt_89hpesx: Put fwnode in matching case during ->probe() +- usb: dwc2: Don't reset the core after setting turnaround time +- usb: gadget: f_fs: Fix setting of device and driver data cross-references +- ASoC: mediatek: mtk-btcvsd: Fix an error handling path in 'mtk_btcvsd_snd_probe()' +- ASoC: rt5682-sdw: set regcache_cache_only false before reading RT5682_DEVICE_ID +- ASoC: rt5682: fix getting the wrong device id when the suspend_stress_test +- ASoC: rt715-sdw: use first_hw_init flag on resume +- ASoC: rt711-sdw: use first_hw_init flag on resume +- ASoC: rt700-sdw: use first_hw_init flag on resume +- ASoC: rt5682-sdw: use first_hw_init flag on resume +- ASoC: rt1308-sdw: use first_hw_init flag on resume +- ASoC: max98373-sdw: use first_hw_init flag on resume +- iommu/dma: Fix IOVA reserve dma ranges +- selftests: splice: Adjust for handler fallback removal +- s390: appldata depends on PROC_SYSCTL +- s390: enable HAVE_IOREMAP_PROT +- s390/irq: select HAVE_IRQ_EXIT_ON_IRQ_STACK +- iommu/amd: Fix extended features logging +- visorbus: fix error return code in visorchipset_init() +- fsi/sbefifo: Fix reset timeout +- fsi/sbefifo: Clean up correct FIFO when receiving reset request from SBE +- fsi: occ: Don't accept response from un-initialized OCC +- fsi: scom: Reset the FSI2PIB engine for any error +- fsi: core: Fix return of error values on failures +- mfd: rn5t618: Fix IRQ trigger by changing it to level mode +- mfd: mp2629: Select MFD_CORE to fix build error +- scsi: iscsi: Flush block work before unblock +- scsi: FlashPoint: Rename si_flags field +- leds: lp50xx: Put fwnode in error case during ->probe() +- leds: lm3697: Don't spam logs when probe is deferred +- leds: lm3692x: Put fwnode in any case during ->probe() +- leds: lm36274: Put fwnode in error case during ->probe() +- leds: lm3532: select regmap I2C API +- leds: class: The -ENOTSUPP should never be seen by user space +- tty: nozomi: Fix the error handling path of 'nozomi_card_init()' +- firmware: stratix10-svc: Fix a resource leak in an error handling path +- char: pcmcia: error out if 'num_bytes_read' is greater than 4 in set_protocol() +- staging: mmal-vchiq: Fix incorrect static vchiq_instance. +- mtd: rawnand: arasan: Ensure proper configuration for the asserted target +- mtd: partitions: redboot: seek fis-index-block in the right node +- perf scripting python: Fix tuple_set_u64() +- Input: hil_kbd - fix error return code in hil_dev_connect() +- ASoC: rsnd: tidyup loop on rsnd_adg_clk_query() +- backlight: lm3630a_bl: Put fwnode in error case during ->probe() +- ASoC: hisilicon: fix missing clk_disable_unprepare() on error in hi6210_i2s_startup() +- ASoC: rk3328: fix missing clk_disable_unprepare() on error in rk3328_platform_probe() +- iio: potentiostat: lmp91000: Fix alignment of buffer in iio_push_to_buffers_with_timestamp() +- iio: cros_ec_sensors: Fix alignment of buffer in iio_push_to_buffers_with_timestamp() +- iio: chemical: atlas: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: light: tcs3472: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: light: tcs3414: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: light: isl29125: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: magn: bmc150: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: magn: hmc5843: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: prox: as3935: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: prox: pulsed-light: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: prox: srf08: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: humidity: am2315: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: gyro: bmg160: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: adc: vf610: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: adc: ti-ads1015: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: accel: stk8ba50: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: accel: stk8312: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: accel: mxc4005: Fix overread of data and alignment issue. +- iio: accel: kxcjk-1013: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: accel: hid: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: accel: bma220: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: accel: bma180: Fix buffer alignment in iio_push_to_buffers_with_timestamp() +- iio: adis16475: do not return ints in irq handlers +- iio: adis16400: do not return ints in irq handlers +- iio: adis_buffer: do not return ints in irq handlers +- mwifiex: re-fix for unaligned accesses +- tty: nozomi: Fix a resource leak in an error handling function +- serial: 8250_omap: fix a timeout loop condition +- serial: fsl_lpuart: remove RTSCTS handling from get_mctrl() +- serial: fsl_lpuart: don't modify arbitrary data on lpuart32 +- rcu: Invoke rcu_spawn_core_kthreads() from rcu_spawn_gp_kthread() +- ASoC: rt5682: Disable irq on shutdown +- staging: fbtft: Don't spam logs when probe is deferred +- staging: fbtft: Rectify GPIO handling +- MIPS: Fix PKMAP with 32-bit MIPS huge page support +- RDMA/core: Always release restrack object +- RDMA/mlx5: Don't access NULL-cleared mpi pointer +- net: tipc: fix FB_MTU eat two pages +- net: sched: fix warning in tcindex_alloc_perfect_hash +- net: lwtunnel: handle MTU calculation in forwading +- writeback: fix obtain a reference to a freeing memcg css +- clk: si5341: Update initialization magic +- clk: si5341: Check for input clock presence and PLL lock on startup +- clk: si5341: Avoid divide errors due to bogus register contents +- clk: si5341: Wait for DEVICE_READY on startup +- clk: qcom: clk-alpha-pll: fix CAL_L write in alpha_pll_fabia_prepare +- clk: actions: Fix AHPPREDIV-H-AHB clock chain on Owl S500 SoC +- clk: actions: Fix bisp_factor_table based clocks on Owl S500 SoC +- clk: actions: Fix SD clocks factor table on Owl S500 SoC +- clk: actions: Fix UART clock dividers on Owl S500 SoC +- Bluetooth: Fix handling of HCI_LE_Advertising_Set_Terminated event +- Bluetooth: Fix Set Extended (Scan Response) Data +- Bluetooth: Fix not sending Set Extended Scan Response +- Bluetooth: mgmt: Fix slab-out-of-bounds in tlv_data_is_valid +- Revert "be2net: disable bh with spin_lock in be_process_mcc" +- gve: Fix swapped vars when fetching max queues +- RDMA/cma: Fix incorrect Packet Lifetime calculation +- bpfilter: Specify the log level for the kmsg message +- net: dsa: sja1105: fix NULL pointer dereference in sja1105_reload_cbs() +- e1000e: Check the PCIm state +- ipv6: fix out-of-bound access in ip6_parse_tlv() +- net: atlantic: fix the macsec key length +- net: phy: mscc: fix macsec key length +- net: macsec: fix the length used to copy the key for offloading +- RDMA/cma: Protect RMW with qp_mutex +- ibmvnic: free tx_pool if tso_pool alloc fails +- ibmvnic: set ltb->buff to NULL after freeing +- Revert "ibmvnic: remove duplicate napi_schedule call in open function" +- i40e: Fix missing rtnl locking when setting up pf switch +- i40e: Fix autoneg disabling for non-10GBaseT links +- i40e: Fix error handling in i40e_vsi_open +- bpf: Do not change gso_size during bpf_skb_change_proto() +- can: j1939: j1939_sk_setsockopt(): prevent allocation of j1939 filter for optlen == 0 +- ipv6: exthdrs: do not blindly use init_net +- net: bcmgenet: Fix attaching to PYH failed on RPi 4B +- mac80211: remove iwlwifi specific workaround NDPs of null_response +- drm/msm/dpu: Fix error return code in dpu_mdss_init() +- drm/msm: Fix error return code in msm_drm_init() +- bpf: Fix null ptr deref with mixed tail calls and subprogs +- ieee802154: hwsim: avoid possible crash in hwsim_del_edge_nl() +- ieee802154: hwsim: Fix memory leak in hwsim_add_one +- tc-testing: fix list handling +- net: ti: am65-cpsw-nuss: Fix crash when changing number of TX queues +- net/ipv4: swap flow ports when validating source +- ip6_tunnel: fix GRE6 segmentation +- vxlan: add missing rcu_read_lock() in neigh_reduce() +- rtw88: 8822c: fix lc calibration timing +- iwlwifi: increase PNVM load timeout +- xfrm: Fix xfrm offload fallback fail case +- pkt_sched: sch_qfq: fix qfq_change_class() error path +- netfilter: nf_tables_offload: check FLOW_DISSECTOR_KEY_BASIC in VLAN transfer logic +- tls: prevent oversized sendfile() hangs by ignoring MSG_MORE +- net: sched: add barrier to ensure correct ordering for lockless qdisc +- vrf: do not push non-ND strict packets with a source LLA through packet taps again +- net: ethernet: ezchip: fix error handling +- net: ethernet: ezchip: fix UAF in nps_enet_remove +- net: ethernet: aeroflex: fix UAF in greth_of_remove +- mt76: mt7615: fix NULL pointer dereference in tx_prepare_skb() +- mt76: fix possible NULL pointer dereference in mt76_tx +- samples/bpf: Fix the error return code of xdp_redirect's main() +- samples/bpf: Fix Segmentation fault for xdp_redirect command +- RDMA/rtrs-srv: Set minimal max_send_wr and max_recv_wr +- bpf: Fix libelf endian handling in resolv_btfids +- xsk: Fix broken Tx ring validation +- xsk: Fix missing validation for skb and unaligned mode +- selftests/bpf: Whitelist test_progs.h from .gitignore +- RDMA/rxe: Fix qp reference counting for atomic ops +- netfilter: nft_tproxy: restrict support to TCP and UDP transport protocols +- netfilter: nft_osf: check for TCP packet before further processing +- netfilter: nft_exthdr: check for IPv6 packet before further processing +- RDMA/mlx5: Don't add slave port to unaffiliated list +- netlabel: Fix memory leak in netlbl_mgmt_add_common +- ath11k: send beacon template after vdev_start/restart during csa +- ath10k: Fix an error code in ath10k_add_interface() +- ath11k: Fix an error handling path in ath11k_core_fetch_board_data_api_n() +- cw1200: Revert unnecessary patches that fix unreal use-after-free bugs +- brcmsmac: mac80211_if: Fix a resource leak in an error handling path +- brcmfmac: Fix a double-free in brcmf_sdio_bus_reset +- brcmfmac: correctly report average RSSI in station info +- brcmfmac: fix setting of station info chains bitmask +- ssb: Fix error return code in ssb_bus_scan() +- wcn36xx: Move hal_buf allocation to devm_kmalloc in probe +- clk: imx8mq: remove SYS PLL 1/2 clock gates +- ieee802154: hwsim: Fix possible memory leak in hwsim_subscribe_all_others +- wireless: carl9170: fix LEDS build errors & warnings +- ath10k: add missing error return code in ath10k_pci_probe() +- ath10k: go to path err_unsupported when chip id is not supported +- tools/bpftool: Fix error return code in do_batch() +- drm: qxl: ensure surf.data is ininitialized +- clk: vc5: fix output disabling when enabling a FOD +- drm/vc4: hdmi: Fix error path of hpd-gpios +- drm/pl111: Actually fix CONFIG_VEXPRESS_CONFIG depends +- RDMA/rxe: Fix failure during driver load +- drm/pl111: depend on CONFIG_VEXPRESS_CONFIG +- RDMA/core: Sanitize WQ state received from the userspace +- net/sched: act_vlan: Fix modify to allow 0 +- xfrm: remove the fragment check for ipv6 beet mode +- clk: tegra30: Use 300MHz for video decoder by default +- ehea: fix error return code in ehea_restart_qps() +- RDMA/rtrs-clt: Fix memory leak of not-freed sess->stats and stats->pcpu_stats +- RDMA/rtrs-clt: Check if the queue_depth has changed during a reconnection +- RDMA/rtrs-srv: Fix memory leak when having multiple sessions +- RDMA/rtrs-srv: Fix memory leak of unfreed rtrs_srv_stats object +- RDMA/rtrs: Do not reset hb_missed_max after re-connection +- RDMA/rtrs-clt: Check state of the rtrs_clt_sess before reading its stats +- RDMA/srp: Fix a recently introduced memory leak +- mptcp: generate subflow hmac after mptcp_finish_join() +- mptcp: fix pr_debug in mptcp_token_new_connect +- drm/rockchip: cdn-dp: fix sign extension on an int multiply for a u64 result +- drm/rockchip: lvds: Fix an error handling path +- drm/rockchip: dsi: move all lane config except LCDC mux to bind() +- drm/rockchip: cdn-dp-core: add missing clk_disable_unprepare() on error in cdn_dp_grf_write() +- drm: rockchip: set alpha_en to 0 if it is not used +- net: ftgmac100: add missing error return code in ftgmac100_probe() +- clk: meson: g12a: fix gp0 and hifi ranges +- net: qrtr: ns: Fix error return code in qrtr_ns_init() +- drm/vmwgfx: Fix cpu updates of coherent multisample surfaces +- drm/vmwgfx: Mark a surface gpu-dirty after the SVGA3dCmdDXGenMips command +- pinctrl: renesas: r8a77990: JTAG pins do not have pull-down capabilities +- pinctrl: renesas: r8a7796: Add missing bias for PRESET# pin +- net: pch_gbe: Propagate error from devm_gpio_request_one() +- net: mvpp2: Put fwnode in error case during ->probe() +- video: fbdev: imxfb: Fix an error message +- drm/ast: Fix missing conversions to managed API +- drm/amd/dc: Fix a missing check bug in dm_dp_mst_detect() +- drm/bridge: Fix the stop condition of drm_bridge_chain_pre_enable() +- drm/bridge/sii8620: fix dependency on extcon +- xfrm: xfrm_state_mtu should return at least 1280 for ipv6 +- mm: memcg/slab: properly set up gfp flags for objcg pointer array +- mm/shmem: fix shmem_swapin() race with swapoff +- swap: fix do_swap_page() race with swapoff +- mm/debug_vm_pgtable: ensure THP availability via has_transparent_hugepage() +- mm/debug_vm_pgtable/basic: iterate over entire protection_map[] +- mm/debug_vm_pgtable/basic: add validation for dirtiness after write protect +- dax: fix ENOMEM handling in grab_mapping_entry() +- ocfs2: fix snprintf() checking +- blk-mq: update hctx->dispatch_busy in case of real scheduler +- cpufreq: Make cpufreq_online() call driver->offline() on errors +- ACPI: bgrt: Fix CFI violation +- ACPI: Use DEVICE_ATTR_ macros +- extcon: extcon-max8997: Fix IRQ freeing at error path +- clocksource/drivers/timer-ti-dm: Save and restore timer TIOCP_CFG +- mark pstore-blk as broken +- ACPI: sysfs: Fix a buffer overrun problem with description_show() +- nvme-pci: look for StorageD3Enable on companion ACPI device instead +- block: avoid double io accounting for flush request +- ACPI: PM / fan: Put fan device IDs into separate header file +- PM / devfreq: Add missing error code in devfreq_add_device() +- media: video-mux: Skip dangling endpoints +- media: v4l2-async: Clean v4l2_async_notifier_add_fwnode_remote_subdev +- psi: Fix race between psi_trigger_create/destroy +- crypto: nx - Fix RCU warning in nx842_OF_upd_status +- spi: spi-sun6i: Fix chipselect/clock bug +- lockdep/selftests: Fix selftests vs PROVE_RAW_LOCK_NESTING +- lockdep: Fix wait-type for empty stack +- sched/uclamp: Fix uclamp_tg_restrict() +- sched/rt: Fix Deadline utilization tracking during policy change +- sched/rt: Fix RT utilization tracking during policy change +- x86/sev: Split up runtime #VC handler for correct state tracking +- x86/sev: Make sure IRQs are disabled while GHCB is active +- btrfs: clear log tree recovering status if starting transaction fails +- regulator: hi655x: Fix pass wrong pointer to config.driver_data +- KVM: arm64: Don't zero the cycle count register when PMCR_EL0.P is set +- perf/arm-cmn: Fix invalid pointer when access dtc object sharing the same IRQ number +- KVM: x86/mmu: Fix return value in tdp_mmu_map_handle_target_level() +- KVM: nVMX: Don't clobber nested MMU's A/D status on EPTP switch +- KVM: nVMX: Ensure 64-bit shift when checking VMFUNC bitmap +- KVM: nVMX: Sync all PGDs on nested transition with shadow paging +- hwmon: (max31790) Fix fan speed reporting for fan7..12 +- hwmon: (max31722) Remove non-standard ACPI device IDs +- hwmon: (lm70) Revert "hwmon: (lm70) Add support for ACPI" +- hwmon: (lm70) Use device_get_match_data() +- media: s5p-g2d: Fix a memory leak on ctx->fh.m2m_ctx +- media: subdev: remove VIDIOC_DQEVENT_TIME32 handling +- arm64/mm: Fix ttbr0 values stored in struct thread_info for software-pan +- arm64: consistently use reserved_pg_dir +- mmc: usdhi6rol0: fix error return code in usdhi6_probe() +- crypto: sm2 - fix a memory leak in sm2 +- crypto: sm2 - remove unnecessary reset operations +- crypto: x86/curve25519 - fix cpu feature checking logic in mod_exit +- crypto: omap-sham - Fix PM reference leak in omap sham ops +- crypto: nitrox - fix unchecked variable in nitrox_register_interrupts +- regulator: fan53880: Fix vsel_mask setting for FAN53880_BUCK +- media: siano: Fix out-of-bounds warnings in smscore_load_firmware_family2() +- m68k: atari: Fix ATARI_KBD_CORE kconfig unmet dependency warning +- media: gspca/gl860: fix zero-length control requests +- media: tc358743: Fix error return code in tc358743_probe_of() +- media: au0828: fix a NULL vs IS_ERR() check +- media: exynos4-is: Fix a use after free in isp_video_release +- media: rkvdec: Fix .buf_prepare +- locking/lockdep: Reduce LOCKDEP dependency list +- pata_ep93xx: fix deferred probing +- media: rc: i2c: Fix an error message +- crypto: ccp - Fix a resource leak in an error handling path +- crypto: sa2ul - Fix pm_runtime enable in sa_ul_probe() +- crypto: sa2ul - Fix leaks on failure paths with sa_dma_init() +- x86/elf: Use _BITUL() macro in UAPI headers +- evm: fix writing /evm overflow +- pata_octeon_cf: avoid WARN_ON() in ata_host_activate() +- kbuild: Fix objtool dependency for 'OBJECT_FILES_NON_STANDARD_ := n' +- sched/uclamp: Fix locking around cpu_util_update_eff() +- sched/uclamp: Fix wrong implementation of cpu.uclamp.min +- media: I2C: change 'RST' to "RSET" to fix multiple build errors +- pata_rb532_cf: fix deferred probing +- sata_highbank: fix deferred probing +- crypto: ux500 - Fix error return code in hash_hw_final() +- crypto: ixp4xx - update IV after requests +- crypto: ixp4xx - dma_unmap the correct address +- media: hantro: do a PM resume earlier +- media: s5p_cec: decrement usage count if disabled +- media: venus: Rework error fail recover logic +- spi: Avoid undefined behaviour when counting unused native CSs +- spi: Allow to have all native CSs in use along with GPIOs +- writeback, cgroup: increment isw_nr_in_flight before grabbing an inode +- ia64: mca_drv: fix incorrect array size calculation +- kthread_worker: fix return value when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync() +- block: fix discard request merge +- mailbox: qcom: Use PLATFORM_DEVID_AUTO to register platform device +- cifs: fix missing spinlock around update to ses->status +- HID: wacom: Correct base usage for capacitive ExpressKey status bits +- ACPI: tables: Add custom DSDT file as makefile prerequisite +- tpm_tis_spi: add missing SPI device ID entries +- clocksource: Check per-CPU clock synchronization when marked unstable +- clocksource: Retry clock read if long delays detected +- ACPI: EC: trust DSDT GPE for certain HP laptop +- cifs: improve fallocate emulation +- PCI: hv: Add check for hyperv_initialized in init_hv_pci_drv() +- EDAC/Intel: Do not load EDAC driver when running as a guest +- nvmet-fc: do not check for invalid target port in nvmet_fc_handle_fcp_rqst() +- nvme-pci: fix var. type for increasing cq_head +- platform/x86: toshiba_acpi: Fix missing error code in toshiba_acpi_setup_keyboard() +- platform/x86: asus-nb-wmi: Revert "add support for ASUS ROG Zephyrus G14 and G15" +- platform/x86: asus-nb-wmi: Revert "Drop duplicate DMI quirk structures" +- block: fix race between adding/removing rq qos and normal IO +- ACPI: resources: Add checks for ACPI IRQ override +- ACPI: bus: Call kobject_put() in acpi_init() error path +- ACPICA: Fix memory leak caused by _CID repair function +- fs: dlm: fix memory leak when fenced +- drivers: hv: Fix missing error code in vmbus_connect() +- open: don't silently ignore unknown O-flags in openat2() +- random32: Fix implicit truncation warning in prandom_seed_state() +- fs: dlm: cancel work sync othercon +- blk-mq: clear stale request in tags->rq[] before freeing one request pool +- blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter +- ACPI: EC: Make more Asus laptops use ECDT _GPE +- platform/x86: touchscreen_dmi: Add info for the Goodix GT912 panel of TM800A550L tablets +- platform/x86: touchscreen_dmi: Add an extra entry for the upside down Goodix touchscreen on Teclast X89 tablets +- Input: goodix - platform/x86: touchscreen_dmi - Move upside down quirks to touchscreen_dmi.c +- lib: vsprintf: Fix handling of number field widths in vsscanf +- hv_utils: Fix passing zero to 'PTR_ERR' warning +- ACPI: processor idle: Fix up C-state latency if not ordered +- EDAC/ti: Add missing MODULE_DEVICE_TABLE +- HID: do not use down_interruptible() when unbinding devices +- ACPI: video: use native backlight for GA401/GA502/GA503 +- media: Fix Media Controller API config checks +- regulator: da9052: Ensure enough delay time for .set_voltage_time_sel +- regulator: mt6358: Fix vdram2 .vsel_mask +- KVM: s390: get rid of register asm usage +- lockding/lockdep: Avoid to find wrong lock dep path in check_irq_usage() +- locking/lockdep: Fix the dep path printing for backwards BFS +- btrfs: disable build on platforms having page size 256K +- btrfs: don't clear page extent mapped if we're not invalidating the full page +- btrfs: sysfs: fix format string for some discard stats +- btrfs: abort transaction if we fail to update the delayed inode +- btrfs: fix error handling in __btrfs_update_delayed_inode +- KVM: PPC: Book3S HV: Fix TLB management on SMT8 POWER9 and POWER10 processors +- drivers/perf: fix the missed ida_simple_remove() in ddr_perf_probe() +- hwmon: (max31790) Fix pwmX_enable attributes +- hwmon: (max31790) Report correct current pwm duty cycles +- media: imx-csi: Skip first few frames from a BT.656 source +- media: siano: fix device register error path +- media: dvb_net: avoid speculation from net slot +- crypto: shash - avoid comparing pointers to exported functions under CFI +- spi: meson-spicc: fix memory leak in meson_spicc_probe +- spi: meson-spicc: fix a wrong goto jump for avoiding memory leak. +- mmc: via-sdmmc: add a check against NULL pointer dereference +- mmc: sdhci-sprd: use sdhci_sprd_writew +- memstick: rtsx_usb_ms: fix UAF +- media: dvd_usb: memory leak in cinergyt2_fe_attach +- Makefile: fix GDB warning with CONFIG_RELR +- media: st-hva: Fix potential NULL pointer dereferences +- media: bt8xx: Fix a missing check bug in bt878_probe +- media: v4l2-core: Avoid the dangling pointer in v4l2_fh_release +- media: cedrus: Fix .buf_prepare +- media: hantro: Fix .buf_prepare +- media: em28xx: Fix possible memory leak of em28xx struct +- media: bt878: do not schedule tasklet when it is not setup +- media: i2c: ov2659: Use clk_{prepare_enable,disable_unprepare}() to set xvclk on/off +- sched/fair: Fix ascii art by relpacing tabs +- arm64: perf: Convert snprintf to sysfs_emit +- crypto: qce: skcipher: Fix incorrect sg count for dma transfers +- crypto: qat - remove unused macro in FW loader +- crypto: qat - check return code of qat_hal_rd_rel_reg() +- media: imx: imx7_mipi_csis: Fix logging of only error event counters +- media: pvrusb2: fix warning in pvr2_i2c_core_done +- media: hevc: Fix dependent slice segment flags +- media: cobalt: fix race condition in setting HPD +- media: cpia2: fix memory leak in cpia2_usb_probe +- media: sti: fix obj-$(config) targets +- crypto: nx - add missing MODULE_DEVICE_TABLE +- hwrng: exynos - Fix runtime PM imbalance on error +- sched/core: Initialize the idle task with preemption disabled +- regulator: uniphier: Add missing MODULE_DEVICE_TABLE +- spi: omap-100k: Fix the length judgment problem +- spi: spi-topcliff-pch: Fix potential double free in pch_spi_process_messages() +- spi: spi-loopback-test: Fix 'tx_buf' might be 'rx_buf' +- media: exynos-gsc: fix pm_runtime_get_sync() usage count +- media: exynos4-is: fix pm_runtime_get_sync() usage count +- media: sti/bdisp: fix pm_runtime_get_sync() usage count +- media: sunxi: fix pm_runtime_get_sync() usage count +- media: s5p-jpeg: fix pm_runtime_get_sync() usage count +- media: mtk-vcodec: fix PM runtime get logic +- media: sh_vou: fix pm_runtime_get_sync() usage count +- media: am437x: fix pm_runtime_get_sync() usage count +- media: s5p: fix pm_runtime_get_sync() usage count +- media: mdk-mdp: fix pm_runtime_get_sync() usage count +- media: marvel-ccic: fix some issues when getting pm_runtime +- staging: media: rkvdec: fix pm_runtime_get_sync() usage count +- Add a reference to ucounts for each cred +- spi: Make of_register_spi_device also set the fwnode +- thermal/cpufreq_cooling: Update offline CPUs per-cpu thermal_pressure +- fuse: reject internal errno +- fuse: check connected before queueing on fpq->io +- fuse: ignore PG_workingset after stealing +- fuse: Fix infinite loop in sget_fc() +- fuse: Fix crash if superblock of submount gets killed early +- fuse: Fix crash in fuse_dentry_automount() error path +- evm: Refuse EVM_ALLOW_METADATA_WRITES only if an HMAC key is loaded +- loop: Fix missing discard support when using LOOP_CONFIGURE +- powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi() +- seq_buf: Make trace_seq_putmem_hex() support data longer than 8 +- tracepoint: Add tracepoint_probe_register_may_exist() for BPF tracing +- tracing/histograms: Fix parsing of "sym-offset" modifier +- rsi: fix AP mode with WPA failure due to encrypted EAPOL +- rsi: Assign beacon rate settings to the correct rate_info descriptor field +- ssb: sdio: Don't overwrite const buffer if block_write fails +- ath9k: Fix kernel NULL pointer dereference during ath_reset_internal() +- serial_cs: remove wrong GLOBETROTTER.cis entry +- serial_cs: Add Option International GSM-Ready 56K/ISDN modem +- serial: sh-sci: Stop dmaengine transfer in sci_stop_tx() +- serial: mvebu-uart: fix calculation of clock divisor +- iio: accel: bma180: Fix BMA25x bandwidth register values +- iio: ltr501: ltr501_read_ps(): add missing endianness conversion +- iio: ltr501: ltr559: fix initialization of LTR501_ALS_CONTR +- iio: ltr501: mark register holding upper 8 bits of ALS_DATA{0,1} and PS_DATA as volatile, too +- iio: light: tcs3472: do not free unallocated IRQ +- iio: frequency: adf4350: disable reg and clk on error in adf4350_probe() +- rtc: stm32: Fix unbalanced clk_disable_unprepare() on probe error path +- clk: agilex/stratix10: fix bypass representation +- clk: agilex/stratix10: remove noc_clk +- clk: agilex/stratix10/n5x: fix how the bypass_reg is handled +- f2fs: Prevent swap file in LFS mode +- s390: mm: Fix secure storage access exception handling +- s390/cio: dont call css_wait_for_slow_path() inside a lock +- KVM: x86/mmu: Use MMU's role to detect CR4.SMEP value in nested NPT walk +- KVM: x86/mmu: Treat NX as used (not reserved) for all !TDP shadow MMUs +- KVM: PPC: Book3S HV: Workaround high stack usage with clang +- KVM: nVMX: Handle split-lock #AC exceptions that happen in L2 +- mm/gup: fix try_grab_compound_head() race with split_huge_page() +- bus: mhi: Wait for M2 state during system resume +- mac80211: remove iwlwifi specific workaround that broke sta NDP tx +- can: peak_pciefd: pucan_handle_status(): fix a potential starvation issue in TX path +- can: j1939: j1939_sk_init(): set SOCK_RCU_FREE to call sk_destruct() after RCU is done +- can: isotp: isotp_release(): omit unintended hrtimer restart on socket release +- can: gw: synchronize rcu operations before removing gw job entry +- can: bcm: delay release of struct bcm_op after synchronize_rcu() +- ext4: use ext4_grp_locked_error in mb_find_extent +- ext4: fix avefreec in find_group_orlov +- ext4: remove check for zero nr_to_scan in ext4_es_scan() +- ext4: correct the cache_nr in tracepoint ext4_es_shrink_exit +- ext4: return error code when ext4_fill_flex_info() fails +- ext4: fix overflow in ext4_iomap_alloc() +- ext4: fix kernel infoleak via ext4_extent_header +- btrfs: clear defrag status of a root if starting transaction fails +- btrfs: compression: don't try to compress if we don't have enough pages +- btrfs: send: fix invalid path for unlink operations after parent orphanization +- ARM: dts: at91: sama5d4: fix pinctrl muxing +- ARM: dts: ux500: Fix LED probing +- crypto: ccp - Annotate SEV Firmware file names +- crypto: nx - Fix memcpy() over-reading in nonce +- Input: joydev - prevent use of not validated data in JSIOCSBTNMAP ioctl +- iov_iter_fault_in_readable() should do nothing in xarray case +- copy_page_to_iter(): fix ITER_DISCARD case +- selftests/lkdtm: Avoid needing explicit sub-shell +- ntfs: fix validity check for file name attribute +- gfs2: Fix error handling in init_statfs +- gfs2: Fix underflow in gfs2_page_mkwrite +- xhci: solve a double free problem while doing s4 +- usb: typec: Add the missed altmode_id_remove() in typec_register_altmode() +- usb: dwc3: Fix debugfs creation flow +- USB: cdc-acm: blacklist Heimann USB Appset device +- usb: renesas-xhci: Fix handling of unknown ROM state +- usb: gadget: eem: fix echo command packet response issue +- net: can: ems_usb: fix use-after-free in ems_usb_disconnect() +- Input: usbtouchscreen - fix control-request directions +- media: dvb-usb: fix wrong definition +- ALSA: hda/realtek: fix mute/micmute LEDs for HP EliteBook 830 G8 Notebook PC +- ALSA: hda/realtek: Apply LED fixup for HP Dragonfly G1, too +- ALSA: hda/realtek: Fix bass speaker DAC mapping for Asus UM431D +- ALSA: hda/realtek: Improve fixup for HP Spectre x360 15-df0xxx +- ALSA: hda/realtek: fix mute/micmute LEDs for HP EliteBook x360 830 G8 +- ALSA: hda/realtek: Add another ALC236 variant support +- ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 630 G8 +- ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 445 G8 +- ALSA: hda/realtek: fix mute/micmute LEDs for HP ProBook 450 G8 +- ALSA: intel8x0: Fix breakage at ac97 clock measurement +- ALSA: usb-audio: scarlett2: Fix wrong resume call +- ALSA: firewire-motu: fix stream format for MOTU 8pre FireWire +- ALSA: usb-audio: Fix OOB access at proc output +- ALSA: usb-audio: fix rate on Ozone Z90 USB headset +- Bluetooth: Remove spurious error message +- Bluetooth: btqca: Don't modify firmware contents in-place +- Bluetooth: hci_qca: fix potential GPF +- Revert "evm: Refuse EVM_ALLOW_METADATA_WRITES only if an HMAC key is loaded" +- configfs: fix memleak in configfs_release_bin_file +- init: only move down lockup_detector_init() when sdei_watchdog is enabled +- arm64: fix AUDIT_ARCH_AARCH64ILP32 bug on audit subsystem +- ext4: cleanup in-core orphan list if ext4_truncate() failed to get a transaction handle +- ext4: fix WARN_ON_ONCE(!buffer_uptodate) after an error writing the superblock +- tty/serial/imx: Enable TXEN bit in imx_poll_init(). +- xen/events: reset active flag for lateeoi events later +- Hexagon: change jumps to must-extend in futex_atomic_* +- Hexagon: add target builtins to kernel +- Hexagon: fix build errors +- media: uvcvideo: Support devices that report an OT as an entity source +- KVM: PPC: Book3S HV: Save and restore FSCR in the P9 path +- ubifs: Remove ui_mutex in ubifs_xattr_get and change_xattr +- ubifs: Fix races between xattr_{set|get} and listxattr operations +- block: stop wait rcu once we can ensure no io while elevator init +- writeback: don't warn on an unregistered BDI in __mark_inode_dirty +- mm/page_isolation: do not isolate the max order page +- mm/zswap: fix passing zero to 'PTR_ERR' warning +- mm/page_alloc: speed up the iteration of max_order +- mm: hugetlb: fix type of delta parameter and related local variables in gather_surplus_pages() +- mm: vmalloc: prevent use after free in _vm_unmap_aliases +- arm32: kaslr: Fix the bitmap error +- net: make sure devices go through netdev_wait_all_refs +- net: fib_notifier: don't return positive values on fib registration +- netfilter: nftables: avoid potential overflows on 32bit arches +- netfilter: Dissect flow after packet mangling +- net: fix a concurrency bug in l2tp_tunnel_register() +- ext4: fix possible UAF when remounting r/o a mmp-protected file system +- SUNRPC: Should wake up the privileged task firstly. +- SUNRPC: Fix the batch tasks count wraparound. +- Revert "KVM: x86/mmu: Drop kvm_mmu_extended_role.cr4_la57 hack" +- RDMA/mlx5: Block FDB rules when not in switchdev mode +- gpio: AMD8111 and TQMX86 require HAS_IOPORT_MAP +- drm/nouveau: fix dma_address check for CPU/GPU sync +- gpio: mxc: Fix disabled interrupt wake-up support +- scsi: sr: Return appropriate error code when disk is ejected +- arm64: seccomp: fix compilation error with ILP32 support +- scsi: sd: block: Fix regressions in read-only block device handling +- integrity: Load mokx variables into the blacklist keyring +- certs: Add ability to preload revocation certs +- certs: Move load_system_certificate_list to a common function +- certs: Add EFI_CERT_X509_GUID support for dbx entries +- Revert "drm: add a locked version of drm_is_current_master" +- netfs: fix test for whether we can skip read when writing beyond EOF +- swiotlb: manipulate orig_addr when tlb_addr has offset +- KVM: SVM: Call SEV Guest Decommission if ASID binding fails +- mm, futex: fix shared futex pgoff on shmem huge page +- mm/thp: another PVMW_SYNC fix in page_vma_mapped_walk() +- mm/thp: fix page_vma_mapped_walk() if THP mapped by ptes +- mm: page_vma_mapped_walk(): get vma_address_end() earlier +- mm: page_vma_mapped_walk(): use goto instead of while (1) +- mm: page_vma_mapped_walk(): add a level of indentation +- mm: page_vma_mapped_walk(): crossing page table boundary +- mm: page_vma_mapped_walk(): prettify PVMW_MIGRATION block +- mm: page_vma_mapped_walk(): use pmde for *pvmw->pmd +- mm: page_vma_mapped_walk(): settle PageHuge on entry +- mm: page_vma_mapped_walk(): use page for pvmw->page +- mm: thp: replace DEBUG_VM BUG with VM_WARN when unmap fails for split +- mm/thp: unmap_mapping_page() to fix THP truncate_cleanup_page() +- mm/thp: fix page_address_in_vma() on file THP tails +- mm/thp: fix vma_address() if virtual address below file offset +- mm/thp: try_to_unmap() use TTU_SYNC for safe splitting +- mm/thp: make is_huge_zero_pmd() safe and quicker +- mm/thp: fix __split_huge_pmd_locked() on shmem migration entry +- mm, thp: use head page in __migration_entry_wait() +- mm/rmap: use page_not_mapped in try_to_unmap() +- mm/rmap: remove unneeded semicolon in page_not_mapped() +- mm: add VM_WARN_ON_ONCE_PAGE() macro +- x86/fpu: Make init_fpstate correct with optimized XSAVE +- x86/fpu: Preserve supervisor states in sanitize_restored_user_xstate() +- kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync() +- kthread_worker: split code for canceling the delayed work timer +- ceph: must hold snap_rwsem when filling inode for async create +- i2c: robotfuzz-osif: fix control-request directions +- KVM: do not allow mapping valid but non-reference-counted pages +- s390/stack: fix possible register corruption with stack switch helper +- nilfs2: fix memory leak in nilfs_sysfs_delete_device_group +- gpiolib: cdev: zero padding during conversion to gpioline_info_changed +- i2c: i801: Ensure that SMBHSTSTS_INUSE_STS is cleared when leaving i801_access +- pinctrl: stm32: fix the reported number of GPIO lines per bank +- perf/x86: Track pmu in per-CPU cpu_hw_events +- net: ll_temac: Avoid ndo_start_xmit returning NETDEV_TX_BUSY +- net: ll_temac: Add memory-barriers for TX BD access +- PCI: Add AMD RS690 quirk to enable 64-bit DMA +- recordmcount: Correct st_shndx handling +- mac80211: handle various extensible elements correctly +- mac80211: reset profile_periodicity/ema_ap +- net: qed: Fix memcpy() overflow of qed_dcbx_params() +- KVM: selftests: Fix kvm_check_cap() assertion +- r8169: Avoid memcpy() over-reading of ETH_SS_STATS +- sh_eth: Avoid memcpy() over-reading of ETH_SS_STATS +- r8152: Avoid memcpy() over-reading of ETH_SS_STATS +- net/packet: annotate accesses to po->ifindex +- net/packet: annotate accesses to po->bind +- net: caif: fix memory leak in ldisc_open +- riscv32: Use medany C model for modules +- net: phy: dp83867: perform soft reset and retain established link +- net/packet: annotate data race in packet_sendmsg() +- inet: annotate date races around sk->sk_txhash +- net: annotate data race in sock_error() +- ping: Check return value of function 'ping_queue_rcv_skb' +- inet: annotate data race in inet_send_prepare() and inet_dgram_connect() +- net: ethtool: clear heap allocations for ethtool function +- mac80211: drop multicast fragments +- net: ipv4: Remove unneed BUG() function +- dmaengine: mediatek: use GFP_NOWAIT instead of GFP_ATOMIC in prep_dma +- dmaengine: mediatek: do not issue a new desc if one is still current +- dmaengine: mediatek: free the proper desc in desc_free handler +- dmaengine: rcar-dmac: Fix PM reference leak in rcar_dmac_probe() +- cfg80211: call cfg80211_leave_ocb when switching away from OCB +- mac80211_hwsim: drop pending frames on stop +- mac80211: remove warning in ieee80211_get_sband() +- dmaengine: xilinx: dpdma: Limit descriptor IDs to 16 bits +- dmaengine: xilinx: dpdma: Add missing dependencies to Kconfig +- dmaengine: stm32-mdma: fix PM reference leak in stm32_mdma_alloc_chan_resourc() +- dmaengine: zynqmp_dma: Fix PM reference leak in zynqmp_dma_alloc_chan_resourc() +- perf/x86/intel/lbr: Zero the xstate buffer on allocation +- perf/x86/lbr: Remove cpuc->lbr_xsave allocation from atomic context +- locking/lockdep: Improve noinstr vs errors +- x86/xen: Fix noinstr fail in exc_xen_unknown_trap() +- x86/entry: Fix noinstr fail in __do_fast_syscall_32() +- drm/vc4: hdmi: Make sure the controller is powered in detect +- drm/vc4: hdmi: Move the HSM clock enable to runtime_pm +- Revert "PCI: PM: Do not read power state in pci_enable_device_flags()" +- spi: spi-nxp-fspi: move the register operation after the clock enable +- arm64: Ignore any DMA offsets in the max_zone_phys() calculation +- MIPS: generic: Update node names to avoid unit addresses +- mmc: meson-gx: use memcpy_to/fromio for dram-access-quirk +- ARM: 9081/1: fix gcc-10 thumb2-kernel regression +- drm/amdgpu: wait for moving fence after pinning +- drm/radeon: wait for moving fence after pinning +- drm/nouveau: wait for moving fence after pinning v2 +- drm: add a locked version of drm_is_current_master +- Revert "drm/amdgpu/gfx10: enlarge CP_MEC_DOORBELL_RANGE_UPPER to cover full doorbell." +- Revert "drm/amdgpu/gfx9: fix the doorbell missing when in CGPG issue." +- module: limit enabling module.sig_enforce +- scsi: core: Treat device offline as a failure +- blk-wbt: make sure throttle is enabled properly +- blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled() +- arm64: fpsimd: run kernel mode NEON with softirqs disabled +- arm64: assembler: introduce wxN aliases for wN registers +- arm64: assembler: remove conditional NEON yield macros +- crypto: arm64/crc-t10dif - move NEON yield to C code +- crypto: arm64/aes-ce-mac - simplify NEON yield +- crypto: arm64/aes-neonbs - remove NEON yield calls +- crypto: arm64/sha512-ce - simplify NEON yield +- crypto: arm64/sha3-ce - simplify NEON yield +- crypto: arm64/sha2-ce - simplify NEON yield +- crypto: arm64/sha1-ce - simplify NEON yield +- arm64: assembler: add cond_yield macro +- mm: fix page reference leak in soft_offline_page() +- block_dump: remove comments in docs +- block_dump: remove block_dump feature +- block_dump: remove block_dump feature in mark_inode_dirty() +- crypto: sun8i-ce - fix error return code in sun8i_ce_prng_generate() +- crypto: nx - add missing call to of_node_put() +- net: hns3: fix a return value error in hclge_get_reset_status() +- net: hns3: check vlan id before using it +- net: hns3: check queue id range before using +- net: hns3: fix misuse vf id and vport id in some logs +- net: hns3: fix inconsistent vf id print +- net: hns3: fix change RSS 'hfunc' ineffective issue +- net: hns3: fix the timing issue of VF clearing interrupt sources +- net: hns3: fix the exception when query imp info +- net: hns3: disable mac in flr process +- net: hns3: change affinity_mask to numa node range +- net: hns3: pad the short tunnel frame before sending to hardware +- net: hns3: make hclgevf_cmd_caps_bit_map0 and hclge_cmd_caps_bit_map0 static +- imans: Use initial ima namespace domain tag when IMANS is disabled. +- IOMMU: SMMUv2: Bypass SMMU in default for some SoCs +- arm64: phytium: using MIDR_PHYTIUM_FT2000PLUS instead of ARM_CPU_IMP_PHYTIUM +- arm64: Add MIDR encoding for PHYTIUM CPUs +- arm64: Add MIDR encoding for HiSilicon Taishan CPUs +- usb: xhci: Add workaround for phytium +- arm64: topology: Support PHYTIUM CPU +- hugetlb: pass head page to remove_hugetlb_page() +- userfaultfd: hugetlbfs: fix new flag usage in error path +- hugetlb: fix uninitialized subpool pointer +- percpu: flush tlb in pcpu_reclaim_populated() +- percpu: implement partial chunk depopulation +- percpu: use pcpu_free_slot instead of pcpu_nr_slots - 1 +- percpu: factor out pcpu_check_block_hint() +- percpu: split __pcpu_balance_workfn() +- percpu: fix a comment about the chunks ordering +- slub: fix kmalloc_pagealloc_invalid_free unit test +- slub: fix unreclaimable slab stat for bulk free +- net: hns3: remove unnecessary spaces +- net: hns3: add some required spaces +- net: hns3: clean up a type mismatch warning +- net: hns3: refine function hns3_set_default_feature() +- net: hns3: uniform parameter name of hclge_ptp_clean_tx_hwts() +- net: hnss3: use max() to simplify code +- net: hns3: modify a print format of hns3_dbg_queue_map() +- net: hns3: refine function hclge_dbg_dump_tm_pri() +- net: hns3: reconstruct function hclge_ets_validate() +- net: hns3: reconstruct function hns3_self_test +- net: hns3: initialize each member of structure array on a separate line +- net: hns3: add required space in comment +- net: hns3: remove unnecessary "static" of local variables in function +- net: hns3: don't config TM DWRR twice when set ETS +- net: hns3: add new function hclge_get_speed_bit() +- net: hns3: refactor function hclgevf_parse_capability() +- net: hns3: refactor function hclge_parse_capability() +- net: hns3: add trace event in hclge_gen_resp_to_vf() +- net: hns3: uniform type of function parameter cmd +- net: hns3: merge some repetitive macros +- net: hns3: package new functions to simplify hclgevf_mbx_handler code +- net: hns3: remove redundant param to simplify code +- net: hns3: use memcpy to simplify code +- net: hns3: remove redundant param mbx_event_pending +- net: hns3: add hns3_state_init() to do state initialization +- net: hns3: add macros for mac speeds of firmware command +- sched: bugfix setscheduler unlock cpuset_rwsem +- ima: fix db size overflow and Kconfig issues +- mm: page_poison: print page info when corruption is caught +- kasan: fix conflict with page poisoning +- mm: fix page_owner initializing issue for arm32 +- net: hns3: add ethtool support for CQE/EQE mode configuration +- net: hns3: add support for EQE/CQE mode configuration +- ethtool: extend coalesce setting uAPI with CQE mode +- ethtool: add two coalesce attributes for CQE mode +- ethtool: add ETHTOOL_COALESCE_ALL_PARAMS define +- net: hns3: fix get wrong pfc_en when query PFC configuration +- net: hns3: fix GRO configuration error after reset +- net: hns3: change the method of getting cmd index in debugfs +- net: hns3: fix duplicate node in VLAN list +- net: hns3: fix speed unknown issue in bond 4 +- net: hns3: add waiting time before cmdq memory is released +- net: hns3: clear hardware resource when loading driver +- net: hns3: make array spec_opcode static const, makes object smaller +- digest list: disable digest lists in non-root ima namespaces +- ima: Introduce ima-ns-sig template +- ima: fix a potential crash owing to the compiler optimisation +- ima: Set ML template per ima namespace +- ima: Add dummy boot aggregate to per ima namespace measurement list +- ima: Load per ima namespace x509 certificate +- integrity: Add key domain tag to the search criteria +- ima: Add key domain to the ima namespace +- keys: Allow to set key domain tag separately from the key type +- keys: Include key domain tag in the iterative search +- keys: Add domain tag to the keyring search criteria +- ima: Remap IDs of subject based rules if necessary +- user namespace: Add function that checks if the UID map is defined +- ima: Parse per ima namespace policy file +- ima: Configure the new ima namespace from securityfs +- ima: Change the owning user namespace of the ima namespace if necessary +- ima: Add the violation counter to the namespace +- ima: Extend permissions to the ima securityfs entries +- ima: Add a reader counter to the integrity inode data +- ima: Add per namespace view of the measurement list +- ima: Add a new ima template that includes namespace ID +- ima: Check ima namespace ID during digest entry lookup +- ima: Keep track of the measurment list per ima namespace +- ima: Add ima namespace id to the measurement list related structures +- ima: Enable per ima namespace policy settings +- ima: Add integrity inode related data to the ima namespace +- ima: Extend the APIs in the integrity subsystem +- ima: Add ima namespace to the ima subsystem APIs +- ima: Add methods for parsing ima policy configuration string +- ima: Add ima policy related data to the ima namespace +- ima: Bind ima namespace to the file descriptor +- ima: Add a list of the installed ima namespaces +- ima: Introduce ima namespace +- mm/page_alloc: further fix __alloc_pages_bulk() return value +- mm/page_alloc: correct return value when failing at preparing +- mm/page_alloc: avoid page allocator recursion with pagesets.lock held +- mm: vmscan: shrink deferred objects proportional to priority +- mm: memcontrol: reparent nr_deferred when memcg offline +- mm: vmscan: don't need allocate shrinker->nr_deferred for memcg aware shrinkers +- mm: vmscan: use per memcg nr_deferred of shrinker +- mm: vmscan: add per memcg shrinker nr_deferred +- mm: vmscan: use a new flag to indicate shrinker is registered +- mm: vmscan: add shrinker_info_protected() helper +- mm: memcontrol: rename shrinker_map to shrinker_info +- mm: vmscan: use kvfree_rcu instead of call_rcu +- mm: vmscan: remove memcg_shrinker_map_size +- mm: vmscan: use shrinker_rwsem to protect shrinker_maps allocation +- mm: vmscan: consolidate shrinker_maps handling code +- mm: vmscan: use nid from shrink_control for tracepoint +- scsi/hifc: Fix memory leakage bug +- crypto: hisilicon/qm - set a qp error flag for userspace +- vfio/hisilicon: add acc live migration driver +- vfio/hisilicon: modify QM for live migration driver +- vfio/pci: provide customized live migration VFIO driver framework +- PCI: Set dma-can-stall for HiSilicon chips +- PCI: Add a quirk to set pasid_no_tlp for HiSilicon chips +- PCI: PASID can be enabled without TLP prefix +- crypto: hisilicon/sec - fix the CTR mode BD configuration +- crypto: hisilicon/sec - fix the max length of AAD for the CCM mode +- crypto: hisilicon/sec - fixup icv checking enabled on Kunpeng 930 +- crypto: hisilicon - check _PS0 and _PR0 method +- crypto: hisilicon - change parameter passing of debugfs function +- crypto: hisilicon - support runtime PM for accelerator device +- crypto: hisilicon - add runtime PM ops +- crypto: hisilicon - using 'debugfs_create_file' instead of 'debugfs_create_regset32' +- crypto: hisilicon/sec - modify the hardware endian configuration +- crypto: hisilicon/sec - fix the abnormal exiting process +- crypto: hisilicon - enable hpre device clock gating +- crypto: hisilicon - enable sec device clock gating +- crypto: hisilicon - enable zip device clock gating +- crypto: hisilicon/sec - fix the process of disabling sva prefetching + +* Web Sep 15 2021 Zheng Zengkai - 5.10.0-6.0.0.0 +- mm/page_alloc: correct return value of populated elements if bulk array is populated +- mm: fix oom killing for disabled pid +- X86/config: Enable CONFIG_USERSWAP +- eulerfs: change default config file +- eulerfs: add Kconfig and Makefile +- eulerfs: add super_operations and module_init/exit +- eulerfs: add inode_operations for symlink inode +- eulerfs: add file_operations for dir inode +- eulerfs: add inode_operations for dir inode and special inode +- eulerfs: add file operations and inode operations for regular file +- eulerfs: add dax operations +- eulerfs: add inode related interfaces +- eulerfs: add dependency operations +- eulerfs: add nv dict operations +- eulerfs: add filename interfaces +- eulerfs: add interfaces for page wear +- eulerfs: add interfaces for inode lock transfer +- eulerfs: add flush interfaces +- eulerfs: add memory allocation interfaces +- eulerfs: add kmeme_cache definitions and interfaces +- eulerfs: common definitions +- vfio/pci: Fix wrong return value when get iommu attribute DOMAIN_ATTR_NESTING +- net: hns3: remove always exist devlink pointer check +- net: hns3: add support ethtool extended link state +- net: hns3: add header file hns3_ethtoo.h +- ethtool: add two link extended substates of bad signal integrity +- docs: ethtool: Add two link extended substates of bad signal integrity +- net: hns3: add support for triggering reset by ethtool + +* Mon Aug 16 2021 Yafen Fang - 5.10.0-5.3.0.2 +- package init based on openEuler 5.10.0-5.3.0 + +* Mon Aug 9 2021 Yafen Fang - 5.10.0-5.1.0.1 +- package init based on openEuler 5.10.0-5.1.0