summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.mailmap7
-rw-r--r--Documentation/ABI/testing/sysfs-devices-system-cpu3
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt10
-rw-r--r--Documentation/admin-guide/perf/hns3-pmu.rst136
-rw-r--r--Documentation/admin-guide/perf/index.rst1
-rw-r--r--Documentation/admin-guide/pm/cpuidle.rst15
-rw-r--r--Documentation/arm64/elf_hwcaps.rst4
-rw-r--r--Documentation/arm64/memory.rst10
-rw-r--r--Documentation/arm64/silicon-errata.rst6
-rw-r--r--Documentation/core-api/protection-keys.rst44
-rw-r--r--Documentation/devicetree/bindings/net/ethernet-controller.yaml123
-rw-r--r--Documentation/devicetree/bindings/net/fsl,fec.yaml3
-rw-r--r--Documentation/features/vm/ioremap_prot/arch-support.txt2
-rw-r--r--Documentation/filesystems/overlayfs.rst4
-rw-r--r--Documentation/memory-barriers.txt11
-rw-r--r--Documentation/networking/ip-sysctl.rst9
-rw-r--r--Documentation/staging/static-keys.rst3
-rw-r--r--Documentation/virt/kvm/arm/hyp-abi.rst11
-rw-r--r--MAINTAINERS9
-rw-r--r--Makefile2
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/arc/kernel/jump_label.c13
-rw-r--r--arch/arm/boot/dts/lan966x.dtsi2
-rw-r--r--arch/arm/include/asm/dma.h2
-rw-r--r--arch/arm/include/asm/io.h4
-rw-r--r--arch/arm/kernel/jump_label.c6
-rw-r--r--arch/arm/lib/findbit.S16
-rw-r--r--arch/arm/mach-pxa/corgi.c2
-rw-r--r--arch/arm/mach-pxa/hx4700.c2
-rw-r--r--arch/arm/mach-pxa/icontrol.c4
-rw-r--r--arch/arm/mach-pxa/littleton.c2
-rw-r--r--arch/arm/mach-pxa/magician.c2
-rw-r--r--arch/arm/mach-pxa/spitz.c2
-rw-r--r--arch/arm/mach-pxa/z2.c4
-rw-r--r--arch/arm/mm/ioremap.c9
-rw-r--r--arch/arm/mm/nommu.c9
-rw-r--r--arch/arm64/Kconfig37
-rw-r--r--arch/arm64/boot/Makefile5
-rw-r--r--arch/arm64/include/asm/asm-extable.h79
-rw-r--r--arch/arm64/include/asm/asm-uaccess.h12
-rw-r--r--arch/arm64/include/asm/asm_pointer_auth.h4
-rw-r--r--arch/arm64/include/asm/assembler.h35
-rw-r--r--arch/arm64/include/asm/barrier.h12
-rw-r--r--arch/arm64/include/asm/cache.h41
-rw-r--r--arch/arm64/include/asm/cacheflush.h7
-rw-r--r--arch/arm64/include/asm/cpu.h1
-rw-r--r--arch/arm64/include/asm/cpu_ops.h9
-rw-r--r--arch/arm64/include/asm/cpufeature.h7
-rw-r--r--arch/arm64/include/asm/cpuidle.h15
-rw-r--r--arch/arm64/include/asm/el2_setup.h60
-rw-r--r--arch/arm64/include/asm/fixmap.h4
-rw-r--r--arch/arm64/include/asm/hwcap.h3
-rw-r--r--arch/arm64/include/asm/io.h24
-rw-r--r--arch/arm64/include/asm/kernel-pgtable.h18
-rw-r--r--arch/arm64/include/asm/memory.h9
-rw-r--r--arch/arm64/include/asm/mmu_context.h16
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h3
-rw-r--r--arch/arm64/include/asm/pgtable.h16
-rw-r--r--arch/arm64/include/asm/processor.h3
-rw-r--r--arch/arm64/include/asm/sysreg.h128
-rw-r--r--arch/arm64/include/asm/uaccess.h94
-rw-r--r--arch/arm64/include/asm/virt.h11
-rw-r--r--arch/arm64/include/uapi/asm/hwcap.h4
-rw-r--r--arch/arm64/kernel/Makefile7
-rw-r--r--arch/arm64/kernel/acpi.c2
-rw-r--r--arch/arm64/kernel/acpi_numa.c2
-rw-r--r--arch/arm64/kernel/alternative.c2
-rw-r--r--arch/arm64/kernel/armv8_deprecated.c9
-rw-r--r--arch/arm64/kernel/cpu_errata.c26
-rw-r--r--arch/arm64/kernel/cpufeature.c376
-rw-r--r--arch/arm64/kernel/cpuidle.c29
-rw-r--r--arch/arm64/kernel/cpuinfo.c51
-rw-r--r--arch/arm64/kernel/entry.S53
-rw-r--r--arch/arm64/kernel/fpsimd.c1
-rw-r--r--arch/arm64/kernel/head.S525
-rw-r--r--arch/arm64/kernel/hibernate.c5
-rw-r--r--arch/arm64/kernel/hyp-stub.S117
-rw-r--r--arch/arm64/kernel/idreg-override.c93
-rw-r--r--arch/arm64/kernel/image-vars.h59
-rw-r--r--arch/arm64/kernel/jump_label.c11
-rw-r--r--arch/arm64/kernel/kaslr.c149
-rw-r--r--arch/arm64/kernel/kuser32.S1
-rw-r--r--arch/arm64/kernel/mte.c9
-rw-r--r--arch/arm64/kernel/pi/Makefile33
-rw-r--r--arch/arm64/kernel/pi/kaslr_early.c112
-rw-r--r--arch/arm64/kernel/signal.c20
-rw-r--r--arch/arm64/kernel/sigreturn32.S1
-rw-r--r--arch/arm64/kernel/sleep.S3
-rw-r--r--arch/arm64/kernel/stacktrace.c99
-rw-r--r--arch/arm64/kernel/suspend.c2
-rw-r--r--arch/arm64/kernel/traps.c6
-rw-r--r--arch/arm64/kernel/vdso/Makefile8
-rw-r--r--arch/arm64/kernel/vdso/vdso.lds.S16
-rw-r--r--arch/arm64/kernel/vdso32/Makefile1
-rw-r--r--arch/arm64/kernel/vdso32/vdso.lds.S27
-rw-r--r--arch/arm64/kernel/vmlinux.lds.S22
-rw-r--r--arch/arm64/kvm/hyp/include/nvhe/fixed_config.h32
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c12
-rw-r--r--arch/arm64/kvm/sys_regs.c14
-rw-r--r--arch/arm64/lib/mte.S2
-rw-r--r--arch/arm64/mm/cache.S41
-rw-r--r--arch/arm64/mm/copypage.c9
-rw-r--r--arch/arm64/mm/dma-mapping.c19
-rw-r--r--arch/arm64/mm/extable.c10
-rw-r--r--arch/arm64/mm/fault.c1
-rw-r--r--arch/arm64/mm/hugetlbpage.c10
-rw-r--r--arch/arm64/mm/init.c4
-rw-r--r--arch/arm64/mm/ioremap.c90
-rw-r--r--arch/arm64/mm/kasan_init.c4
-rw-r--r--arch/arm64/mm/mmu.c78
-rw-r--r--arch/arm64/mm/mteswap.c9
-rw-r--r--arch/arm64/mm/proc.S188
-rw-r--r--arch/arm64/tools/cpucaps2
-rwxr-xr-xarch/arm64/tools/gen-sysreg.awk2
-rw-r--r--arch/arm64/tools/sysreg264
-rw-r--r--arch/loongarch/Kconfig1
-rw-r--r--arch/loongarch/include/asm/asmmacro.h12
-rw-r--r--arch/loongarch/include/asm/atomic.h37
-rw-r--r--arch/loongarch/include/asm/barrier.h4
-rw-r--r--arch/loongarch/include/asm/cmpxchg.h4
-rw-r--r--arch/loongarch/include/asm/compiler.h15
-rw-r--r--arch/loongarch/include/asm/elf.h2
-rw-r--r--arch/loongarch/include/asm/futex.h11
-rw-r--r--arch/loongarch/include/asm/irqflags.h1
-rw-r--r--arch/loongarch/include/asm/local.h1
-rw-r--r--arch/loongarch/include/asm/loongson.h16
-rw-r--r--arch/loongarch/include/asm/stacktrace.h12
-rw-r--r--arch/loongarch/include/asm/thread_info.h4
-rw-r--r--arch/loongarch/include/asm/uaccess.h2
-rw-r--r--arch/loongarch/kernel/cacheinfo.c11
-rw-r--r--arch/loongarch/kernel/entry.S4
-rw-r--r--arch/loongarch/kernel/env.c20
-rw-r--r--arch/loongarch/kernel/fpu.S174
-rw-r--r--arch/loongarch/kernel/genex.S12
-rw-r--r--arch/loongarch/kernel/head.S8
-rw-r--r--arch/loongarch/kernel/ptrace.c12
-rw-r--r--arch/loongarch/kernel/reset.c1
-rw-r--r--arch/loongarch/kernel/setup.c2
-rw-r--r--arch/loongarch/kernel/smp.c113
-rw-r--r--arch/loongarch/kernel/switch.S4
-rw-r--r--arch/loongarch/lib/clear_user.S2
-rw-r--r--arch/loongarch/lib/copy_user.S2
-rw-r--r--arch/loongarch/lib/delay.c1
-rw-r--r--arch/loongarch/mm/page.S118
-rw-r--r--arch/loongarch/mm/tlbex.S98
-rw-r--r--arch/m68k/Kconfig.cpu7
-rw-r--r--arch/m68k/Kconfig.debug6
-rw-r--r--arch/m68k/Kconfig.machine5
-rw-r--r--arch/m68k/configs/amiga_defconfig14
-rw-r--r--arch/m68k/configs/apollo_defconfig14
-rw-r--r--arch/m68k/configs/atari_defconfig14
-rw-r--r--arch/m68k/configs/bvme6000_defconfig14
-rw-r--r--arch/m68k/configs/hp300_defconfig14
-rw-r--r--arch/m68k/configs/mac_defconfig14
-rw-r--r--arch/m68k/configs/multi_defconfig14
-rw-r--r--arch/m68k/configs/mvme147_defconfig14
-rw-r--r--arch/m68k/configs/mvme16x_defconfig14
-rw-r--r--arch/m68k/configs/q40_defconfig14
-rw-r--r--arch/m68k/configs/sun3_defconfig14
-rw-r--r--arch/m68k/configs/sun3x_defconfig14
-rw-r--r--arch/m68k/include/asm/bitops.h2
-rw-r--r--arch/m68k/include/asm/processor.h1
-rw-r--r--arch/m68k/include/uapi/asm/bootinfo-virt.h8
-rw-r--r--arch/m68k/kernel/traps.c1
-rw-r--r--arch/m68k/mac/iop.c4
-rw-r--r--arch/m68k/mac/macints.c35
-rw-r--r--arch/m68k/q40/q40ints.c2
-rw-r--r--arch/m68k/sun3/mmu_emu.c11
-rw-r--r--arch/m68k/virt/config.c11
-rw-r--r--arch/m68k/virt/ints.c3
-rw-r--r--arch/m68k/virt/platform.c58
-rw-r--r--arch/mips/include/asm/jump_label.h2
-rw-r--r--arch/mips/kernel/jump_label.c19
-rw-r--r--arch/mips/kernel/module.c5
-rw-r--r--arch/parisc/kernel/jump_label.c11
-rw-r--r--arch/powerpc/Kconfig4
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/riscv/Makefile2
-rw-r--r--arch/riscv/kernel/jump_label.c12
-rw-r--r--arch/s390/include/asm/archrandom.h9
-rw-r--r--arch/s390/include/asm/jump_label.h5
-rw-r--r--arch/s390/kernel/jump_label.c28
-rw-r--r--arch/s390/kernel/module.c1
-rw-r--r--arch/sparc/kernel/module.c3
-rw-r--r--arch/x86/.gitignore2
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/Kconfig.debug3
-rw-r--r--arch/x86/events/amd/uncore.c146
-rw-r--r--arch/x86/events/intel/core.c7
-rw-r--r--arch/x86/events/intel/ds.c129
-rw-r--r--arch/x86/events/perf_event.h17
-rw-r--r--arch/x86/include/asm/amd-ibs.h16
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/fpu/api.h2
-rw-r--r--arch/x86/include/asm/mwait.h1
-rw-r--r--arch/x86/include/asm/nospec-branch.h24
-rw-r--r--arch/x86/include/asm/perf_event.h16
-rw-r--r--arch/x86/include/asm/sev.h7
-rw-r--r--arch/x86/include/asm/special_insns.h9
-rw-r--r--arch/x86/include/asm/tlbflush.h1
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h15
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/amd_nb.c13
-rw-r--r--arch/x86/kernel/cpu/bugs.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c4
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c47
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/vmware.c4
-rw-r--r--arch/x86/kernel/e820.c6
-rw-r--r--arch/x86/kernel/fpu/core.c14
-rw-r--r--arch/x86/kernel/jump_label.c13
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c74
-rw-r--r--arch/x86/kernel/module.c3
-rw-r--r--arch/x86/kernel/pmem.c7
-rw-r--r--arch/x86/kernel/process.c44
-rw-r--r--arch/x86/kernel/setup.c73
-rw-r--r--arch/x86/kernel/sev-shared.c25
-rw-r--r--arch/x86/kernel/sev.c17
-rw-r--r--arch/x86/mm/extable.c16
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/pkeys.c15
-rw-r--r--arch/x86/mm/tlb.c31
-rw-r--r--arch/x86/purgatory/Makefile10
-rw-r--r--arch/x86/purgatory/kexec-purgatory.S14
-rw-r--r--drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c1
-rw-r--r--drivers/cpuidle/Kconfig.arm3
-rw-r--r--drivers/edac/ghes_edac.c11
-rw-r--r--drivers/edac/synopsys_edac.c44
-rw-r--r--drivers/gpu/drm/amd/display/Kconfig2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine.h2
-rw-r--r--drivers/gpu/drm/i915/gt/intel_engine_cs.c88
-rw-r--r--drivers/gpu/drm/i915/gt/intel_execlists_submission.c7
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_guc.c4
-rw-r--r--drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c81
-rw-r--r--drivers/gpu/drm/nouveau/nouveau_dmem.c6
-rw-r--r--drivers/gpu/drm/tiny/simpledrm.c2
-rw-r--r--drivers/hwmon/k10temp.c12
-rw-r--r--drivers/idle/intel_idle.c33
-rw-r--r--drivers/net/ethernet/fungible/funeth/funeth_rx.c5
-rw-r--r--drivers/net/ethernet/fungible/funeth/funeth_tx.c20
-rw-r--r--drivers/net/ethernet/fungible/funeth/funeth_txrx.h6
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c4
-rw-r--r--drivers/net/ethernet/intel/ice/ice_ethtool.c3
-rw-r--r--drivers/net/ethernet/intel/ice/ice_main.c10
-rw-r--r--drivers/net/ethernet/intel/ice/ice_sriov.c40
-rw-r--r--drivers/net/ethernet/intel/ice/ice_txrx.c8
-rw-r--r--drivers/net/ethernet/intel/ice/ice_virtchnl.c3
-rw-r--r--drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c106
-rw-r--r--drivers/net/ethernet/netronome/nfp/bpf/jit.c2
-rw-r--r--drivers/net/ethernet/sfc/ptp.c22
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c9
-rw-r--r--drivers/net/ipa/ipa_qmi_msg.h2
-rw-r--r--drivers/net/macsec.c33
-rw-r--r--drivers/net/pcs/pcs-xpcs.c2
-rw-r--r--drivers/net/sungem_phy.c1
-rw-r--r--drivers/net/virtio_net.c37
-rw-r--r--drivers/nvme/host/pci.c2
-rw-r--r--drivers/of/kexec.c13
-rw-r--r--drivers/perf/arm-cci.c11
-rw-r--r--drivers/perf/arm-ccn.c6
-rw-r--r--drivers/perf/arm_spe_pmu.c22
-rw-r--r--drivers/perf/fsl_imx8_ddr_perf.c6
-rw-r--r--drivers/perf/hisilicon/Kconfig10
-rw-r--r--drivers/perf/hisilicon/Makefile1
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c16
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_hha_pmu.c16
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c16
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_pa_pmu.c16
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_pmu.c18
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_pmu.h2
-rw-r--r--drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c15
-rw-r--r--drivers/perf/hisilicon/hns3_pmu.c1671
-rw-r--r--drivers/perf/marvell_cn10k_tad_pmu.c12
-rw-r--r--drivers/perf/riscv_pmu.c4
-rw-r--r--drivers/perf/riscv_pmu_sbi.c106
-rw-r--r--drivers/powercap/dtpm_cpu.c33
-rw-r--r--drivers/ptp/Kconfig1
-rw-r--r--drivers/s390/net/qeth_core_main.c2
-rw-r--r--drivers/scsi/mpt3sas/mpt3sas_scsih.c1
-rw-r--r--drivers/scsi/scsi_ioctl.c2
-rw-r--r--drivers/thermal/cpufreq_cooling.c6
-rw-r--r--drivers/ufs/core/ufshcd.c58
-rw-r--r--drivers/ufs/host/ufshcd-pltfrm.c15
-rw-r--r--fs/attr.c74
-rw-r--r--fs/dlm/Kconfig9
-rw-r--r--fs/dlm/Makefile2
-rw-r--r--fs/dlm/ast.c4
-rw-r--r--fs/dlm/config.c21
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dlm_internal.h32
-rw-r--r--fs/dlm/lock.c143
-rw-r--r--fs/dlm/lock.h17
-rw-r--r--fs/dlm/lockspace.c31
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/dlm/member.c30
-rw-r--r--fs/dlm/plock.c51
-rw-r--r--fs/dlm/recoverd.c35
-rw-r--r--fs/dlm/user.c21
-rw-r--r--fs/erofs/compress.h2
-rw-r--r--fs/erofs/data.c39
-rw-r--r--fs/erofs/decompressor.c18
-rw-r--r--fs/erofs/decompressor_lzma.c1
-rw-r--r--fs/erofs/dir.c20
-rw-r--r--fs/erofs/zdata.c797
-rw-r--r--fs/erofs/zdata.h119
-rw-r--r--fs/erofs/zpvec.h159
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--fs/ext2/super.c18
-rw-r--r--fs/ext4/inode.c14
-rw-r--r--fs/f2fs/file.c22
-rw-r--r--fs/f2fs/recovery.c10
-rw-r--r--fs/fat/file.c9
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/ksmbd/vfs.c2
-rw-r--r--fs/ksmbd/vfs.h2
-rw-r--r--fs/locks.c77
-rw-r--r--fs/notify/fanotify/fanotify.c19
-rw-r--r--fs/notify/fanotify/fanotify.h2
-rw-r--r--fs/notify/fanotify/fanotify_user.c110
-rw-r--r--fs/notify/fdinfo.c6
-rw-r--r--fs/notify/fsnotify.c23
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ntfs/attrib.c8
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/slot_map.c46
-rw-r--r--fs/ocfs2/super.c21
-rw-r--r--fs/open.c60
-rw-r--r--fs/overlayfs/copy_up.c4
-rw-r--r--fs/overlayfs/inode.c87
-rw-r--r--fs/overlayfs/overlayfs.h15
-rw-r--r--fs/overlayfs/super.c25
-rw-r--r--fs/posix_acl.c168
-rw-r--r--fs/quota/dquot.c17
-rw-r--r--fs/read_write.c3
-rw-r--r--fs/reiserfs/inode.c16
-rw-r--r--fs/userfaultfd.c12
-rw-r--r--fs/xattr.c25
-rw-r--r--fs/xfs/xfs_iops.c14
-rw-r--r--fs/zonefs/super.c2
-rw-r--r--include/asm-generic/barrier.h8
-rw-r--r--include/asm-generic/io.h31
-rw-r--r--include/linux/cgroup-defs.h4
-rw-r--r--include/linux/cpuhotplug.h2
-rw-r--r--include/linux/evm.h6
-rw-r--r--include/linux/fanotify.h14
-rw-r--r--include/linux/fs.h140
-rw-r--r--include/linux/fsnotify_backend.h89
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h12
-rw-r--r--include/linux/ima.h5
-rw-r--r--include/linux/jump_label.h9
-rw-r--r--include/linux/kernel_stat.h7
-rw-r--r--include/linux/lockdep.h30
-rw-r--r--include/linux/mm.h14
-rw-r--r--include/linux/mnt_idmapping.h305
-rw-r--r--include/linux/of.h2
-rw-r--r--include/linux/once_lite.h20
-rw-r--r--include/linux/pci_ids.h3
-rw-r--r--include/linux/perf/riscv_pmu.h4
-rw-r--r--include/linux/perf_event.h2
-rw-r--r--include/linux/posix_acl.h1
-rw-r--r--include/linux/posix_acl_xattr.h34
-rw-r--r--include/linux/quotaops.h15
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/sched/rt.h8
-rw-r--r--include/linux/sched/topology.h1
-rw-r--r--include/linux/security.h8
-rw-r--r--include/linux/xattr.h2
-rw-r--r--include/net/addrconf.h3
-rw-r--r--include/net/bluetooth/l2cap.h1
-rw-r--r--include/net/inet_connection_sock.h10
-rw-r--r--include/net/sock.h8
-rw-r--r--include/net/tcp.h2
-rw-r--r--include/trace/events/dlm.h118
-rw-r--r--include/trace/events/kmem.h40
-rw-r--r--include/uapi/asm-generic/fcntl.h2
-rw-r--r--include/uapi/linux/fanotify.h8
-rw-r--r--include/uapi/linux/perf_event.h5
-rw-r--r--kernel/cgroup/rstat.c44
-rw-r--r--kernel/configs/x86_debug.config3
-rw-r--r--kernel/events/core.c21
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/jump_label.c41
-rw-r--r--kernel/locking/lockdep.c7
-rw-r--r--kernel/locking/rwsem.c30
-rw-r--r--kernel/rseq.c23
-rw-r--r--kernel/sched/core.c215
-rw-r--r--kernel/sched/core_sched.c15
-rw-r--r--kernel/sched/cpufreq_schedutil.c5
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c818
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/pelt.h40
-rw-r--r--kernel/sched/rt.c15
-rw-r--r--kernel/sched/sched.h63
-rw-r--r--kernel/sched/topology.c23
-rw-r--r--kernel/watch_queue.c58
-rw-r--r--kernel/workqueue.c5
-rw-r--r--mm/gup.c6
-rw-r--r--mm/hmm.c19
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/ioremap.c26
-rw-r--r--mm/kasan/common.c3
-rw-r--r--mm/kfence/core.c18
-rw-r--r--mm/memory.c9
-rw-r--r--mm/memremap.c6
-rw-r--r--mm/page_alloc.c31
-rw-r--r--mm/secretmem.c33
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c20
-rw-r--r--mm/slab.h39
-rw-r--r--mm/slab_common.c36
-rw-r--r--mm/slob.c33
-rw-r--r--mm/slub.c98
-rw-r--r--mm/swap_slots.c2
-rw-r--r--net/bluetooth/hci_sync.c6
-rw-r--r--net/bluetooth/l2cap_core.c61
-rw-r--r--net/bluetooth/mgmt.c1
-rw-r--r--net/bridge/br_netlink.c8
-rw-r--r--net/caif/caif_socket.c20
-rw-r--r--net/decnet/af_decnet.c4
-rw-r--r--net/dsa/switch.c1
-rw-r--r--net/ipv4/fib_trie.c7
-rw-r--r--net/ipv4/tcp.c23
-rw-r--r--net/ipv4/tcp_input.c41
-rw-r--r--net/ipv4/tcp_ipv4.c4
-rw-r--r--net/ipv4/tcp_metrics.c10
-rw-r--r--net/ipv4/tcp_output.c27
-rw-r--r--net/ipv6/mcast.c14
-rw-r--r--net/ipv6/ping.c6
-rw-r--r--net/ipv6/tcp_ipv6.c4
-rw-r--r--net/mac80211/iface.c3
-rw-r--r--net/mptcp/options.c2
-rw-r--r--net/mptcp/protocol.c8
-rw-r--r--net/mptcp/subflow.c2
-rw-r--r--net/netfilter/nf_tables_api.c6
-rw-r--r--net/netfilter/nfnetlink_queue.c7
-rw-r--r--net/netfilter/nft_queue.c27
-rw-r--r--net/sctp/associola.c5
-rw-r--r--net/sctp/stream.c19
-rw-r--r--net/sctp/stream_sched.c2
-rw-r--r--net/tipc/socket.c2
-rw-r--r--net/tls/tls_device.c7
-rwxr-xr-xscripts/remove-stale-files2
-rw-r--r--security/integrity/evm/evm_main.c12
-rw-r--r--security/integrity/ima/ima_kexec.c2
-rw-r--r--security/security.c5
-rw-r--r--tools/arch/x86/include/asm/cpufeatures.h1
-rw-r--r--tools/include/uapi/asm-generic/fcntl.h11
-rwxr-xr-xtools/perf/scripts/python/arm-cs-trace-disasm.py34
-rw-r--r--tools/perf/util/bpf-loader.c18
-rw-r--r--tools/perf/util/symbol-elf.c56
-rw-r--r--tools/testing/selftests/rseq/rseq-riscv.h50
-rw-r--r--tools/testing/selftests/rseq/rseq.c3
-rw-r--r--tools/vm/slabinfo.c26
457 files changed, 9203 insertions, 5140 deletions
diff --git a/.mailmap b/.mailmap
index 13e4f504e17f..71577c396252 100644
--- a/.mailmap
+++ b/.mailmap
@@ -60,6 +60,10 @@ Arnd Bergmann <arnd@arndb.de>
Atish Patra <atishp@atishpatra.org> <atish.patra@wdc.com>
Axel Dyks <xl@xlsigned.net>
Axel Lin <axel.lin@gmail.com>
+Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@linaro.org>
+Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@spreadtrum.com>
+Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang@unisoc.com>
+Baolin Wang <baolin.wang@linux.alibaba.com> <baolin.wang7@gmail.com>
Bart Van Assche <bvanassche@acm.org> <bart.vanassche@sandisk.com>
Bart Van Assche <bvanassche@acm.org> <bart.vanassche@wdc.com>
Ben Gardner <bgardner@wabtec.com>
@@ -135,6 +139,8 @@ Frank Rowand <frowand.list@gmail.com> <frowand@mvista.com>
Frank Zago <fzago@systemfabricworks.com>
Gao Xiang <xiang@kernel.org> <gaoxiang25@huawei.com>
Gao Xiang <xiang@kernel.org> <hsiangkao@aol.com>
+Gao Xiang <xiang@kernel.org> <hsiangkao@linux.alibaba.com>
+Gao Xiang <xiang@kernel.org> <hsiangkao@redhat.com>
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@de.ibm.com>
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <gerald.schaefer@de.ibm.com>
Gerald Schaefer <gerald.schaefer@linux.ibm.com> <geraldsc@linux.vnet.ibm.com>
@@ -371,6 +377,7 @@ Sean Nyekjaer <sean@geanix.com> <sean.nyekjaer@prevas.dk>
Sebastian Reichel <sre@kernel.org> <sebastian.reichel@collabora.co.uk>
Sebastian Reichel <sre@kernel.org> <sre@debian.org>
Sedat Dilek <sedat.dilek@gmail.com> <sedat.dilek@credativ.de>
+Seth Forshee <sforshee@kernel.org> <seth.forshee@canonical.com>
Shiraz Hashim <shiraz.linux.kernel@gmail.com> <shiraz.hashim@st.com>
Shuah Khan <shuah@kernel.org> <shuahkhan@gmail.com>
Shuah Khan <shuah@kernel.org> <shuah.khan@hp.com>
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index bcc974d276dc..df79e129d097 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -493,12 +493,13 @@ What: /sys/devices/system/cpu/cpuX/regs/
/sys/devices/system/cpu/cpuX/regs/identification/
/sys/devices/system/cpu/cpuX/regs/identification/midr_el1
/sys/devices/system/cpu/cpuX/regs/identification/revidr_el1
+ /sys/devices/system/cpu/cpuX/regs/identification/smidr_el1
Date: June 2016
Contact: Linux ARM Kernel Mailing list <linux-arm-kernel@lists.infradead.org>
Description: AArch64 CPU registers
'identification' directory exposes the CPU ID registers for
- identifying model and revision of the CPU.
+ identifying model and revision of the CPU and SMCU.
What: /sys/devices/system/cpu/aarch32_el0
Date: May 2021
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index c0fdb04a0435..5e9147fe8968 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -400,6 +400,12 @@
arm64.nomte [ARM64] Unconditionally disable Memory Tagging Extension
support
+ arm64.nosve [ARM64] Unconditionally disable Scalable Vector
+ Extension support
+
+ arm64.nosme [ARM64] Unconditionally disable Scalable Matrix
+ Extension support
+
ataflop= [HW,M68k]
atarimouse= [HW,MOUSE] Atari Mouse
@@ -3161,7 +3167,7 @@
improves system performance, but it may also
expose users to several CPU vulnerabilities.
Equivalent to: nopti [X86,PPC]
- kpti=0 [ARM64]
+ if nokaslr then kpti=0 [ARM64]
nospectre_v1 [X86,PPC]
nobp=0 [S390]
nospectre_v2 [X86,PPC,S390,ARM64]
@@ -3176,6 +3182,7 @@
no_entry_flush [PPC]
no_uaccess_flush [PPC]
mmio_stale_data=off [X86]
+ retbleed=off [X86]
Exceptions:
This does not have any effect on
@@ -3198,6 +3205,7 @@
mds=full,nosmt [X86]
tsx_async_abort=full,nosmt [X86]
mmio_stale_data=full,nosmt [X86]
+ retbleed=auto,nosmt [X86]
mminit_loglevel=
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
diff --git a/Documentation/admin-guide/perf/hns3-pmu.rst b/Documentation/admin-guide/perf/hns3-pmu.rst
new file mode 100644
index 000000000000..578407e487d6
--- /dev/null
+++ b/Documentation/admin-guide/perf/hns3-pmu.rst
@@ -0,0 +1,136 @@
+======================================
+HNS3 Performance Monitoring Unit (PMU)
+======================================
+
+HNS3(HiSilicon network system 3) Performance Monitoring Unit (PMU) is an
+End Point device to collect performance statistics of HiSilicon SoC NIC.
+On Hip09, each SICL(Super I/O cluster) has one PMU device.
+
+HNS3 PMU supports collection of performance statistics such as bandwidth,
+latency, packet rate and interrupt rate.
+
+Each HNS3 PMU supports 8 hardware events.
+
+HNS3 PMU driver
+===============
+
+The HNS3 PMU driver registers a perf PMU with the name of its sicl id.::
+
+ /sys/devices/hns3_pmu_sicl_<sicl_id>
+
+PMU driver provides description of available events, filter modes, format,
+identifier and cpumask in sysfs.
+
+The "events" directory describes the event code of all supported events
+shown in perf list.
+
+The "filtermode" directory describes the supported filter modes of each
+event.
+
+The "format" directory describes all formats of the config (events) and
+config1 (filter options) fields of the perf_event_attr structure.
+
+The "identifier" file shows version of PMU hardware device.
+
+The "bdf_min" and "bdf_max" files show the supported bdf range of each
+pmu device.
+
+The "hw_clk_freq" file shows the hardware clock frequency of each pmu
+device.
+
+Example usage of checking event code and subevent code::
+
+ $# cat /sys/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_time
+ config=0x00204
+ $# cat /sys/devices/hns3_pmu_sicl_0/events/dly_tx_normal_to_mac_packet_num
+ config=0x10204
+
+Each performance statistic has a pair of events to get two values to
+calculate real performance data in userspace.
+
+The bits 0~15 of config (here 0x0204) are the true hardware event code. If
+two events have same value of bits 0~15 of config, that means they are
+event pair. And the bit 16 of config indicates getting counter 0 or
+counter 1 of hardware event.
+
+After getting two values of event pair in usersapce, the formula of
+computation to calculate real performance data is:::
+
+ counter 0 / counter 1
+
+Example usage of checking supported filter mode::
+
+ $# cat /sys/devices/hns3_pmu_sicl_0/filtermode/bw_ssu_rpu_byte_num
+ filter mode supported: global/port/port-tc/func/func-queue/
+
+Example usage of perf::
+
+ $# perf list
+ hns3_pmu_sicl_0/bw_ssu_rpu_byte_num/ [kernel PMU event]
+ hns3_pmu_sicl_0/bw_ssu_rpu_time/ [kernel PMU event]
+ ------------------------------------------
+
+ $# perf stat -g -e hns3_pmu_sicl_0/bw_ssu_rpu_byte_num,global=1/ -e hns3_pmu_sicl_0/bw_ssu_rpu_time,global=1/ -I 1000
+ or
+ $# perf stat -g -e hns3_pmu_sicl_0/config=0x00002,global=1/ -e hns3_pmu_sicl_0/config=0x10002,global=1/ -I 1000
+
+
+Filter modes
+--------------
+
+1. global mode
+PMU collect performance statistics for all HNS3 PCIe functions of IO DIE.
+Set the "global" filter option to 1 will enable this mode.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,global=1/ -I 1000
+
+2. port mode
+PMU collect performance statistic of one whole physical port. The port id
+is same as mac id. The "tc" filter option must be set to 0xF in this mode,
+here tc stands for traffic class.
+
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,port=0,tc=0xF/ -I 1000
+
+3. port-tc mode
+PMU collect performance statistic of one tc of physical port. The port id
+is same as mac id. The "tc" filter option must be set to 0 ~ 7 in this
+mode.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,port=0,tc=0/ -I 1000
+
+4. func mode
+PMU collect performance statistic of one PF/VF. The function id is BDF of
+PF/VF, its conversion formula::
+
+ func = (bus << 8) + (device << 3) + (function)
+
+for example:
+ BDF func
+ 35:00.0 0x3500
+ 35:00.1 0x3501
+ 35:01.0 0x3508
+
+In this mode, the "queue" filter option must be set to 0xFFFF.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,bdf=0x3500,queue=0xFFFF/ -I 1000
+
+5. func-queue mode
+PMU collect performance statistic of one queue of PF/VF. The function id
+is BDF of PF/VF, the "queue" filter option must be set to the exact queue
+id of function.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x1020F,bdf=0x3500,queue=0/ -I 1000
+
+6. func-intr mode
+PMU collect performance statistic of one interrupt of PF/VF. The function
+id is BDF of PF/VF, the "intr" filter option must be set to the exact
+interrupt id of function.
+Example usage of perf::
+
+ $# perf stat -a -e hns3_pmu_sicl_0/config=0x00301,bdf=0x3500,intr=0/ -I 1000
diff --git a/Documentation/admin-guide/perf/index.rst b/Documentation/admin-guide/perf/index.rst
index 69b23f087c05..9c9ece88ce53 100644
--- a/Documentation/admin-guide/perf/index.rst
+++ b/Documentation/admin-guide/perf/index.rst
@@ -9,6 +9,7 @@ Performance monitor support
hisi-pmu
hisi-pcie-pmu
+ hns3-pmu
imx-ddr
qcom_l2_pmu
qcom_l3_pmu
diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst
index aec2cd2aaea7..19754beb5a4e 100644
--- a/Documentation/admin-guide/pm/cpuidle.rst
+++ b/Documentation/admin-guide/pm/cpuidle.rst
@@ -612,8 +612,8 @@ the ``menu`` governor to be used on the systems that use the ``ladder`` governor
by default this way, for example.
The other kernel command line parameters controlling CPU idle time management
-described below are only relevant for the *x86* architecture and some of
-them affect Intel processors only.
+described below are only relevant for the *x86* architecture and references
+to ``intel_idle`` affect Intel processors only.
The *x86* architecture support code recognizes three kernel command line
options related to CPU idle time management: ``idle=poll``, ``idle=halt``,
@@ -635,10 +635,13 @@ idle, so it very well may hurt single-thread computations performance as well as
energy-efficiency. Thus using it for performance reasons may not be a good idea
at all.]
-The ``idle=nomwait`` option disables the ``intel_idle`` driver and causes
-``acpi_idle`` to be used (as long as all of the information needed by it is
-there in the system's ACPI tables), but it is not allowed to use the
-``MWAIT`` instruction of the CPUs to ask the hardware to enter idle states.
+The ``idle=nomwait`` option prevents the use of ``MWAIT`` instruction of
+the CPU to enter idle states. When this option is used, the ``acpi_idle``
+driver will use the ``HLT`` instruction instead of ``MWAIT``. On systems
+running Intel processors, this option disables the ``intel_idle`` driver
+and forces the use of the ``acpi_idle`` driver instead. Note that in either
+case, ``acpi_idle`` driver will function only if all the information needed
+by it is in the system's ACPI tables.
In addition to the architecture-level kernel command line options affecting CPU
idle time management, there are parameters affecting individual ``CPUIdle``
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index 3d116fb536c5..31fc10b833dd 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -301,6 +301,10 @@ HWCAP2_WFXT
Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
+HWCAP2_EBF16
+
+ Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
+
4. Unused AT_HWCAP bits
-----------------------
diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst
index 901cd094f4ec..2a641ba7be3b 100644
--- a/Documentation/arm64/memory.rst
+++ b/Documentation/arm64/memory.rst
@@ -33,9 +33,8 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit)::
0000000000000000 0000ffffffffffff 256TB user
ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map
[ffff600000000000 ffff7fffffffffff] 32TB [kasan shadow region]
- ffff800000000000 ffff800007ffffff 128MB bpf jit region
- ffff800008000000 ffff80000fffffff 128MB modules
- ffff800010000000 fffffbffefffffff 124TB vmalloc
+ ffff800000000000 ffff800007ffffff 128MB modules
+ ffff800008000000 fffffbffefffffff 124TB vmalloc
fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down)
fffffbfffe000000 fffffbfffe7fffff 8MB [guard region]
fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space
@@ -51,9 +50,8 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support):
0000000000000000 000fffffffffffff 4PB user
fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map
[fffd800000000000 ffff7fffffffffff] 512TB [kasan shadow region]
- ffff800000000000 ffff800007ffffff 128MB bpf jit region
- ffff800008000000 ffff80000fffffff 128MB modules
- ffff800010000000 fffffbffefffffff 124TB vmalloc
+ ffff800000000000 ffff800007ffffff 128MB modules
+ ffff800008000000 fffffbffefffffff 124TB vmalloc
fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down)
fffffbfffe000000 fffffbfffe7fffff 8MB [guard region]
fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space
diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst
index d27db84d585e..33b04db8408f 100644
--- a/Documentation/arm64/silicon-errata.rst
+++ b/Documentation/arm64/silicon-errata.rst
@@ -82,10 +82,14 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A57 | #1319537 | ARM64_ERRATUM_1319367 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A57 | #1742098 | ARM64_ERRATUM_1742098 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A72 | #853709 | N/A |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A72 | #1319367 | ARM64_ERRATUM_1319367 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A72 | #1655431 | ARM64_ERRATUM_1742098 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A76 | #1188873,1418040| ARM64_ERRATUM_1418040 |
@@ -102,6 +106,8 @@ stable kernels.
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A510 | #2077057 | ARM64_ERRATUM_2077057 |
+----------------+-----------------+-----------------+-----------------------------+
+| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
++----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
+----------------+-----------------+-----------------+-----------------------------+
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
diff --git a/Documentation/core-api/protection-keys.rst b/Documentation/core-api/protection-keys.rst
index ec575e72d0b2..bf28ac0401f3 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -4,31 +4,29 @@
Memory Protection Keys
======================
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
-which is found on Intel's Skylake (and later) "Scalable Processor"
-Server CPUs. It will be available in future non-server Intel parts
-and future AMD processors.
-
-For anyone wishing to test or use this feature, it is available in
-Amazon's EC2 C5 instances and is known to work there using an Ubuntu
-17.04 image.
-
-Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains. It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key. Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
+Memory Protection Keys provide a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables when an
+application changes protection domains.
+
+Pkeys Userspace (PKU) is a feature which can be found on:
+ * Intel server CPUs, Skylake and later
+ * Intel client CPUs, Tiger Lake (11th Gen Core) and later
+ * Future AMD CPUs
+
+Pkeys work by dedicating 4 previously Reserved bits in each page table entry to
+a "protection key", giving 16 possible keys.
+
+Protections for each key are defined with a per-CPU user-accessible register
+(PKRU). Each of these is a 32-bit register storing two bits (Access Disable
+and Write Disable) for each of 16 keys.
+
+Being a CPU register, PKRU is inherently thread-local, potentially giving each
thread a different set of protections from every other thread.
-There are two new instructions (RDPKRU/WRPKRU) for reading and writing
-to the new register. The feature is only available in 64-bit mode,
-even though there is theoretically space in the PAE PTEs. These
-permissions are enforced on data access only and have no effect on
-instruction fetches.
+There are two instructions (RDPKRU/WRPKRU) for reading and writing to the
+register. The feature is only available in 64-bit mode, even though there is
+theoretically space in the PAE PTEs. These permissions are enforced on data
+access only and have no effect on instruction fetches.
Syscalls
========
diff --git a/Documentation/devicetree/bindings/net/ethernet-controller.yaml b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
index 4f15463611f8..170cd201adc2 100644
--- a/Documentation/devicetree/bindings/net/ethernet-controller.yaml
+++ b/Documentation/devicetree/bindings/net/ethernet-controller.yaml
@@ -167,70 +167,65 @@ properties:
- in-band-status
fixed-link:
- allOf:
- - if:
- type: array
- then:
- deprecated: true
- items:
- - minimum: 0
- maximum: 31
- description:
- Emulated PHY ID, choose any but unique to the all
- specified fixed-links
-
- - enum: [0, 1]
- description:
- Duplex configuration. 0 for half duplex or 1 for
- full duplex
-
- - enum: [10, 100, 1000, 2500, 10000]
- description:
- Link speed in Mbits/sec.
-
- - enum: [0, 1]
- description:
- Pause configuration. 0 for no pause, 1 for pause
-
- - enum: [0, 1]
- description:
- Asymmetric pause configuration. 0 for no asymmetric
- pause, 1 for asymmetric pause
-
-
- - if:
- type: object
- then:
- properties:
- speed:
- description:
- Link speed.
- $ref: /schemas/types.yaml#/definitions/uint32
- enum: [10, 100, 1000, 2500, 10000]
-
- full-duplex:
- $ref: /schemas/types.yaml#/definitions/flag
- description:
- Indicates that full-duplex is used. When absent, half
- duplex is assumed.
-
- pause:
- $ref: /schemas/types.yaml#definitions/flag
- description:
- Indicates that pause should be enabled.
-
- asym-pause:
- $ref: /schemas/types.yaml#/definitions/flag
- description:
- Indicates that asym_pause should be enabled.
-
- link-gpios:
- maxItems: 1
- description:
- GPIO to determine if the link is up
-
- required:
- - speed
+ oneOf:
+ - $ref: /schemas/types.yaml#/definitions/uint32-array
+ deprecated: true
+ items:
+ - minimum: 0
+ maximum: 31
+ description:
+ Emulated PHY ID, choose any but unique to the all
+ specified fixed-links
+
+ - enum: [0, 1]
+ description:
+ Duplex configuration. 0 for half duplex or 1 for
+ full duplex
+
+ - enum: [10, 100, 1000, 2500, 10000]
+ description:
+ Link speed in Mbits/sec.
+
+ - enum: [0, 1]
+ description:
+ Pause configuration. 0 for no pause, 1 for pause
+
+ - enum: [0, 1]
+ description:
+ Asymmetric pause configuration. 0 for no asymmetric
+ pause, 1 for asymmetric pause
+ - type: object
+ additionalProperties: false
+ properties:
+ speed:
+ description:
+ Link speed.
+ $ref: /schemas/types.yaml#/definitions/uint32
+ enum: [10, 100, 1000, 2500, 10000]
+
+ full-duplex:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Indicates that full-duplex is used. When absent, half
+ duplex is assumed.
+
+ pause:
+ $ref: /schemas/types.yaml#definitions/flag
+ description:
+ Indicates that pause should be enabled.
+
+ asym-pause:
+ $ref: /schemas/types.yaml#/definitions/flag
+ description:
+ Indicates that asym_pause should be enabled.
+
+ link-gpios:
+ maxItems: 1
+ description:
+ GPIO to determine if the link is up
+
+ required:
+ - speed
additionalProperties: true
diff --git a/Documentation/devicetree/bindings/net/fsl,fec.yaml b/Documentation/devicetree/bindings/net/fsl,fec.yaml
index daa2f79a294f..1b1853062cd3 100644
--- a/Documentation/devicetree/bindings/net/fsl,fec.yaml
+++ b/Documentation/devicetree/bindings/net/fsl,fec.yaml
@@ -183,6 +183,7 @@ properties:
Should specify the gpio for phy reset.
phy-reset-duration:
+ $ref: /schemas/types.yaml#/definitions/uint32
deprecated: true
description:
Reset duration in milliseconds. Should present only if property
@@ -191,12 +192,14 @@ properties:
and 1 millisecond will be used instead.
phy-reset-active-high:
+ type: boolean
deprecated: true
description:
If present then the reset sequence using the GPIO specified in the
"phy-reset-gpios" property is reversed (H=reset state, L=operation state).
phy-reset-post-delay:
+ $ref: /schemas/types.yaml#/definitions/uint32
deprecated: true
description:
Post reset delay in milliseconds. If present then a delay of phy-reset-post-delay
diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt
index b01bf7bca3e6..6bd78eb4dc6e 100644
--- a/Documentation/features/vm/ioremap_prot/arch-support.txt
+++ b/Documentation/features/vm/ioremap_prot/arch-support.txt
@@ -9,7 +9,7 @@
| alpha: | TODO |
| arc: | ok |
| arm: | TODO |
- | arm64: | TODO |
+ | arm64: | ok |
| csky: | TODO |
| hexagon: | TODO |
| ia64: | TODO |
diff --git a/Documentation/filesystems/overlayfs.rst b/Documentation/filesystems/overlayfs.rst
index 316cfd8b1891..7da6c30ed596 100644
--- a/Documentation/filesystems/overlayfs.rst
+++ b/Documentation/filesystems/overlayfs.rst
@@ -466,10 +466,6 @@ overlay filesystem and the value of st_ino for filesystem objects may not be
persistent and could change even while the overlay filesystem is mounted, as
summarized in the `Inode properties`_ table above.
-4) "idmapped mounts"
-When the upper or lower layers are idmapped mounts overlayfs will be mounted
-without support for POSIX Access Control Lists (ACLs). This limitation will
-eventually be lifted.
Changes to underlying filesystems
---------------------------------
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index b12df9137e1c..832b5d36e279 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1894,6 +1894,7 @@ There are some more advanced barrier functions:
(*) dma_wmb();
(*) dma_rmb();
+ (*) dma_mb();
These are for use with consistent memory to guarantee the ordering
of writes or reads of shared memory accessible to both the CPU and a
@@ -1925,11 +1926,11 @@ There are some more advanced barrier functions:
The dma_rmb() allows us guarantee the device has released ownership
before we read the data from the descriptor, and the dma_wmb() allows
us to guarantee the data is written to the descriptor before the device
- can see it now has ownership. Note that, when using writel(), a prior
- wmb() is not needed to guarantee that the cache coherent memory writes
- have completed before writing to the MMIO region. The cheaper
- writel_relaxed() does not provide this guarantee and must not be used
- here.
+ can see it now has ownership. The dma_mb() implies both a dma_rmb() and
+ a dma_wmb(). Note that, when using writel(), a prior wmb() is not needed
+ to guarantee that the cache coherent memory writes have completed before
+ writing to the MMIO region. The cheaper writel_relaxed() does not provide
+ this guarantee and must not be used here.
See the subsection "Kernel I/O barrier effects" for more information on
relaxed I/O accessors and the Documentation/core-api/dma-api.rst file for
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 66c72230eaad..d7a1bf1a55b5 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2866,7 +2866,14 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
Default: 4K
sctp_wmem - vector of 3 INTEGERs: min, default, max
- Currently this tunable has no effect.
+ Only the first value ("min") is used, "default" and "max" are
+ ignored.
+
+ min: Minimum size of send buffer that can be used by SCTP sockets.
+ It is guaranteed to each SCTP socket (but not association) even
+ under moderate memory pressure.
+
+ Default: 4K
addr_scope_policy - INTEGER
Control IPv4 address scoping - draft-stewart-tsvwg-sctp-ipv4-00
diff --git a/Documentation/staging/static-keys.rst b/Documentation/staging/static-keys.rst
index 38290b9f25eb..b0a519f456cf 100644
--- a/Documentation/staging/static-keys.rst
+++ b/Documentation/staging/static-keys.rst
@@ -201,9 +201,6 @@ static_key->entry field makes use of the two least significant bits.
* ``void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type)``,
see: arch/x86/kernel/jump_label.c
-* ``__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type)``,
- see: arch/x86/kernel/jump_label.c
-
* ``struct jump_entry``,
see: arch/x86/include/asm/jump_label.h
diff --git a/Documentation/virt/kvm/arm/hyp-abi.rst b/Documentation/virt/kvm/arm/hyp-abi.rst
index 4d43fbc25195..412b276449d3 100644
--- a/Documentation/virt/kvm/arm/hyp-abi.rst
+++ b/Documentation/virt/kvm/arm/hyp-abi.rst
@@ -60,12 +60,13 @@ these functions (see arch/arm{,64}/include/asm/virt.h):
* ::
- x0 = HVC_VHE_RESTART (arm64 only)
+ x0 = HVC_FINALISE_EL2 (arm64 only)
- Attempt to upgrade the kernel's exception level from EL1 to EL2 by enabling
- the VHE mode. This is conditioned by the CPU supporting VHE, the EL2 MMU
- being off, and VHE not being disabled by any other means (command line
- option, for example).
+ Finish configuring EL2 depending on the command-line options,
+ including an attempt to upgrade the kernel's exception level from
+ EL1 to EL2 by enabling the VHE mode. This is conditioned by the CPU
+ supporting VHE, the EL2 MMU being off, and VHE not being disabled by
+ any other means (command line option, for example).
Any other value of r0/x0 triggers a hypervisor-specific handling,
which is not documented here.
diff --git a/MAINTAINERS b/MAINTAINERS
index 64379c699903..04ec80ee7352 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7485,6 +7485,8 @@ F: include/video/s1d13xxxfb.h
EROFS FILE SYSTEM
M: Gao Xiang <xiang@kernel.org>
M: Chao Yu <chao@kernel.org>
+R: Yue Hu <huyue2@coolpad.com>
+R: Jeffle Xu <jefflexu@linux.alibaba.com>
L: linux-erofs@lists.ozlabs.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git
@@ -9036,6 +9038,12 @@ F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst
F: Documentation/admin-guide/perf/hisi-pmu.rst
F: drivers/perf/hisilicon
+HISILICON HNS3 PMU DRIVER
+M: Guangbin Huang <huangguangbin2@huawei.com>
+S: Supported
+F: Documentation/admin-guide/perf/hns3-pmu.rst
+F: drivers/perf/hisilicon/hns3_pmu.c
+
HISILICON QM AND ZIP Controller DRIVER
M: Zhou Wang <wangzhou1@hisilicon.com>
L: linux-crypto@vger.kernel.org
@@ -9618,6 +9626,7 @@ F: drivers/input/misc/ideapad_slidebar.c
IDMAPPED MOUNTS
M: Christian Brauner <brauner@kernel.org>
+M: Seth Forshee <sforshee@kernel.org>
L: linux-fsdevel@vger.kernel.org
S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
diff --git a/Makefile b/Makefile
index b79c1c18149d..df92892325ae 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
VERSION = 5
PATCHLEVEL = 19
SUBLEVEL = 0
-EXTRAVERSION = -rc8
+EXTRAVERSION =
NAME = Superb Owl
# *DOCUMENTATION*
diff --git a/arch/Kconfig b/arch/Kconfig
index 71b9272acb28..5ea3e3838c21 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -223,6 +223,9 @@ config HAVE_FUNCTION_DESCRIPTORS
config TRACE_IRQFLAGS_SUPPORT
bool
+config TRACE_IRQFLAGS_NMI_SUPPORT
+ bool
+
#
# An arch should select this if it provides all these things:
#
diff --git a/arch/arc/kernel/jump_label.c b/arch/arc/kernel/jump_label.c
index b8600dc325b5..70b74a5d047b 100644
--- a/arch/arc/kernel/jump_label.c
+++ b/arch/arc/kernel/jump_label.c
@@ -96,19 +96,6 @@ void arch_jump_label_transform(struct jump_entry *entry,
flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);
}
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- /*
- * We use only one NOP type (1x, 4 byte) in arch_static_branch, so
- * there's no need to patch an identical NOP over the top of it here.
- * The generic code calls 'arch_jump_label_transform' if the NOP needs
- * to be replaced by a branch, so 'arch_jump_label_transform_static' is
- * never called with type other than JUMP_LABEL_NOP.
- */
- BUG_ON(type != JUMP_LABEL_NOP);
-}
-
#ifdef CONFIG_ARC_DBG_JUMP_LABEL
#define SELFTEST_MSG "ARC: instruction generation self-test: "
diff --git a/arch/arm/boot/dts/lan966x.dtsi b/arch/arm/boot/dts/lan966x.dtsi
index 3cb02fffe716..38e90a31d2dd 100644
--- a/arch/arm/boot/dts/lan966x.dtsi
+++ b/arch/arm/boot/dts/lan966x.dtsi
@@ -38,7 +38,7 @@
sys_clk: sys_clk {
compatible = "fixed-clock";
#clock-cells = <0>;
- clock-frequency = <162500000>;
+ clock-frequency = <165625000>;
};
cpu_clk: cpu_clk {
diff --git a/arch/arm/include/asm/dma.h b/arch/arm/include/asm/dma.h
index a81dda65c576..45180a2cc47c 100644
--- a/arch/arm/include/asm/dma.h
+++ b/arch/arm/include/asm/dma.h
@@ -10,7 +10,7 @@
#else
#define MAX_DMA_ADDRESS ({ \
extern phys_addr_t arm_dma_zone_size; \
- arm_dma_zone_size && arm_dma_zone_size < (0x10000000 - PAGE_OFFSET) ? \
+ arm_dma_zone_size && arm_dma_zone_size < (0x100000000ULL - PAGE_OFFSET) ? \
(PAGE_OFFSET + arm_dma_zone_size) : 0xffffffffUL; })
#endif
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index eba7cbc93b86..7fcdc785366c 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -139,11 +139,9 @@ extern void __iomem *__arm_ioremap_caller(phys_addr_t, size_t, unsigned int,
extern void __iomem *__arm_ioremap_pfn(unsigned long, unsigned long, size_t, unsigned int);
extern void __iomem *__arm_ioremap_exec(phys_addr_t, size_t, bool cached);
void __arm_iomem_set_ro(void __iomem *ptr, size_t size);
-extern void __iounmap(volatile void __iomem *addr);
extern void __iomem * (*arch_ioremap_caller)(phys_addr_t, size_t,
unsigned int, void *);
-extern void (*arch_iounmap)(volatile void __iomem *);
/*
* Bad read/write accesses...
@@ -380,7 +378,7 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size);
#define ioremap_wc ioremap_wc
#define ioremap_wt ioremap_wc
-void iounmap(volatile void __iomem *iomem_cookie);
+void iounmap(volatile void __iomem *io_addr);
#define iounmap iounmap
void *arch_memremap_wb(phys_addr_t phys_addr, size_t size);
diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c
index 303b3ab87f7e..eb9c24b6e8e2 100644
--- a/arch/arm/kernel/jump_label.c
+++ b/arch/arm/kernel/jump_label.c
@@ -27,9 +27,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
{
__arch_jump_label_transform(entry, type, false);
}
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- __arch_jump_label_transform(entry, type, true);
-}
diff --git a/arch/arm/lib/findbit.S b/arch/arm/lib/findbit.S
index b5e8b9ae4c7d..7fd3600db8ef 100644
--- a/arch/arm/lib/findbit.S
+++ b/arch/arm/lib/findbit.S
@@ -40,8 +40,8 @@ ENDPROC(_find_first_zero_bit_le)
* Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
*/
ENTRY(_find_next_zero_bit_le)
- teq r1, #0
- beq 3b
+ cmp r2, r1
+ bhs 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
ARM( ldrb r3, [r0, r2, lsr #3] )
@@ -81,8 +81,8 @@ ENDPROC(_find_first_bit_le)
* Prototype: int find_next_zero_bit(void *addr, unsigned int maxbit, int offset)
*/
ENTRY(_find_next_bit_le)
- teq r1, #0
- beq 3b
+ cmp r2, r1
+ bhs 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
ARM( ldrb r3, [r0, r2, lsr #3] )
@@ -115,8 +115,8 @@ ENTRY(_find_first_zero_bit_be)
ENDPROC(_find_first_zero_bit_be)
ENTRY(_find_next_zero_bit_be)
- teq r1, #0
- beq 3b
+ cmp r2, r1
+ bhs 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
eor r3, r2, #0x18 @ big endian byte ordering
@@ -149,8 +149,8 @@ ENTRY(_find_first_bit_be)
ENDPROC(_find_first_bit_be)
ENTRY(_find_next_bit_be)
- teq r1, #0
- beq 3b
+ cmp r2, r1
+ bhs 3b
ands ip, r2, #7
beq 1b @ If new byte, goto old routine
eor r3, r2, #0x18 @ big endian byte ordering
diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c
index c546356d0f02..5738496717e2 100644
--- a/arch/arm/mach-pxa/corgi.c
+++ b/arch/arm/mach-pxa/corgi.c
@@ -549,7 +549,7 @@ static struct pxa2xx_spi_controller corgi_spi_info = {
};
static struct gpiod_lookup_table corgi_spi_gpio_table = {
- .dev_id = "pxa2xx-spi.1",
+ .dev_id = "spi1",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", CORGI_GPIO_ADS7846_CS, "cs", 0, GPIO_ACTIVE_LOW),
GPIO_LOOKUP_IDX("gpio-pxa", CORGI_GPIO_LCDCON_CS, "cs", 1, GPIO_ACTIVE_LOW),
diff --git a/arch/arm/mach-pxa/hx4700.c b/arch/arm/mach-pxa/hx4700.c
index 2ae06edf413c..2fd665944103 100644
--- a/arch/arm/mach-pxa/hx4700.c
+++ b/arch/arm/mach-pxa/hx4700.c
@@ -635,7 +635,7 @@ static struct pxa2xx_spi_controller pxa_ssp2_master_info = {
};
static struct gpiod_lookup_table pxa_ssp2_gpio_table = {
- .dev_id = "pxa2xx-spi.2",
+ .dev_id = "spi2",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", GPIO88_HX4700_TSC2046_CS, "cs", 0, GPIO_ACTIVE_LOW),
{ },
diff --git a/arch/arm/mach-pxa/icontrol.c b/arch/arm/mach-pxa/icontrol.c
index 753fe166ab68..624088257cfc 100644
--- a/arch/arm/mach-pxa/icontrol.c
+++ b/arch/arm/mach-pxa/icontrol.c
@@ -140,7 +140,7 @@ struct platform_device pxa_spi_ssp4 = {
};
static struct gpiod_lookup_table pxa_ssp3_gpio_table = {
- .dev_id = "pxa2xx-spi.3",
+ .dev_id = "spi3",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS1, "cs", 0, GPIO_ACTIVE_LOW),
GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS2, "cs", 1, GPIO_ACTIVE_LOW),
@@ -149,7 +149,7 @@ static struct gpiod_lookup_table pxa_ssp3_gpio_table = {
};
static struct gpiod_lookup_table pxa_ssp4_gpio_table = {
- .dev_id = "pxa2xx-spi.4",
+ .dev_id = "spi4",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS3, "cs", 0, GPIO_ACTIVE_LOW),
GPIO_LOOKUP_IDX("gpio-pxa", ICONTROL_MCP251x_nCS4, "cs", 1, GPIO_ACTIVE_LOW),
diff --git a/arch/arm/mach-pxa/littleton.c b/arch/arm/mach-pxa/littleton.c
index f98dc61e87af..98423a96f440 100644
--- a/arch/arm/mach-pxa/littleton.c
+++ b/arch/arm/mach-pxa/littleton.c
@@ -207,7 +207,7 @@ static struct spi_board_info littleton_spi_devices[] __initdata = {
};
static struct gpiod_lookup_table littleton_spi_gpio_table = {
- .dev_id = "pxa2xx-spi.2",
+ .dev_id = "spi2",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", LITTLETON_GPIO_LCD_CS, "cs", 0, GPIO_ACTIVE_LOW),
{ },
diff --git a/arch/arm/mach-pxa/magician.c b/arch/arm/mach-pxa/magician.c
index 20456a55c4c5..0827ebca1d38 100644
--- a/arch/arm/mach-pxa/magician.c
+++ b/arch/arm/mach-pxa/magician.c
@@ -994,7 +994,7 @@ static struct pxa2xx_spi_controller magician_spi_info = {
};
static struct gpiod_lookup_table magician_spi_gpio_table = {
- .dev_id = "pxa2xx-spi.2",
+ .dev_id = "spi2",
.table = {
/* NOTICE must be GPIO, incompatibility with hw PXA SPI framing */
GPIO_LOOKUP_IDX("gpio-pxa", GPIO14_MAGICIAN_TSC2046_CS, "cs", 0, GPIO_ACTIVE_LOW),
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index dd88953adc9d..9964729cd428 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -578,7 +578,7 @@ static struct pxa2xx_spi_controller spitz_spi_info = {
};
static struct gpiod_lookup_table spitz_spi_gpio_table = {
- .dev_id = "pxa2xx-spi.2",
+ .dev_id = "spi2",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", SPITZ_GPIO_ADS7846_CS, "cs", 0, GPIO_ACTIVE_LOW),
GPIO_LOOKUP_IDX("gpio-pxa", SPITZ_GPIO_LCDCON_CS, "cs", 1, GPIO_ACTIVE_LOW),
diff --git a/arch/arm/mach-pxa/z2.c b/arch/arm/mach-pxa/z2.c
index d03520555497..c4d4162a7e6e 100644
--- a/arch/arm/mach-pxa/z2.c
+++ b/arch/arm/mach-pxa/z2.c
@@ -623,7 +623,7 @@ static struct pxa2xx_spi_controller pxa_ssp2_master_info = {
};
static struct gpiod_lookup_table pxa_ssp1_gpio_table = {
- .dev_id = "pxa2xx-spi.1",
+ .dev_id = "spi1",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", GPIO24_ZIPITZ2_WIFI_CS, "cs", 0, GPIO_ACTIVE_LOW),
{ },
@@ -631,7 +631,7 @@ static struct gpiod_lookup_table pxa_ssp1_gpio_table = {
};
static struct gpiod_lookup_table pxa_ssp2_gpio_table = {
- .dev_id = "pxa2xx-spi.2",
+ .dev_id = "spi2",
.table = {
GPIO_LOOKUP_IDX("gpio-pxa", GPIO88_ZIPITZ2_LCD_CS, "cs", 0, GPIO_ACTIVE_LOW),
{ },
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 576c0e6c92fc..2129070065c3 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -418,7 +418,7 @@ void *arch_memremap_wb(phys_addr_t phys_addr, size_t size)
__builtin_return_address(0));
}
-void __iounmap(volatile void __iomem *io_addr)
+void iounmap(volatile void __iomem *io_addr)
{
void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
struct static_vm *svm;
@@ -446,13 +446,6 @@ void __iounmap(volatile void __iomem *io_addr)
vunmap(addr);
}
-
-void (*arch_iounmap)(volatile void __iomem *) = __iounmap;
-
-void iounmap(volatile void __iomem *cookie)
-{
- arch_iounmap(cookie);
-}
EXPORT_SYMBOL(iounmap);
#if defined(CONFIG_PCI) || IS_ENABLED(CONFIG_PCMCIA)
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index 2658f52903da..c42debaded95 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -230,14 +230,7 @@ void *arch_memremap_wb(phys_addr_t phys_addr, size_t size)
return (void *)phys_addr;
}
-void __iounmap(volatile void __iomem *addr)
-{
-}
-EXPORT_SYMBOL(__iounmap);
-
-void (*arch_iounmap)(volatile void __iomem *);
-
-void iounmap(volatile void __iomem *addr)
+void iounmap(volatile void __iomem *io_addr)
{
}
EXPORT_SYMBOL(iounmap);
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1652a9800ebe..340e61199057 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -101,6 +101,7 @@ config ARM64
select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
select ARCH_WANT_LD_ORPHAN_WARN
select ARCH_WANTS_NO_INSTR
+ select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARM_AMBA
select ARM_ARCH_TIMER
@@ -126,6 +127,7 @@ config ARM64
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_IDLE_POLL_SETUP
+ select GENERIC_IOREMAP
select GENERIC_IRQ_IPI
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
@@ -188,6 +190,7 @@ config ARM64
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS
+ select HAVE_IOREMAP_PROT
select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KVM
select HAVE_NMI
@@ -226,6 +229,7 @@ config ARM64
select THREAD_INFO_IN_TASK
select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
select TRACE_IRQFLAGS_SUPPORT
+ select TRACE_IRQFLAGS_NMI_SUPPORT
help
ARM 64-bit (AArch64) Linux support.
@@ -503,6 +507,22 @@ config ARM64_ERRATUM_834220
If unsure, say Y.
+config ARM64_ERRATUM_1742098
+ bool "Cortex-A57/A72: 1742098: ELR recorded incorrectly on interrupt taken between cryptographic instructions in a sequence"
+ depends on COMPAT
+ default y
+ help
+ This option removes the AES hwcap for aarch32 user-space to
+ workaround erratum 1742098 on Cortex-A57 and Cortex-A72.
+
+ Affected parts may corrupt the AES state if an interrupt is
+ taken between a pair of AES instructions. These instructions
+ are only present if the cryptography extensions are present.
+ All software should have a fallback implementation for CPUs
+ that don't implement the cryptography extensions.
+
+ If unsure, say Y.
+
config ARM64_ERRATUM_845719
bool "Cortex-A53: 845719: a load might read incorrect data"
depends on COMPAT
@@ -821,6 +841,23 @@ config ARM64_ERRATUM_2224489
If unsure, say Y.
+config ARM64_ERRATUM_2441009
+ bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
+ default y
+ select ARM64_WORKAROUND_REPEAT_TLBI
+ help
+ This option adds a workaround for ARM Cortex-A510 erratum #2441009.
+
+ Under very rare circumstances, affected Cortex-A510 CPUs
+ may not handle a race between a break-before-make sequence on one
+ CPU, and another CPU accessing the same page. This could allow a
+ store to a page that has been unmapped.
+
+ Work around this by adding the affected CPUs to the list that needs
+ TLB sequences to be done twice.
+
+ If unsure, say Y.
+
config ARM64_ERRATUM_2064142
bool "Cortex-A510: 2064142: workaround TRBE register writes while disabled"
depends on CORESIGHT_TRBE
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile
index ebe80faab883..a0e3dedd2883 100644
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -16,7 +16,7 @@
OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
-targets := Image Image.bz2 Image.gz Image.lz4 Image.lzma Image.lzo
+targets := Image Image.bz2 Image.gz Image.lz4 Image.lzma Image.lzo Image.zst
$(obj)/Image: vmlinux FORCE
$(call if_changed,objcopy)
@@ -35,3 +35,6 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
$(obj)/Image.lzo: $(obj)/Image FORCE
$(call if_changed,lzo)
+
+$(obj)/Image.zst: $(obj)/Image FORCE
+ $(call if_changed,zstd)
diff --git a/arch/arm64/include/asm/asm-extable.h b/arch/arm64/include/asm/asm-extable.h
index c39f2437e08e..980d1dd8e1a3 100644
--- a/arch/arm64/include/asm/asm-extable.h
+++ b/arch/arm64/include/asm/asm-extable.h
@@ -2,12 +2,27 @@
#ifndef __ASM_ASM_EXTABLE_H
#define __ASM_ASM_EXTABLE_H
+#include <linux/bits.h>
+#include <asm/gpr-num.h>
+
#define EX_TYPE_NONE 0
-#define EX_TYPE_FIXUP 1
-#define EX_TYPE_BPF 2
-#define EX_TYPE_UACCESS_ERR_ZERO 3
+#define EX_TYPE_BPF 1
+#define EX_TYPE_UACCESS_ERR_ZERO 2
+#define EX_TYPE_KACCESS_ERR_ZERO 3
#define EX_TYPE_LOAD_UNALIGNED_ZEROPAD 4
+/* Data fields for EX_TYPE_UACCESS_ERR_ZERO */
+#define EX_DATA_REG_ERR_SHIFT 0
+#define EX_DATA_REG_ERR GENMASK(4, 0)
+#define EX_DATA_REG_ZERO_SHIFT 5
+#define EX_DATA_REG_ZERO GENMASK(9, 5)
+
+/* Data fields for EX_TYPE_LOAD_UNALIGNED_ZEROPAD */
+#define EX_DATA_REG_DATA_SHIFT 0
+#define EX_DATA_REG_DATA GENMASK(4, 0)
+#define EX_DATA_REG_ADDR_SHIFT 5
+#define EX_DATA_REG_ADDR GENMASK(9, 5)
+
#ifdef __ASSEMBLY__
#define __ASM_EXTABLE_RAW(insn, fixup, type, data) \
@@ -19,31 +34,45 @@
.short (data); \
.popsection;
+#define EX_DATA_REG(reg, gpr) \
+ (.L__gpr_num_##gpr << EX_DATA_REG_##reg##_SHIFT)
+
+#define _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, zero) \
+ __ASM_EXTABLE_RAW(insn, fixup, \
+ EX_TYPE_UACCESS_ERR_ZERO, \
+ ( \
+ EX_DATA_REG(ERR, err) | \
+ EX_DATA_REG(ZERO, zero) \
+ ))
+
+#define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \
+ _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, wzr)
+
+#define _ASM_EXTABLE_UACCESS(insn, fixup) \
+ _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, wzr, wzr)
+
/*
- * Create an exception table entry for `insn`, which will branch to `fixup`
+ * Create an exception table entry for uaccess `insn`, which will branch to `fixup`
* when an unhandled fault is taken.
*/
- .macro _asm_extable, insn, fixup
- __ASM_EXTABLE_RAW(\insn, \fixup, EX_TYPE_FIXUP, 0)
+ .macro _asm_extable_uaccess, insn, fixup
+ _ASM_EXTABLE_UACCESS(\insn, \fixup)
.endm
/*
* Create an exception table entry for `insn` if `fixup` is provided. Otherwise
* do nothing.
*/
- .macro _cond_extable, insn, fixup
- .ifnc \fixup,
- _asm_extable \insn, \fixup
+ .macro _cond_uaccess_extable, insn, fixup
+ .ifnc \fixup,
+ _asm_extable_uaccess \insn, \fixup
.endif
.endm
#else /* __ASSEMBLY__ */
-#include <linux/bits.h>
#include <linux/stringify.h>
-#include <asm/gpr-num.h>
-
#define __ASM_EXTABLE_RAW(insn, fixup, type, data) \
".pushsection __ex_table, \"a\"\n" \
".align 2\n" \
@@ -53,14 +82,6 @@
".short (" data ")\n" \
".popsection\n"
-#define _ASM_EXTABLE(insn, fixup) \
- __ASM_EXTABLE_RAW(#insn, #fixup, __stringify(EX_TYPE_FIXUP), "0")
-
-#define EX_DATA_REG_ERR_SHIFT 0
-#define EX_DATA_REG_ERR GENMASK(4, 0)
-#define EX_DATA_REG_ZERO_SHIFT 5
-#define EX_DATA_REG_ZERO GENMASK(9, 5)
-
#define EX_DATA_REG(reg, gpr) \
"((.L__gpr_num_" #gpr ") << " __stringify(EX_DATA_REG_##reg##_SHIFT) ")"
@@ -73,13 +94,23 @@
EX_DATA_REG(ZERO, zero) \
")")
+#define _ASM_EXTABLE_KACCESS_ERR_ZERO(insn, fixup, err, zero) \
+ __DEFINE_ASM_GPR_NUMS \
+ __ASM_EXTABLE_RAW(#insn, #fixup, \
+ __stringify(EX_TYPE_KACCESS_ERR_ZERO), \
+ "(" \
+ EX_DATA_REG(ERR, err) " | " \
+ EX_DATA_REG(ZERO, zero) \
+ ")")
+
#define _ASM_EXTABLE_UACCESS_ERR(insn, fixup, err) \
_ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, err, wzr)
-#define EX_DATA_REG_DATA_SHIFT 0
-#define EX_DATA_REG_DATA GENMASK(4, 0)
-#define EX_DATA_REG_ADDR_SHIFT 5
-#define EX_DATA_REG_ADDR GENMASK(9, 5)
+#define _ASM_EXTABLE_UACCESS(insn, fixup) \
+ _ASM_EXTABLE_UACCESS_ERR_ZERO(insn, fixup, wzr, wzr)
+
+#define _ASM_EXTABLE_KACCESS_ERR(insn, fixup, err) \
+ _ASM_EXTABLE_KACCESS_ERR_ZERO(insn, fixup, err, wzr)
#define _ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(insn, fixup, data, addr) \
__DEFINE_ASM_GPR_NUMS \
diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 0557af834e03..75b211c98dea 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -61,7 +61,7 @@ alternative_else_nop_endif
#define USER(l, x...) \
9999: x; \
- _asm_extable 9999b, l
+ _asm_extable_uaccess 9999b, l
/*
* Generate the assembly for LDTR/STTR with exception table entries.
@@ -73,8 +73,8 @@ alternative_else_nop_endif
8889: ldtr \reg2, [\addr, #8];
add \addr, \addr, \post_inc;
- _asm_extable 8888b,\l;
- _asm_extable 8889b,\l;
+ _asm_extable_uaccess 8888b, \l;
+ _asm_extable_uaccess 8889b, \l;
.endm
.macro user_stp l, reg1, reg2, addr, post_inc
@@ -82,14 +82,14 @@ alternative_else_nop_endif
8889: sttr \reg2, [\addr, #8];
add \addr, \addr, \post_inc;
- _asm_extable 8888b,\l;
- _asm_extable 8889b,\l;
+ _asm_extable_uaccess 8888b,\l;
+ _asm_extable_uaccess 8889b,\l;
.endm
.macro user_ldst l, inst, reg, addr, post_inc
8888: \inst \reg, [\addr];
add \addr, \addr, \post_inc;
- _asm_extable 8888b,\l;
+ _asm_extable_uaccess 8888b, \l;
.endm
#endif
diff --git a/arch/arm64/include/asm/asm_pointer_auth.h b/arch/arm64/include/asm/asm_pointer_auth.h
index ead62f7dd269..13ecc79854ee 100644
--- a/arch/arm64/include/asm/asm_pointer_auth.h
+++ b/arch/arm64/include/asm/asm_pointer_auth.h
@@ -59,9 +59,9 @@ alternative_else_nop_endif
.macro __ptrauth_keys_init_cpu tsk, tmp1, tmp2, tmp3
mrs \tmp1, id_aa64isar1_el1
- ubfx \tmp1, \tmp1, #ID_AA64ISAR1_APA_SHIFT, #8
+ ubfx \tmp1, \tmp1, #ID_AA64ISAR1_EL1_APA_SHIFT, #8
mrs_s \tmp2, SYS_ID_AA64ISAR2_EL1
- ubfx \tmp2, \tmp2, #ID_AA64ISAR2_APA3_SHIFT, #4
+ ubfx \tmp2, \tmp2, #ID_AA64ISAR2_EL1_APA3_SHIFT, #4
orr \tmp1, \tmp1, \tmp2
cbz \tmp1, .Lno_addr_auth\@
mov_q \tmp1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 8c5a61aeaf8e..5846145be523 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -360,6 +360,20 @@ alternative_cb_end
.endm
/*
+ * idmap_get_t0sz - get the T0SZ value needed to cover the ID map
+ *
+ * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
+ * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
+ * this number conveniently equals the number of leading zeroes in
+ * the physical address of _end.
+ */
+ .macro idmap_get_t0sz, reg
+ adrp \reg, _end
+ orr \reg, \reg, #(1 << VA_BITS_MIN) - 1
+ clz \reg, \reg
+ .endm
+
+/*
* tcr_compute_pa_size - set TCR.(I)PS to the highest supported
* ID_AA64MMFR0_EL1.PARange value
*
@@ -423,7 +437,7 @@ alternative_endif
b.lo .Ldcache_op\@
dsb \domain
- _cond_extable .Ldcache_op\@, \fixup
+ _cond_uaccess_extable .Ldcache_op\@, \fixup
.endm
/*
@@ -462,7 +476,19 @@ alternative_endif
dsb ish
isb
- _cond_extable .Licache_op\@, \fixup
+ _cond_uaccess_extable .Licache_op\@, \fixup
+ .endm
+
+/*
+ * load_ttbr1 - install @pgtbl as a TTBR1 page table
+ * pgtbl preserved
+ * tmp1/tmp2 clobbered, either may overlap with pgtbl
+ */
+ .macro load_ttbr1, pgtbl, tmp1, tmp2
+ phys_to_ttbr \tmp1, \pgtbl
+ offset_ttbr1 \tmp1, \tmp2
+ msr ttbr1_el1, \tmp1
+ isb
.endm
/*
@@ -478,10 +504,7 @@ alternative_endif
isb
tlbi vmalle1
dsb nsh
- phys_to_ttbr \tmp, \page_table
- offset_ttbr1 \tmp, \tmp2
- msr ttbr1_el1, \tmp
- isb
+ load_ttbr1 \page_table, \tmp, \tmp2
.endm
/*
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 9f3e2c3d2ca0..2cfc4245d2e2 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -50,13 +50,13 @@
#define pmr_sync() do {} while (0)
#endif
-#define mb() dsb(sy)
-#define rmb() dsb(ld)
-#define wmb() dsb(st)
+#define __mb() dsb(sy)
+#define __rmb() dsb(ld)
+#define __wmb() dsb(st)
-#define dma_mb() dmb(osh)
-#define dma_rmb() dmb(oshld)
-#define dma_wmb() dmb(oshst)
+#define __dma_mb() dmb(osh)
+#define __dma_rmb() dmb(oshld)
+#define __dma_wmb() dmb(oshst)
#define io_stop_wc() dgh()
diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 7c2181c72116..ca9b487112cc 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -5,34 +5,9 @@
#ifndef __ASM_CACHE_H
#define __ASM_CACHE_H
-#include <asm/cputype.h>
-#include <asm/mte-def.h>
-
-#define CTR_L1IP_SHIFT 14
-#define CTR_L1IP_MASK 3
-#define CTR_DMINLINE_SHIFT 16
-#define CTR_IMINLINE_SHIFT 0
-#define CTR_IMINLINE_MASK 0xf
-#define CTR_ERG_SHIFT 20
-#define CTR_CWG_SHIFT 24
-#define CTR_CWG_MASK 15
-#define CTR_IDC_SHIFT 28
-#define CTR_DIC_SHIFT 29
-
-#define CTR_CACHE_MINLINE_MASK \
- (0xf << CTR_DMINLINE_SHIFT | CTR_IMINLINE_MASK << CTR_IMINLINE_SHIFT)
-
-#define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK)
-
-#define ICACHE_POLICY_VPIPT 0
-#define ICACHE_POLICY_RESERVED 1
-#define ICACHE_POLICY_VIPT 2
-#define ICACHE_POLICY_PIPT 3
-
#define L1_CACHE_SHIFT (6)
#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
-
#define CLIDR_LOUU_SHIFT 27
#define CLIDR_LOC_SHIFT 24
#define CLIDR_LOUIS_SHIFT 21
@@ -55,6 +30,10 @@
#include <linux/bitops.h>
#include <linux/kasan-enabled.h>
+#include <asm/cputype.h>
+#include <asm/mte-def.h>
+#include <asm/sysreg.h>
+
#ifdef CONFIG_KASAN_SW_TAGS
#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
#elif defined(CONFIG_KASAN_HW_TAGS)
@@ -66,6 +45,12 @@ static inline unsigned int arch_slab_minalign(void)
#define arch_slab_minalign() arch_slab_minalign()
#endif
+#define CTR_CACHE_MINLINE_MASK \
+ (0xf << CTR_EL0_DMINLINE_SHIFT | \
+ CTR_EL0_IMINLINE_MASK << CTR_EL0_IMINLINE_SHIFT)
+
+#define CTR_L1IP(ctr) SYS_FIELD_GET(CTR_EL0, L1Ip, ctr)
+
#define ICACHEF_ALIASING 0
#define ICACHEF_VPIPT 1
extern unsigned long __icache_flags;
@@ -86,7 +71,7 @@ static __always_inline int icache_is_vpipt(void)
static inline u32 cache_type_cwg(void)
{
- return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK;
+ return (read_cpuid_cachetype() >> CTR_EL0_CWG_SHIFT) & CTR_EL0_CWG_MASK;
}
#define __read_mostly __section(".data..read_mostly")
@@ -120,12 +105,12 @@ static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void)
{
u32 ctr = read_cpuid_cachetype();
- if (!(ctr & BIT(CTR_IDC_SHIFT))) {
+ if (!(ctr & BIT(CTR_EL0_IDC_SHIFT))) {
u64 clidr = read_sysreg(clidr_el1);
if (CLIDR_LOC(clidr) == 0 ||
(CLIDR_LOUIS(clidr) == 0 && CLIDR_LOUU(clidr) == 0))
- ctr |= BIT(CTR_IDC_SHIFT);
+ ctr |= BIT(CTR_EL0_IDC_SHIFT);
}
return ctr;
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 5a228e203ef9..37185e978aeb 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -105,13 +105,6 @@ static inline void flush_icache_range(unsigned long start, unsigned long end)
#define flush_icache_range flush_icache_range
/*
- * Cache maintenance functions used by the DMA API. No to be used directly.
- */
-extern void __dma_map_area(const void *, size_t, int);
-extern void __dma_unmap_area(const void *, size_t, int);
-extern void __dma_flush_area(const void *, size_t);
-
-/*
* Copy user data from/to a page which is mapped into a different
* processes address space. Really, we want to allow our "user
* space" model to handle this.
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 115cdec1ae87..fd7a92219eea 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -46,6 +46,7 @@ struct cpuinfo_arm64 {
u64 reg_midr;
u64 reg_revidr;
u64 reg_gmid;
+ u64 reg_smidr;
u64 reg_id_aa64dfr0;
u64 reg_id_aa64dfr1;
diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h
index e95c4df83911..a444c8915e88 100644
--- a/arch/arm64/include/asm/cpu_ops.h
+++ b/arch/arm64/include/asm/cpu_ops.h
@@ -31,11 +31,6 @@
* @cpu_die: Makes a cpu leave the kernel. Must not fail. Called from the
* cpu being killed.
* @cpu_kill: Ensures a cpu has left the kernel. Called from another cpu.
- * @cpu_init_idle: Reads any data necessary to initialize CPU idle states for
- * a proposed logical id.
- * @cpu_suspend: Suspends a cpu and saves the required context. May fail owing
- * to wrong parameters or error conditions. Called from the
- * CPU being suspended. Must be called with IRQs disabled.
*/
struct cpu_operations {
const char *name;
@@ -49,10 +44,6 @@ struct cpu_operations {
void (*cpu_die)(unsigned int cpu);
int (*cpu_kill)(unsigned int cpu);
#endif
-#ifdef CONFIG_CPU_IDLE
- int (*cpu_init_idle)(unsigned int);
- int (*cpu_suspend)(unsigned long);
-#endif
};
int __init init_cpu_ops(int cpu);
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 14a8f3d93add..fd7d75a275f6 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -11,7 +11,7 @@
#include <asm/hwcap.h>
#include <asm/sysreg.h>
-#define MAX_CPU_FEATURES 64
+#define MAX_CPU_FEATURES 128
#define cpu_feature(x) KERNEL_HWCAP_ ## x
#ifndef __ASSEMBLY__
@@ -673,7 +673,7 @@ static inline bool supports_clearbhb(int scope)
isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
return cpuid_feature_extract_unsigned_field(isar2,
- ID_AA64ISAR2_CLEARBHB_SHIFT);
+ ID_AA64ISAR2_EL1_BC_SHIFT);
}
const struct cpumask *system_32bit_el0_cpumask(void);
@@ -908,7 +908,10 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
}
extern struct arm64_ftr_override id_aa64mmfr1_override;
+extern struct arm64_ftr_override id_aa64pfr0_override;
extern struct arm64_ftr_override id_aa64pfr1_override;
+extern struct arm64_ftr_override id_aa64zfr0_override;
+extern struct arm64_ftr_override id_aa64smfr0_override;
extern struct arm64_ftr_override id_aa64isar1_override;
extern struct arm64_ftr_override id_aa64isar2_override;
diff --git a/arch/arm64/include/asm/cpuidle.h b/arch/arm64/include/asm/cpuidle.h
index 14a19d1141bd..2047713e097d 100644
--- a/arch/arm64/include/asm/cpuidle.h
+++ b/arch/arm64/include/asm/cpuidle.h
@@ -4,21 +4,6 @@
#include <asm/proc-fns.h>
-#ifdef CONFIG_CPU_IDLE
-extern int arm_cpuidle_init(unsigned int cpu);
-extern int arm_cpuidle_suspend(int index);
-#else
-static inline int arm_cpuidle_init(unsigned int cpu)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int arm_cpuidle_suspend(int index)
-{
- return -EOPNOTSUPP;
-}
-#endif
-
#ifdef CONFIG_ARM64_PSEUDO_NMI
#include <asm/arch_gicv3.h>
diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 34ceff08cac4..2630faa5bc08 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -129,64 +129,6 @@
msr cptr_el2, x0 // Disable copro. traps to EL2
.endm
-/* SVE register access */
-.macro __init_el2_nvhe_sve
- mrs x1, id_aa64pfr0_el1
- ubfx x1, x1, #ID_AA64PFR0_SVE_SHIFT, #4
- cbz x1, .Lskip_sve_\@
-
- bic x0, x0, #CPTR_EL2_TZ // Also disable SVE traps
- msr cptr_el2, x0 // Disable copro. traps to EL2
- isb
- mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
- msr_s SYS_ZCR_EL2, x1 // length for EL1.
-.Lskip_sve_\@:
-.endm
-
-/* SME register access and priority mapping */
-.macro __init_el2_nvhe_sme
- mrs x1, id_aa64pfr1_el1
- ubfx x1, x1, #ID_AA64PFR1_SME_SHIFT, #4
- cbz x1, .Lskip_sme_\@
-
- bic x0, x0, #CPTR_EL2_TSM // Also disable SME traps
- msr cptr_el2, x0 // Disable copro. traps to EL2
- isb
-
- mrs x1, sctlr_el2
- orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps
- msr sctlr_el2, x1
- isb
-
- mov x1, #0 // SMCR controls
-
- mrs_s x2, SYS_ID_AA64SMFR0_EL1
- ubfx x2, x2, #ID_AA64SMFR0_FA64_SHIFT, #1 // Full FP in SM?
- cbz x2, .Lskip_sme_fa64_\@
-
- orr x1, x1, SMCR_ELx_FA64_MASK
-.Lskip_sme_fa64_\@:
-
- orr x1, x1, #SMCR_ELx_LEN_MASK // Enable full SME vector
- msr_s SYS_SMCR_EL2, x1 // length for EL1.
-
- mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported?
- ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1
- cbz x1, .Lskip_sme_\@
-
- msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal
-
- mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present?
- ubfx x1, x1, #ID_AA64MMFR1_HCX_SHIFT, #4
- cbz x1, .Lskip_sme_\@
-
- mrs_s x1, SYS_HCRX_EL2
- orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping
- msr_s SYS_HCRX_EL2, x1
-
-.Lskip_sme_\@:
-.endm
-
/* Disable any fine grained traps */
.macro __init_el2_fgt
mrs x1, id_aa64mmfr0_el1
@@ -250,8 +192,6 @@
__init_el2_hstr
__init_el2_nvhe_idregs
__init_el2_nvhe_cptr
- __init_el2_nvhe_sve
- __init_el2_nvhe_sme
__init_el2_fgt
__init_el2_nvhe_prepare_eret
.endm
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index daff882883f9..71ed5fdf718b 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -62,10 +62,12 @@ enum fixed_addresses {
#endif /* CONFIG_ACPI_APEI_GHES */
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#ifdef CONFIG_RELOCATABLE
+ FIX_ENTRY_TRAMP_TEXT4, /* one extra slot for the data page */
+#endif
FIX_ENTRY_TRAMP_TEXT3,
FIX_ENTRY_TRAMP_TEXT2,
FIX_ENTRY_TRAMP_TEXT1,
- FIX_ENTRY_TRAMP_DATA,
#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT1))
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
__end_of_permanent_fixed_addresses,
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index aa443d8f8cfb..cef4ae7a3d8b 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -85,7 +85,7 @@
#define KERNEL_HWCAP_PACA __khwcap_feature(PACA)
#define KERNEL_HWCAP_PACG __khwcap_feature(PACG)
-#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 32)
+#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 64)
#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP)
#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2)
#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES)
@@ -118,6 +118,7 @@
#define KERNEL_HWCAP_SME_F32F32 __khwcap2_feature(SME_F32F32)
#define KERNEL_HWCAP_SME_FA64 __khwcap2_feature(SME_FA64)
#define KERNEL_HWCAP_WFXT __khwcap2_feature(WFXT)
+#define KERNEL_HWCAP_EBF16 __khwcap2_feature(EBF16)
/*
* This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 3995652daf81..87dd42d74afe 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -163,13 +163,16 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
/*
* I/O memory mapping functions.
*/
-extern void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot);
-extern void iounmap(volatile void __iomem *addr);
-extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
-#define ioremap(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
-#define ioremap_wc(addr, size) __ioremap((addr), (size), __pgprot(PROT_NORMAL_NC))
-#define ioremap_np(addr, size) __ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRnE))
+bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot);
+#define ioremap_allowed ioremap_allowed
+
+#define _PAGE_IOREMAP PROT_DEVICE_nGnRE
+
+#define ioremap_wc(addr, size) \
+ ioremap_prot((addr), (size), PROT_NORMAL_NC)
+#define ioremap_np(addr, size) \
+ ioremap_prot((addr), (size), PROT_DEVICE_nGnRnE)
/*
* io{read,write}{16,32,64}be() macros
@@ -184,6 +187,15 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
#include <asm-generic/io.h>
+#define ioremap_cache ioremap_cache
+static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size)
+{
+ if (pfn_is_map_memory(__phys_to_pfn(addr)))
+ return (void __iomem *)__phys_to_virt(addr);
+
+ return ioremap_prot(addr, size, PROT_NORMAL);
+}
+
/*
* More restrictive address range checking than the default implementation
* (PHYS_OFFSET and PHYS_MASK taken into account).
diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 96dc0f7da258..02e59fa8f293 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -8,6 +8,7 @@
#ifndef __ASM_KERNEL_PGTABLE_H
#define __ASM_KERNEL_PGTABLE_H
+#include <asm/boot.h>
#include <asm/pgtable-hwdef.h>
#include <asm/sparsemem.h>
@@ -35,10 +36,8 @@
*/
#if ARM64_KERNEL_USES_PMD_MAPS
#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS - 1)
-#define IDMAP_PGTABLE_LEVELS (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT) - 1)
#else
#define SWAPPER_PGTABLE_LEVELS (CONFIG_PGTABLE_LEVELS)
-#define IDMAP_PGTABLE_LEVELS (ARM64_HW_PGTABLE_LEVELS(PHYS_MASK_SHIFT))
#endif
@@ -87,7 +86,14 @@
+ EARLY_PUDS((vstart), (vend)) /* each PUD needs a next level page table */ \
+ EARLY_PMDS((vstart), (vend))) /* each PMD needs a next level page table */
#define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end))
-#define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE)
+
+/* the initial ID map may need two extra pages if it needs to be extended */
+#if VA_BITS < 48
+#define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + 2) * PAGE_SIZE)
+#else
+#define INIT_IDMAP_DIR_SIZE (INIT_IDMAP_DIR_PAGES * PAGE_SIZE)
+#endif
+#define INIT_IDMAP_DIR_PAGES EARLY_PAGES(KIMAGE_VADDR, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE)
/* Initial memory map size */
#if ARM64_KERNEL_USES_PMD_MAPS
@@ -107,9 +113,11 @@
#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S)
#if ARM64_KERNEL_USES_PMD_MAPS
-#define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
+#define SWAPPER_RW_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS)
+#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PMD_SECT_RDONLY)
#else
-#define SWAPPER_MM_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+#define SWAPPER_RW_MMUFLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+#define SWAPPER_RX_MMUFLAGS (SWAPPER_RW_MMUFLAGS | PTE_RDONLY)
#endif
/*
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 0af70d9abede..227d256cd4b9 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -174,7 +174,11 @@
#include <linux/types.h>
#include <asm/bug.h>
+#if VA_BITS > 48
extern u64 vabits_actual;
+#else
+#define vabits_actual ((u64)VA_BITS)
+#endif
extern s64 memstart_addr;
/* PHYS_OFFSET - the physical address of the start of memory. */
@@ -351,6 +355,11 @@ static inline void *phys_to_virt(phys_addr_t x)
})
void dump_mem_limit(void);
+
+static inline bool defer_reserve_crashkernel(void)
+{
+ return IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32);
+}
#endif /* !ASSEMBLY */
/*
diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index 6770667b34a3..c7ccd82db1d2 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -60,8 +60,7 @@ static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
* TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in
* physical memory, in which case it will be smaller.
*/
-extern u64 idmap_t0sz;
-extern u64 idmap_ptrs_per_pgd;
+extern int idmap_t0sz;
/*
* Ensure TCR.T0SZ is set to the provided value.
@@ -106,13 +105,18 @@ static inline void cpu_uninstall_idmap(void)
cpu_switch_mm(mm->pgd, mm);
}
-static inline void cpu_install_idmap(void)
+static inline void __cpu_install_idmap(pgd_t *idmap)
{
cpu_set_reserved_ttbr0();
local_flush_tlb_all();
cpu_set_idmap_tcr_t0sz();
- cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
+ cpu_switch_mm(lm_alias(idmap), &init_mm);
+}
+
+static inline void cpu_install_idmap(void)
+{
+ __cpu_install_idmap(idmap_pg_dir);
}
/*
@@ -143,7 +147,7 @@ static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz)
* Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
* avoiding the possibility of conflicting TLB entries being allocated.
*/
-static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp)
+static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp, pgd_t *idmap)
{
typedef void (ttbr_replace_func)(phys_addr_t);
extern ttbr_replace_func idmap_cpu_replace_ttbr1;
@@ -166,7 +170,7 @@ static inline void __nocfi cpu_replace_ttbr1(pgd_t *pgdp)
replace_phys = (void *)__pa_symbol(function_nocfi(idmap_cpu_replace_ttbr1));
- cpu_install_idmap();
+ __cpu_install_idmap(idmap);
replace_phys(ttbr1);
cpu_uninstall_idmap();
}
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index dd3d12bce07b..5ab8d163198f 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -281,10 +281,9 @@
*/
#ifdef CONFIG_ARM64_PA_BITS_52
/*
- * This should be GENMASK_ULL(47, 2).
* TTBR_ELx[1] is RES0 in this configuration.
*/
-#define TTBR_BADDR_MASK_52 (((UL(1) << 46) - 1) << 2)
+#define TTBR_BADDR_MASK_52 GENMASK_ULL(47, 2)
#endif
#ifdef CONFIG_ARM64_VA_BITS_52
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0b6632f18364..b5df82aa99e6 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -45,6 +45,12 @@
__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline bool arch_thp_swp_supported(void)
+{
+ return !system_supports_mte();
+}
+#define arch_thp_swp_supported arch_thp_swp_supported
+
/*
* Outside of a few very special situations (e.g. hibernation), we always
* use broadcast TLB invalidation instructions, therefore a spurious page
@@ -427,6 +433,16 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
}
+/*
+ * Select all bits except the pfn
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
+
+ return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
#ifdef CONFIG_NUMA_BALANCING
/*
* See the comment in include/linux/pgtable.h
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index 9e58749db21d..86eb0bfe3b38 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -272,8 +272,9 @@ void tls_preserve_current_state(void);
static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
{
+ s32 previous_syscall = regs->syscallno;
memset(regs, 0, sizeof(*regs));
- forget_syscall(regs);
+ regs->syscallno = previous_syscall;
regs->pc = pc;
if (system_uses_irq_prio_masking())
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 42ff95dba6da..7c71358d44c4 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -192,8 +192,6 @@
#define SYS_ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 4, 0)
#define SYS_ID_AA64PFR1_EL1 sys_reg(3, 0, 0, 4, 1)
-#define SYS_ID_AA64ZFR0_EL1 sys_reg(3, 0, 0, 4, 4)
-#define SYS_ID_AA64SMFR0_EL1 sys_reg(3, 0, 0, 4, 5)
#define SYS_ID_AA64DFR0_EL1 sys_reg(3, 0, 0, 5, 0)
#define SYS_ID_AA64DFR1_EL1 sys_reg(3, 0, 0, 5, 1)
@@ -201,9 +199,6 @@
#define SYS_ID_AA64AFR0_EL1 sys_reg(3, 0, 0, 5, 4)
#define SYS_ID_AA64AFR1_EL1 sys_reg(3, 0, 0, 5, 5)
-#define SYS_ID_AA64ISAR1_EL1 sys_reg(3, 0, 0, 6, 1)
-#define SYS_ID_AA64ISAR2_EL1 sys_reg(3, 0, 0, 6, 2)
-
#define SYS_ID_AA64MMFR0_EL1 sys_reg(3, 0, 0, 7, 0)
#define SYS_ID_AA64MMFR1_EL1 sys_reg(3, 0, 0, 7, 1)
#define SYS_ID_AA64MMFR2_EL1 sys_reg(3, 0, 0, 7, 2)
@@ -410,12 +405,6 @@
#define SYS_MAIR_EL1 sys_reg(3, 0, 10, 2, 0)
#define SYS_AMAIR_EL1 sys_reg(3, 0, 10, 3, 0)
-#define SYS_LORSA_EL1 sys_reg(3, 0, 10, 4, 0)
-#define SYS_LOREA_EL1 sys_reg(3, 0, 10, 4, 1)
-#define SYS_LORN_EL1 sys_reg(3, 0, 10, 4, 2)
-#define SYS_LORC_EL1 sys_reg(3, 0, 10, 4, 3)
-#define SYS_LORID_EL1 sys_reg(3, 0, 10, 4, 7)
-
#define SYS_VBAR_EL1 sys_reg(3, 0, 12, 0, 0)
#define SYS_DISR_EL1 sys_reg(3, 0, 12, 1, 1)
@@ -454,16 +443,12 @@
#define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0)
#define SYS_CCSIDR_EL1 sys_reg(3, 1, 0, 0, 0)
-#define SYS_GMID_EL1 sys_reg(3, 1, 0, 0, 4)
#define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7)
#define SMIDR_EL1_IMPLEMENTER_SHIFT 24
#define SMIDR_EL1_SMPS_SHIFT 15
#define SMIDR_EL1_AFFINITY_SHIFT 0
-#define SYS_CTR_EL0 sys_reg(3, 3, 0, 0, 1)
-#define SYS_DCZID_EL0 sys_reg(3, 3, 0, 0, 7)
-
#define SYS_RNDR_EL0 sys_reg(3, 3, 2, 4, 0)
#define SYS_RNDRRS_EL0 sys_reg(3, 3, 2, 4, 1)
@@ -704,66 +689,6 @@
/* Position the attr at the correct index */
#define MAIR_ATTRIDX(attr, idx) ((attr) << ((idx) * 8))
-/* id_aa64isar1 */
-#define ID_AA64ISAR1_I8MM_SHIFT 52
-#define ID_AA64ISAR1_DGH_SHIFT 48
-#define ID_AA64ISAR1_BF16_SHIFT 44
-#define ID_AA64ISAR1_SPECRES_SHIFT 40
-#define ID_AA64ISAR1_SB_SHIFT 36
-#define ID_AA64ISAR1_FRINTTS_SHIFT 32
-#define ID_AA64ISAR1_GPI_SHIFT 28
-#define ID_AA64ISAR1_GPA_SHIFT 24
-#define ID_AA64ISAR1_LRCPC_SHIFT 20
-#define ID_AA64ISAR1_FCMA_SHIFT 16
-#define ID_AA64ISAR1_JSCVT_SHIFT 12
-#define ID_AA64ISAR1_API_SHIFT 8
-#define ID_AA64ISAR1_APA_SHIFT 4
-#define ID_AA64ISAR1_DPB_SHIFT 0
-
-#define ID_AA64ISAR1_APA_NI 0x0
-#define ID_AA64ISAR1_APA_ARCHITECTED 0x1
-#define ID_AA64ISAR1_APA_ARCH_EPAC 0x2
-#define ID_AA64ISAR1_APA_ARCH_EPAC2 0x3
-#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC 0x4
-#define ID_AA64ISAR1_APA_ARCH_EPAC2_FPAC_CMB 0x5
-#define ID_AA64ISAR1_API_NI 0x0
-#define ID_AA64ISAR1_API_IMP_DEF 0x1
-#define ID_AA64ISAR1_API_IMP_DEF_EPAC 0x2
-#define ID_AA64ISAR1_API_IMP_DEF_EPAC2 0x3
-#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC 0x4
-#define ID_AA64ISAR1_API_IMP_DEF_EPAC2_FPAC_CMB 0x5
-#define ID_AA64ISAR1_GPA_NI 0x0
-#define ID_AA64ISAR1_GPA_ARCHITECTED 0x1
-#define ID_AA64ISAR1_GPI_NI 0x0
-#define ID_AA64ISAR1_GPI_IMP_DEF 0x1
-
-/* id_aa64isar2 */
-#define ID_AA64ISAR2_CLEARBHB_SHIFT 28
-#define ID_AA64ISAR2_APA3_SHIFT 12
-#define ID_AA64ISAR2_GPA3_SHIFT 8
-#define ID_AA64ISAR2_RPRES_SHIFT 4
-#define ID_AA64ISAR2_WFXT_SHIFT 0
-
-#define ID_AA64ISAR2_RPRES_8BIT 0x0
-#define ID_AA64ISAR2_RPRES_12BIT 0x1
-/*
- * Value 0x1 has been removed from the architecture, and is
- * reserved, but has not yet been removed from the ARM ARM
- * as of ARM DDI 0487G.b.
- */
-#define ID_AA64ISAR2_WFXT_NI 0x0
-#define ID_AA64ISAR2_WFXT_SUPPORTED 0x2
-
-#define ID_AA64ISAR2_APA3_NI 0x0
-#define ID_AA64ISAR2_APA3_ARCHITECTED 0x1
-#define ID_AA64ISAR2_APA3_ARCH_EPAC 0x2
-#define ID_AA64ISAR2_APA3_ARCH_EPAC2 0x3
-#define ID_AA64ISAR2_APA3_ARCH_EPAC2_FPAC 0x4
-#define ID_AA64ISAR2_APA3_ARCH_EPAC2_FPAC_CMB 0x5
-
-#define ID_AA64ISAR2_GPA3_NI 0x0
-#define ID_AA64ISAR2_GPA3_ARCHITECTED 0x1
-
/* id_aa64pfr0 */
#define ID_AA64PFR0_CSV3_SHIFT 60
#define ID_AA64PFR0_CSV2_SHIFT 56
@@ -811,45 +736,6 @@
#define ID_AA64PFR1_MTE 0x2
#define ID_AA64PFR1_MTE_ASYMM 0x3
-/* id_aa64zfr0 */
-#define ID_AA64ZFR0_F64MM_SHIFT 56
-#define ID_AA64ZFR0_F32MM_SHIFT 52
-#define ID_AA64ZFR0_I8MM_SHIFT 44
-#define ID_AA64ZFR0_SM4_SHIFT 40
-#define ID_AA64ZFR0_SHA3_SHIFT 32
-#define ID_AA64ZFR0_BF16_SHIFT 20
-#define ID_AA64ZFR0_BITPERM_SHIFT 16
-#define ID_AA64ZFR0_AES_SHIFT 4
-#define ID_AA64ZFR0_SVEVER_SHIFT 0
-
-#define ID_AA64ZFR0_F64MM 0x1
-#define ID_AA64ZFR0_F32MM 0x1
-#define ID_AA64ZFR0_I8MM 0x1
-#define ID_AA64ZFR0_BF16 0x1
-#define ID_AA64ZFR0_SM4 0x1
-#define ID_AA64ZFR0_SHA3 0x1
-#define ID_AA64ZFR0_BITPERM 0x1
-#define ID_AA64ZFR0_AES 0x1
-#define ID_AA64ZFR0_AES_PMULL 0x2
-#define ID_AA64ZFR0_SVEVER_SVE2 0x1
-
-/* id_aa64smfr0 */
-#define ID_AA64SMFR0_FA64_SHIFT 63
-#define ID_AA64SMFR0_I16I64_SHIFT 52
-#define ID_AA64SMFR0_F64F64_SHIFT 48
-#define ID_AA64SMFR0_I8I32_SHIFT 36
-#define ID_AA64SMFR0_F16F32_SHIFT 35
-#define ID_AA64SMFR0_B16F32_SHIFT 34
-#define ID_AA64SMFR0_F32F32_SHIFT 32
-
-#define ID_AA64SMFR0_FA64 0x1
-#define ID_AA64SMFR0_I16I64 0xf
-#define ID_AA64SMFR0_F64F64 0x1
-#define ID_AA64SMFR0_I8I32 0xf
-#define ID_AA64SMFR0_F16F32 0x1
-#define ID_AA64SMFR0_B16F32 0x1
-#define ID_AA64SMFR0_F32F32 0x1
-
/* id_aa64mmfr0 */
#define ID_AA64MMFR0_ECV_SHIFT 60
#define ID_AA64MMFR0_FGT_SHIFT 56
@@ -902,6 +788,7 @@
/* id_aa64mmfr1 */
#define ID_AA64MMFR1_ECBHB_SHIFT 60
+#define ID_AA64MMFR1_TIDCP1_SHIFT 52
#define ID_AA64MMFR1_HCX_SHIFT 40
#define ID_AA64MMFR1_AFP_SHIFT 44
#define ID_AA64MMFR1_ETS_SHIFT 36
@@ -918,6 +805,9 @@
#define ID_AA64MMFR1_VMIDBITS_8 0
#define ID_AA64MMFR1_VMIDBITS_16 2
+#define ID_AA64MMFR1_TIDCP1_NI 0
+#define ID_AA64MMFR1_TIDCP1_IMP 1
+
/* id_aa64mmfr2 */
#define ID_AA64MMFR2_E0PD_SHIFT 60
#define ID_AA64MMFR2_EVT_SHIFT 56
@@ -1084,9 +974,6 @@
#define MVFR2_FPMISC_SHIFT 4
#define MVFR2_SIMDMISC_SHIFT 0
-#define DCZID_DZP_SHIFT 4
-#define DCZID_BS_SHIFT 0
-
#define CPACR_EL1_FPEN_EL1EN (BIT(20)) /* enable EL1 access */
#define CPACR_EL1_FPEN_EL0EN (BIT(21)) /* enable EL0 access, if EL1EN set */
@@ -1121,8 +1008,8 @@
#define SYS_RGSR_EL1_SEED_MASK 0xffffUL
/* GMID_EL1 field definitions */
-#define SYS_GMID_EL1_BS_SHIFT 0
-#define SYS_GMID_EL1_BS_SIZE 4
+#define GMID_EL1_BS_SHIFT 0
+#define GMID_EL1_BS_SIZE 4
/* TFSR{,E0}_EL1 bit definitions */
#define SYS_TFSR_EL1_TF0_SHIFT 0
@@ -1324,6 +1211,9 @@
#endif
+#define SYS_FIELD_GET(reg, field, val) \
+ FIELD_GET(reg##_##field##_MASK, val)
+
#define SYS_FIELD_PREP(reg, field, val) \
FIELD_PREP(reg##_##field##_MASK, val)
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index 63f9c828f1a7..2fc9f0861769 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -232,34 +232,34 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
* The "__xxx_error" versions set the third argument to -EFAULT if an error
* occurs, and leave it unchanged on success.
*/
-#define __get_mem_asm(load, reg, x, addr, err) \
+#define __get_mem_asm(load, reg, x, addr, err, type) \
asm volatile( \
"1: " load " " reg "1, [%2]\n" \
"2:\n" \
- _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 2b, %w0, %w1) \
+ _ASM_EXTABLE_##type##ACCESS_ERR_ZERO(1b, 2b, %w0, %w1) \
: "+r" (err), "=&r" (x) \
: "r" (addr))
-#define __raw_get_mem(ldr, x, ptr, err) \
-do { \
- unsigned long __gu_val; \
- switch (sizeof(*(ptr))) { \
- case 1: \
- __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), (err)); \
- break; \
- case 2: \
- __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), (err)); \
- break; \
- case 4: \
- __get_mem_asm(ldr, "%w", __gu_val, (ptr), (err)); \
- break; \
- case 8: \
- __get_mem_asm(ldr, "%x", __gu_val, (ptr), (err)); \
- break; \
- default: \
- BUILD_BUG(); \
- } \
- (x) = (__force __typeof__(*(ptr)))__gu_val; \
+#define __raw_get_mem(ldr, x, ptr, err, type) \
+do { \
+ unsigned long __gu_val; \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), (err), type); \
+ break; \
+ case 2: \
+ __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), (err), type); \
+ break; \
+ case 4: \
+ __get_mem_asm(ldr, "%w", __gu_val, (ptr), (err), type); \
+ break; \
+ case 8: \
+ __get_mem_asm(ldr, "%x", __gu_val, (ptr), (err), type); \
+ break; \
+ default: \
+ BUILD_BUG(); \
+ } \
+ (x) = (__force __typeof__(*(ptr)))__gu_val; \
} while (0)
/*
@@ -274,7 +274,7 @@ do { \
__chk_user_ptr(ptr); \
\
uaccess_ttbr0_enable(); \
- __raw_get_mem("ldtr", __rgu_val, __rgu_ptr, err); \
+ __raw_get_mem("ldtr", __rgu_val, __rgu_ptr, err, U); \
uaccess_ttbr0_disable(); \
\
(x) = __rgu_val; \
@@ -314,40 +314,40 @@ do { \
\
__uaccess_enable_tco_async(); \
__raw_get_mem("ldr", *((type *)(__gkn_dst)), \
- (__force type *)(__gkn_src), __gkn_err); \
+ (__force type *)(__gkn_src), __gkn_err, K); \
__uaccess_disable_tco_async(); \
\
if (unlikely(__gkn_err)) \
goto err_label; \
} while (0)
-#define __put_mem_asm(store, reg, x, addr, err) \
+#define __put_mem_asm(store, reg, x, addr, err, type) \
asm volatile( \
"1: " store " " reg "1, [%2]\n" \
"2:\n" \
- _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0) \
+ _ASM_EXTABLE_##type##ACCESS_ERR(1b, 2b, %w0) \
: "+r" (err) \
: "r" (x), "r" (addr))
-#define __raw_put_mem(str, x, ptr, err) \
-do { \
- __typeof__(*(ptr)) __pu_val = (x); \
- switch (sizeof(*(ptr))) { \
- case 1: \
- __put_mem_asm(str "b", "%w", __pu_val, (ptr), (err)); \
- break; \
- case 2: \
- __put_mem_asm(str "h", "%w", __pu_val, (ptr), (err)); \
- break; \
- case 4: \
- __put_mem_asm(str, "%w", __pu_val, (ptr), (err)); \
- break; \
- case 8: \
- __put_mem_asm(str, "%x", __pu_val, (ptr), (err)); \
- break; \
- default: \
- BUILD_BUG(); \
- } \
+#define __raw_put_mem(str, x, ptr, err, type) \
+do { \
+ __typeof__(*(ptr)) __pu_val = (x); \
+ switch (sizeof(*(ptr))) { \
+ case 1: \
+ __put_mem_asm(str "b", "%w", __pu_val, (ptr), (err), type); \
+ break; \
+ case 2: \
+ __put_mem_asm(str "h", "%w", __pu_val, (ptr), (err), type); \
+ break; \
+ case 4: \
+ __put_mem_asm(str, "%w", __pu_val, (ptr), (err), type); \
+ break; \
+ case 8: \
+ __put_mem_asm(str, "%x", __pu_val, (ptr), (err), type); \
+ break; \
+ default: \
+ BUILD_BUG(); \
+ } \
} while (0)
/*
@@ -362,7 +362,7 @@ do { \
__chk_user_ptr(__rpu_ptr); \
\
uaccess_ttbr0_enable(); \
- __raw_put_mem("sttr", __rpu_val, __rpu_ptr, err); \
+ __raw_put_mem("sttr", __rpu_val, __rpu_ptr, err, U); \
uaccess_ttbr0_disable(); \
} while (0)
@@ -400,7 +400,7 @@ do { \
\
__uaccess_enable_tco_async(); \
__raw_put_mem("str", *((type *)(__pkn_src)), \
- (__force type *)(__pkn_dst), __pkn_err); \
+ (__force type *)(__pkn_dst), __pkn_err, K); \
__uaccess_disable_tco_async(); \
\
if (unlikely(__pkn_err)) \
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index 0e80db4327b6..4eb601e7de50 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -36,9 +36,9 @@
#define HVC_RESET_VECTORS 2
/*
- * HVC_VHE_RESTART - Upgrade the CPU from EL1 to EL2, if possible
+ * HVC_FINALISE_EL2 - Upgrade the CPU from EL1 to EL2, if possible
*/
-#define HVC_VHE_RESTART 3
+#define HVC_FINALISE_EL2 3
/* Max number of HYP stub hypercalls */
#define HVC_STUB_HCALL_NR 4
@@ -49,6 +49,13 @@
#define BOOT_CPU_MODE_EL1 (0xe11)
#define BOOT_CPU_MODE_EL2 (0xe12)
+/*
+ * Flags returned together with the boot mode, but not preserved in
+ * __boot_cpu_mode. Used by the idreg override code to work out the
+ * boot state.
+ */
+#define BOOT_CPU_FLAG_E2H BIT_ULL(32)
+
#ifndef __ASSEMBLY__
#include <asm/ptrace.h>
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 4bb2cc8ac446..1ad2568a2569 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -19,6 +19,9 @@
/*
* HWCAP flags - for AT_HWCAP
+ *
+ * Bits 62 and 63 are reserved for use by libc.
+ * Bits 32-61 are unallocated for potential use by libc.
*/
#define HWCAP_FP (1 << 0)
#define HWCAP_ASIMD (1 << 1)
@@ -88,5 +91,6 @@
#define HWCAP2_SME_F32F32 (1 << 29)
#define HWCAP2_SME_FA64 (1 << 30)
#define HWCAP2_WFXT (1UL << 31)
+#define HWCAP2_EBF16 (1UL << 32)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index fa7981d0d917..1add7b01efa7 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong
CFLAGS_syscall.o += -fno-stack-protector
+# When KASAN is enabled, a stack trace is recorded for every alloc/free, which
+# can significantly impact performance. Avoid instrumenting the stack trace
+# collection code to minimize this impact.
+KASAN_SANITIZE_stacktrace.o := n
+
# It's not safe to invoke KCOV when portions of the kernel environment aren't
# available or are out-of-sync with HW state. Since `noinstr` doesn't always
# inhibit KCOV instrumentation, disable it for the entire compilation unit.
@@ -59,7 +64,7 @@ obj-$(CONFIG_ACPI) += acpi.o
obj-$(CONFIG_ACPI_NUMA) += acpi_numa.o
obj-$(CONFIG_ARM64_ACPI_PARKING_PROTOCOL) += acpi_parking_protocol.o
obj-$(CONFIG_PARAVIRT) += paravirt.o
-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o pi/
obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o
obj-$(CONFIG_ELF_CORE) += elfcore.o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o relocate_kernel.o \
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index e4dea8db6924..a5a256e3f9fe 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -351,7 +351,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
prot = __acpi_get_writethrough_mem_attribute();
}
}
- return __ioremap(phys, size, prot);
+ return ioremap_prot(phys, size, pgprot_val(prot));
}
/*
diff --git a/arch/arm64/kernel/acpi_numa.c b/arch/arm64/kernel/acpi_numa.c
index fdfecf0991ce..e51535a5f939 100644
--- a/arch/arm64/kernel/acpi_numa.c
+++ b/arch/arm64/kernel/acpi_numa.c
@@ -109,7 +109,7 @@ void __init acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa)
pxm = pa->proximity_domain;
node = acpi_map_pxm_to_node(pxm);
- if (node == NUMA_NO_NODE || node >= MAX_NUMNODES) {
+ if (node == NUMA_NO_NODE) {
pr_err("SRAT: Too many proximity domains %d\n", pxm);
bad_srat();
return;
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 7bbf5104b7b7..9bcaa5eacf16 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -121,7 +121,7 @@ static void clean_dcache_range_nopatch(u64 start, u64 end)
ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
d_size = 4 << cpuid_feature_extract_unsigned_field(ctr_el0,
- CTR_DMINLINE_SHIFT);
+ CTR_EL0_DminLine_SHIFT);
cur = start & ~(d_size - 1);
do {
/*
diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c
index 6875a16b09d2..fb0e7c7b2e20 100644
--- a/arch/arm64/kernel/armv8_deprecated.c
+++ b/arch/arm64/kernel/armv8_deprecated.c
@@ -59,6 +59,7 @@ struct insn_emulation {
static LIST_HEAD(insn_emulation);
static int nr_insn_emulated __initdata;
static DEFINE_RAW_SPINLOCK(insn_emulation_lock);
+static DEFINE_MUTEX(insn_emulation_mutex);
static void register_emulation_hooks(struct insn_emulation_ops *ops)
{
@@ -207,10 +208,10 @@ static int emulation_proc_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret = 0;
- struct insn_emulation *insn = (struct insn_emulation *) table->data;
+ struct insn_emulation *insn = container_of(table->data, struct insn_emulation, current_mode);
enum insn_emulation_mode prev_mode = insn->current_mode;
- table->data = &insn->current_mode;
+ mutex_lock(&insn_emulation_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write || prev_mode == insn->current_mode)
@@ -223,7 +224,7 @@ static int emulation_proc_handler(struct ctl_table *table, int write,
update_insn_emulation_mode(insn, INSN_UNDEF);
}
ret:
- table->data = insn;
+ mutex_unlock(&insn_emulation_mutex);
return ret;
}
@@ -247,7 +248,7 @@ static void __init register_insn_emulation_sysctl(void)
sysctl->maxlen = sizeof(int);
sysctl->procname = insn->ops->name;
- sysctl->data = insn;
+ sysctl->data = &insn->current_mode;
sysctl->extra1 = &insn->min;
sysctl->extra2 = &insn->max;
sysctl->proc_handler = emulation_proc_handler;
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index c05cc3b6162e..7e6289e709fc 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -187,7 +187,7 @@ has_neoverse_n1_erratum_1542419(const struct arm64_cpu_capabilities *entry,
int scope)
{
u32 midr = read_cpuid_id();
- bool has_dic = read_cpuid_cachetype() & BIT(CTR_DIC_SHIFT);
+ bool has_dic = read_cpuid_cachetype() & BIT(CTR_EL0_DIC_SHIFT);
const struct midr_range range = MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1);
WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
@@ -212,6 +212,12 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe),
},
#endif
+#ifdef CONFIG_ARM64_ERRATUM_2441009
+ {
+ /* Cortex-A510 r0p0 -> r1p1. Fixed in r1p2 */
+ ERRATA_MIDR_RANGE(MIDR_CORTEX_A510, 0, 0, 1, 1),
+ },
+#endif
{},
};
#endif
@@ -395,6 +401,14 @@ static struct midr_range trbe_write_out_of_range_cpus[] = {
};
#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */
+#ifdef CONFIG_ARM64_ERRATUM_1742098
+static struct midr_range broken_aarch32_aes[] = {
+ MIDR_RANGE(MIDR_CORTEX_A57, 0, 1, 0xf, 0xf),
+ MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+ {},
+};
+#endif /* CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE */
+
const struct arm64_cpu_capabilities arm64_errata[] = {
#ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE
{
@@ -480,7 +494,7 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
#endif
#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
{
- .desc = "Qualcomm erratum 1009, or ARM erratum 1286807",
+ .desc = "Qualcomm erratum 1009, or ARM erratum 1286807, 2441009",
.capability = ARM64_WORKAROUND_REPEAT_TLBI,
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
.matches = cpucap_multi_entry_cap_matches,
@@ -658,6 +672,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 1)
},
#endif
+#ifdef CONFIG_ARM64_ERRATUM_1742098
+ {
+ .desc = "ARM erratum 1742098",
+ .capability = ARM64_WORKAROUND_1742098,
+ CAP_MIDR_RANGE_LIST(broken_aarch32_aes),
+ .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
+ },
+#endif
{
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 8d88433de81d..ad64cab0a2ba 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -79,6 +79,7 @@
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
#include <asm/fpsimd.h>
+#include <asm/hwcap.h>
#include <asm/insn.h>
#include <asm/kvm_host.h>
#include <asm/mmu_context.h>
@@ -91,7 +92,7 @@
#include <asm/virt.h>
/* Kernel representation of AT_HWCAP and AT_HWCAP2 */
-static unsigned long elf_hwcap __read_mostly;
+static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;
#ifdef CONFIG_COMPAT
#define COMPAT_ELF_HWCAP_DEFAULT \
@@ -209,35 +210,35 @@ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_I8MM_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DGH_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_BF16_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SPECRES_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_SB_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FRINTTS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPI_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_GPA_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_FCMA_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_API_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_APA_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_DPB_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0),
ARM64_FTR_END,
};
static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_CLEARBHB_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_APA3_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_GPA3_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_WFXT_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -276,41 +277,41 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_F64MM_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_F32MM_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_I8MM_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SM4_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SHA3_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BF16_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BITPERM_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_AES_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
- FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SVEVER_SHIFT, 4, 0),
+ FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0),
ARM64_FTR_END,
};
static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_FA64_SHIFT, 1, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I16I64_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F64F64_SHIFT, 1, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_I8I32_SHIFT, 4, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F16F32_SHIFT, 1, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_B16F32_SHIFT, 1, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
- FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_F32F32_SHIFT, 1, 0),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0),
ARM64_FTR_END,
};
@@ -361,6 +362,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
};
static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TIDCP1_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_AFP_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0),
@@ -396,18 +398,18 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
static const struct arm64_ftr_bits ftr_ctr[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DIC_SHIFT, 1, 1),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IDC_SHIFT, 1, 1),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_CWG_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_ERG_SHIFT, 4, 0),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_DMINLINE_SHIFT, 4, 1),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1),
/*
* Linux can handle differing I-cache policies. Userspace JITs will
* make use of *minLine.
* If we have differing I-cache policies, report it as the weakest - VIPT.
*/
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_L1IP_SHIFT, 2, ICACHE_POLICY_VIPT), /* L1Ip */
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT), /* L1Ip */
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -453,13 +455,13 @@ static const struct arm64_ftr_bits ftr_mvfr2[] = {
};
static const struct arm64_ftr_bits ftr_dczid[] = {
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_DZP_SHIFT, 1, 1),
- ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_BS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0),
ARM64_FTR_END,
};
static const struct arm64_ftr_bits ftr_gmid[] = {
- ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, SYS_GMID_EL1_BS_SHIFT, 4, 0),
+ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0),
ARM64_FTR_END,
};
@@ -561,7 +563,7 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = {
static const struct arm64_ftr_bits ftr_id_dfr0[] = {
/* [31:28] TraceFilt */
- S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_PERFMON_SHIFT, 4, 0xf),
+ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_PERFMON_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MPROFDBG_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPTRC_SHIFT, 4, 0),
ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPTRC_SHIFT, 4, 0),
@@ -631,7 +633,10 @@ static const struct arm64_ftr_bits ftr_raz[] = {
__ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override)
struct arm64_ftr_override __ro_after_init id_aa64mmfr1_override;
+struct arm64_ftr_override __ro_after_init id_aa64pfr0_override;
struct arm64_ftr_override __ro_after_init id_aa64pfr1_override;
+struct arm64_ftr_override __ro_after_init id_aa64zfr0_override;
+struct arm64_ftr_override __ro_after_init id_aa64smfr0_override;
struct arm64_ftr_override __ro_after_init id_aa64isar1_override;
struct arm64_ftr_override __ro_after_init id_aa64isar2_override;
@@ -668,11 +673,14 @@ static const struct __ftr_reg_entry {
ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5),
/* Op1 = 0, CRn = 0, CRm = 4 */
- ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0),
+ ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0,
+ &id_aa64pfr0_override),
ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1,
&id_aa64pfr1_override),
- ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0),
- ARM64_FTR_REG(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0),
+ ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0,
+ &id_aa64zfr0_override),
+ ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0,
+ &id_aa64smfr0_override),
/* Op1 = 0, CRn = 0, CRm = 5 */
ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
@@ -993,15 +1001,24 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
init_32bit_cpu_features(&info->aarch32);
- if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
+ if (IS_ENABLED(CONFIG_ARM64_SVE) &&
+ id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
+ info->reg_zcr = read_zcr_features();
init_cpu_ftr_reg(SYS_ZCR_EL1, info->reg_zcr);
vec_init_vq_map(ARM64_VEC_SVE);
}
- if (id_aa64pfr1_sme(info->reg_id_aa64pfr1)) {
+ if (IS_ENABLED(CONFIG_ARM64_SME) &&
+ id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
+ info->reg_smcr = read_smcr_features();
+ /*
+ * We mask out SMPS since even if the hardware
+ * supports priorities the kernel does not at present
+ * and we block access to them.
+ */
+ info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
init_cpu_ftr_reg(SYS_SMCR_EL1, info->reg_smcr);
- if (IS_ENABLED(CONFIG_ARM64_SME))
- vec_init_vq_map(ARM64_VEC_SME);
+ vec_init_vq_map(ARM64_VEC_SME);
}
if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
@@ -1233,23 +1250,31 @@ void update_cpu_features(int cpu,
taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu,
info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0);
- if (id_aa64pfr0_sve(info->reg_id_aa64pfr0)) {
+ if (IS_ENABLED(CONFIG_ARM64_SVE) &&
+ id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
+ info->reg_zcr = read_zcr_features();
taint |= check_update_ftr_reg(SYS_ZCR_EL1, cpu,
info->reg_zcr, boot->reg_zcr);
- /* Probe vector lengths, unless we already gave up on SVE */
- if (id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1)) &&
- !system_capabilities_finalized())
+ /* Probe vector lengths */
+ if (!system_capabilities_finalized())
vec_update_vq_map(ARM64_VEC_SVE);
}
- if (id_aa64pfr1_sme(info->reg_id_aa64pfr1)) {
+ if (IS_ENABLED(CONFIG_ARM64_SME) &&
+ id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
+ info->reg_smcr = read_smcr_features();
+ /*
+ * We mask out SMPS since even if the hardware
+ * supports priorities the kernel does not at present
+ * and we block access to them.
+ */
+ info->reg_smidr = read_cpuid(SMIDR_EL1) & ~SMIDR_EL1_SMPS;
taint |= check_update_ftr_reg(SYS_SMCR_EL1, cpu,
info->reg_smcr, boot->reg_smcr);
- /* Probe vector lengths, unless we already gave up on SME */
- if (id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1)) &&
- !system_capabilities_finalized())
+ /* Probe vector lengths */
+ if (!system_capabilities_finalized())
vec_update_vq_map(ARM64_VEC_SME);
}
@@ -1480,7 +1505,7 @@ static bool has_cache_idc(const struct arm64_cpu_capabilities *entry,
else
ctr = read_cpuid_effective_cachetype();
- return ctr & BIT(CTR_IDC_SHIFT);
+ return ctr & BIT(CTR_EL0_IDC_SHIFT);
}
static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused)
@@ -1491,7 +1516,7 @@ static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unu
* to the CTR_EL0 on this CPU and emulate it with the real/safe
* value.
*/
- if (!(read_cpuid_cachetype() & BIT(CTR_IDC_SHIFT)))
+ if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT)))
sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
}
@@ -1505,7 +1530,7 @@ static bool has_cache_dic(const struct arm64_cpu_capabilities *entry,
else
ctr = read_cpuid_cachetype();
- return ctr & BIT(CTR_DIC_SHIFT);
+ return ctr & BIT(CTR_EL0_DIC_SHIFT);
}
static bool __maybe_unused
@@ -1645,14 +1670,34 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
}
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT))
+
+extern
+void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags);
+
+static phys_addr_t kpti_ng_temp_alloc;
+
+static phys_addr_t kpti_ng_pgd_alloc(int shift)
+{
+ kpti_ng_temp_alloc -= PAGE_SIZE;
+ return kpti_ng_temp_alloc;
+}
+
static void __nocfi
kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
{
- typedef void (kpti_remap_fn)(int, int, phys_addr_t);
+ typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
extern kpti_remap_fn idmap_kpti_install_ng_mappings;
kpti_remap_fn *remap_fn;
int cpu = smp_processor_id();
+ int levels = CONFIG_PGTABLE_LEVELS;
+ int order = order_base_2(levels);
+ u64 kpti_ng_temp_pgd_pa = 0;
+ pgd_t *kpti_ng_temp_pgd;
+ u64 alloc = 0;
if (__this_cpu_read(this_cpu_vector) == vectors) {
const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);
@@ -1670,12 +1715,40 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused)
remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings));
+ if (!cpu) {
+ alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
+ kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE);
+ kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd);
+
+ //
+ // Create a minimal page table hierarchy that permits us to map
+ // the swapper page tables temporarily as we traverse them.
+ //
+ // The physical pages are laid out as follows:
+ //
+ // +--------+-/-------+-/------ +-\\--------+
+ // : PTE[] : | PMD[] : | PUD[] : || PGD[] :
+ // +--------+-\-------+-\------ +-//--------+
+ // ^
+ // The first page is mapped into this hierarchy at a PMD_SHIFT
+ // aligned virtual address, so that we can manipulate the PTE
+ // level entries while the mapping is active. The first entry
+ // covers the PTE[] page itself, the remaining entries are free
+ // to be used as a ad-hoc fixmap.
+ //
+ create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc),
+ KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL,
+ kpti_ng_pgd_alloc, 0);
+ }
+
cpu_install_idmap();
- remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir));
+ remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA);
cpu_uninstall_idmap();
- if (!cpu)
+ if (!cpu) {
+ free_pages(alloc, order);
arm64_use_ng_mappings = true;
+ }
}
#else
static void
@@ -1971,6 +2044,14 @@ static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
}
#endif /* CONFIG_ARM64_MTE */
+static void elf_hwcap_fixup(void)
+{
+#ifdef CONFIG_ARM64_ERRATUM_1742098
+ if (cpus_have_const_cap(ARM64_WORKAROUND_1742098))
+ compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES;
+#endif /* ARM64_ERRATUM_1742098 */
+}
+
#ifdef CONFIG_KVM
static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
{
@@ -1978,6 +2059,11 @@ static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, in
}
#endif /* CONFIG_KVM */
+static void cpu_trap_el0_impdef(const struct arm64_cpu_capabilities *__unused)
+{
+ sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP);
+}
+
/* Internal helper functions to match cpu capability type */
static bool
cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
@@ -2132,7 +2218,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_cpuid_feature,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
- .field_pos = ID_AA64ISAR1_DPB_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_DPB_SHIFT,
.field_width = 4,
.min_field_value = 1,
},
@@ -2143,7 +2229,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.matches = has_cpuid_feature,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR1_DPB_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_DPB_SHIFT,
.field_width = 4,
.min_field_value = 2,
},
@@ -2303,7 +2389,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.matches = has_cpuid_feature,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
- .field_pos = ID_AA64ISAR1_SB_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_SB_SHIFT,
.field_width = 4,
.sign = FTR_UNSIGNED,
.min_field_value = 1,
@@ -2315,9 +2401,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR1_APA_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_APA_SHIFT,
.field_width = 4,
- .min_field_value = ID_AA64ISAR1_APA_ARCHITECTED,
+ .min_field_value = ID_AA64ISAR1_EL1_APA_PAuth,
.matches = has_address_auth_cpucap,
},
{
@@ -2326,9 +2412,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
.sys_reg = SYS_ID_AA64ISAR2_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR2_APA3_SHIFT,
+ .field_pos = ID_AA64ISAR2_EL1_APA3_SHIFT,
.field_width = 4,
- .min_field_value = ID_AA64ISAR2_APA3_ARCHITECTED,
+ .min_field_value = ID_AA64ISAR2_EL1_APA3_PAuth,
.matches = has_address_auth_cpucap,
},
{
@@ -2337,9 +2423,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR1_API_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_API_SHIFT,
.field_width = 4,
- .min_field_value = ID_AA64ISAR1_API_IMP_DEF,
+ .min_field_value = ID_AA64ISAR1_EL1_API_PAuth,
.matches = has_address_auth_cpucap,
},
{
@@ -2353,9 +2439,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR1_GPA_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_GPA_SHIFT,
.field_width = 4,
- .min_field_value = ID_AA64ISAR1_GPA_ARCHITECTED,
+ .min_field_value = ID_AA64ISAR1_EL1_GPA_IMP,
.matches = has_cpuid_feature,
},
{
@@ -2364,9 +2450,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.sys_reg = SYS_ID_AA64ISAR2_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR2_GPA3_SHIFT,
+ .field_pos = ID_AA64ISAR2_EL1_GPA3_SHIFT,
.field_width = 4,
- .min_field_value = ID_AA64ISAR2_GPA3_ARCHITECTED,
+ .min_field_value = ID_AA64ISAR2_EL1_GPA3_IMP,
.matches = has_cpuid_feature,
},
{
@@ -2375,9 +2461,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR1_GPI_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_GPI_SHIFT,
.field_width = 4,
- .min_field_value = ID_AA64ISAR1_GPI_IMP_DEF,
+ .min_field_value = ID_AA64ISAR1_EL1_GPI_IMP,
.matches = has_cpuid_feature,
},
{
@@ -2478,7 +2564,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.sys_reg = SYS_ID_AA64ISAR1_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR1_LRCPC_SHIFT,
+ .field_pos = ID_AA64ISAR1_EL1_LRCPC_SHIFT,
.field_width = 4,
.matches = has_cpuid_feature,
.min_field_value = 1,
@@ -2503,9 +2589,9 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.capability = ARM64_SME_FA64,
.sys_reg = SYS_ID_AA64SMFR0_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64SMFR0_FA64_SHIFT,
+ .field_pos = ID_AA64SMFR0_EL1_FA64_SHIFT,
.field_width = 1,
- .min_field_value = ID_AA64SMFR0_FA64,
+ .min_field_value = ID_AA64SMFR0_EL1_FA64_IMP,
.matches = has_cpuid_feature,
.cpu_enable = fa64_kernel_enable,
},
@@ -2516,10 +2602,22 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.type = ARM64_CPUCAP_SYSTEM_FEATURE,
.sys_reg = SYS_ID_AA64ISAR2_EL1,
.sign = FTR_UNSIGNED,
- .field_pos = ID_AA64ISAR2_WFXT_SHIFT,
+ .field_pos = ID_AA64ISAR2_EL1_WFxT_SHIFT,
.field_width = 4,
.matches = has_cpuid_feature,
- .min_field_value = ID_AA64ISAR2_WFXT_SUPPORTED,
+ .min_field_value = ID_AA64ISAR2_EL1_WFxT_IMP,
+ },
+ {
+ .desc = "Trap EL0 IMPLEMENTATION DEFINED functionality",
+ .capability = ARM64_HAS_TIDCP1,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .sys_reg = SYS_ID_AA64MMFR1_EL1,
+ .sign = FTR_UNSIGNED,
+ .field_pos = ID_AA64MMFR1_TIDCP1_SHIFT,
+ .field_width = 4,
+ .min_field_value = ID_AA64MMFR1_TIDCP1_IMP,
+ .matches = has_cpuid_feature,
+ .cpu_enable = cpu_trap_el0_impdef,
},
{},
};
@@ -2560,33 +2658,33 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
#ifdef CONFIG_ARM64_PTR_AUTH
static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = {
{
- HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_APA_SHIFT,
+ HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_APA_SHIFT,
4, FTR_UNSIGNED,
- ID_AA64ISAR1_APA_ARCHITECTED)
+ ID_AA64ISAR1_EL1_APA_PAuth)
},
{
- HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_APA3_SHIFT,
- 4, FTR_UNSIGNED, ID_AA64ISAR2_APA3_ARCHITECTED)
+ HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_APA3_SHIFT,
+ 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_APA3_PAuth)
},
{
- HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_API_SHIFT,
- 4, FTR_UNSIGNED, ID_AA64ISAR1_API_IMP_DEF)
+ HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_API_SHIFT,
+ 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_API_PAuth)
},
{},
};
static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = {
{
- HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPA_SHIFT,
- 4, FTR_UNSIGNED, ID_AA64ISAR1_GPA_ARCHITECTED)
+ HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_GPA_SHIFT,
+ 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_GPA_IMP)
},
{
- HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_GPA3_SHIFT,
- 4, FTR_UNSIGNED, ID_AA64ISAR2_GPA3_ARCHITECTED)
+ HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_GPA3_SHIFT,
+ 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_GPA3_IMP)
},
{
- HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_GPI_SHIFT,
- 4, FTR_UNSIGNED, ID_AA64ISAR1_GPI_IMP_DEF)
+ HWCAP_CPUID_MATCH(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_GPI_SHIFT,
+ 4, FTR_UNSIGNED, ID_AA64ISAR1_EL1_GPI_IMP)
},
{},
};
@@ -2614,30 +2712,31 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, 4, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, 4, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FRINTTS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_BF16_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_BF16),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DGH_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH),
- HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_I8MM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_I8MM),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FRINT),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_SB_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_BF16),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_EBF16),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DGH),
+ HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_I8MM),
HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT),
#ifdef CONFIG_ARM64_SVE
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SVEVER_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_SVEVER_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_AES, CAP_HWCAP, KERNEL_HWCAP_SVEAES),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_AES_PMULL, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BITPERM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_BITPERM, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_BF16, CAP_HWCAP, KERNEL_HWCAP_SVEBF16),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SHA3_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_SHA3, CAP_HWCAP, KERNEL_HWCAP_SVESHA3),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SM4_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_SM4, CAP_HWCAP, KERNEL_HWCAP_SVESM4),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_I8MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_I8MM, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F32MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_F32MM, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM),
- HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_F64MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_F64MM, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SVEver_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_AES_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_AES_PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BitPerm_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_BF16_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SHA3_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_SM4_IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_I8MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_F32MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM),
+ HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, FTR_UNSIGNED, ID_AA64ZFR0_EL1_F64MM_IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
#endif
HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS),
#ifdef CONFIG_ARM64_BTI
@@ -2653,17 +2752,17 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
#endif /* CONFIG_ARM64_MTE */
HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
- HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
- HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_WFXT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_WFXT_SUPPORTED, CAP_HWCAP, KERNEL_HWCAP_WFXT),
+ HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
+ HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, FTR_UNSIGNED, ID_AA64ISAR2_EL1_WFxT_IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT),
#ifdef CONFIG_ARM64_SME
HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SME_SHIFT, 4, FTR_UNSIGNED, ID_AA64PFR1_SME, CAP_HWCAP, KERNEL_HWCAP_SME),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_FA64, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I16I64_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_I16I64, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F64F64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F64F64, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_I8I32_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_I8I32, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F16F32, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_B16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_B16F32, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
- HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_F32F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_F32F32, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_FA64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_EL1_I16I64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F64F64_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, FTR_UNSIGNED, ID_AA64SMFR0_EL1_I8I32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F16F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_B16F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
+ HWCAP_CAP(SYS_ID_AA64SMFR0_EL1, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, FTR_UNSIGNED, ID_AA64SMFR0_EL1_F32F32_IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
#endif /* CONFIG_ARM64_SME */
{},
};
@@ -3098,14 +3197,12 @@ static bool __maybe_unused __system_matches_cap(unsigned int n)
void cpu_set_feature(unsigned int num)
{
- WARN_ON(num >= MAX_CPU_FEATURES);
- elf_hwcap |= BIT(num);
+ set_bit(num, elf_hwcap);
}
bool cpu_have_feature(unsigned int num)
{
- WARN_ON(num >= MAX_CPU_FEATURES);
- return elf_hwcap & BIT(num);
+ return test_bit(num, elf_hwcap);
}
EXPORT_SYMBOL_GPL(cpu_have_feature);
@@ -3116,12 +3213,12 @@ unsigned long cpu_get_elf_hwcap(void)
* note that for userspace compatibility we guarantee that bits 62
* and 63 will always be returned as 0.
*/
- return lower_32_bits(elf_hwcap);
+ return elf_hwcap[0];
}
unsigned long cpu_get_elf_hwcap2(void)
{
- return upper_32_bits(elf_hwcap);
+ return elf_hwcap[1];
}
static void __init setup_system_capabilities(void)
@@ -3143,8 +3240,10 @@ void __init setup_cpu_features(void)
setup_system_capabilities();
setup_elf_hwcaps(arm64_elf_hwcaps);
- if (system_supports_32bit_el0())
+ if (system_supports_32bit_el0()) {
setup_elf_hwcaps(compat_elf_hwcaps);
+ elf_hwcap_fixup();
+ }
if (system_uses_ttbr0_pan())
pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
@@ -3197,6 +3296,7 @@ static int enable_mismatched_32bit_el0(unsigned int cpu)
cpu_active_mask);
get_cpu_device(lucky_winner)->offline_disabled = true;
setup_elf_hwcaps(compat_elf_hwcaps);
+ elf_hwcap_fixup();
pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n",
cpu, lucky_winner);
return 0;
@@ -3218,7 +3318,7 @@ subsys_initcall_sync(init_32bit_el0_mask);
static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap)
{
- cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir);
}
/*
diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index 3006f4324808..4150e308e99c 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -13,35 +13,6 @@
#include <linux/of_device.h>
#include <linux/psci.h>
-#include <asm/cpuidle.h>
-#include <asm/cpu_ops.h>
-
-int arm_cpuidle_init(unsigned int cpu)
-{
- const struct cpu_operations *ops = get_cpu_ops(cpu);
- int ret = -EOPNOTSUPP;
-
- if (ops && ops->cpu_suspend && ops->cpu_init_idle)
- ret = ops->cpu_init_idle(cpu);
-
- return ret;
-}
-
-/**
- * arm_cpuidle_suspend() - function to enter a low-power idle state
- * @index: argument to pass to CPU suspend operations
- *
- * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU
- * operations back-end error code otherwise.
- */
-int arm_cpuidle_suspend(int index)
-{
- int cpu = smp_processor_id();
- const struct cpu_operations *ops = get_cpu_ops(cpu);
-
- return ops->cpu_suspend(index);
-}
-
#ifdef CONFIG_ACPI
#include <acpi/processor.h>
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 8eff0a34ffd4..d7702f39b4d3 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -33,12 +33,19 @@
DEFINE_PER_CPU(struct cpuinfo_arm64, cpu_data);
static struct cpuinfo_arm64 boot_cpu_data;
-static const char *icache_policy_str[] = {
- [ICACHE_POLICY_VPIPT] = "VPIPT",
- [ICACHE_POLICY_RESERVED] = "RESERVED/UNKNOWN",
- [ICACHE_POLICY_VIPT] = "VIPT",
- [ICACHE_POLICY_PIPT] = "PIPT",
-};
+static inline const char *icache_policy_str(int l1ip)
+{
+ switch (l1ip) {
+ case CTR_EL0_L1Ip_VPIPT:
+ return "VPIPT";
+ case CTR_EL0_L1Ip_VIPT:
+ return "VIPT";
+ case CTR_EL0_L1Ip_PIPT:
+ return "PIPT";
+ default:
+ return "RESERVED/UNKNOWN";
+ }
+}
unsigned long __icache_flags;
@@ -107,6 +114,7 @@ static const char *const hwcap_str[] = {
[KERNEL_HWCAP_SME_F32F32] = "smef32f32",
[KERNEL_HWCAP_SME_FA64] = "smefa64",
[KERNEL_HWCAP_WFXT] = "wfxt",
+ [KERNEL_HWCAP_EBF16] = "ebf16",
};
#ifdef CONFIG_COMPAT
@@ -267,6 +275,7 @@ static struct kobj_type cpuregs_kobj_type = {
CPUREGS_ATTR_RO(midr_el1, midr);
CPUREGS_ATTR_RO(revidr_el1, revidr);
+CPUREGS_ATTR_RO(smidr_el1, smidr);
static struct attribute *cpuregs_id_attrs[] = {
&cpuregs_attr_midr_el1.attr,
@@ -279,6 +288,16 @@ static const struct attribute_group cpuregs_attr_group = {
.name = "identification"
};
+static struct attribute *sme_cpuregs_id_attrs[] = {
+ &cpuregs_attr_smidr_el1.attr,
+ NULL
+};
+
+static const struct attribute_group sme_cpuregs_attr_group = {
+ .attrs = sme_cpuregs_id_attrs,
+ .name = "identification"
+};
+
static int cpuid_cpu_online(unsigned int cpu)
{
int rc;
@@ -296,6 +315,8 @@ static int cpuid_cpu_online(unsigned int cpu)
rc = sysfs_create_group(&info->kobj, &cpuregs_attr_group);
if (rc)
kobject_del(&info->kobj);
+ if (system_supports_sme())
+ rc = sysfs_merge_group(&info->kobj, &sme_cpuregs_attr_group);
out:
return rc;
}
@@ -342,19 +363,19 @@ static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
u32 l1ip = CTR_L1IP(info->reg_ctr);
switch (l1ip) {
- case ICACHE_POLICY_PIPT:
+ case CTR_EL0_L1Ip_PIPT:
break;
- case ICACHE_POLICY_VPIPT:
+ case CTR_EL0_L1Ip_VPIPT:
set_bit(ICACHEF_VPIPT, &__icache_flags);
break;
- case ICACHE_POLICY_RESERVED:
- case ICACHE_POLICY_VIPT:
+ case CTR_EL0_L1Ip_VIPT:
+ default:
/* Assume aliasing */
set_bit(ICACHEF_ALIASING, &__icache_flags);
break;
}
- pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
+ pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str(l1ip), cpu);
}
static void __cpuinfo_store_cpu_32bit(struct cpuinfo_32bit *info)
@@ -418,14 +439,6 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
__cpuinfo_store_cpu_32bit(&info->aarch32);
- if (IS_ENABLED(CONFIG_ARM64_SVE) &&
- id_aa64pfr0_sve(info->reg_id_aa64pfr0))
- info->reg_zcr = read_zcr_features();
-
- if (IS_ENABLED(CONFIG_ARM64_SME) &&
- id_aa64pfr1_sme(info->reg_id_aa64pfr1))
- info->reg_smcr = read_smcr_features();
-
cpuinfo_detect_icache_policy(info);
}
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5b82b9292400..254fe31c03a0 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -636,18 +636,28 @@ alternative_else_nop_endif
*/
.endm
- .macro tramp_data_page dst
- adr_l \dst, .entry.tramp.text
- sub \dst, \dst, PAGE_SIZE
- .endm
-
- .macro tramp_data_read_var dst, var
-#ifdef CONFIG_RANDOMIZE_BASE
- tramp_data_page \dst
- add \dst, \dst, #:lo12:__entry_tramp_data_\var
- ldr \dst, [\dst]
+ .macro tramp_data_read_var dst, var
+#ifdef CONFIG_RELOCATABLE
+ ldr \dst, .L__tramp_data_\var
+ .ifndef .L__tramp_data_\var
+ .pushsection ".entry.tramp.rodata", "a", %progbits
+ .align 3
+.L__tramp_data_\var:
+ .quad \var
+ .popsection
+ .endif
#else
- ldr \dst, =\var
+ /*
+ * As !RELOCATABLE implies !RANDOMIZE_BASE the address is always a
+ * compile time constant (and hence not secret and not worth hiding).
+ *
+ * As statically allocated kernel code and data always live in the top
+ * 47 bits of the address space we can sign-extend bit 47 and avoid an
+ * instruction to load the upper 16 bits (which must be 0xFFFF).
+ */
+ movz \dst, :abs_g2_s:\var
+ movk \dst, :abs_g1_nc:\var
+ movk \dst, :abs_g0_nc:\var
#endif
.endm
@@ -695,7 +705,7 @@ alternative_else_nop_endif
msr vbar_el1, x30
isb
.else
- ldr x30, =vectors
+ adr_l x30, vectors
.endif // \kpti == 1
.if \bhb == BHB_MITIGATION_FW
@@ -764,24 +774,7 @@ SYM_CODE_END(tramp_exit_native)
SYM_CODE_START(tramp_exit_compat)
tramp_exit 32
SYM_CODE_END(tramp_exit_compat)
-
- .ltorg
.popsection // .entry.tramp.text
-#ifdef CONFIG_RANDOMIZE_BASE
- .pushsection ".rodata", "a"
- .align PAGE_SHIFT
-SYM_DATA_START(__entry_tramp_data_start)
-__entry_tramp_data_vectors:
- .quad vectors
-#ifdef CONFIG_ARM_SDE_INTERFACE
-__entry_tramp_data___sdei_asm_handler:
- .quad __sdei_asm_handler
-#endif /* CONFIG_ARM_SDE_INTERFACE */
-__entry_tramp_data_this_cpu_vector:
- .quad this_cpu_vector
-SYM_DATA_END(__entry_tramp_data_start)
- .popsection // .rodata
-#endif /* CONFIG_RANDOMIZE_BASE */
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
/*
@@ -932,7 +925,6 @@ NOKPROBE(call_on_irq_stack)
* This clobbers x4, __sdei_handler() will restore this from firmware's
* copy.
*/
-.ltorg
.pushsection ".entry.tramp.text", "ax"
SYM_CODE_START(__sdei_asm_entry_trampoline)
mrs x4, ttbr1_el1
@@ -967,7 +959,6 @@ SYM_CODE_START(__sdei_asm_exit_trampoline)
1: sdei_handler_exit exit_mode=x2
SYM_CODE_END(__sdei_asm_exit_trampoline)
NOKPROBE(__sdei_asm_exit_trampoline)
- .ltorg
.popsection // .entry.tramp.text
#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index aecf3071efdd..dd63ffc3a2fa 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -445,7 +445,6 @@ static void fpsimd_save(void)
if (system_supports_sme()) {
u64 *svcr = last->svcr;
- *svcr = read_sysreg_s(SYS_SVCR);
*svcr = read_sysreg_s(SYS_SVCR);
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 6a98f1a38c29..cefe6a73ee54 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -37,8 +37,6 @@
#include "efi-header.S"
-#define __PHYS_OFFSET KERNEL_START
-
#if (PAGE_OFFSET & 0x1fffff) != 0
#error PAGE_OFFSET must be at least 2MB aligned
#endif
@@ -51,9 +49,6 @@
* MMU = off, D-cache = off, I-cache = on or off,
* x0 = physical address to the FDT blob.
*
- * This code is mostly position independent so you call this at
- * __pa(PAGE_OFFSET).
- *
* Note that the callee-saved registers are used for storing variables
* that are useful before the MMU is enabled. The allocations are described
* in the entry routines.
@@ -82,25 +77,34 @@
* primary lowlevel boot path:
*
* Register Scope Purpose
+ * x20 primary_entry() .. __primary_switch() CPU boot mode
* x21 primary_entry() .. start_kernel() FDT pointer passed at boot in x0
+ * x22 create_idmap() .. start_kernel() ID map VA of the DT blob
* x23 primary_entry() .. start_kernel() physical misalignment/KASLR offset
- * x28 __create_page_tables() callee preserved temp register
- * x19/x20 __primary_switch() callee preserved temp registers
- * x24 __primary_switch() .. relocate_kernel() current RELR displacement
+ * x24 __primary_switch() linear map KASLR seed
+ * x25 primary_entry() .. start_kernel() supported VA size
+ * x28 create_idmap() callee preserved temp register
*/
SYM_CODE_START(primary_entry)
bl preserve_boot_args
bl init_kernel_el // w0=cpu_boot_mode
- adrp x23, __PHYS_OFFSET
- and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0
- bl set_cpu_boot_mode_flag
- bl __create_page_tables
+ mov x20, x0
+ bl create_idmap
+
/*
* The following calls CPU setup code, see arch/arm64/mm/proc.S for
* details.
* On return, the CPU will be ready for the MMU to be turned on and
* the TCR will have been set.
*/
+#if VA_BITS > 48
+ mrs_s x0, SYS_ID_AA64MMFR2_EL1
+ tst x0, #0xf << ID_AA64MMFR2_LVA_SHIFT
+ mov x0, #VA_BITS
+ mov x25, #VA_BITS_MIN
+ csel x25, x25, x0, eq
+ mov x0, x25
+#endif
bl __cpu_setup // initialise processor
b __primary_switch
SYM_CODE_END(primary_entry)
@@ -122,28 +126,16 @@ SYM_CODE_START_LOCAL(preserve_boot_args)
b dcache_inval_poc // tail call
SYM_CODE_END(preserve_boot_args)
-/*
- * Macro to create a table entry to the next page.
- *
- * tbl: page table address
- * virt: virtual address
- * shift: #imm page table shift
- * ptrs: #imm pointers per table page
- *
- * Preserves: virt
- * Corrupts: ptrs, tmp1, tmp2
- * Returns: tbl -> next level table page address
- */
- .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
- add \tmp1, \tbl, #PAGE_SIZE
- phys_to_pte \tmp2, \tmp1
- orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type
- lsr \tmp1, \virt, #\shift
- sub \ptrs, \ptrs, #1
- and \tmp1, \tmp1, \ptrs // table index
- str \tmp2, [\tbl, \tmp1, lsl #3]
- add \tbl, \tbl, #PAGE_SIZE // next level table page
- .endm
+SYM_FUNC_START_LOCAL(clear_page_tables)
+ /*
+ * Clear the init page tables.
+ */
+ adrp x0, init_pg_dir
+ adrp x1, init_pg_end
+ sub x2, x1, x0
+ mov x1, xzr
+ b __pi_memset // tail call
+SYM_FUNC_END(clear_page_tables)
/*
* Macro to populate page table entries, these entries can be pointers to the next level
@@ -179,31 +171,20 @@ SYM_CODE_END(preserve_boot_args)
* vstart: virtual address of start of range
* vend: virtual address of end of range - we map [vstart, vend]
* shift: shift used to transform virtual address into index
- * ptrs: number of entries in page table
+ * order: #imm 2log(number of entries in page table)
* istart: index in table corresponding to vstart
* iend: index in table corresponding to vend
* count: On entry: how many extra entries were required in previous level, scales
* our end index.
* On exit: returns how many extra entries required for next page table level
*
- * Preserves: vstart, vend, shift, ptrs
+ * Preserves: vstart, vend
* Returns: istart, iend, count
*/
- .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count
- lsr \iend, \vend, \shift
- mov \istart, \ptrs
- sub \istart, \istart, #1
- and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1)
- mov \istart, \ptrs
- mul \istart, \istart, \count
- add \iend, \iend, \istart // iend += count * ptrs
- // our entries span multiple tables
-
- lsr \istart, \vstart, \shift
- mov \count, \ptrs
- sub \count, \count, #1
- and \istart, \istart, \count
-
+ .macro compute_indices, vstart, vend, shift, order, istart, iend, count
+ ubfx \istart, \vstart, \shift, \order
+ ubfx \iend, \vend, \shift, \order
+ add \iend, \iend, \count, lsl \order
sub \count, \iend, \istart
.endm
@@ -218,119 +199,116 @@ SYM_CODE_END(preserve_boot_args)
* vend: virtual address of end of range - we map [vstart, vend - 1]
* flags: flags to use to map last level entries
* phys: physical address corresponding to vstart - physical memory is contiguous
- * pgds: the number of pgd entries
+ * order: #imm 2log(number of entries in PGD table)
+ *
+ * If extra_shift is set, an extra level will be populated if the end address does
+ * not fit in 'extra_shift' bits. This assumes vend is in the TTBR0 range.
*
* Temporaries: istart, iend, tmp, count, sv - these need to be different registers
* Preserves: vstart, flags
* Corrupts: tbl, rtbl, vend, istart, iend, tmp, count, sv
*/
- .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv
+ .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, order, istart, iend, tmp, count, sv, extra_shift
sub \vend, \vend, #1
add \rtbl, \tbl, #PAGE_SIZE
- mov \sv, \rtbl
mov \count, #0
- compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count
+
+ .ifnb \extra_shift
+ tst \vend, #~((1 << (\extra_shift)) - 1)
+ b.eq .L_\@
+ compute_indices \vstart, \vend, #\extra_shift, #(PAGE_SHIFT - 3), \istart, \iend, \count
+ mov \sv, \rtbl
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
+ .endif
+.L_\@:
+ compute_indices \vstart, \vend, #PGDIR_SHIFT, #\order, \istart, \iend, \count
mov \sv, \rtbl
+ populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
+ mov \tbl, \sv
#if SWAPPER_PGTABLE_LEVELS > 3
- compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count
+ compute_indices \vstart, \vend, #PUD_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
+ mov \sv, \rtbl
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
- mov \sv, \rtbl
#endif
#if SWAPPER_PGTABLE_LEVELS > 2
- compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count
+ compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
+ mov \sv, \rtbl
populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp
mov \tbl, \sv
#endif
- compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count
- bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1
- populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
+ compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #(PAGE_SHIFT - 3), \istart, \iend, \count
+ bic \rtbl, \phys, #SWAPPER_BLOCK_SIZE - 1
+ populate_entries \tbl, \rtbl, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp
.endm
/*
- * Setup the initial page tables. We only setup the barest amount which is
- * required to get the kernel running. The following sections are required:
- * - identity mapping to enable the MMU (low address, TTBR0)
- * - first few MB of the kernel linear mapping to jump to once the MMU has
- * been enabled
+ * Remap a subregion created with the map_memory macro with modified attributes
+ * or output address. The entire remapped region must have been covered in the
+ * invocation of map_memory.
+ *
+ * x0: last level table address (returned in first argument to map_memory)
+ * x1: start VA of the existing mapping
+ * x2: start VA of the region to update
+ * x3: end VA of the region to update (exclusive)
+ * x4: start PA associated with the region to update
+ * x5: attributes to set on the updated region
+ * x6: order of the last level mappings
*/
-SYM_FUNC_START_LOCAL(__create_page_tables)
- mov x28, lr
+SYM_FUNC_START_LOCAL(remap_region)
+ sub x3, x3, #1 // make end inclusive
- /*
- * Invalidate the init page tables to avoid potential dirty cache lines
- * being evicted. Other page tables are allocated in rodata as part of
- * the kernel image, and thus are clean to the PoC per the boot
- * protocol.
- */
- adrp x0, init_pg_dir
- adrp x1, init_pg_end
- bl dcache_inval_poc
+ // Get the index offset for the start of the last level table
+ lsr x1, x1, x6
+ bfi x1, xzr, #0, #PAGE_SHIFT - 3
- /*
- * Clear the init page tables.
- */
- adrp x0, init_pg_dir
- adrp x1, init_pg_end
- sub x1, x1, x0
-1: stp xzr, xzr, [x0], #16
- stp xzr, xzr, [x0], #16
- stp xzr, xzr, [x0], #16
- stp xzr, xzr, [x0], #16
- subs x1, x1, #64
- b.ne 1b
+ // Derive the start and end indexes into the last level table
+ // associated with the provided region
+ lsr x2, x2, x6
+ lsr x3, x3, x6
+ sub x2, x2, x1
+ sub x3, x3, x1
- mov x7, SWAPPER_MM_MMUFLAGS
+ mov x1, #1
+ lsl x6, x1, x6 // block size at this level
- /*
- * Create the identity mapping.
- */
- adrp x0, idmap_pg_dir
- adrp x3, __idmap_text_start // __pa(__idmap_text_start)
-
-#ifdef CONFIG_ARM64_VA_BITS_52
- mrs_s x6, SYS_ID_AA64MMFR2_EL1
- and x6, x6, #(0xf << ID_AA64MMFR2_LVA_SHIFT)
- mov x5, #52
- cbnz x6, 1f
-#endif
- mov x5, #VA_BITS_MIN
-1:
- adr_l x6, vabits_actual
- str x5, [x6]
- dmb sy
- dc ivac, x6 // Invalidate potentially stale cache line
+ populate_entries x0, x4, x2, x3, x5, x6, x7
+ ret
+SYM_FUNC_END(remap_region)
+SYM_FUNC_START_LOCAL(create_idmap)
+ mov x28, lr
/*
- * VA_BITS may be too small to allow for an ID mapping to be created
- * that covers system RAM if that is located sufficiently high in the
- * physical address space. So for the ID map, use an extended virtual
- * range in that case, and configure an additional translation level
- * if needed.
+ * The ID map carries a 1:1 mapping of the physical address range
+ * covered by the loaded image, which could be anywhere in DRAM. This
+ * means that the required size of the VA (== PA) space is decided at
+ * boot time, and could be more than the configured size of the VA
+ * space for ordinary kernel and user space mappings.
+ *
+ * There are three cases to consider here:
+ * - 39 <= VA_BITS < 48, and the ID map needs up to 48 VA bits to cover
+ * the placement of the image. In this case, we configure one extra
+ * level of translation on the fly for the ID map only. (This case
+ * also covers 42-bit VA/52-bit PA on 64k pages).
*
- * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
- * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
- * this number conveniently equals the number of leading zeroes in
- * the physical address of __idmap_text_end.
+ * - VA_BITS == 48, and the ID map needs more than 48 VA bits. This can
+ * only happen when using 64k pages, in which case we need to extend
+ * the root level table rather than add a level. Note that we can
+ * treat this case as 'always extended' as long as we take care not
+ * to program an unsupported T0SZ value into the TCR register.
+ *
+ * - Combinations that would require two additional levels of
+ * translation are not supported, e.g., VA_BITS==36 on 16k pages, or
+ * VA_BITS==39/4k pages with 5-level paging, where the input address
+ * requires more than 47 or 48 bits, respectively.
*/
- adrp x5, __idmap_text_end
- clz x5, x5
- cmp x5, TCR_T0SZ(VA_BITS_MIN) // default T0SZ small enough?
- b.ge 1f // .. then skip VA range extension
-
- adr_l x6, idmap_t0sz
- str x5, [x6]
- dmb sy
- dc ivac, x6 // Invalidate potentially stale cache line
-
#if (VA_BITS < 48)
+#define IDMAP_PGD_ORDER (VA_BITS - PGDIR_SHIFT)
#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3)
-#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT))
/*
* If VA_BITS < 48, we have to configure an additional table level.
@@ -342,36 +320,40 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
#if VA_BITS != EXTRA_SHIFT
#error "Mismatch between VA_BITS and page size/number of translation levels"
#endif
-
- mov x4, EXTRA_PTRS
- create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6
#else
+#define IDMAP_PGD_ORDER (PHYS_MASK_SHIFT - PGDIR_SHIFT)
+#define EXTRA_SHIFT
/*
* If VA_BITS == 48, we don't have to configure an additional
* translation level, but the top-level table has more entries.
*/
- mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT)
- str_l x4, idmap_ptrs_per_pgd, x5
#endif
-1:
- ldr_l x4, idmap_ptrs_per_pgd
- adr_l x6, __idmap_text_end // __pa(__idmap_text_end)
-
- map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14
-
- /*
- * Map the kernel image (starting with PHYS_OFFSET).
- */
- adrp x0, init_pg_dir
- mov_q x5, KIMAGE_VADDR // compile time __va(_text)
- add x5, x5, x23 // add KASLR displacement
- mov x4, PTRS_PER_PGD
- adrp x6, _end // runtime __pa(_end)
- adrp x3, _text // runtime __pa(_text)
- sub x6, x6, x3 // _end - _text
- add x6, x6, x5 // runtime __va(_end)
-
- map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14
+ adrp x0, init_idmap_pg_dir
+ adrp x3, _text
+ adrp x6, _end + MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE
+ mov x7, SWAPPER_RX_MMUFLAGS
+
+ map_memory x0, x1, x3, x6, x7, x3, IDMAP_PGD_ORDER, x10, x11, x12, x13, x14, EXTRA_SHIFT
+
+ /* Remap the kernel page tables r/w in the ID map */
+ adrp x1, _text
+ adrp x2, init_pg_dir
+ adrp x3, init_pg_end
+ bic x4, x2, #SWAPPER_BLOCK_SIZE - 1
+ mov x5, SWAPPER_RW_MMUFLAGS
+ mov x6, #SWAPPER_BLOCK_SHIFT
+ bl remap_region
+
+ /* Remap the FDT after the kernel image */
+ adrp x1, _text
+ adrp x22, _end + SWAPPER_BLOCK_SIZE
+ bic x2, x22, #SWAPPER_BLOCK_SIZE - 1
+ bfi x22, x21, #0, #SWAPPER_BLOCK_SHIFT // remapped FDT address
+ add x3, x2, #MAX_FDT_SIZE + SWAPPER_BLOCK_SIZE
+ bic x4, x21, #SWAPPER_BLOCK_SIZE - 1
+ mov x5, SWAPPER_RW_MMUFLAGS
+ mov x6, #SWAPPER_BLOCK_SHIFT
+ bl remap_region
/*
* Since the page tables have been populated with non-cacheable
@@ -380,16 +362,27 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
*/
dmb sy
- adrp x0, idmap_pg_dir
- adrp x1, idmap_pg_end
+ adrp x0, init_idmap_pg_dir
+ adrp x1, init_idmap_pg_end
bl dcache_inval_poc
+ ret x28
+SYM_FUNC_END(create_idmap)
+SYM_FUNC_START_LOCAL(create_kernel_mapping)
adrp x0, init_pg_dir
- adrp x1, init_pg_end
- bl dcache_inval_poc
+ mov_q x5, KIMAGE_VADDR // compile time __va(_text)
+ add x5, x5, x23 // add KASLR displacement
+ adrp x6, _end // runtime __pa(_end)
+ adrp x3, _text // runtime __pa(_text)
+ sub x6, x6, x3 // _end - _text
+ add x6, x6, x5 // runtime __va(_end)
+ mov x7, SWAPPER_RW_MMUFLAGS
- ret x28
-SYM_FUNC_END(__create_page_tables)
+ map_memory x0, x1, x5, x6, x7, x3, (VA_BITS - PGDIR_SHIFT), x10, x11, x12, x13, x14
+
+ dsb ishst // sync with page table walker
+ ret
+SYM_FUNC_END(create_kernel_mapping)
/*
* Initialize CPU registers with task-specific and cpu-specific context.
@@ -420,7 +413,7 @@ SYM_FUNC_END(__create_page_tables)
/*
* The following fragment of code is executed with the MMU enabled.
*
- * x0 = __PHYS_OFFSET
+ * x0 = __pa(KERNEL_START)
*/
SYM_FUNC_START_LOCAL(__primary_switched)
adr_l x4, init_task
@@ -439,6 +432,9 @@ SYM_FUNC_START_LOCAL(__primary_switched)
sub x4, x4, x0 // the kernel virtual and
str_l x4, kimage_voffset, x5 // physical mappings
+ mov x0, x20
+ bl set_cpu_boot_mode_flag
+
// Clear BSS
adr_l x0, __bss_start
mov x1, xzr
@@ -447,35 +443,30 @@ SYM_FUNC_START_LOCAL(__primary_switched)
bl __pi_memset
dsb ishst // Make zero page visible to PTW
+#if VA_BITS > 48
+ adr_l x8, vabits_actual // Set this early so KASAN early init
+ str x25, [x8] // ... observes the correct value
+ dc civac, x8 // Make visible to booting secondaries
+#endif
+
+#ifdef CONFIG_RANDOMIZE_BASE
+ adrp x5, memstart_offset_seed // Save KASLR linear map seed
+ strh w24, [x5, :lo12:memstart_offset_seed]
+#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
bl kasan_early_init
#endif
mov x0, x21 // pass FDT address in x0
bl early_fdt_map // Try mapping the FDT early
+ mov x0, x20 // pass the full boot status
bl init_feature_override // Parse cpu feature overrides
-#ifdef CONFIG_RANDOMIZE_BASE
- tst x23, ~(MIN_KIMG_ALIGN - 1) // already running randomized?
- b.ne 0f
- bl kaslr_early_init // parse FDT for KASLR options
- cbz x0, 0f // KASLR disabled? just proceed
- orr x23, x23, x0 // record KASLR offset
- ldp x29, x30, [sp], #16 // we must enable KASLR, return
- ret // to __primary_switch()
-0:
-#endif
- bl switch_to_vhe // Prefer VHE if possible
+ mov x0, x20
+ bl finalise_el2 // Prefer VHE if possible
ldp x29, x30, [sp], #16
bl start_kernel
ASM_BUG()
SYM_FUNC_END(__primary_switched)
- .pushsection ".rodata", "a"
-SYM_DATA_START(kimage_vaddr)
- .quad _text
-SYM_DATA_END(kimage_vaddr)
-EXPORT_SYMBOL(kimage_vaddr)
- .popsection
-
/*
* end early head section, begin head code that is also used for
* hotplug and needs to have the same protections as the text region
@@ -490,8 +481,9 @@ EXPORT_SYMBOL(kimage_vaddr)
* Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if
* SCTLR_ELx.EOS is clear), we place an ISB prior to ERET.
*
- * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if
- * booted in EL1 or EL2 respectively.
+ * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in x0 if
+ * booted in EL1 or EL2 respectively, with the top 32 bits containing
+ * potential context flags. These flags are *not* stored in __boot_cpu_mode.
*/
SYM_FUNC_START(init_kernel_el)
mrs x0, CurrentEL
@@ -520,6 +512,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
msr vbar_el2, x0
isb
+ mov_q x1, INIT_SCTLR_EL1_MMU_OFF
+
/*
* Fruity CPUs seem to have HCR_EL2.E2H set to RES1,
* making it impossible to start in nVHE mode. Is that
@@ -529,34 +523,19 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL)
and x0, x0, #HCR_E2H
cbz x0, 1f
- /* Switching to VHE requires a sane SCTLR_EL1 as a start */
- mov_q x0, INIT_SCTLR_EL1_MMU_OFF
- msr_s SYS_SCTLR_EL12, x0
-
- /*
- * Force an eret into a helper "function", and let it return
- * to our original caller... This makes sure that we have
- * initialised the basic PSTATE state.
- */
- mov x0, #INIT_PSTATE_EL2
- msr spsr_el1, x0
- adr x0, __cpu_stick_to_vhe
- msr elr_el1, x0
- eret
+ /* Set a sane SCTLR_EL1, the VHE way */
+ msr_s SYS_SCTLR_EL12, x1
+ mov x2, #BOOT_CPU_FLAG_E2H
+ b 2f
1:
- mov_q x0, INIT_SCTLR_EL1_MMU_OFF
- msr sctlr_el1, x0
-
+ msr sctlr_el1, x1
+ mov x2, xzr
+2:
msr elr_el2, lr
mov w0, #BOOT_CPU_MODE_EL2
+ orr x0, x0, x2
eret
-
-__cpu_stick_to_vhe:
- mov x0, #HVC_VHE_RESTART
- hvc #0
- mov x0, #BOOT_CPU_MODE_EL2
- ret
SYM_FUNC_END(init_kernel_el)
/*
@@ -569,52 +548,21 @@ SYM_FUNC_START_LOCAL(set_cpu_boot_mode_flag)
b.ne 1f
add x1, x1, #4
1: str w0, [x1] // Save CPU boot mode
- dmb sy
- dc ivac, x1 // Invalidate potentially stale cache line
ret
SYM_FUNC_END(set_cpu_boot_mode_flag)
-/*
- * These values are written with the MMU off, but read with the MMU on.
- * Writers will invalidate the corresponding address, discarding up to a
- * 'Cache Writeback Granule' (CWG) worth of data. The linker script ensures
- * sufficient alignment that the CWG doesn't overlap another section.
- */
- .pushsection ".mmuoff.data.write", "aw"
-/*
- * We need to find out the CPU boot mode long after boot, so we need to
- * store it in a writable variable.
- *
- * This is not in .bss, because we set it sufficiently early that the boot-time
- * zeroing of .bss would clobber it.
- */
-SYM_DATA_START(__boot_cpu_mode)
- .long BOOT_CPU_MODE_EL2
- .long BOOT_CPU_MODE_EL1
-SYM_DATA_END(__boot_cpu_mode)
-/*
- * The booting CPU updates the failed status @__early_cpu_boot_status,
- * with MMU turned off.
- */
-SYM_DATA_START(__early_cpu_boot_status)
- .quad 0
-SYM_DATA_END(__early_cpu_boot_status)
-
- .popsection
-
/*
* This provides a "holding pen" for platforms to hold all secondary
* cores are held until we're ready for them to initialise.
*/
SYM_FUNC_START(secondary_holding_pen)
bl init_kernel_el // w0=cpu_boot_mode
- bl set_cpu_boot_mode_flag
- mrs x0, mpidr_el1
+ mrs x2, mpidr_el1
mov_q x1, MPIDR_HWID_BITMASK
- and x0, x0, x1
+ and x2, x2, x1
adr_l x3, secondary_holding_pen_release
pen: ldr x4, [x3]
- cmp x4, x0
+ cmp x4, x2
b.eq secondary_startup
wfe
b pen
@@ -626,7 +574,6 @@ SYM_FUNC_END(secondary_holding_pen)
*/
SYM_FUNC_START(secondary_entry)
bl init_kernel_el // w0=cpu_boot_mode
- bl set_cpu_boot_mode_flag
b secondary_startup
SYM_FUNC_END(secondary_entry)
@@ -634,16 +581,24 @@ SYM_FUNC_START_LOCAL(secondary_startup)
/*
* Common entry point for secondary CPUs.
*/
- bl switch_to_vhe
+ mov x20, x0 // preserve boot mode
+ bl finalise_el2
bl __cpu_secondary_check52bitva
+#if VA_BITS > 48
+ ldr_l x0, vabits_actual
+#endif
bl __cpu_setup // initialise processor
adrp x1, swapper_pg_dir
+ adrp x2, idmap_pg_dir
bl __enable_mmu
ldr x8, =__secondary_switched
br x8
SYM_FUNC_END(secondary_startup)
SYM_FUNC_START_LOCAL(__secondary_switched)
+ mov x0, x20
+ bl set_cpu_boot_mode_flag
+ str_l xzr, __early_cpu_boot_status, x3
adr_l x5, vectors
msr vbar_el1, x5
isb
@@ -691,6 +646,7 @@ SYM_FUNC_END(__secondary_too_slow)
*
* x0 = SCTLR_EL1 value for turning on the MMU.
* x1 = TTBR1_EL1 value
+ * x2 = ID map root table address
*
* Returns to the caller via x30/lr. This requires the caller to be covered
* by the .idmap.text section.
@@ -699,20 +655,15 @@ SYM_FUNC_END(__secondary_too_slow)
* If it isn't, park the CPU
*/
SYM_FUNC_START(__enable_mmu)
- mrs x2, ID_AA64MMFR0_EL1
- ubfx x2, x2, #ID_AA64MMFR0_TGRAN_SHIFT, 4
- cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN
+ mrs x3, ID_AA64MMFR0_EL1
+ ubfx x3, x3, #ID_AA64MMFR0_TGRAN_SHIFT, 4
+ cmp x3, #ID_AA64MMFR0_TGRAN_SUPPORTED_MIN
b.lt __no_granule_support
- cmp x2, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX
+ cmp x3, #ID_AA64MMFR0_TGRAN_SUPPORTED_MAX
b.gt __no_granule_support
- update_early_cpu_boot_status 0, x2, x3
- adrp x2, idmap_pg_dir
- phys_to_ttbr x1, x1
phys_to_ttbr x2, x2
msr ttbr0_el1, x2 // load TTBR0
- offset_ttbr1 x1, x3
- msr ttbr1_el1, x1 // load TTBR1
- isb
+ load_ttbr1 x1, x1, x3
set_sctlr_el1 x0
@@ -720,7 +671,7 @@ SYM_FUNC_START(__enable_mmu)
SYM_FUNC_END(__enable_mmu)
SYM_FUNC_START(__cpu_secondary_check52bitva)
-#ifdef CONFIG_ARM64_VA_BITS_52
+#if VA_BITS > 48
ldr_l x0, vabits_actual
cmp x0, #52
b.ne 2f
@@ -755,13 +706,10 @@ SYM_FUNC_START_LOCAL(__relocate_kernel)
* Iterate over each entry in the relocation table, and apply the
* relocations in place.
*/
- ldr w9, =__rela_offset // offset to reloc table
- ldr w10, =__rela_size // size of reloc table
-
+ adr_l x9, __rela_start
+ adr_l x10, __rela_end
mov_q x11, KIMAGE_VADDR // default virtual offset
add x11, x11, x23 // actual virtual offset
- add x9, x9, x11 // __va(.rela)
- add x10, x9, x10 // __va(.rela) + sizeof(.rela)
0: cmp x9, x10
b.hs 1f
@@ -804,21 +752,9 @@ SYM_FUNC_START_LOCAL(__relocate_kernel)
* entry in x9, the address being relocated by the current address or
* bitmap entry in x13 and the address being relocated by the current
* bit in x14.
- *
- * Because addends are stored in place in the binary, RELR relocations
- * cannot be applied idempotently. We use x24 to keep track of the
- * currently applied displacement so that we can correctly relocate if
- * __relocate_kernel is called twice with non-zero displacements (i.e.
- * if there is both a physical misalignment and a KASLR displacement).
*/
- ldr w9, =__relr_offset // offset to reloc table
- ldr w10, =__relr_size // size of reloc table
- add x9, x9, x11 // __va(.relr)
- add x10, x9, x10 // __va(.relr) + sizeof(.relr)
-
- sub x15, x23, x24 // delta from previous offset
- cbz x15, 7f // nothing to do if unchanged
- mov x24, x23 // save new offset
+ adr_l x9, __relr_start
+ adr_l x10, __relr_end
2: cmp x9, x10
b.hs 7f
@@ -826,7 +762,7 @@ SYM_FUNC_START_LOCAL(__relocate_kernel)
tbnz x11, #0, 3f // branch to handle bitmaps
add x13, x11, x23
ldr x12, [x13] // relocate address entry
- add x12, x12, x15
+ add x12, x12, x23
str x12, [x13], #8 // adjust to start of bitmap
b 2b
@@ -835,7 +771,7 @@ SYM_FUNC_START_LOCAL(__relocate_kernel)
cbz x11, 6f
tbz x11, #0, 5f // skip bit if not set
ldr x12, [x14] // relocate bit
- add x12, x12, x15
+ add x12, x12, x23
str x12, [x14]
5: add x14, x14, #8 // move to next bit's address
@@ -856,43 +792,32 @@ SYM_FUNC_END(__relocate_kernel)
#endif
SYM_FUNC_START_LOCAL(__primary_switch)
+ adrp x1, reserved_pg_dir
+ adrp x2, init_idmap_pg_dir
+ bl __enable_mmu
+#ifdef CONFIG_RELOCATABLE
+ adrp x23, KERNEL_START
+ and x23, x23, MIN_KIMG_ALIGN - 1
#ifdef CONFIG_RANDOMIZE_BASE
- mov x19, x0 // preserve new SCTLR_EL1 value
- mrs x20, sctlr_el1 // preserve old SCTLR_EL1 value
+ mov x0, x22
+ adrp x1, init_pg_end
+ mov sp, x1
+ mov x29, xzr
+ bl __pi_kaslr_early_init
+ and x24, x0, #SZ_2M - 1 // capture memstart offset seed
+ bic x0, x0, #SZ_2M - 1
+ orr x23, x23, x0 // record kernel offset
+#endif
#endif
+ bl clear_page_tables
+ bl create_kernel_mapping
adrp x1, init_pg_dir
- bl __enable_mmu
+ load_ttbr1 x1, x1, x2
#ifdef CONFIG_RELOCATABLE
-#ifdef CONFIG_RELR
- mov x24, #0 // no RELR displacement yet
-#endif
bl __relocate_kernel
-#ifdef CONFIG_RANDOMIZE_BASE
- ldr x8, =__primary_switched
- adrp x0, __PHYS_OFFSET
- blr x8
-
- /*
- * If we return here, we have a KASLR displacement in x23 which we need
- * to take into account by discarding the current kernel mapping and
- * creating a new one.
- */
- pre_disable_mmu_workaround
- msr sctlr_el1, x20 // disable the MMU
- isb
- bl __create_page_tables // recreate kernel mapping
-
- tlbi vmalle1 // Remove any stale TLB entries
- dsb nsh
- isb
-
- set_sctlr_el1 x19 // re-enable the MMU
-
- bl __relocate_kernel
-#endif
#endif
ldr x8, =__primary_switched
- adrp x0, __PHYS_OFFSET
+ adrp x0, KERNEL_START // __pa(KERNEL_START)
br x8
SYM_FUNC_END(__primary_switch)
diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 2e248342476e..af5df48ba915 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -300,11 +300,6 @@ static void swsusp_mte_restore_tags(void)
unsigned long pfn = xa_state.xa_index;
struct page *page = pfn_to_online_page(pfn);
- /*
- * It is not required to invoke page_kasan_tag_reset(page)
- * at this point since the tags stored in page->flags are
- * already restored.
- */
mte_restore_page_tags(page_address(page), tags);
mte_free_tag_storage(tags);
diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S
index 43d212618834..12c7fad02ae5 100644
--- a/arch/arm64/kernel/hyp-stub.S
+++ b/arch/arm64/kernel/hyp-stub.S
@@ -16,6 +16,30 @@
#include <asm/ptrace.h>
#include <asm/virt.h>
+// Warning, hardcoded register allocation
+// This will clobber x1 and x2, and expect x1 to contain
+// the id register value as read from the HW
+.macro __check_override idreg, fld, width, pass, fail
+ ubfx x1, x1, #\fld, #\width
+ cbz x1, \fail
+
+ adr_l x1, \idreg\()_override
+ ldr x2, [x1, FTR_OVR_VAL_OFFSET]
+ ldr x1, [x1, FTR_OVR_MASK_OFFSET]
+ ubfx x2, x2, #\fld, #\width
+ ubfx x1, x1, #\fld, #\width
+ cmp x1, xzr
+ and x2, x2, x1
+ csinv x2, x2, xzr, ne
+ cbnz x2, \pass
+ b \fail
+.endm
+
+.macro check_override idreg, fld, pass, fail
+ mrs x1, \idreg\()_el1
+ __check_override \idreg \fld 4 \pass \fail
+.endm
+
.text
.pushsection .hyp.text, "ax"
@@ -51,8 +75,8 @@ SYM_CODE_START_LOCAL(elx_sync)
msr vbar_el2, x1
b 9f
-1: cmp x0, #HVC_VHE_RESTART
- b.eq mutate_to_vhe
+1: cmp x0, #HVC_FINALISE_EL2
+ b.eq __finalise_el2
2: cmp x0, #HVC_SOFT_RESTART
b.ne 3f
@@ -73,27 +97,67 @@ SYM_CODE_START_LOCAL(elx_sync)
eret
SYM_CODE_END(elx_sync)
-// nVHE? No way! Give me the real thing!
-SYM_CODE_START_LOCAL(mutate_to_vhe)
+SYM_CODE_START_LOCAL(__finalise_el2)
+ check_override id_aa64pfr0 ID_AA64PFR0_SVE_SHIFT .Linit_sve .Lskip_sve
+
+.Linit_sve: /* SVE register access */
+ mrs x0, cptr_el2 // Disable SVE traps
+ bic x0, x0, #CPTR_EL2_TZ
+ msr cptr_el2, x0
+ isb
+ mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector
+ msr_s SYS_ZCR_EL2, x1 // length for EL1.
+
+.Lskip_sve:
+ check_override id_aa64pfr1 ID_AA64PFR1_SME_SHIFT .Linit_sme .Lskip_sme
+
+.Linit_sme: /* SME register access and priority mapping */
+ mrs x0, cptr_el2 // Disable SME traps
+ bic x0, x0, #CPTR_EL2_TSM
+ msr cptr_el2, x0
+ isb
+
+ mrs x1, sctlr_el2
+ orr x1, x1, #SCTLR_ELx_ENTP2 // Disable TPIDR2 traps
+ msr sctlr_el2, x1
+ isb
+
+ mov x0, #0 // SMCR controls
+
+ // Full FP in SM?
+ mrs_s x1, SYS_ID_AA64SMFR0_EL1
+ __check_override id_aa64smfr0 ID_AA64SMFR0_EL1_FA64_SHIFT 1 .Linit_sme_fa64 .Lskip_sme_fa64
+
+.Linit_sme_fa64:
+ orr x0, x0, SMCR_ELx_FA64_MASK
+.Lskip_sme_fa64:
+
+ orr x0, x0, #SMCR_ELx_LEN_MASK // Enable full SME vector
+ msr_s SYS_SMCR_EL2, x0 // length for EL1.
+
+ mrs_s x1, SYS_SMIDR_EL1 // Priority mapping supported?
+ ubfx x1, x1, #SMIDR_EL1_SMPS_SHIFT, #1
+ cbz x1, .Lskip_sme
+
+ msr_s SYS_SMPRIMAP_EL2, xzr // Make all priorities equal
+
+ mrs x1, id_aa64mmfr1_el1 // HCRX_EL2 present?
+ ubfx x1, x1, #ID_AA64MMFR1_HCX_SHIFT, #4
+ cbz x1, .Lskip_sme
+
+ mrs_s x1, SYS_HCRX_EL2
+ orr x1, x1, #HCRX_EL2_SMPME_MASK // Enable priority mapping
+ msr_s SYS_HCRX_EL2, x1
+
+.Lskip_sme:
+
+ // nVHE? No way! Give me the real thing!
// Sanity check: MMU *must* be off
mrs x1, sctlr_el2
tbnz x1, #0, 1f
// Needs to be VHE capable, obviously
- mrs x1, id_aa64mmfr1_el1
- ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4
- cbz x1, 1f
-
- // Check whether VHE is disabled from the command line
- adr_l x1, id_aa64mmfr1_override
- ldr x2, [x1, FTR_OVR_VAL_OFFSET]
- ldr x1, [x1, FTR_OVR_MASK_OFFSET]
- ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4
- ubfx x1, x1, #ID_AA64MMFR1_VHE_SHIFT, #4
- cmp x1, xzr
- and x2, x2, x1
- csinv x2, x2, xzr, ne
- cbnz x2, 2f
+ check_override id_aa64mmfr1 ID_AA64MMFR1_VHE_SHIFT 2f 1f
1: mov_q x0, HVC_STUB_ERR
eret
@@ -140,10 +204,10 @@ SYM_CODE_START_LOCAL(mutate_to_vhe)
msr spsr_el1, x0
b enter_vhe
-SYM_CODE_END(mutate_to_vhe)
+SYM_CODE_END(__finalise_el2)
// At the point where we reach enter_vhe(), we run with
- // the MMU off (which is enforced by mutate_to_vhe()).
+ // the MMU off (which is enforced by __finalise_el2()).
// We thus need to be in the idmap, or everything will
// explode when enabling the MMU.
@@ -222,12 +286,12 @@ SYM_FUNC_START(__hyp_reset_vectors)
SYM_FUNC_END(__hyp_reset_vectors)
/*
- * Entry point to switch to VHE if deemed capable
+ * Entry point to finalise EL2 and switch to VHE if deemed capable
+ *
+ * w0: boot mode, as returned by init_kernel_el()
*/
-SYM_FUNC_START(switch_to_vhe)
+SYM_FUNC_START(finalise_el2)
// Need to have booted at EL2
- adr_l x1, __boot_cpu_mode
- ldr w0, [x1]
cmp w0, #BOOT_CPU_MODE_EL2
b.ne 1f
@@ -236,9 +300,8 @@ SYM_FUNC_START(switch_to_vhe)
cmp x0, #CurrentEL_EL1
b.ne 1f
- // Turn the world upside down
- mov x0, #HVC_VHE_RESTART
+ mov x0, #HVC_FINALISE_EL2
hvc #0
1:
ret
-SYM_FUNC_END(switch_to_vhe)
+SYM_FUNC_END(finalise_el2)
diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c
index 8a2ceb591686..1b0542c69738 100644
--- a/arch/arm64/kernel/idreg-override.c
+++ b/arch/arm64/kernel/idreg-override.c
@@ -19,16 +19,21 @@
#define FTR_ALIAS_NAME_LEN 30
#define FTR_ALIAS_OPTION_LEN 116
+static u64 __boot_status __initdata;
+
struct ftr_set_desc {
char name[FTR_DESC_NAME_LEN];
struct arm64_ftr_override *override;
struct {
char name[FTR_DESC_FIELD_LEN];
u8 shift;
+ u8 width;
bool (*filter)(u64 val);
} fields[];
};
+#define FIELD(n, s, f) { .name = n, .shift = s, .width = 4, .filter = f }
+
static bool __init mmfr1_vh_filter(u64 val)
{
/*
@@ -37,24 +42,65 @@ static bool __init mmfr1_vh_filter(u64 val)
* the user was trying to force nVHE on us, proceed with
* attitude adjustment.
*/
- return !(is_kernel_in_hyp_mode() && val == 0);
+ return !(__boot_status == (BOOT_CPU_FLAG_E2H | BOOT_CPU_MODE_EL2) &&
+ val == 0);
}
static const struct ftr_set_desc mmfr1 __initconst = {
.name = "id_aa64mmfr1",
.override = &id_aa64mmfr1_override,
.fields = {
- { "vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter },
+ FIELD("vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter),
+ {}
+ },
+};
+
+static bool __init pfr0_sve_filter(u64 val)
+{
+ /*
+ * Disabling SVE also means disabling all the features that
+ * are associated with it. The easiest way to do it is just to
+ * override id_aa64zfr0_el1 to be 0.
+ */
+ if (!val) {
+ id_aa64zfr0_override.val = 0;
+ id_aa64zfr0_override.mask = GENMASK(63, 0);
+ }
+
+ return true;
+}
+
+static const struct ftr_set_desc pfr0 __initconst = {
+ .name = "id_aa64pfr0",
+ .override = &id_aa64pfr0_override,
+ .fields = {
+ FIELD("sve", ID_AA64PFR0_SVE_SHIFT, pfr0_sve_filter),
{}
},
};
+static bool __init pfr1_sme_filter(u64 val)
+{
+ /*
+ * Similarly to SVE, disabling SME also means disabling all
+ * the features that are associated with it. Just set
+ * id_aa64smfr0_el1 to 0 and don't look back.
+ */
+ if (!val) {
+ id_aa64smfr0_override.val = 0;
+ id_aa64smfr0_override.mask = GENMASK(63, 0);
+ }
+
+ return true;
+}
+
static const struct ftr_set_desc pfr1 __initconst = {
.name = "id_aa64pfr1",
.override = &id_aa64pfr1_override,
.fields = {
- { "bt", ID_AA64PFR1_BT_SHIFT },
- { "mte", ID_AA64PFR1_MTE_SHIFT},
+ FIELD("bt", ID_AA64PFR1_BT_SHIFT, NULL ),
+ FIELD("mte", ID_AA64PFR1_MTE_SHIFT, NULL),
+ FIELD("sme", ID_AA64PFR1_SME_SHIFT, pfr1_sme_filter),
{}
},
};
@@ -63,10 +109,10 @@ static const struct ftr_set_desc isar1 __initconst = {
.name = "id_aa64isar1",
.override = &id_aa64isar1_override,
.fields = {
- { "gpi", ID_AA64ISAR1_GPI_SHIFT },
- { "gpa", ID_AA64ISAR1_GPA_SHIFT },
- { "api", ID_AA64ISAR1_API_SHIFT },
- { "apa", ID_AA64ISAR1_APA_SHIFT },
+ FIELD("gpi", ID_AA64ISAR1_EL1_GPI_SHIFT, NULL),
+ FIELD("gpa", ID_AA64ISAR1_EL1_GPA_SHIFT, NULL),
+ FIELD("api", ID_AA64ISAR1_EL1_API_SHIFT, NULL),
+ FIELD("apa", ID_AA64ISAR1_EL1_APA_SHIFT, NULL),
{}
},
};
@@ -75,8 +121,18 @@ static const struct ftr_set_desc isar2 __initconst = {
.name = "id_aa64isar2",
.override = &id_aa64isar2_override,
.fields = {
- { "gpa3", ID_AA64ISAR2_GPA3_SHIFT },
- { "apa3", ID_AA64ISAR2_APA3_SHIFT },
+ FIELD("gpa3", ID_AA64ISAR2_EL1_GPA3_SHIFT, NULL),
+ FIELD("apa3", ID_AA64ISAR2_EL1_APA3_SHIFT, NULL),
+ {}
+ },
+};
+
+static const struct ftr_set_desc smfr0 __initconst = {
+ .name = "id_aa64smfr0",
+ .override = &id_aa64smfr0_override,
+ .fields = {
+ /* FA64 is a one bit field... :-/ */
+ { "fa64", ID_AA64SMFR0_EL1_FA64_SHIFT, 1, },
{}
},
};
@@ -89,16 +145,18 @@ static const struct ftr_set_desc kaslr __initconst = {
.override = &kaslr_feature_override,
#endif
.fields = {
- { "disabled", 0 },
+ FIELD("disabled", 0, NULL),
{}
},
};
static const struct ftr_set_desc * const regs[] __initconst = {
&mmfr1,
+ &pfr0,
&pfr1,
&isar1,
&isar2,
+ &smfr0,
&kaslr,
};
@@ -108,6 +166,8 @@ static const struct {
} aliases[] __initconst = {
{ "kvm-arm.mode=nvhe", "id_aa64mmfr1.vh=0" },
{ "kvm-arm.mode=protected", "id_aa64mmfr1.vh=0" },
+ { "arm64.nosve", "id_aa64pfr0.sve=0 id_aa64pfr1.sme=0" },
+ { "arm64.nosme", "id_aa64pfr1.sme=0" },
{ "arm64.nobti", "id_aa64pfr1.bt=0" },
{ "arm64.nopauth",
"id_aa64isar1.gpi=0 id_aa64isar1.gpa=0 "
@@ -144,7 +204,8 @@ static void __init match_options(const char *cmdline)
for (f = 0; strlen(regs[i]->fields[f].name); f++) {
u64 shift = regs[i]->fields[f].shift;
- u64 mask = 0xfUL << shift;
+ u64 width = regs[i]->fields[f].width ?: 4;
+ u64 mask = GENMASK_ULL(shift + width - 1, shift);
u64 v;
if (find_field(cmdline, regs[i], f, &v))
@@ -152,7 +213,7 @@ static void __init match_options(const char *cmdline)
/*
* If an override gets filtered out, advertise
- * it by setting the value to 0xf, but
+ * it by setting the value to the all-ones while
* clearing the mask... Yes, this is fragile.
*/
if (regs[i]->fields[f].filter &&
@@ -234,9 +295,9 @@ static __init void parse_cmdline(void)
}
/* Keep checkers quiet */
-void init_feature_override(void);
+void init_feature_override(u64 boot_status);
-asmlinkage void __init init_feature_override(void)
+asmlinkage void __init init_feature_override(u64 boot_status)
{
int i;
@@ -247,6 +308,8 @@ asmlinkage void __init init_feature_override(void)
}
}
+ __boot_status = boot_status;
+
parse_cmdline();
for (i = 0; i < ARRAY_SIZE(regs); i++) {
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 241c86b67d01..afa69e04e75e 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -10,11 +10,8 @@
#error This file should only be included in vmlinux.lds.S
#endif
-#ifdef CONFIG_EFI
-
-__efistub_kernel_size = _edata - _text;
-__efistub_primary_entry_offset = primary_entry - _text;
-
+PROVIDE(__efistub_kernel_size = _edata - _text);
+PROVIDE(__efistub_primary_entry_offset = primary_entry - _text);
/*
* The EFI stub has its own symbol namespace prefixed by __efistub_, to
@@ -25,31 +22,37 @@ __efistub_primary_entry_offset = primary_entry - _text;
* linked at. The routines below are all implemented in assembler in a
* position independent manner
*/
-__efistub_memcmp = __pi_memcmp;
-__efistub_memchr = __pi_memchr;
-__efistub_memcpy = __pi_memcpy;
-__efistub_memmove = __pi_memmove;
-__efistub_memset = __pi_memset;
-__efistub_strlen = __pi_strlen;
-__efistub_strnlen = __pi_strnlen;
-__efistub_strcmp = __pi_strcmp;
-__efistub_strncmp = __pi_strncmp;
-__efistub_strrchr = __pi_strrchr;
-__efistub_dcache_clean_poc = __pi_dcache_clean_poc;
-
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
-__efistub___memcpy = __pi_memcpy;
-__efistub___memmove = __pi_memmove;
-__efistub___memset = __pi_memset;
-#endif
+PROVIDE(__efistub_memcmp = __pi_memcmp);
+PROVIDE(__efistub_memchr = __pi_memchr);
+PROVIDE(__efistub_memcpy = __pi_memcpy);
+PROVIDE(__efistub_memmove = __pi_memmove);
+PROVIDE(__efistub_memset = __pi_memset);
+PROVIDE(__efistub_strlen = __pi_strlen);
+PROVIDE(__efistub_strnlen = __pi_strnlen);
+PROVIDE(__efistub_strcmp = __pi_strcmp);
+PROVIDE(__efistub_strncmp = __pi_strncmp);
+PROVIDE(__efistub_strrchr = __pi_strrchr);
+PROVIDE(__efistub_dcache_clean_poc = __pi_dcache_clean_poc);
+
+PROVIDE(__efistub__text = _text);
+PROVIDE(__efistub__end = _end);
+PROVIDE(__efistub__edata = _edata);
+PROVIDE(__efistub_screen_info = screen_info);
+PROVIDE(__efistub__ctype = _ctype);
-__efistub__text = _text;
-__efistub__end = _end;
-__efistub__edata = _edata;
-__efistub_screen_info = screen_info;
-__efistub__ctype = _ctype;
+/*
+ * The __ prefixed memcpy/memset/memmove symbols are provided by KASAN, which
+ * instruments the conventional ones. Therefore, any references from the EFI
+ * stub or other position independent, low level C code should be redirected to
+ * the non-instrumented versions as well.
+ */
+PROVIDE(__efistub___memcpy = __pi_memcpy);
+PROVIDE(__efistub___memmove = __pi_memmove);
+PROVIDE(__efistub___memset = __pi_memset);
-#endif
+PROVIDE(__pi___memcpy = __pi_memcpy);
+PROVIDE(__pi___memmove = __pi_memmove);
+PROVIDE(__pi___memset = __pi_memset);
#ifdef CONFIG_KVM
diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c
index fc98037e1220..faf88ec9c48e 100644
--- a/arch/arm64/kernel/jump_label.c
+++ b/arch/arm64/kernel/jump_label.c
@@ -26,14 +26,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
aarch64_insn_patch_text_nosync(addr, insn);
}
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- /*
- * We use the architected A64 NOP in arch_static_branch, so there's no
- * need to patch an identical A64 NOP over the top of it here. The core
- * will call arch_jump_label_transform from a module notifier if the
- * NOP needs to be replaced by a branch.
- */
-}
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c
index 418b2bba1521..325455d16dbc 100644
--- a/arch/arm64/kernel/kaslr.c
+++ b/arch/arm64/kernel/kaslr.c
@@ -13,7 +13,6 @@
#include <linux/pgtable.h>
#include <linux/random.h>
-#include <asm/cacheflush.h>
#include <asm/fixmap.h>
#include <asm/kernel-pgtable.h>
#include <asm/memory.h>
@@ -21,128 +20,45 @@
#include <asm/sections.h>
#include <asm/setup.h>
-enum kaslr_status {
- KASLR_ENABLED,
- KASLR_DISABLED_CMDLINE,
- KASLR_DISABLED_NO_SEED,
- KASLR_DISABLED_FDT_REMAP,
-};
-
-static enum kaslr_status __initdata kaslr_status;
u64 __ro_after_init module_alloc_base;
u16 __initdata memstart_offset_seed;
-static __init u64 get_kaslr_seed(void *fdt)
-{
- int node, len;
- fdt64_t *prop;
- u64 ret;
-
- node = fdt_path_offset(fdt, "/chosen");
- if (node < 0)
- return 0;
-
- prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
- if (!prop || len != sizeof(u64))
- return 0;
-
- ret = fdt64_to_cpu(*prop);
- *prop = 0;
- return ret;
-}
-
struct arm64_ftr_override kaslr_feature_override __initdata;
-/*
- * This routine will be executed with the kernel mapped at its default virtual
- * address, and if it returns successfully, the kernel will be remapped, and
- * start_kernel() will be executed from a randomized virtual offset. The
- * relocation will result in all absolute references (e.g., static variables
- * containing function pointers) to be reinitialized, and zero-initialized
- * .bss variables will be reset to 0.
- */
-u64 __init kaslr_early_init(void)
+static int __init kaslr_init(void)
{
- void *fdt;
- u64 seed, offset, mask, module_range;
- unsigned long raw;
+ u64 module_range;
+ u32 seed;
/*
* Set a reasonable default for module_alloc_base in case
* we end up running with module randomization disabled.
*/
module_alloc_base = (u64)_etext - MODULES_VSIZE;
- dcache_clean_inval_poc((unsigned long)&module_alloc_base,
- (unsigned long)&module_alloc_base +
- sizeof(module_alloc_base));
-
- /*
- * Try to map the FDT early. If this fails, we simply bail,
- * and proceed with KASLR disabled. We will make another
- * attempt at mapping the FDT in setup_machine()
- */
- fdt = get_early_fdt_ptr();
- if (!fdt) {
- kaslr_status = KASLR_DISABLED_FDT_REMAP;
- return 0;
- }
- /*
- * Retrieve (and wipe) the seed from the FDT
- */
- seed = get_kaslr_seed(fdt);
-
- /*
- * Check if 'nokaslr' appears on the command line, and
- * return 0 if that is the case.
- */
if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) {
- kaslr_status = KASLR_DISABLED_CMDLINE;
+ pr_info("KASLR disabled on command line\n");
return 0;
}
- /*
- * Mix in any entropy obtainable architecturally if enabled
- * and supported.
- */
-
- if (arch_get_random_seed_long_early(&raw))
- seed ^= raw;
-
- if (!seed) {
- kaslr_status = KASLR_DISABLED_NO_SEED;
+ if (!kaslr_offset()) {
+ pr_warn("KASLR disabled due to lack of seed\n");
return 0;
}
+ pr_info("KASLR enabled\n");
+
/*
- * OK, so we are proceeding with KASLR enabled. Calculate a suitable
- * kernel image offset from the seed. Let's place the kernel in the
- * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of
- * the lower and upper quarters to avoid colliding with other
- * allocations.
- * Even if we could randomize at page granularity for 16k and 64k pages,
- * let's always round to 2 MB so we don't interfere with the ability to
- * map using contiguous PTEs
+ * KASAN without KASAN_VMALLOC does not expect the module region to
+ * intersect the vmalloc region, since shadow memory is allocated for
+ * each module at load time, whereas the vmalloc region will already be
+ * shadowed by KASAN zero pages.
*/
- mask = ((1UL << (VA_BITS_MIN - 2)) - 1) & ~(SZ_2M - 1);
- offset = BIT(VA_BITS_MIN - 3) + (seed & mask);
+ BUILD_BUG_ON((IS_ENABLED(CONFIG_KASAN_GENERIC) ||
+ IS_ENABLED(CONFIG_KASAN_SW_TAGS)) &&
+ !IS_ENABLED(CONFIG_KASAN_VMALLOC));
- /* use the top 16 bits to randomize the linear region */
- memstart_offset_seed = seed >> 48;
-
- if (!IS_ENABLED(CONFIG_KASAN_VMALLOC) &&
- (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
- IS_ENABLED(CONFIG_KASAN_SW_TAGS)))
- /*
- * KASAN without KASAN_VMALLOC does not expect the module region
- * to intersect the vmalloc region, since shadow memory is
- * allocated for each module at load time, whereas the vmalloc
- * region is shadowed by KASAN zero pages. So keep modules
- * out of the vmalloc region if KASAN is enabled without
- * KASAN_VMALLOC, and put the kernel well within 4 GB of the
- * module region.
- */
- return offset % SZ_2G;
+ seed = get_random_u32();
if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
/*
@@ -154,8 +70,7 @@ u64 __init kaslr_early_init(void)
* resolved normally.)
*/
module_range = SZ_2G - (u64)(_end - _stext);
- module_alloc_base = max((u64)_end + offset - SZ_2G,
- (u64)MODULES_VADDR);
+ module_alloc_base = max((u64)_end - SZ_2G, (u64)MODULES_VADDR);
} else {
/*
* Randomize the module region by setting module_alloc_base to
@@ -167,40 +82,12 @@ u64 __init kaslr_early_init(void)
* when ARM64_MODULE_PLTS is enabled.
*/
module_range = MODULES_VSIZE - (u64)(_etext - _stext);
- module_alloc_base = (u64)_etext + offset - MODULES_VSIZE;
}
/* use the lower 21 bits to randomize the base of the module region */
module_alloc_base += (module_range * (seed & ((1 << 21) - 1))) >> 21;
module_alloc_base &= PAGE_MASK;
- dcache_clean_inval_poc((unsigned long)&module_alloc_base,
- (unsigned long)&module_alloc_base +
- sizeof(module_alloc_base));
- dcache_clean_inval_poc((unsigned long)&memstart_offset_seed,
- (unsigned long)&memstart_offset_seed +
- sizeof(memstart_offset_seed));
-
- return offset;
-}
-
-static int __init kaslr_init(void)
-{
- switch (kaslr_status) {
- case KASLR_ENABLED:
- pr_info("KASLR enabled\n");
- break;
- case KASLR_DISABLED_CMDLINE:
- pr_info("KASLR disabled on command line\n");
- break;
- case KASLR_DISABLED_NO_SEED:
- pr_warn("KASLR disabled due to lack of seed\n");
- break;
- case KASLR_DISABLED_FDT_REMAP:
- pr_warn("KASLR disabled due to FDT remapping failure\n");
- break;
- }
-
return 0;
}
-core_initcall(kaslr_init)
+subsys_initcall(kaslr_init)
diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S
index 42bd8c0c60e0..692e9d2e31e5 100644
--- a/arch/arm64/kernel/kuser32.S
+++ b/arch/arm64/kernel/kuser32.S
@@ -15,6 +15,7 @@
#include <asm/unistd.h>
+ .section .rodata
.align 5
.globl __kuser_helper_start
__kuser_helper_start:
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index f6b00743c399..b2b730233274 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -48,15 +48,6 @@ static void mte_sync_page_tags(struct page *page, pte_t old_pte,
if (!pte_is_tagged)
return;
- page_kasan_tag_reset(page);
- /*
- * We need smp_wmb() in between setting the flags and clearing the
- * tags because if another thread reads page->flags and builds a
- * tagged address out of it, there is an actual dependency to the
- * memory access, but on the current thread we do not guarantee that
- * the new page->flags are visible before the tags were updated.
- */
- smp_wmb();
mte_clear_page_tags(page_address(page));
}
diff --git a/arch/arm64/kernel/pi/Makefile b/arch/arm64/kernel/pi/Makefile
new file mode 100644
index 000000000000..839291430cb3
--- /dev/null
+++ b/arch/arm64/kernel/pi/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright 2022 Google LLC
+
+KBUILD_CFLAGS := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
+ -Os -DDISABLE_BRANCH_PROFILING $(DISABLE_STACKLEAK_PLUGIN) \
+ $(call cc-option,-mbranch-protection=none) \
+ -I$(srctree)/scripts/dtc/libfdt -fno-stack-protector \
+ -include $(srctree)/include/linux/hidden.h \
+ -D__DISABLE_EXPORTS -ffreestanding -D__NO_FORTIFY \
+ $(call cc-option,-fno-addrsig)
+
+# remove SCS flags from all objects in this directory
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_SCS), $(KBUILD_CFLAGS))
+# disable LTO
+KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS))
+
+GCOV_PROFILE := n
+KASAN_SANITIZE := n
+KCSAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+$(obj)/%.pi.o: OBJCOPYFLAGS := --prefix-symbols=__pi_ \
+ --remove-section=.note.gnu.property \
+ --prefix-alloc-sections=.init
+$(obj)/%.pi.o: $(obj)/%.o FORCE
+ $(call if_changed,objcopy)
+
+$(obj)/lib-%.o: $(srctree)/lib/%.c FORCE
+ $(call if_changed_rule,cc_o_c)
+
+obj-y := kaslr_early.pi.o lib-fdt.pi.o lib-fdt_ro.pi.o
+extra-y := $(patsubst %.pi.o,%.o,$(obj-y))
diff --git a/arch/arm64/kernel/pi/kaslr_early.c b/arch/arm64/kernel/pi/kaslr_early.c
new file mode 100644
index 000000000000..6c3855e69395
--- /dev/null
+++ b/arch/arm64/kernel/pi/kaslr_early.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright 2022 Google LLC
+// Author: Ard Biesheuvel <ardb@google.com>
+
+// NOTE: code in this file runs *very* early, and is not permitted to use
+// global variables or anything that relies on absolute addressing.
+
+#include <linux/libfdt.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/sizes.h>
+#include <linux/string.h>
+
+#include <asm/archrandom.h>
+#include <asm/memory.h>
+
+/* taken from lib/string.c */
+static char *__strstr(const char *s1, const char *s2)
+{
+ size_t l1, l2;
+
+ l2 = strlen(s2);
+ if (!l2)
+ return (char *)s1;
+ l1 = strlen(s1);
+ while (l1 >= l2) {
+ l1--;
+ if (!memcmp(s1, s2, l2))
+ return (char *)s1;
+ s1++;
+ }
+ return NULL;
+}
+static bool cmdline_contains_nokaslr(const u8 *cmdline)
+{
+ const u8 *str;
+
+ str = __strstr(cmdline, "nokaslr");
+ return str == cmdline || (str > cmdline && *(str - 1) == ' ');
+}
+
+static bool is_kaslr_disabled_cmdline(void *fdt)
+{
+ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) {
+ int node;
+ const u8 *prop;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ goto out;
+
+ prop = fdt_getprop(fdt, node, "bootargs", NULL);
+ if (!prop)
+ goto out;
+
+ if (cmdline_contains_nokaslr(prop))
+ return true;
+
+ if (IS_ENABLED(CONFIG_CMDLINE_EXTEND))
+ goto out;
+
+ return false;
+ }
+out:
+ return cmdline_contains_nokaslr(CONFIG_CMDLINE);
+}
+
+static u64 get_kaslr_seed(void *fdt)
+{
+ int node, len;
+ fdt64_t *prop;
+ u64 ret;
+
+ node = fdt_path_offset(fdt, "/chosen");
+ if (node < 0)
+ return 0;
+
+ prop = fdt_getprop_w(fdt, node, "kaslr-seed", &len);
+ if (!prop || len != sizeof(u64))
+ return 0;
+
+ ret = fdt64_to_cpu(*prop);
+ *prop = 0;
+ return ret;
+}
+
+asmlinkage u64 kaslr_early_init(void *fdt)
+{
+ u64 seed;
+
+ if (is_kaslr_disabled_cmdline(fdt))
+ return 0;
+
+ seed = get_kaslr_seed(fdt);
+ if (!seed) {
+#ifdef CONFIG_ARCH_RANDOM
+ if (!__early_cpu_has_rndr() ||
+ !__arm64_rndr((unsigned long *)&seed))
+#endif
+ return 0;
+ }
+
+ /*
+ * OK, so we are proceeding with KASLR enabled. Calculate a suitable
+ * kernel image offset from the seed. Let's place the kernel in the
+ * middle half of the VMALLOC area (VA_BITS_MIN - 2), and stay clear of
+ * the lower and upper quarters to avoid colliding with other
+ * allocations.
+ */
+ return BIT(VA_BITS_MIN - 3) + (seed & GENMASK(VA_BITS_MIN - 3, 0));
+}
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index b0980fbb6bc7..3e6d0352d7d3 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -280,6 +280,9 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user)
vl = task_get_sme_vl(current);
} else {
+ if (!system_supports_sve())
+ return -EINVAL;
+
vl = task_get_sve_vl(current);
}
@@ -342,9 +345,14 @@ fpsimd_only:
#else /* ! CONFIG_ARM64_SVE */
-/* Turn any non-optimised out attempts to use these into a link error: */
+static int restore_sve_fpsimd_context(struct user_ctxs *user)
+{
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
+/* Turn any non-optimised out attempts to use this into a link error: */
extern int preserve_sve_context(void __user *ctx);
-extern int restore_sve_fpsimd_context(struct user_ctxs *user);
#endif /* ! CONFIG_ARM64_SVE */
@@ -649,14 +657,10 @@ static int restore_sigframe(struct pt_regs *regs,
if (!user.fpsimd)
return -EINVAL;
- if (user.sve) {
- if (!system_supports_sve())
- return -EINVAL;
-
+ if (user.sve)
err = restore_sve_fpsimd_context(&user);
- } else {
+ else
err = restore_fpsimd_context(user.fpsimd);
- }
}
if (err == 0 && system_supports_sme() && user.za)
diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S
index 475d30d471ac..ccbd4aab4ba4 100644
--- a/arch/arm64/kernel/sigreturn32.S
+++ b/arch/arm64/kernel/sigreturn32.S
@@ -15,6 +15,7 @@
#include <asm/unistd.h>
+ .section .rodata
.globl __aarch32_sigret_code_start
__aarch32_sigret_code_start:
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index 4ea9392f86e0..617f78ad43a1 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -100,10 +100,11 @@ SYM_FUNC_END(__cpu_suspend_enter)
.pushsection ".idmap.text", "awx"
SYM_CODE_START(cpu_resume)
bl init_kernel_el
- bl switch_to_vhe
+ bl finalise_el2
bl __cpu_setup
/* enable the MMU early - so we can access sleep_save_stash by va */
adrp x1, swapper_pg_dir
+ adrp x2, idmap_pg_dir
bl __enable_mmu
ldr x8, =_cpu_resume
br x8
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 0467cb79f080..fcaa151b81f1 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -38,6 +38,8 @@
* @kr_cur: When KRETPROBES is selected, holds the kretprobe instance
* associated with the most recently encountered replacement lr
* value.
+ *
+ * @task: The task being unwound.
*/
struct unwind_state {
unsigned long fp;
@@ -48,13 +50,13 @@ struct unwind_state {
#ifdef CONFIG_KRETPROBES
struct llist_node *kr_cur;
#endif
+ struct task_struct *task;
};
-static notrace void unwind_init(struct unwind_state *state, unsigned long fp,
- unsigned long pc)
+static void unwind_init_common(struct unwind_state *state,
+ struct task_struct *task)
{
- state->fp = fp;
- state->pc = pc;
+ state->task = task;
#ifdef CONFIG_KRETPROBES
state->kr_cur = NULL;
#endif
@@ -72,7 +74,57 @@ static notrace void unwind_init(struct unwind_state *state, unsigned long fp,
state->prev_fp = 0;
state->prev_type = STACK_TYPE_UNKNOWN;
}
-NOKPROBE_SYMBOL(unwind_init);
+
+/*
+ * Start an unwind from a pt_regs.
+ *
+ * The unwind will begin at the PC within the regs.
+ *
+ * The regs must be on a stack currently owned by the calling task.
+ */
+static inline void unwind_init_from_regs(struct unwind_state *state,
+ struct pt_regs *regs)
+{
+ unwind_init_common(state, current);
+
+ state->fp = regs->regs[29];
+ state->pc = regs->pc;
+}
+
+/*
+ * Start an unwind from a caller.
+ *
+ * The unwind will begin at the caller of whichever function this is inlined
+ * into.
+ *
+ * The function which invokes this must be noinline.
+ */
+static __always_inline void unwind_init_from_caller(struct unwind_state *state)
+{
+ unwind_init_common(state, current);
+
+ state->fp = (unsigned long)__builtin_frame_address(1);
+ state->pc = (unsigned long)__builtin_return_address(0);
+}
+
+/*
+ * Start an unwind from a blocked task.
+ *
+ * The unwind will begin at the blocked tasks saved PC (i.e. the caller of
+ * cpu_switch_to()).
+ *
+ * The caller should ensure the task is blocked in cpu_switch_to() for the
+ * duration of the unwind, or the unwind will be bogus. It is never valid to
+ * call this for the current task.
+ */
+static inline void unwind_init_from_task(struct unwind_state *state,
+ struct task_struct *task)
+{
+ unwind_init_common(state, task);
+
+ state->fp = thread_saved_fp(task);
+ state->pc = thread_saved_pc(task);
+}
/*
* Unwind from one frame record (A) to the next frame record (B).
@@ -81,9 +133,9 @@ NOKPROBE_SYMBOL(unwind_init);
* records (e.g. a cycle), determined based on the location and fp value of A
* and the location (but not the fp value) of B.
*/
-static int notrace unwind_next(struct task_struct *tsk,
- struct unwind_state *state)
+static int notrace unwind_next(struct unwind_state *state)
{
+ struct task_struct *tsk = state->task;
unsigned long fp = state->fp;
struct stack_info info;
@@ -117,15 +169,15 @@ static int notrace unwind_next(struct task_struct *tsk,
if (fp <= state->prev_fp)
return -EINVAL;
} else {
- set_bit(state->prev_type, state->stacks_done);
+ __set_bit(state->prev_type, state->stacks_done);
}
/*
* Record this frame record's values and location. The prev_fp and
* prev_type are only meaningful to the next unwind_next() invocation.
*/
- state->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp));
- state->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8));
+ state->fp = READ_ONCE(*(unsigned long *)(fp));
+ state->pc = READ_ONCE(*(unsigned long *)(fp + 8));
state->prev_fp = fp;
state->prev_type = info.type;
@@ -157,8 +209,7 @@ static int notrace unwind_next(struct task_struct *tsk,
}
NOKPROBE_SYMBOL(unwind_next);
-static void notrace unwind(struct task_struct *tsk,
- struct unwind_state *state,
+static void notrace unwind(struct unwind_state *state,
stack_trace_consume_fn consume_entry, void *cookie)
{
while (1) {
@@ -166,7 +217,7 @@ static void notrace unwind(struct task_struct *tsk,
if (!consume_entry(cookie, state->pc))
break;
- ret = unwind_next(tsk, state);
+ ret = unwind_next(state);
if (ret < 0)
break;
}
@@ -212,15 +263,15 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
{
struct unwind_state state;
- if (regs)
- unwind_init(&state, regs->regs[29], regs->pc);
- else if (task == current)
- unwind_init(&state,
- (unsigned long)__builtin_frame_address(1),
- (unsigned long)__builtin_return_address(0));
- else
- unwind_init(&state, thread_saved_fp(task),
- thread_saved_pc(task));
-
- unwind(task, &state, consume_entry, cookie);
+ if (regs) {
+ if (task != current)
+ return;
+ unwind_init_from_regs(&state, regs);
+ } else if (task == current) {
+ unwind_init_from_caller(&state);
+ } else {
+ unwind_init_from_task(&state, task);
+ }
+
+ unwind(&state, consume_entry, cookie);
}
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index 2b0887e58a7c..9135fe0f3df5 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -52,7 +52,7 @@ void notrace __cpu_suspend_exit(void)
/* Restore CnP bit in TTBR1_EL1 */
if (system_supports_cnp())
- cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir);
/*
* PSTATE was not saved over suspend/resume, re-enable any detected
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 9ac7a81b79be..b7fed33981f7 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -579,11 +579,11 @@ static void ctr_read_handler(unsigned long esr, struct pt_regs *regs)
if (cpus_have_const_cap(ARM64_WORKAROUND_1542419)) {
/* Hide DIC so that we can trap the unnecessary maintenance...*/
- val &= ~BIT(CTR_DIC_SHIFT);
+ val &= ~BIT(CTR_EL0_DIC_SHIFT);
/* ... and fake IminLine to reduce the number of traps. */
- val &= ~CTR_IMINLINE_MASK;
- val |= (PAGE_SHIFT - 2) & CTR_IMINLINE_MASK;
+ val &= ~CTR_EL0_IminLine_MASK;
+ val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK;
}
pt_regs_write_reg(regs, rt, val);
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index f6e25d7c346a..bafbf78fab77 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -24,7 +24,13 @@ btildflags-$(CONFIG_ARM64_BTI_KERNEL) += -z force-bti
# routines, as x86 does (see 6f121e548f83 ("x86, vdso: Reimplement vdso.so
# preparation in build-time C")).
ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \
- -Bsymbolic --build-id=sha1 -n $(btildflags-y) -T
+ -Bsymbolic --build-id=sha1 -n $(btildflags-y)
+
+ifdef CONFIG_LD_ORPHAN_WARN
+ ldflags-y += --orphan-handling=warn
+endif
+
+ldflags-y += -T
ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index a5e61e09ea92..e69fb4aaaf3e 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -11,6 +11,7 @@
#include <linux/const.h>
#include <asm/page.h>
#include <asm/vdso.h>
+#include <asm-generic/vmlinux.lds.h>
OUTPUT_FORMAT("elf64-littleaarch64", "elf64-bigaarch64", "elf64-littleaarch64")
OUTPUT_ARCH(aarch64)
@@ -49,11 +50,24 @@ SECTIONS
.dynamic : { *(.dynamic) } :text :dynamic
- .rodata : { *(.rodata*) } :text
+ .rela.dyn : ALIGN(8) { *(.rela .rela*) }
+
+ .rodata : {
+ *(.rodata*)
+ *(.got)
+ *(.got.plt)
+ *(.plt)
+ *(.plt.*)
+ *(.iplt)
+ *(.igot .igot.plt)
+ } :text
_end = .;
PROVIDE(end = .);
+ DWARF_DEBUG
+ ELF_DETAILS
+
/DISCARD/ : {
*(.data .data.* .gnu.linkonce.d.* .sdata*)
*(.bss .sbss .dynbss .dynsbss)
diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile
index 05ba1aae1b6f..36c8f66cad25 100644
--- a/arch/arm64/kernel/vdso32/Makefile
+++ b/arch/arm64/kernel/vdso32/Makefile
@@ -104,6 +104,7 @@ VDSO_AFLAGS += -D__ASSEMBLY__
VDSO_LDFLAGS += -Bsymbolic --no-undefined -soname=linux-vdso.so.1
VDSO_LDFLAGS += -z max-page-size=4096 -z common-page-size=4096
VDSO_LDFLAGS += -shared --hash-style=sysv --build-id=sha1
+VDSO_LDFLAGS += --orphan-handling=warn
# Borrow vdsomunge.c from the arm vDSO
diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S
index 3348ce5ea306..8d95d7d35057 100644
--- a/arch/arm64/kernel/vdso32/vdso.lds.S
+++ b/arch/arm64/kernel/vdso32/vdso.lds.S
@@ -11,6 +11,7 @@
#include <linux/const.h>
#include <asm/page.h>
#include <asm/vdso.h>
+#include <asm-generic/vmlinux.lds.h>
OUTPUT_FORMAT("elf32-littlearm", "elf32-bigarm", "elf32-littlearm")
OUTPUT_ARCH(arm)
@@ -35,12 +36,30 @@ SECTIONS
.dynamic : { *(.dynamic) } :text :dynamic
- .rodata : { *(.rodata*) } :text
+ .rodata : {
+ *(.rodata*)
+ *(.got)
+ *(.got.plt)
+ *(.plt)
+ *(.rel.iplt)
+ *(.iplt)
+ *(.igot.plt)
+ } :text
- .text : { *(.text*) } :text =0xe7f001f2
+ .text : {
+ *(.text*)
+ *(.glue_7)
+ *(.glue_7t)
+ *(.vfp11_veneer)
+ *(.v4_bx)
+ } :text =0xe7f001f2
- .got : { *(.got) }
- .rel.plt : { *(.rel.plt) }
+ .rel.dyn : { *(.rel*) }
+
+ .ARM.exidx : { *(.ARM.exidx*) }
+ DWARF_DEBUG
+ ELF_DETAILS
+ .ARM.attributes 0 : { *(.ARM.attributes) }
/DISCARD/ : {
*(.note.GNU-stack)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 2d4a8f995175..45131e354e27 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -115,7 +115,8 @@ jiffies = jiffies_64;
__entry_tramp_text_start = .; \
*(.entry.tramp.text) \
. = ALIGN(PAGE_SIZE); \
- __entry_tramp_text_end = .;
+ __entry_tramp_text_end = .; \
+ *(.entry.tramp.rodata)
#else
#define TRAMP_TEXT
#endif
@@ -198,8 +199,7 @@ SECTIONS
}
idmap_pg_dir = .;
- . += IDMAP_DIR_SIZE;
- idmap_pg_end = .;
+ . += PAGE_SIZE;
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
@@ -235,6 +235,10 @@ SECTIONS
__inittext_end = .;
__initdata_begin = .;
+ init_idmap_pg_dir = .;
+ . += INIT_IDMAP_DIR_SIZE;
+ init_idmap_pg_end = .;
+
.init.data : {
INIT_DATA
INIT_SETUP(16)
@@ -253,21 +257,17 @@ SECTIONS
HYPERVISOR_RELOC_SECTION
.rela.dyn : ALIGN(8) {
+ __rela_start = .;
*(.rela .rela*)
+ __rela_end = .;
}
- __rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR);
- __rela_size = SIZEOF(.rela.dyn);
-
-#ifdef CONFIG_RELR
.relr.dyn : ALIGN(8) {
+ __relr_start = .;
*(.relr.dyn)
+ __relr_end = .;
}
- __relr_offset = ABSOLUTE(ADDR(.relr.dyn) - KIMAGE_VADDR);
- __relr_size = SIZEOF(.relr.dyn);
-#endif
-
. = ALIGN(SEGMENT_ALIGN);
__initdata_end = .;
__init_end = .;
diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
index fd55014b3497..fa6e466ed57f 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h
@@ -176,25 +176,25 @@
)
#define PVM_ID_AA64ISAR1_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR1_DPB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_JSCVT) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_FCMA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_LRCPC) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_FRINTTS) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_SB) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_SPECRES) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_BF16) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_DGH) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR1_I8MM) \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \
)
#define PVM_ID_AA64ISAR2_ALLOW (\
- ARM64_FEATURE_MASK(ID_AA64ISAR2_GPA3) | \
- ARM64_FEATURE_MASK(ID_AA64ISAR2_APA3) \
+ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \
+ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) \
)
u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 35a4331ba5f3..6b94c3e6ff26 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -173,10 +173,10 @@ static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu)
u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW;
if (!vcpu_has_ptrauth(vcpu))
- allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_API) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI));
+ allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
return id_aa64isar1_el1_sys_val & allow_mask;
}
@@ -186,8 +186,8 @@ static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu)
u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW;
if (!vcpu_has_ptrauth(vcpu))
- allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_APA3) |
- ARM64_FEATURE_MASK(ID_AA64ISAR2_GPA3));
+ allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
return id_aa64isar2_el1_sys_val & allow_mask;
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index c06c0477fab5..c4fb3874b5e2 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1136,17 +1136,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu,
break;
case SYS_ID_AA64ISAR1_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_API) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) |
- ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI));
+ val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
break;
case SYS_ID_AA64ISAR2_EL1:
if (!vcpu_has_ptrauth(vcpu))
- val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_APA3) |
- ARM64_FEATURE_MASK(ID_AA64ISAR2_GPA3));
+ val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
+ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
if (!cpus_have_final_cap(ARM64_HAS_WFXT))
- val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_WFXT);
+ val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
break;
case SYS_ID_AA64DFR0_EL1:
/* Limit debug to ARMv8.0 */
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index eeb9e45bcce8..1b7c93ae7e63 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -18,7 +18,7 @@
*/
.macro multitag_transfer_size, reg, tmp
mrs_s \reg, SYS_GMID_EL1
- ubfx \reg, \reg, #SYS_GMID_EL1_BS_SHIFT, #SYS_GMID_EL1_BS_SIZE
+ ubfx \reg, \reg, #GMID_EL1_BS_SHIFT, #GMID_EL1_BS_SIZE
mov \tmp, #4
lsl \reg, \tmp, \reg
.endm
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 21c907987080..081058d4e436 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -194,44 +194,3 @@ SYM_FUNC_START(__pi_dcache_clean_pop)
ret
SYM_FUNC_END(__pi_dcache_clean_pop)
SYM_FUNC_ALIAS(dcache_clean_pop, __pi_dcache_clean_pop)
-
-/*
- * __dma_flush_area(start, size)
- *
- * clean & invalidate D / U line
- *
- * - start - virtual start address of region
- * - size - size in question
- */
-SYM_FUNC_START(__pi___dma_flush_area)
- add x1, x0, x1
- dcache_by_line_op civac, sy, x0, x1, x2, x3
- ret
-SYM_FUNC_END(__pi___dma_flush_area)
-SYM_FUNC_ALIAS(__dma_flush_area, __pi___dma_flush_area)
-
-/*
- * __dma_map_area(start, size, dir)
- * - start - kernel virtual start address
- * - size - size of region
- * - dir - DMA direction
- */
-SYM_FUNC_START(__pi___dma_map_area)
- add x1, x0, x1
- b __pi_dcache_clean_poc
-SYM_FUNC_END(__pi___dma_map_area)
-SYM_FUNC_ALIAS(__dma_map_area, __pi___dma_map_area)
-
-/*
- * __dma_unmap_area(start, size, dir)
- * - start - kernel virtual start address
- * - size - size of region
- * - dir - DMA direction
- */
-SYM_FUNC_START(__pi___dma_unmap_area)
- add x1, x0, x1
- cmp w2, #DMA_TO_DEVICE
- b.ne __pi_dcache_inval_poc
- ret
-SYM_FUNC_END(__pi___dma_unmap_area)
-SYM_FUNC_ALIAS(__dma_unmap_area, __pi___dma_unmap_area)
diff --git a/arch/arm64/mm/copypage.c b/arch/arm64/mm/copypage.c
index 0dea80bf6de4..24913271e898 100644
--- a/arch/arm64/mm/copypage.c
+++ b/arch/arm64/mm/copypage.c
@@ -23,15 +23,6 @@ void copy_highpage(struct page *to, struct page *from)
if (system_supports_mte() && test_bit(PG_mte_tagged, &from->flags)) {
set_bit(PG_mte_tagged, &to->flags);
- page_kasan_tag_reset(to);
- /*
- * We need smp_wmb() in between setting the flags and clearing the
- * tags because if another thread reads page->flags and builds a
- * tagged address out of it, there is an actual dependency to the
- * memory access, but on the current thread we do not guarantee that
- * the new page->flags are visible before the tags were updated.
- */
- smp_wmb();
mte_copy_page_tags(kto, kfrom);
}
}
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 6099c81b9322..599cf81f5685 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -14,20 +14,29 @@
#include <asm/xen/xen-ops.h>
void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir)
{
- __dma_map_area(phys_to_virt(paddr), size, dir);
+ unsigned long start = (unsigned long)phys_to_virt(paddr);
+
+ dcache_clean_poc(start, start + size);
}
void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
- enum dma_data_direction dir)
+ enum dma_data_direction dir)
{
- __dma_unmap_area(phys_to_virt(paddr), size, dir);
+ unsigned long start = (unsigned long)phys_to_virt(paddr);
+
+ if (dir == DMA_TO_DEVICE)
+ return;
+
+ dcache_inval_poc(start, start + size);
}
void arch_dma_prep_coherent(struct page *page, size_t size)
{
- __dma_flush_area(page_address(page), size);
+ unsigned long start = (unsigned long)page_address(page);
+
+ dcache_clean_inval_poc(start, start + size);
}
#ifdef CONFIG_IOMMU_DMA
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 489455309695..228d681a8715 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -16,13 +16,6 @@ get_ex_fixup(const struct exception_table_entry *ex)
return ((unsigned long)&ex->fixup + ex->fixup);
}
-static bool ex_handler_fixup(const struct exception_table_entry *ex,
- struct pt_regs *regs)
-{
- regs->pc = get_ex_fixup(ex);
- return true;
-}
-
static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex,
struct pt_regs *regs)
{
@@ -72,11 +65,10 @@ bool fixup_exception(struct pt_regs *regs)
return false;
switch (ex->type) {
- case EX_TYPE_FIXUP:
- return ex_handler_fixup(ex, regs);
case EX_TYPE_BPF:
return ex_handler_bpf(ex, regs);
case EX_TYPE_UACCESS_ERR_ZERO:
+ case EX_TYPE_KACCESS_ERR_ZERO:
return ex_handler_uaccess_err_zero(ex, regs);
case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
return ex_handler_load_unaligned_zeropad(ex, regs);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index c5e11768e5c1..cdf3ffa0c223 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -927,6 +927,5 @@ struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
void tag_clear_highpage(struct page *page)
{
mte_zero_clear_page_tags(page_address(page));
- page_kasan_tag_reset(page);
set_bit(PG_mte_tagged, &page->flags);
}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 3618ef3f6d81..5307ffdefb8d 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -100,16 +100,6 @@ int pud_huge(pud_t pud)
#endif
}
-/*
- * Select all bits except the pfn
- */
-static inline pgprot_t pte_pgprot(pte_t pte)
-{
- unsigned long pfn = pte_pfn(pte);
-
- return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
-}
-
static int find_num_contig(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, size_t *pgsize)
{
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 339ee84e5a61..b6ef26fc8ebe 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -389,7 +389,7 @@ void __init arm64_memblock_init(void)
early_init_fdt_scan_reserved_mem();
- if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
+ if (!defer_reserve_crashkernel())
reserve_crashkernel();
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
@@ -438,7 +438,7 @@ void __init bootmem_init(void)
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
- if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
+ if (defer_reserve_crashkernel())
reserve_crashkernel();
memblock_dump_all();
diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c
index b21f91cd830d..c5af103d4ad4 100644
--- a/arch/arm64/mm/ioremap.c
+++ b/arch/arm64/mm/ioremap.c
@@ -1,96 +1,22 @@
// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Based on arch/arm/mm/ioremap.c
- *
- * (C) Copyright 1995 1996 Linus Torvalds
- * Hacked for ARM by Phil Blundell <philb@gnu.org>
- * Hacked to allow all architectures to build, and various cleanups
- * by Russell King
- * Copyright (C) 2012 ARM Ltd.
- */
-#include <linux/export.h>
#include <linux/mm.h>
-#include <linux/vmalloc.h>
#include <linux/io.h>
-#include <asm/fixmap.h>
-#include <asm/tlbflush.h>
-
-static void __iomem *__ioremap_caller(phys_addr_t phys_addr, size_t size,
- pgprot_t prot, void *caller)
+bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot)
{
- unsigned long last_addr;
- unsigned long offset = phys_addr & ~PAGE_MASK;
- int err;
- unsigned long addr;
- struct vm_struct *area;
+ unsigned long last_addr = phys_addr + size - 1;
- /*
- * Page align the mapping address and size, taking account of any
- * offset.
- */
- phys_addr &= PAGE_MASK;
- size = PAGE_ALIGN(size + offset);
+ /* Don't allow outside PHYS_MASK */
+ if (last_addr & ~PHYS_MASK)
+ return false;
- /*
- * Don't allow wraparound, zero size or outside PHYS_MASK.
- */
- last_addr = phys_addr + size - 1;
- if (!size || last_addr < phys_addr || (last_addr & ~PHYS_MASK))
- return NULL;
-
- /*
- * Don't allow RAM to be mapped.
- */
+ /* Don't allow RAM to be mapped. */
if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr))))
- return NULL;
-
- area = get_vm_area_caller(size, VM_IOREMAP, caller);
- if (!area)
- return NULL;
- addr = (unsigned long)area->addr;
- area->phys_addr = phys_addr;
-
- err = ioremap_page_range(addr, addr + size, phys_addr, prot);
- if (err) {
- vunmap((void *)addr);
- return NULL;
- }
-
- return (void __iomem *)(offset + addr);
-}
-
-void __iomem *__ioremap(phys_addr_t phys_addr, size_t size, pgprot_t prot)
-{
- return __ioremap_caller(phys_addr, size, prot,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(__ioremap);
-
-void iounmap(volatile void __iomem *io_addr)
-{
- unsigned long addr = (unsigned long)io_addr & PAGE_MASK;
-
- /*
- * We could get an address outside vmalloc range in case
- * of ioremap_cache() reusing a RAM mapping.
- */
- if (is_vmalloc_addr((void *)addr))
- vunmap((void *)addr);
-}
-EXPORT_SYMBOL(iounmap);
-
-void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size)
-{
- /* For normal memory we already have a cacheable mapping. */
- if (pfn_is_map_memory(__phys_to_pfn(phys_addr)))
- return (void __iomem *)__phys_to_virt(phys_addr);
+ return false;
- return __ioremap_caller(phys_addr, size, __pgprot(PROT_NORMAL),
- __builtin_return_address(0));
+ return true;
}
-EXPORT_SYMBOL(ioremap_cache);
/*
* Must be called after early_fixmap_init
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index c12cd700598f..e969e68de005 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -236,7 +236,7 @@ static void __init kasan_init_shadow(void)
*/
memcpy(tmp_pg_dir, swapper_pg_dir, sizeof(tmp_pg_dir));
dsb(ishst);
- cpu_replace_ttbr1(lm_alias(tmp_pg_dir));
+ cpu_replace_ttbr1(lm_alias(tmp_pg_dir), idmap_pg_dir);
clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -280,7 +280,7 @@ static void __init kasan_init_shadow(void)
PAGE_KERNEL_RO));
memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE);
- cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir), idmap_pg_dir);
}
static void __init kasan_init_depth(void)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 626ec32873c6..db7c4e6ae57b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -43,15 +43,27 @@
#define NO_CONT_MAPPINGS BIT(1)
#define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */
-u64 idmap_t0sz = TCR_T0SZ(VA_BITS_MIN);
-u64 idmap_ptrs_per_pgd = PTRS_PER_PGD;
+int idmap_t0sz __ro_after_init;
-u64 __section(".mmuoff.data.write") vabits_actual;
+#if VA_BITS > 48
+u64 vabits_actual __ro_after_init = VA_BITS_MIN;
EXPORT_SYMBOL(vabits_actual);
+#endif
+
+u64 kimage_vaddr __ro_after_init = (u64)&_text;
+EXPORT_SYMBOL(kimage_vaddr);
u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);
+u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };
+
+/*
+ * The booting CPU updates the failed status @__early_cpu_boot_status,
+ * with MMU turned off.
+ */
+long __section(".mmuoff.data.write") __early_cpu_boot_status;
+
/*
* Empty_zero_page is a special page that is used for zero-initialized data
* and COW.
@@ -388,6 +400,13 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
} while (pgdp++, addr = next, addr != end);
}
+#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+extern __alias(__create_pgd_mapping)
+void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
+ phys_addr_t size, pgprot_t prot,
+ phys_addr_t (*pgtable_alloc)(int), int flags);
+#endif
+
static phys_addr_t __pgd_pgtable_alloc(int shift)
{
void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL);
@@ -529,8 +548,7 @@ static void __init map_mem(pgd_t *pgdp)
#ifdef CONFIG_KEXEC_CORE
if (crash_mem_map) {
- if (IS_ENABLED(CONFIG_ZONE_DMA) ||
- IS_ENABLED(CONFIG_ZONE_DMA32))
+ if (defer_reserve_crashkernel())
flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
else if (crashk_res.end)
memblock_mark_nomap(crashk_res.start,
@@ -571,8 +589,7 @@ static void __init map_mem(pgd_t *pgdp)
* through /sys/kernel/kexec_crash_size interface.
*/
#ifdef CONFIG_KEXEC_CORE
- if (crash_mem_map &&
- !IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32)) {
+ if (crash_mem_map && !defer_reserve_crashkernel()) {
if (crashk_res.end) {
__map_memblock(pgdp, crashk_res.start,
crashk_res.end + 1,
@@ -665,13 +682,9 @@ static int __init map_entry_trampoline(void)
__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
pa_start + i * PAGE_SIZE, prot);
- if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
- extern char __entry_tramp_data_start[];
-
- __set_fixmap(FIX_ENTRY_TRAMP_DATA,
- __pa_symbol(__entry_tramp_data_start),
- PAGE_KERNEL_RO);
- }
+ if (IS_ENABLED(CONFIG_RELOCATABLE))
+ __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
+ pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
return 0;
}
@@ -762,22 +775,57 @@ static void __init map_kernel(pgd_t *pgdp)
kasan_copy_shadow(pgdp);
}
+static void __init create_idmap(void)
+{
+ u64 start = __pa_symbol(__idmap_text_start);
+ u64 size = __pa_symbol(__idmap_text_end) - start;
+ pgd_t *pgd = idmap_pg_dir;
+ u64 pgd_phys;
+
+ /* check if we need an additional level of translation */
+ if (VA_BITS < 48 && idmap_t0sz < (64 - VA_BITS_MIN)) {
+ pgd_phys = early_pgtable_alloc(PAGE_SHIFT);
+ set_pgd(&idmap_pg_dir[start >> VA_BITS],
+ __pgd(pgd_phys | P4D_TYPE_TABLE));
+ pgd = __va(pgd_phys);
+ }
+ __create_pgd_mapping(pgd, start, start, size, PAGE_KERNEL_ROX,
+ early_pgtable_alloc, 0);
+
+ if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
+ extern u32 __idmap_kpti_flag;
+ u64 pa = __pa_symbol(&__idmap_kpti_flag);
+
+ /*
+ * The KPTI G-to-nG conversion code needs a read-write mapping
+ * of its synchronization flag in the ID map.
+ */
+ __create_pgd_mapping(pgd, pa, pa, sizeof(u32), PAGE_KERNEL,
+ early_pgtable_alloc, 0);
+ }
+}
+
void __init paging_init(void)
{
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
+ extern pgd_t init_idmap_pg_dir[];
+
+ idmap_t0sz = 63UL - __fls(__pa_symbol(_end) | GENMASK(VA_BITS_MIN - 1, 0));
map_kernel(pgdp);
map_mem(pgdp);
pgd_clear_fixmap();
- cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
+ cpu_replace_ttbr1(lm_alias(swapper_pg_dir), init_idmap_pg_dir);
init_mm.pgd = swapper_pg_dir;
memblock_phys_free(__pa_symbol(init_pg_dir),
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
memblock_allow_resize();
+
+ create_idmap();
}
/*
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index a9e50e930484..4334dec93bd4 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -53,15 +53,6 @@ bool mte_restore_tags(swp_entry_t entry, struct page *page)
if (!tags)
return false;
- page_kasan_tag_reset(page);
- /*
- * We need smp_wmb() in between setting the flags and clearing the
- * tags because if another thread reads page->flags and builds a
- * tagged address out of it, there is an actual dependency to the
- * memory access, but on the current thread we do not guarantee that
- * the new page->flags are visible before the tags were updated.
- */
- smp_wmb();
mte_restore_page_tags(page_address(page), tags);
return true;
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index 50bbed947bec..7837a69524c5 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -14,6 +14,7 @@
#include <asm/asm-offsets.h>
#include <asm/asm_pointer_auth.h>
#include <asm/hwcap.h>
+#include <asm/kernel-pgtable.h>
#include <asm/pgtable-hwdef.h>
#include <asm/cpufeature.h>
#include <asm/alternative.h>
@@ -200,34 +201,64 @@ SYM_FUNC_END(idmap_cpu_replace_ttbr1)
.popsection
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+
+#define KPTI_NG_PTE_FLAGS (PTE_ATTRINDX(MT_NORMAL) | SWAPPER_PTE_FLAGS)
+
.pushsection ".idmap.text", "awx"
- .macro __idmap_kpti_get_pgtable_ent, type
- dc cvac, cur_\()\type\()p // Ensure any existing dirty
- dmb sy // lines are written back before
- ldr \type, [cur_\()\type\()p] // loading the entry
- tbz \type, #0, skip_\()\type // Skip invalid and
- tbnz \type, #11, skip_\()\type // non-global entries
+ .macro kpti_mk_tbl_ng, type, num_entries
+ add end_\type\()p, cur_\type\()p, #\num_entries * 8
+.Ldo_\type:
+ ldr \type, [cur_\type\()p] // Load the entry
+ tbz \type, #0, .Lnext_\type // Skip invalid and
+ tbnz \type, #11, .Lnext_\type // non-global entries
+ orr \type, \type, #PTE_NG // Same bit for blocks and pages
+ str \type, [cur_\type\()p] // Update the entry
+ .ifnc \type, pte
+ tbnz \type, #1, .Lderef_\type
+ .endif
+.Lnext_\type:
+ add cur_\type\()p, cur_\type\()p, #8
+ cmp cur_\type\()p, end_\type\()p
+ b.ne .Ldo_\type
.endm
- .macro __idmap_kpti_put_pgtable_ent_ng, type
- orr \type, \type, #PTE_NG // Same bit for blocks and pages
- str \type, [cur_\()\type\()p] // Update the entry and ensure
- dmb sy // that it is visible to all
- dc civac, cur_\()\type\()p // CPUs.
+ /*
+ * Dereference the current table entry and map it into the temporary
+ * fixmap slot associated with the current level.
+ */
+ .macro kpti_map_pgtbl, type, level
+ str xzr, [temp_pte, #8 * (\level + 1)] // break before make
+ dsb nshst
+ add pte, temp_pte, #PAGE_SIZE * (\level + 1)
+ lsr pte, pte, #12
+ tlbi vaae1, pte
+ dsb nsh
+ isb
+
+ phys_to_pte pte, cur_\type\()p
+ add cur_\type\()p, temp_pte, #PAGE_SIZE * (\level + 1)
+ orr pte, pte, pte_flags
+ str pte, [temp_pte, #8 * (\level + 1)]
+ dsb nshst
.endm
/*
- * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper)
+ * void __kpti_install_ng_mappings(int cpu, int num_secondaries, phys_addr_t temp_pgd,
+ * unsigned long temp_pte_va)
*
* Called exactly once from stop_machine context by each CPU found during boot.
*/
-__idmap_kpti_flag:
- .long 1
+ .pushsection ".data", "aw", %progbits
+SYM_DATA(__idmap_kpti_flag, .long 1)
+ .popsection
+
SYM_FUNC_START(idmap_kpti_install_ng_mappings)
cpu .req w0
+ temp_pte .req x0
num_cpus .req w1
- swapper_pa .req x2
+ pte_flags .req x1
+ temp_pgd_phys .req x2
swapper_ttb .req x3
flag_ptr .req x4
cur_pgdp .req x5
@@ -235,17 +266,16 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
pgd .req x7
cur_pudp .req x8
end_pudp .req x9
- pud .req x10
cur_pmdp .req x11
end_pmdp .req x12
- pmd .req x13
cur_ptep .req x14
end_ptep .req x15
pte .req x16
+ valid .req x17
+ mov x5, x3 // preserve temp_pte arg
mrs swapper_ttb, ttbr1_el1
- restore_ttbr1 swapper_ttb
- adr flag_ptr, __idmap_kpti_flag
+ adr_l flag_ptr, __idmap_kpti_flag
cbnz cpu, __idmap_kpti_secondary
@@ -256,98 +286,71 @@ SYM_FUNC_START(idmap_kpti_install_ng_mappings)
eor w17, w17, num_cpus
cbnz w17, 1b
- /* We need to walk swapper, so turn off the MMU. */
- pre_disable_mmu_workaround
- mrs x17, sctlr_el1
- bic x17, x17, #SCTLR_ELx_M
- msr sctlr_el1, x17
+ /* Switch to the temporary page tables on this CPU only */
+ __idmap_cpu_set_reserved_ttbr1 x8, x9
+ offset_ttbr1 temp_pgd_phys, x8
+ msr ttbr1_el1, temp_pgd_phys
isb
+ mov temp_pte, x5
+ mov pte_flags, #KPTI_NG_PTE_FLAGS
+
/* Everybody is enjoying the idmap, so we can rewrite swapper. */
/* PGD */
- mov cur_pgdp, swapper_pa
- add end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8)
-do_pgd: __idmap_kpti_get_pgtable_ent pgd
- tbnz pgd, #1, walk_puds
-next_pgd:
- __idmap_kpti_put_pgtable_ent_ng pgd
-skip_pgd:
- add cur_pgdp, cur_pgdp, #8
- cmp cur_pgdp, end_pgdp
- b.ne do_pgd
-
- /* Publish the updated tables and nuke all the TLBs */
- dsb sy
- tlbi vmalle1is
- dsb ish
- isb
+ adrp cur_pgdp, swapper_pg_dir
+ kpti_map_pgtbl pgd, 0
+ kpti_mk_tbl_ng pgd, PTRS_PER_PGD
- /* We're done: fire up the MMU again */
- mrs x17, sctlr_el1
- orr x17, x17, #SCTLR_ELx_M
- set_sctlr_el1 x17
+ /* Ensure all the updated entries are visible to secondary CPUs */
+ dsb ishst
+
+ /* We're done: fire up swapper_pg_dir again */
+ __idmap_cpu_set_reserved_ttbr1 x8, x9
+ msr ttbr1_el1, swapper_ttb
+ isb
/* Set the flag to zero to indicate that we're all done */
str wzr, [flag_ptr]
ret
+.Lderef_pgd:
/* PUD */
-walk_puds:
- .if CONFIG_PGTABLE_LEVELS > 3
+ .if CONFIG_PGTABLE_LEVELS > 3
+ pud .req x10
pte_to_phys cur_pudp, pgd
- add end_pudp, cur_pudp, #(PTRS_PER_PUD * 8)
-do_pud: __idmap_kpti_get_pgtable_ent pud
- tbnz pud, #1, walk_pmds
-next_pud:
- __idmap_kpti_put_pgtable_ent_ng pud
-skip_pud:
- add cur_pudp, cur_pudp, 8
- cmp cur_pudp, end_pudp
- b.ne do_pud
- b next_pgd
- .else /* CONFIG_PGTABLE_LEVELS <= 3 */
- mov pud, pgd
- b walk_pmds
-next_pud:
- b next_pgd
+ kpti_map_pgtbl pud, 1
+ kpti_mk_tbl_ng pud, PTRS_PER_PUD
+ b .Lnext_pgd
+ .else /* CONFIG_PGTABLE_LEVELS <= 3 */
+ pud .req pgd
+ .set .Lnext_pud, .Lnext_pgd
.endif
+.Lderef_pud:
/* PMD */
-walk_pmds:
- .if CONFIG_PGTABLE_LEVELS > 2
+ .if CONFIG_PGTABLE_LEVELS > 2
+ pmd .req x13
pte_to_phys cur_pmdp, pud
- add end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8)
-do_pmd: __idmap_kpti_get_pgtable_ent pmd
- tbnz pmd, #1, walk_ptes
-next_pmd:
- __idmap_kpti_put_pgtable_ent_ng pmd
-skip_pmd:
- add cur_pmdp, cur_pmdp, #8
- cmp cur_pmdp, end_pmdp
- b.ne do_pmd
- b next_pud
- .else /* CONFIG_PGTABLE_LEVELS <= 2 */
- mov pmd, pud
- b walk_ptes
-next_pmd:
- b next_pud
+ kpti_map_pgtbl pmd, 2
+ kpti_mk_tbl_ng pmd, PTRS_PER_PMD
+ b .Lnext_pud
+ .else /* CONFIG_PGTABLE_LEVELS <= 2 */
+ pmd .req pgd
+ .set .Lnext_pmd, .Lnext_pgd
.endif
+.Lderef_pmd:
/* PTE */
-walk_ptes:
pte_to_phys cur_ptep, pmd
- add end_ptep, cur_ptep, #(PTRS_PER_PTE * 8)
-do_pte: __idmap_kpti_get_pgtable_ent pte
- __idmap_kpti_put_pgtable_ent_ng pte
-skip_pte:
- add cur_ptep, cur_ptep, #8
- cmp cur_ptep, end_ptep
- b.ne do_pte
- b next_pmd
+ kpti_map_pgtbl pte, 3
+ kpti_mk_tbl_ng pte, PTRS_PER_PTE
+ b .Lnext_pmd
.unreq cpu
+ .unreq temp_pte
.unreq num_cpus
- .unreq swapper_pa
+ .unreq pte_flags
+ .unreq temp_pgd_phys
.unreq cur_pgdp
.unreq end_pgdp
.unreq pgd
@@ -360,6 +363,7 @@ skip_pte:
.unreq cur_ptep
.unreq end_ptep
.unreq pte
+ .unreq valid
/* Secondary CPUs end up here */
__idmap_kpti_secondary:
@@ -379,7 +383,6 @@ __idmap_kpti_secondary:
cbnz w16, 1b
/* All done, act like nothing happened */
- offset_ttbr1 swapper_ttb, x16
msr ttbr1_el1, swapper_ttb
isb
ret
@@ -395,6 +398,8 @@ SYM_FUNC_END(idmap_kpti_install_ng_mappings)
*
* Initialise the processor for turning the MMU on.
*
+ * Input:
+ * x0 - actual number of VA bits (ignored unless VA_BITS > 48)
* Output:
* Return in x0 the value of the SCTLR_EL1 register.
*/
@@ -464,12 +469,11 @@ SYM_FUNC_START(__cpu_setup)
tcr_clear_errata_bits tcr, x9, x5
#ifdef CONFIG_ARM64_VA_BITS_52
- ldr_l x9, vabits_actual
- sub x9, xzr, x9
+ sub x9, xzr, x0
add x9, x9, #64
tcr_set_t1sz tcr, x9
#else
- ldr_l x9, idmap_t0sz
+ idmap_get_t0sz x9
#endif
tcr_set_t0sz tcr, x9
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 507b20373953..779653771507 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -36,6 +36,7 @@ HAS_RNG
HAS_SB
HAS_STAGE2_FWB
HAS_SYSREG_GIC_CPUIF
+HAS_TIDCP1
HAS_TLB_RANGE
HAS_VIRT_HOST_EXTN
HAS_WFXT
@@ -61,6 +62,7 @@ WORKAROUND_1418040
WORKAROUND_1463225
WORKAROUND_1508412
WORKAROUND_1542419
+WORKAROUND_1742098
WORKAROUND_1902691
WORKAROUND_2038923
WORKAROUND_2064142
diff --git a/arch/arm64/tools/gen-sysreg.awk b/arch/arm64/tools/gen-sysreg.awk
index 5c55509eb43f..db461921d256 100755
--- a/arch/arm64/tools/gen-sysreg.awk
+++ b/arch/arm64/tools/gen-sysreg.awk
@@ -88,7 +88,7 @@ END {
# skip blank lines and comment lines
/^$/ { next }
-/^#/ { next }
+/^[\t ]*#/ { next }
/^SysregFields/ {
change_block("SysregFields", "None", "SysregFields")
diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg
index ff5e552f7420..9ae483ec1e56 100644
--- a/arch/arm64/tools/sysreg
+++ b/arch/arm64/tools/sysreg
@@ -46,6 +46,89 @@
# feature that introduces them (eg, FEAT_LS64_ACCDATA introduces enumeration
# item ACCDATA) though it may be more taseful to do something else.
+Sysreg ID_AA64ZFR0_EL1 3 0 0 4 4
+Res0 63:60
+Enum 59:56 F64MM
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 55:52 F32MM
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Res0 51:48
+Enum 47:44 I8MM
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 43:40 SM4
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Res0 39:36
+Enum 35:32 SHA3
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Res0 31:24
+Enum 23:20 BF16
+ 0b0000 NI
+ 0b0001 IMP
+ 0b0010 EBF16
+EndEnum
+Enum 19:16 BitPerm
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Res0 15:8
+Enum 7:4 AES
+ 0b0000 NI
+ 0b0001 IMP
+ 0b0010 PMULL128
+EndEnum
+Enum 3:0 SVEver
+ 0b0000 IMP
+ 0b0001 SVE2
+EndEnum
+EndSysreg
+
+Sysreg ID_AA64SMFR0_EL1 3 0 0 4 5
+Enum 63 FA64
+ 0b0 NI
+ 0b1 IMP
+EndEnum
+Res0 62:60
+Field 59:56 SMEver
+Enum 55:52 I16I64
+ 0b0000 NI
+ 0b1111 IMP
+EndEnum
+Res0 51:49
+Enum 48 F64F64
+ 0b0 NI
+ 0b1 IMP
+EndEnum
+Res0 47:40
+Enum 39:36 I8I32
+ 0b0000 NI
+ 0b1111 IMP
+EndEnum
+Enum 35 F16F32
+ 0b0 NI
+ 0b1 IMP
+EndEnum
+Enum 34 B16F32
+ 0b0 NI
+ 0b1 IMP
+EndEnum
+Res0 33
+Enum 32 F32F32
+ 0b0 NI
+ 0b1 IMP
+EndEnum
+Res0 31:0
+EndSysreg
+
Sysreg ID_AA64ISAR0_EL1 3 0 0 6 0
Enum 63:60 RNDR
0b0000 NI
@@ -114,6 +197,122 @@ EndEnum
Res0 3:0
EndSysreg
+Sysreg ID_AA64ISAR1_EL1 3 0 0 6 1
+Enum 63:60 LS64
+ 0b0000 NI
+ 0b0001 LS64
+ 0b0010 LS64_V
+ 0b0011 LS64_ACCDATA
+EndEnum
+Enum 59:56 XS
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 55:52 I8MM
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 51:48 DGH
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 47:44 BF16
+ 0b0000 NI
+ 0b0001 IMP
+ 0b0010 EBF16
+EndEnum
+Enum 43:40 SPECRES
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 39:36 SB
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 35:32 FRINTTS
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 31:28 GPI
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 27:24 GPA
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 23:20 LRCPC
+ 0b0000 NI
+ 0b0001 IMP
+ 0b0010 LRCPC2
+EndEnum
+Enum 19:16 FCMA
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 15:12 JSCVT
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 11:8 API
+ 0b0000 NI
+ 0b0001 PAuth
+ 0b0010 EPAC
+ 0b0011 PAuth2
+ 0b0100 FPAC
+ 0b0101 FPACCOMBINE
+EndEnum
+Enum 7:4 APA
+ 0b0000 NI
+ 0b0001 PAuth
+ 0b0010 EPAC
+ 0b0011 PAuth2
+ 0b0100 FPAC
+ 0b0101 FPACCOMBINE
+EndEnum
+Enum 3:0 DPB
+ 0b0000 NI
+ 0b0001 IMP
+ 0b0010 DPB2
+EndEnum
+EndSysreg
+
+Sysreg ID_AA64ISAR2_EL1 3 0 0 6 2
+Res0 63:28
+Enum 27:24 PAC_frac
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 23:20 BC
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 19:16 MOPS
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 15:12 APA3
+ 0b0000 NI
+ 0b0001 PAuth
+ 0b0010 EPAC
+ 0b0011 PAuth2
+ 0b0100 FPAC
+ 0b0101 FPACCOMBINE
+EndEnum
+Enum 11:8 GPA3
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 7:4 RPRES
+ 0b0000 NI
+ 0b0001 IMP
+EndEnum
+Enum 3:0 WFxT
+ 0b0000 NI
+ 0b0010 IMP
+EndEnum
+EndSysreg
+
Sysreg SCTLR_EL1 3 0 1 0 0
Field 63 TIDCP
Field 62 SPINMASK
@@ -257,6 +456,11 @@ Field 5:3 Ctype2
Field 2:0 Ctype1
EndSysreg
+Sysreg GMID_EL1 3 1 0 0 4
+Res0 63:4
+Field 3:0 BS
+EndSysreg
+
Sysreg SMIDR_EL1 3 1 0 0 6
Res0 63:32
Field 31:24 IMPLEMENTER
@@ -273,6 +477,33 @@ Field 3:1 Level
Field 0 InD
EndSysreg
+Sysreg CTR_EL0 3 3 0 0 1
+Res0 63:38
+Field 37:32 TminLine
+Res1 31
+Res0 30
+Field 29 DIC
+Field 28 IDC
+Field 27:24 CWG
+Field 23:20 ERG
+Field 19:16 DminLine
+Enum 15:14 L1Ip
+ 0b00 VPIPT
+ # This is named as AIVIVT in the ARM but documented as reserved
+ 0b01 RESERVED
+ 0b10 VIPT
+ 0b11 PIPT
+EndEnum
+Res0 13:4
+Field 3:0 IminLine
+EndSysreg
+
+Sysreg DCZID_EL0 3 3 0 0 7
+Res0 63:5
+Field 4 DZP
+Field 3:0 BS
+EndSysreg
+
Sysreg SVCR 3 3 4 2 2
Res0 63:2
Field 1 ZA
@@ -367,3 +598,36 @@ EndSysreg
Sysreg TTBR1_EL1 3 0 2 0 1
Fields TTBRx_EL1
EndSysreg
+
+Sysreg LORSA_EL1 3 0 10 4 0
+Res0 63:52
+Field 51:16 SA
+Res0 15:1
+Field 0 Valid
+EndSysreg
+
+Sysreg LOREA_EL1 3 0 10 4 1
+Res0 63:52
+Field 51:48 EA_51_48
+Field 47:16 EA_47_16
+Res0 15:0
+EndSysreg
+
+Sysreg LORN_EL1 3 0 10 4 2
+Res0 63:8
+Field 7:0 Num
+EndSysreg
+
+Sysreg LORC_EL1 3 0 10 4 3
+Res0 63:10
+Field 9:2 DS
+Res0 1
+Field 0 EN
+EndSysreg
+
+Sysreg LORID_EL1 3 0 10 4 7
+Res0 63:24
+Field 23:16 LD
+Res0 15:8
+Field 7:0 LR
+EndSysreg
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index b57daee98b89..62b5b07fa4e1 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -69,7 +69,6 @@ config LOONGARCH
select GENERIC_TIME_VSYSCALL
select GPIOLIB
select HAVE_ARCH_AUDITSYSCALL
- select HAVE_ARCH_COMPILER_H
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
diff --git a/arch/loongarch/include/asm/asmmacro.h b/arch/loongarch/include/asm/asmmacro.h
index a1a04083bd67..be037a40580d 100644
--- a/arch/loongarch/include/asm/asmmacro.h
+++ b/arch/loongarch/include/asm/asmmacro.h
@@ -274,16 +274,4 @@
nor \dst, \src, zero
.endm
-.macro bgt r0 r1 label
- blt \r1, \r0, \label
-.endm
-
-.macro bltz r0 label
- blt \r0, zero, \label
-.endm
-
-.macro bgez r0 label
- bge \r0, zero, \label
-.endm
-
#endif /* _ASM_ASMMACRO_H */
diff --git a/arch/loongarch/include/asm/atomic.h b/arch/loongarch/include/asm/atomic.h
index 979367ad4e2c..6b9aca9ab6e9 100644
--- a/arch/loongarch/include/asm/atomic.h
+++ b/arch/loongarch/include/asm/atomic.h
@@ -10,7 +10,6 @@
#include <linux/types.h>
#include <asm/barrier.h>
#include <asm/cmpxchg.h>
-#include <asm/compiler.h>
#if __SIZEOF_LONG__ == 4
#define __LL "ll.w "
@@ -157,27 +156,25 @@ static inline int arch_atomic_sub_if_positive(int i, atomic_t *v)
__asm__ __volatile__(
"1: ll.w %1, %2 # atomic_sub_if_positive\n"
" addi.w %0, %1, %3 \n"
- " or %1, %0, $zero \n"
- " blt %0, $zero, 2f \n"
+ " move %1, %0 \n"
+ " bltz %0, 2f \n"
" sc.w %1, %2 \n"
- " beq $zero, %1, 1b \n"
+ " beqz %1, 1b \n"
"2: \n"
__WEAK_LLSC_MB
- : "=&r" (result), "=&r" (temp),
- "+" GCC_OFF_SMALL_ASM() (v->counter)
+ : "=&r" (result), "=&r" (temp), "+ZC" (v->counter)
: "I" (-i));
} else {
__asm__ __volatile__(
"1: ll.w %1, %2 # atomic_sub_if_positive\n"
" sub.w %0, %1, %3 \n"
- " or %1, %0, $zero \n"
- " blt %0, $zero, 2f \n"
+ " move %1, %0 \n"
+ " bltz %0, 2f \n"
" sc.w %1, %2 \n"
- " beq $zero, %1, 1b \n"
+ " beqz %1, 1b \n"
"2: \n"
__WEAK_LLSC_MB
- : "=&r" (result), "=&r" (temp),
- "+" GCC_OFF_SMALL_ASM() (v->counter)
+ : "=&r" (result), "=&r" (temp), "+ZC" (v->counter)
: "r" (i));
}
@@ -320,27 +317,25 @@ static inline long arch_atomic64_sub_if_positive(long i, atomic64_t *v)
__asm__ __volatile__(
"1: ll.d %1, %2 # atomic64_sub_if_positive \n"
" addi.d %0, %1, %3 \n"
- " or %1, %0, $zero \n"
- " blt %0, $zero, 2f \n"
+ " move %1, %0 \n"
+ " bltz %0, 2f \n"
" sc.d %1, %2 \n"
- " beq %1, $zero, 1b \n"
+ " beqz %1, 1b \n"
"2: \n"
__WEAK_LLSC_MB
- : "=&r" (result), "=&r" (temp),
- "+" GCC_OFF_SMALL_ASM() (v->counter)
+ : "=&r" (result), "=&r" (temp), "+ZC" (v->counter)
: "I" (-i));
} else {
__asm__ __volatile__(
"1: ll.d %1, %2 # atomic64_sub_if_positive \n"
" sub.d %0, %1, %3 \n"
- " or %1, %0, $zero \n"
- " blt %0, $zero, 2f \n"
+ " move %1, %0 \n"
+ " bltz %0, 2f \n"
" sc.d %1, %2 \n"
- " beq %1, $zero, 1b \n"
+ " beqz %1, 1b \n"
"2: \n"
__WEAK_LLSC_MB
- : "=&r" (result), "=&r" (temp),
- "+" GCC_OFF_SMALL_ASM() (v->counter)
+ : "=&r" (result), "=&r" (temp), "+ZC" (v->counter)
: "r" (i));
}
diff --git a/arch/loongarch/include/asm/barrier.h b/arch/loongarch/include/asm/barrier.h
index b6517eeeb141..cda977675854 100644
--- a/arch/loongarch/include/asm/barrier.h
+++ b/arch/loongarch/include/asm/barrier.h
@@ -48,9 +48,9 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
__asm__ __volatile__(
"sltu %0, %1, %2\n\t"
#if (__SIZEOF_LONG__ == 4)
- "sub.w %0, $r0, %0\n\t"
+ "sub.w %0, $zero, %0\n\t"
#elif (__SIZEOF_LONG__ == 8)
- "sub.d %0, $r0, %0\n\t"
+ "sub.d %0, $zero, %0\n\t"
#endif
: "=r" (mask)
: "r" (index), "r" (size)
diff --git a/arch/loongarch/include/asm/cmpxchg.h b/arch/loongarch/include/asm/cmpxchg.h
index 75b3a4478652..0a9b0fac1eee 100644
--- a/arch/loongarch/include/asm/cmpxchg.h
+++ b/arch/loongarch/include/asm/cmpxchg.h
@@ -55,9 +55,9 @@ static inline unsigned long __xchg(volatile void *ptr, unsigned long x,
__asm__ __volatile__( \
"1: " ld " %0, %2 # __cmpxchg_asm \n" \
" bne %0, %z3, 2f \n" \
- " or $t0, %z4, $zero \n" \
+ " move $t0, %z4 \n" \
" " st " $t0, %1 \n" \
- " beq $zero, $t0, 1b \n" \
+ " beqz $t0, 1b \n" \
"2: \n" \
__WEAK_LLSC_MB \
: "=&r" (__ret), "=ZB"(*m) \
diff --git a/arch/loongarch/include/asm/compiler.h b/arch/loongarch/include/asm/compiler.h
deleted file mode 100644
index 657cebe70ace..000000000000
--- a/arch/loongarch/include/asm/compiler.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
- */
-#ifndef _ASM_COMPILER_H
-#define _ASM_COMPILER_H
-
-#define GCC_OFF_SMALL_ASM() "ZC"
-
-#define LOONGARCH_ISA_LEVEL "loongarch"
-#define LOONGARCH_ISA_ARCH_LEVEL "arch=loongarch"
-#define LOONGARCH_ISA_LEVEL_RAW loongarch
-#define LOONGARCH_ISA_ARCH_LEVEL_RAW LOONGARCH_ISA_LEVEL_RAW
-
-#endif /* _ASM_COMPILER_H */
diff --git a/arch/loongarch/include/asm/elf.h b/arch/loongarch/include/asm/elf.h
index f3960b18a90e..5f3ff4781fda 100644
--- a/arch/loongarch/include/asm/elf.h
+++ b/arch/loongarch/include/asm/elf.h
@@ -288,8 +288,6 @@ struct arch_elf_state {
.interp_fp_abi = LOONGARCH_ABI_FP_ANY, \
}
-#define elf_read_implies_exec(ex, exec_stk) (exec_stk == EXSTACK_DEFAULT)
-
extern int arch_elf_pt_proc(void *ehdr, void *phdr, struct file *elf,
bool is_interp, struct arch_elf_state *state);
diff --git a/arch/loongarch/include/asm/futex.h b/arch/loongarch/include/asm/futex.h
index 9de8231694ec..feb6658c84ff 100644
--- a/arch/loongarch/include/asm/futex.h
+++ b/arch/loongarch/include/asm/futex.h
@@ -8,7 +8,6 @@
#include <linux/futex.h>
#include <linux/uaccess.h>
#include <asm/barrier.h>
-#include <asm/compiler.h>
#include <asm/errno.h>
#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
@@ -17,7 +16,7 @@
"1: ll.w %1, %4 # __futex_atomic_op\n" \
" " insn " \n" \
"2: sc.w $t0, %2 \n" \
- " beq $t0, $zero, 1b \n" \
+ " beqz $t0, 1b \n" \
"3: \n" \
" .section .fixup,\"ax\" \n" \
"4: li.w %0, %6 \n" \
@@ -82,9 +81,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newv
"# futex_atomic_cmpxchg_inatomic \n"
"1: ll.w %1, %3 \n"
" bne %1, %z4, 3f \n"
- " or $t0, %z5, $zero \n"
+ " move $t0, %z5 \n"
"2: sc.w $t0, %2 \n"
- " beq $zero, $t0, 1b \n"
+ " beqz $t0, 1b \n"
"3: \n"
__WEAK_LLSC_MB
" .section .fixup,\"ax\" \n"
@@ -95,8 +94,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newv
" "__UA_ADDR "\t1b, 4b \n"
" "__UA_ADDR "\t2b, 4b \n"
" .previous \n"
- : "+r" (ret), "=&r" (val), "=" GCC_OFF_SMALL_ASM() (*uaddr)
- : GCC_OFF_SMALL_ASM() (*uaddr), "Jr" (oldval), "Jr" (newval),
+ : "+r" (ret), "=&r" (val), "=ZC" (*uaddr)
+ : "ZC" (*uaddr), "Jr" (oldval), "Jr" (newval),
"i" (-EFAULT)
: "memory", "t0");
diff --git a/arch/loongarch/include/asm/irqflags.h b/arch/loongarch/include/asm/irqflags.h
index 52121cd791fe..319a8c616f1f 100644
--- a/arch/loongarch/include/asm/irqflags.h
+++ b/arch/loongarch/include/asm/irqflags.h
@@ -9,7 +9,6 @@
#include <linux/compiler.h>
#include <linux/stringify.h>
-#include <asm/compiler.h>
#include <asm/loongarch.h>
static inline void arch_local_irq_enable(void)
diff --git a/arch/loongarch/include/asm/local.h b/arch/loongarch/include/asm/local.h
index 2052a2267337..65fbbae9fc4d 100644
--- a/arch/loongarch/include/asm/local.h
+++ b/arch/loongarch/include/asm/local.h
@@ -9,7 +9,6 @@
#include <linux/bitops.h>
#include <linux/atomic.h>
#include <asm/cmpxchg.h>
-#include <asm/compiler.h>
typedef struct {
atomic_long_t a;
diff --git a/arch/loongarch/include/asm/loongson.h b/arch/loongarch/include/asm/loongson.h
index 6a8038725ba7..6e8f6972ceb6 100644
--- a/arch/loongarch/include/asm/loongson.h
+++ b/arch/loongarch/include/asm/loongson.h
@@ -39,18 +39,6 @@ extern const struct plat_smp_ops loongson3_smp_ops;
#define MAX_PACKAGES 16
-/* Chip Config register of each physical cpu package */
-extern u64 loongson_chipcfg[MAX_PACKAGES];
-#define LOONGSON_CHIPCFG(id) (*(volatile u32 *)(loongson_chipcfg[id]))
-
-/* Chip Temperature register of each physical cpu package */
-extern u64 loongson_chiptemp[MAX_PACKAGES];
-#define LOONGSON_CHIPTEMP(id) (*(volatile u32 *)(loongson_chiptemp[id]))
-
-/* Freq Control register of each physical cpu package */
-extern u64 loongson_freqctrl[MAX_PACKAGES];
-#define LOONGSON_FREQCTRL(id) (*(volatile u32 *)(loongson_freqctrl[id]))
-
#define xconf_readl(addr) readl(addr)
#define xconf_readq(addr) readq(addr)
@@ -58,7 +46,7 @@ static inline void xconf_writel(u32 val, volatile void __iomem *addr)
{
asm volatile (
" st.w %[v], %[hw], 0 \n"
- " ld.b $r0, %[hw], 0 \n"
+ " ld.b $zero, %[hw], 0 \n"
:
: [hw] "r" (addr), [v] "r" (val)
);
@@ -68,7 +56,7 @@ static inline void xconf_writeq(u64 val64, volatile void __iomem *addr)
{
asm volatile (
" st.d %[v], %[hw], 0 \n"
- " ld.b $r0, %[hw], 0 \n"
+ " ld.b $zero, %[hw], 0 \n"
:
: [hw] "r" (addr), [v] "r" (val64)
);
diff --git a/arch/loongarch/include/asm/stacktrace.h b/arch/loongarch/include/asm/stacktrace.h
index 26483e396ad1..6b5c2a7aa706 100644
--- a/arch/loongarch/include/asm/stacktrace.h
+++ b/arch/loongarch/include/asm/stacktrace.h
@@ -23,13 +23,13 @@
static __always_inline void prepare_frametrace(struct pt_regs *regs)
{
__asm__ __volatile__(
- /* Save $r1 */
+ /* Save $ra */
STORE_ONE_REG(1)
- /* Use $r1 to save PC */
- "pcaddi $r1, 0\n\t"
- STR_LONG_S " $r1, %0\n\t"
- /* Restore $r1 */
- STR_LONG_L " $r1, %1, "STR_LONGSIZE"\n\t"
+ /* Use $ra to save PC */
+ "pcaddi $ra, 0\n\t"
+ STR_LONG_S " $ra, %0\n\t"
+ /* Restore $ra */
+ STR_LONG_L " $ra, %1, "STR_LONGSIZE"\n\t"
STORE_ONE_REG(2)
STORE_ONE_REG(3)
STORE_ONE_REG(4)
diff --git a/arch/loongarch/include/asm/thread_info.h b/arch/loongarch/include/asm/thread_info.h
index 99beb11c2fa8..b7dd9f19a5a9 100644
--- a/arch/loongarch/include/asm/thread_info.h
+++ b/arch/loongarch/include/asm/thread_info.h
@@ -44,14 +44,14 @@ struct thread_info {
}
/* How to get the thread information struct from C. */
-register struct thread_info *__current_thread_info __asm__("$r2");
+register struct thread_info *__current_thread_info __asm__("$tp");
static inline struct thread_info *current_thread_info(void)
{
return __current_thread_info;
}
-register unsigned long current_stack_pointer __asm__("$r3");
+register unsigned long current_stack_pointer __asm__("$sp");
#endif /* !__ASSEMBLY__ */
diff --git a/arch/loongarch/include/asm/uaccess.h b/arch/loongarch/include/asm/uaccess.h
index 217c6a3727b1..2b44edc604a2 100644
--- a/arch/loongarch/include/asm/uaccess.h
+++ b/arch/loongarch/include/asm/uaccess.h
@@ -162,7 +162,7 @@ do { \
"2: \n" \
" .section .fixup,\"ax\" \n" \
"3: li.w %0, %3 \n" \
- " or %1, $r0, $r0 \n" \
+ " move %1, $zero \n" \
" b 2b \n" \
" .previous \n" \
" .section __ex_table,\"a\" \n" \
diff --git a/arch/loongarch/kernel/cacheinfo.c b/arch/loongarch/kernel/cacheinfo.c
index b38f5489d094..4662b06269f4 100644
--- a/arch/loongarch/kernel/cacheinfo.c
+++ b/arch/loongarch/kernel/cacheinfo.c
@@ -4,8 +4,9 @@
*
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
*/
-#include <asm/cpu-info.h>
#include <linux/cacheinfo.h>
+#include <asm/bootinfo.h>
+#include <asm/cpu-info.h>
/* Populates leaf and increments to next leaf */
#define populate_cache(cache, leaf, c_level, c_type) \
@@ -17,6 +18,8 @@ do { \
leaf->ways_of_associativity = c->cache.ways; \
leaf->size = c->cache.linesz * c->cache.sets * \
c->cache.ways; \
+ if (leaf->level > 2) \
+ leaf->size *= nodes_per_package; \
leaf++; \
} while (0)
@@ -95,11 +98,15 @@ static void cache_cpumap_setup(unsigned int cpu)
int populate_cache_leaves(unsigned int cpu)
{
- int level = 1;
+ int level = 1, nodes_per_package = 1;
struct cpuinfo_loongarch *c = &current_cpu_data;
struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ if (loongson_sysconf.nr_nodes > 1)
+ nodes_per_package = loongson_sysconf.cores_per_package
+ / loongson_sysconf.cores_per_node;
+
if (c->icache.waysize) {
populate_cache(dcache, this_leaf, level, CACHE_TYPE_DATA);
populate_cache(icache, this_leaf, level++, CACHE_TYPE_INST);
diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S
index d5b3dbcf5425..d53b631c9022 100644
--- a/arch/loongarch/kernel/entry.S
+++ b/arch/loongarch/kernel/entry.S
@@ -27,7 +27,7 @@ SYM_FUNC_START(handle_syscall)
addi.d sp, sp, -PT_SIZE
cfi_st t2, PT_R3
- cfi_rel_offset sp, PT_R3
+ cfi_rel_offset sp, PT_R3
st.d zero, sp, PT_R0
csrrd t2, LOONGARCH_CSR_PRMD
st.d t2, sp, PT_PRMD
@@ -50,7 +50,7 @@ SYM_FUNC_START(handle_syscall)
cfi_st a7, PT_R11
csrrd ra, LOONGARCH_CSR_ERA
st.d ra, sp, PT_ERA
- cfi_rel_offset ra, PT_ERA
+ cfi_rel_offset ra, PT_ERA
cfi_st tp, PT_R2
cfi_st u0, PT_R21
diff --git a/arch/loongarch/kernel/env.c b/arch/loongarch/kernel/env.c
index 467946ecf451..82b478a5c665 100644
--- a/arch/loongarch/kernel/env.c
+++ b/arch/loongarch/kernel/env.c
@@ -17,21 +17,6 @@ u64 efi_system_table;
struct loongson_system_configuration loongson_sysconf;
EXPORT_SYMBOL(loongson_sysconf);
-u64 loongson_chipcfg[MAX_PACKAGES];
-u64 loongson_chiptemp[MAX_PACKAGES];
-u64 loongson_freqctrl[MAX_PACKAGES];
-unsigned long long smp_group[MAX_PACKAGES];
-
-static void __init register_addrs_set(u64 *registers, const u64 addr, int num)
-{
- u64 i;
-
- for (i = 0; i < num; i++) {
- *registers = (i << 44) | addr;
- registers++;
- }
-}
-
void __init init_environ(void)
{
int efi_boot = fw_arg0;
@@ -50,11 +35,6 @@ void __init init_environ(void)
efi_memmap_init_early(&data);
memblock_reserve(data.phys_map & PAGE_MASK,
PAGE_ALIGN(data.size + (data.phys_map & ~PAGE_MASK)));
-
- register_addrs_set(smp_group, TO_UNCACHE(0x1fe01000), 16);
- register_addrs_set(loongson_chipcfg, TO_UNCACHE(0x1fe00180), 16);
- register_addrs_set(loongson_chiptemp, TO_UNCACHE(0x1fe0019c), 16);
- register_addrs_set(loongson_freqctrl, TO_UNCACHE(0x1fe001d0), 16);
}
static int __init init_cpu_fullname(void)
diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S
index a631a7137667..576b3370a296 100644
--- a/arch/loongarch/kernel/fpu.S
+++ b/arch/loongarch/kernel/fpu.S
@@ -27,78 +27,78 @@
.endm
.macro sc_save_fp base
- EX fst.d $f0, \base, (0 * FPU_REG_WIDTH)
- EX fst.d $f1, \base, (1 * FPU_REG_WIDTH)
- EX fst.d $f2, \base, (2 * FPU_REG_WIDTH)
- EX fst.d $f3, \base, (3 * FPU_REG_WIDTH)
- EX fst.d $f4, \base, (4 * FPU_REG_WIDTH)
- EX fst.d $f5, \base, (5 * FPU_REG_WIDTH)
- EX fst.d $f6, \base, (6 * FPU_REG_WIDTH)
- EX fst.d $f7, \base, (7 * FPU_REG_WIDTH)
- EX fst.d $f8, \base, (8 * FPU_REG_WIDTH)
- EX fst.d $f9, \base, (9 * FPU_REG_WIDTH)
- EX fst.d $f10, \base, (10 * FPU_REG_WIDTH)
- EX fst.d $f11, \base, (11 * FPU_REG_WIDTH)
- EX fst.d $f12, \base, (12 * FPU_REG_WIDTH)
- EX fst.d $f13, \base, (13 * FPU_REG_WIDTH)
- EX fst.d $f14, \base, (14 * FPU_REG_WIDTH)
- EX fst.d $f15, \base, (15 * FPU_REG_WIDTH)
- EX fst.d $f16, \base, (16 * FPU_REG_WIDTH)
- EX fst.d $f17, \base, (17 * FPU_REG_WIDTH)
- EX fst.d $f18, \base, (18 * FPU_REG_WIDTH)
- EX fst.d $f19, \base, (19 * FPU_REG_WIDTH)
- EX fst.d $f20, \base, (20 * FPU_REG_WIDTH)
- EX fst.d $f21, \base, (21 * FPU_REG_WIDTH)
- EX fst.d $f22, \base, (22 * FPU_REG_WIDTH)
- EX fst.d $f23, \base, (23 * FPU_REG_WIDTH)
- EX fst.d $f24, \base, (24 * FPU_REG_WIDTH)
- EX fst.d $f25, \base, (25 * FPU_REG_WIDTH)
- EX fst.d $f26, \base, (26 * FPU_REG_WIDTH)
- EX fst.d $f27, \base, (27 * FPU_REG_WIDTH)
- EX fst.d $f28, \base, (28 * FPU_REG_WIDTH)
- EX fst.d $f29, \base, (29 * FPU_REG_WIDTH)
- EX fst.d $f30, \base, (30 * FPU_REG_WIDTH)
- EX fst.d $f31, \base, (31 * FPU_REG_WIDTH)
+ EX fst.d $f0, \base, (0 * FPU_REG_WIDTH)
+ EX fst.d $f1, \base, (1 * FPU_REG_WIDTH)
+ EX fst.d $f2, \base, (2 * FPU_REG_WIDTH)
+ EX fst.d $f3, \base, (3 * FPU_REG_WIDTH)
+ EX fst.d $f4, \base, (4 * FPU_REG_WIDTH)
+ EX fst.d $f5, \base, (5 * FPU_REG_WIDTH)
+ EX fst.d $f6, \base, (6 * FPU_REG_WIDTH)
+ EX fst.d $f7, \base, (7 * FPU_REG_WIDTH)
+ EX fst.d $f8, \base, (8 * FPU_REG_WIDTH)
+ EX fst.d $f9, \base, (9 * FPU_REG_WIDTH)
+ EX fst.d $f10, \base, (10 * FPU_REG_WIDTH)
+ EX fst.d $f11, \base, (11 * FPU_REG_WIDTH)
+ EX fst.d $f12, \base, (12 * FPU_REG_WIDTH)
+ EX fst.d $f13, \base, (13 * FPU_REG_WIDTH)
+ EX fst.d $f14, \base, (14 * FPU_REG_WIDTH)
+ EX fst.d $f15, \base, (15 * FPU_REG_WIDTH)
+ EX fst.d $f16, \base, (16 * FPU_REG_WIDTH)
+ EX fst.d $f17, \base, (17 * FPU_REG_WIDTH)
+ EX fst.d $f18, \base, (18 * FPU_REG_WIDTH)
+ EX fst.d $f19, \base, (19 * FPU_REG_WIDTH)
+ EX fst.d $f20, \base, (20 * FPU_REG_WIDTH)
+ EX fst.d $f21, \base, (21 * FPU_REG_WIDTH)
+ EX fst.d $f22, \base, (22 * FPU_REG_WIDTH)
+ EX fst.d $f23, \base, (23 * FPU_REG_WIDTH)
+ EX fst.d $f24, \base, (24 * FPU_REG_WIDTH)
+ EX fst.d $f25, \base, (25 * FPU_REG_WIDTH)
+ EX fst.d $f26, \base, (26 * FPU_REG_WIDTH)
+ EX fst.d $f27, \base, (27 * FPU_REG_WIDTH)
+ EX fst.d $f28, \base, (28 * FPU_REG_WIDTH)
+ EX fst.d $f29, \base, (29 * FPU_REG_WIDTH)
+ EX fst.d $f30, \base, (30 * FPU_REG_WIDTH)
+ EX fst.d $f31, \base, (31 * FPU_REG_WIDTH)
.endm
.macro sc_restore_fp base
- EX fld.d $f0, \base, (0 * FPU_REG_WIDTH)
- EX fld.d $f1, \base, (1 * FPU_REG_WIDTH)
- EX fld.d $f2, \base, (2 * FPU_REG_WIDTH)
- EX fld.d $f3, \base, (3 * FPU_REG_WIDTH)
- EX fld.d $f4, \base, (4 * FPU_REG_WIDTH)
- EX fld.d $f5, \base, (5 * FPU_REG_WIDTH)
- EX fld.d $f6, \base, (6 * FPU_REG_WIDTH)
- EX fld.d $f7, \base, (7 * FPU_REG_WIDTH)
- EX fld.d $f8, \base, (8 * FPU_REG_WIDTH)
- EX fld.d $f9, \base, (9 * FPU_REG_WIDTH)
- EX fld.d $f10, \base, (10 * FPU_REG_WIDTH)
- EX fld.d $f11, \base, (11 * FPU_REG_WIDTH)
- EX fld.d $f12, \base, (12 * FPU_REG_WIDTH)
- EX fld.d $f13, \base, (13 * FPU_REG_WIDTH)
- EX fld.d $f14, \base, (14 * FPU_REG_WIDTH)
- EX fld.d $f15, \base, (15 * FPU_REG_WIDTH)
- EX fld.d $f16, \base, (16 * FPU_REG_WIDTH)
- EX fld.d $f17, \base, (17 * FPU_REG_WIDTH)
- EX fld.d $f18, \base, (18 * FPU_REG_WIDTH)
- EX fld.d $f19, \base, (19 * FPU_REG_WIDTH)
- EX fld.d $f20, \base, (20 * FPU_REG_WIDTH)
- EX fld.d $f21, \base, (21 * FPU_REG_WIDTH)
- EX fld.d $f22, \base, (22 * FPU_REG_WIDTH)
- EX fld.d $f23, \base, (23 * FPU_REG_WIDTH)
- EX fld.d $f24, \base, (24 * FPU_REG_WIDTH)
- EX fld.d $f25, \base, (25 * FPU_REG_WIDTH)
- EX fld.d $f26, \base, (26 * FPU_REG_WIDTH)
- EX fld.d $f27, \base, (27 * FPU_REG_WIDTH)
- EX fld.d $f28, \base, (28 * FPU_REG_WIDTH)
- EX fld.d $f29, \base, (29 * FPU_REG_WIDTH)
- EX fld.d $f30, \base, (30 * FPU_REG_WIDTH)
- EX fld.d $f31, \base, (31 * FPU_REG_WIDTH)
+ EX fld.d $f0, \base, (0 * FPU_REG_WIDTH)
+ EX fld.d $f1, \base, (1 * FPU_REG_WIDTH)
+ EX fld.d $f2, \base, (2 * FPU_REG_WIDTH)
+ EX fld.d $f3, \base, (3 * FPU_REG_WIDTH)
+ EX fld.d $f4, \base, (4 * FPU_REG_WIDTH)
+ EX fld.d $f5, \base, (5 * FPU_REG_WIDTH)
+ EX fld.d $f6, \base, (6 * FPU_REG_WIDTH)
+ EX fld.d $f7, \base, (7 * FPU_REG_WIDTH)
+ EX fld.d $f8, \base, (8 * FPU_REG_WIDTH)
+ EX fld.d $f9, \base, (9 * FPU_REG_WIDTH)
+ EX fld.d $f10, \base, (10 * FPU_REG_WIDTH)
+ EX fld.d $f11, \base, (11 * FPU_REG_WIDTH)
+ EX fld.d $f12, \base, (12 * FPU_REG_WIDTH)
+ EX fld.d $f13, \base, (13 * FPU_REG_WIDTH)
+ EX fld.d $f14, \base, (14 * FPU_REG_WIDTH)
+ EX fld.d $f15, \base, (15 * FPU_REG_WIDTH)
+ EX fld.d $f16, \base, (16 * FPU_REG_WIDTH)
+ EX fld.d $f17, \base, (17 * FPU_REG_WIDTH)
+ EX fld.d $f18, \base, (18 * FPU_REG_WIDTH)
+ EX fld.d $f19, \base, (19 * FPU_REG_WIDTH)
+ EX fld.d $f20, \base, (20 * FPU_REG_WIDTH)
+ EX fld.d $f21, \base, (21 * FPU_REG_WIDTH)
+ EX fld.d $f22, \base, (22 * FPU_REG_WIDTH)
+ EX fld.d $f23, \base, (23 * FPU_REG_WIDTH)
+ EX fld.d $f24, \base, (24 * FPU_REG_WIDTH)
+ EX fld.d $f25, \base, (25 * FPU_REG_WIDTH)
+ EX fld.d $f26, \base, (26 * FPU_REG_WIDTH)
+ EX fld.d $f27, \base, (27 * FPU_REG_WIDTH)
+ EX fld.d $f28, \base, (28 * FPU_REG_WIDTH)
+ EX fld.d $f29, \base, (29 * FPU_REG_WIDTH)
+ EX fld.d $f30, \base, (30 * FPU_REG_WIDTH)
+ EX fld.d $f31, \base, (31 * FPU_REG_WIDTH)
.endm
.macro sc_save_fcc base, tmp0, tmp1
movcf2gr \tmp0, $fcc0
- move \tmp1, \tmp0
+ move \tmp1, \tmp0
movcf2gr \tmp0, $fcc1
bstrins.d \tmp1, \tmp0, 15, 8
movcf2gr \tmp0, $fcc2
@@ -113,11 +113,11 @@
bstrins.d \tmp1, \tmp0, 55, 48
movcf2gr \tmp0, $fcc7
bstrins.d \tmp1, \tmp0, 63, 56
- EX st.d \tmp1, \base, 0
+ EX st.d \tmp1, \base, 0
.endm
.macro sc_restore_fcc base, tmp0, tmp1
- EX ld.d \tmp0, \base, 0
+ EX ld.d \tmp0, \base, 0
bstrpick.d \tmp1, \tmp0, 7, 0
movgr2cf $fcc0, \tmp1
bstrpick.d \tmp1, \tmp0, 15, 8
@@ -138,11 +138,11 @@
.macro sc_save_fcsr base, tmp0
movfcsr2gr \tmp0, fcsr0
- EX st.w \tmp0, \base, 0
+ EX st.w \tmp0, \base, 0
.endm
.macro sc_restore_fcsr base, tmp0
- EX ld.w \tmp0, \base, 0
+ EX ld.w \tmp0, \base, 0
movgr2fcsr fcsr0, \tmp0
.endm
@@ -151,9 +151,9 @@
*/
SYM_FUNC_START(_save_fp)
fpu_save_csr a0 t1
- fpu_save_double a0 t1 # clobbers t1
+ fpu_save_double a0 t1 # clobbers t1
fpu_save_cc a0 t1 t2 # clobbers t1, t2
- jirl zero, ra, 0
+ jr ra
SYM_FUNC_END(_save_fp)
EXPORT_SYMBOL(_save_fp)
@@ -161,10 +161,10 @@ EXPORT_SYMBOL(_save_fp)
* Restore a thread's fp context.
*/
SYM_FUNC_START(_restore_fp)
- fpu_restore_double a0 t1 # clobbers t1
- fpu_restore_csr a0 t1
- fpu_restore_cc a0 t1 t2 # clobbers t1, t2
- jirl zero, ra, 0
+ fpu_restore_double a0 t1 # clobbers t1
+ fpu_restore_csr a0 t1
+ fpu_restore_cc a0 t1 t2 # clobbers t1, t2
+ jr ra
SYM_FUNC_END(_restore_fp)
/*
@@ -216,7 +216,7 @@ SYM_FUNC_START(_init_fpu)
movgr2fr.d $f30, t1
movgr2fr.d $f31, t1
- jirl zero, ra, 0
+ jr ra
SYM_FUNC_END(_init_fpu)
/*
@@ -225,11 +225,11 @@ SYM_FUNC_END(_init_fpu)
* a2: fcsr
*/
SYM_FUNC_START(_save_fp_context)
- sc_save_fcc a1 t1 t2
- sc_save_fcsr a2 t1
- sc_save_fp a0
- li.w a0, 0 # success
- jirl zero, ra, 0
+ sc_save_fcc a1 t1 t2
+ sc_save_fcsr a2 t1
+ sc_save_fp a0
+ li.w a0, 0 # success
+ jr ra
SYM_FUNC_END(_save_fp_context)
/*
@@ -238,14 +238,14 @@ SYM_FUNC_END(_save_fp_context)
* a2: fcsr
*/
SYM_FUNC_START(_restore_fp_context)
- sc_restore_fp a0
- sc_restore_fcc a1 t1 t2
- sc_restore_fcsr a2 t1
- li.w a0, 0 # success
- jirl zero, ra, 0
+ sc_restore_fp a0
+ sc_restore_fcc a1 t1 t2
+ sc_restore_fcsr a2 t1
+ li.w a0, 0 # success
+ jr ra
SYM_FUNC_END(_restore_fp_context)
SYM_FUNC_START(fault)
li.w a0, -EFAULT # failure
- jirl zero, ra, 0
+ jr ra
SYM_FUNC_END(fault)
diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S
index 93496852b3cc..75e5be807a0d 100644
--- a/arch/loongarch/kernel/genex.S
+++ b/arch/loongarch/kernel/genex.S
@@ -28,23 +28,23 @@ SYM_FUNC_START(__arch_cpu_idle)
nop
idle 0
/* end of rollback region */
-1: jirl zero, ra, 0
+1: jr ra
SYM_FUNC_END(__arch_cpu_idle)
SYM_FUNC_START(handle_vint)
BACKUP_T0T1
SAVE_ALL
la.abs t1, __arch_cpu_idle
- LONG_L t0, sp, PT_ERA
+ LONG_L t0, sp, PT_ERA
/* 32 byte rollback region */
ori t0, t0, 0x1f
xori t0, t0, 0x1f
bne t0, t1, 1f
- LONG_S t0, sp, PT_ERA
+ LONG_S t0, sp, PT_ERA
1: move a0, sp
move a1, sp
la.abs t0, do_vint
- jirl ra, t0, 0
+ jirl ra, t0, 0
RESTORE_ALL_AND_RET
SYM_FUNC_END(handle_vint)
@@ -72,7 +72,7 @@ SYM_FUNC_END(except_vec_cex)
build_prep_\prep
move a0, sp
la.abs t0, do_\handler
- jirl ra, t0, 0
+ jirl ra, t0, 0
RESTORE_ALL_AND_RET
SYM_FUNC_END(handle_\exception)
.endm
@@ -91,5 +91,5 @@ SYM_FUNC_END(except_vec_cex)
SYM_FUNC_START(handle_sys)
la.abs t0, handle_syscall
- jirl zero, t0, 0
+ jr t0
SYM_FUNC_END(handle_sys)
diff --git a/arch/loongarch/kernel/head.S b/arch/loongarch/kernel/head.S
index d01e62dd414f..7062cdf0e33e 100644
--- a/arch/loongarch/kernel/head.S
+++ b/arch/loongarch/kernel/head.S
@@ -32,7 +32,7 @@ SYM_CODE_START(kernel_entry) # kernel entry point
/* We might not get launched at the address the kernel is linked to,
so we jump there. */
la.abs t0, 0f
- jirl zero, t0, 0
+ jr t0
0:
la t0, __bss_start # clear .bss
st.d zero, t0, 0
@@ -50,7 +50,7 @@ SYM_CODE_START(kernel_entry) # kernel entry point
/* KSave3 used for percpu base, initialized as 0 */
csrwr zero, PERCPU_BASE_KS
/* GPR21 used for percpu base (runtime), initialized as 0 */
- or u0, zero, zero
+ move u0, zero
la tp, init_thread_union
/* Set the SP after an empty pt_regs. */
@@ -85,8 +85,8 @@ SYM_CODE_START(smpboot_entry)
ld.d sp, t0, CPU_BOOT_STACK
ld.d tp, t0, CPU_BOOT_TINFO
- la.abs t0, 0f
- jirl zero, t0, 0
+ la.abs t0, 0f
+ jr t0
0:
bl start_secondary
SYM_CODE_END(smpboot_entry)
diff --git a/arch/loongarch/kernel/ptrace.c b/arch/loongarch/kernel/ptrace.c
index e6ab87948e1d..dc2b82ea894c 100644
--- a/arch/loongarch/kernel/ptrace.c
+++ b/arch/loongarch/kernel/ptrace.c
@@ -193,7 +193,7 @@ static int fpr_set(struct task_struct *target,
const void *kbuf, const void __user *ubuf)
{
const int fcc_start = NUM_FPU_REGS * sizeof(elf_fpreg_t);
- const int fcc_end = fcc_start + sizeof(u64);
+ const int fcsr_start = fcc_start + sizeof(u64);
int err;
BUG_ON(count % sizeof(elf_fpreg_t));
@@ -209,10 +209,12 @@ static int fpr_set(struct task_struct *target,
if (err)
return err;
- if (count > 0)
- err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.fcc,
- fcc_start, fcc_end);
+ err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.fpu.fcc, fcc_start,
+ fcc_start + sizeof(u64));
+ err |= user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.fpu.fcsr, fcsr_start,
+ fcsr_start + sizeof(u32));
return err;
}
diff --git a/arch/loongarch/kernel/reset.c b/arch/loongarch/kernel/reset.c
index 2b86469e4718..800c965a17ea 100644
--- a/arch/loongarch/kernel/reset.c
+++ b/arch/loongarch/kernel/reset.c
@@ -13,7 +13,6 @@
#include <linux/console.h>
#include <acpi/reboot.h>
-#include <asm/compiler.h>
#include <asm/idle.h>
#include <asm/loongarch.h>
#include <asm/reboot.h>
diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c
index c74860b53375..8f5c2f9a1a83 100644
--- a/arch/loongarch/kernel/setup.c
+++ b/arch/loongarch/kernel/setup.c
@@ -126,7 +126,7 @@ static void __init parse_bios_table(const struct dmi_header *dm)
char *dmi_data = (char *)dm;
bios_extern = *(dmi_data + SMBIOS_BIOSEXTERN_OFFSET);
- b_info.bios_size = *(dmi_data + SMBIOS_BIOSSIZE_OFFSET);
+ b_info.bios_size = (*(dmi_data + SMBIOS_BIOSSIZE_OFFSET) + 1) << 6;
if (bios_extern & LOONGSON_EFI_ENABLE)
set_bit(EFI_BOOT, &efi.flags);
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 73cec62504fb..09743103d9b3 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -278,116 +278,29 @@ void loongson3_cpu_die(unsigned int cpu)
mb();
}
-/*
- * The target CPU should go to XKPRANGE (uncached area) and flush
- * ICache/DCache/VCache before the control CPU can safely disable its clock.
- */
-static void loongson3_play_dead(int *state_addr)
+void play_dead(void)
{
- register int val;
- register void *addr;
+ register uint64_t addr;
register void (*init_fn)(void);
- __asm__ __volatile__(
- " li.d %[addr], 0x8000000000000000\n"
- "1: cacop 0x8, %[addr], 0 \n" /* flush ICache */
- " cacop 0x8, %[addr], 1 \n"
- " cacop 0x8, %[addr], 2 \n"
- " cacop 0x8, %[addr], 3 \n"
- " cacop 0x9, %[addr], 0 \n" /* flush DCache */
- " cacop 0x9, %[addr], 1 \n"
- " cacop 0x9, %[addr], 2 \n"
- " cacop 0x9, %[addr], 3 \n"
- " addi.w %[sets], %[sets], -1 \n"
- " addi.d %[addr], %[addr], 0x40 \n"
- " bnez %[sets], 1b \n"
- " li.d %[addr], 0x8000000000000000\n"
- "2: cacop 0xa, %[addr], 0 \n" /* flush VCache */
- " cacop 0xa, %[addr], 1 \n"
- " cacop 0xa, %[addr], 2 \n"
- " cacop 0xa, %[addr], 3 \n"
- " cacop 0xa, %[addr], 4 \n"
- " cacop 0xa, %[addr], 5 \n"
- " cacop 0xa, %[addr], 6 \n"
- " cacop 0xa, %[addr], 7 \n"
- " cacop 0xa, %[addr], 8 \n"
- " cacop 0xa, %[addr], 9 \n"
- " cacop 0xa, %[addr], 10 \n"
- " cacop 0xa, %[addr], 11 \n"
- " cacop 0xa, %[addr], 12 \n"
- " cacop 0xa, %[addr], 13 \n"
- " cacop 0xa, %[addr], 14 \n"
- " cacop 0xa, %[addr], 15 \n"
- " addi.w %[vsets], %[vsets], -1 \n"
- " addi.d %[addr], %[addr], 0x40 \n"
- " bnez %[vsets], 2b \n"
- " li.w %[val], 0x7 \n" /* *state_addr = CPU_DEAD; */
- " st.w %[val], %[state_addr], 0 \n"
- " dbar 0 \n"
- " cacop 0x11, %[state_addr], 0 \n" /* flush entry of *state_addr */
- : [addr] "=&r" (addr), [val] "=&r" (val)
- : [state_addr] "r" (state_addr),
- [sets] "r" (cpu_data[smp_processor_id()].dcache.sets),
- [vsets] "r" (cpu_data[smp_processor_id()].vcache.sets));
-
+ idle_task_exit();
local_irq_enable();
- change_csr_ecfg(ECFG0_IM, ECFGF_IPI);
+ set_csr_ecfg(ECFGF_IPI);
+ __this_cpu_write(cpu_state, CPU_DEAD);
+
+ __smp_mb();
+ do {
+ __asm__ __volatile__("idle 0\n\t");
+ addr = iocsr_read64(LOONGARCH_IOCSR_MBUF0);
+ } while (addr == 0);
- __asm__ __volatile__(
- " idle 0 \n"
- " li.w $t0, 0x1020 \n"
- " iocsrrd.d %[init_fn], $t0 \n" /* Get init PC */
- : [init_fn] "=&r" (addr)
- : /* No Input */
- : "a0");
- init_fn = __va(addr);
+ init_fn = (void *)TO_CACHE(addr);
+ iocsr_write32(0xffffffff, LOONGARCH_IOCSR_IPI_CLEAR);
init_fn();
unreachable();
}
-void play_dead(void)
-{
- int *state_addr;
- unsigned int cpu = smp_processor_id();
- void (*play_dead_uncached)(int *s);
-
- idle_task_exit();
- play_dead_uncached = (void *)TO_UNCACHE(__pa((unsigned long)loongson3_play_dead));
- state_addr = &per_cpu(cpu_state, cpu);
- mb();
- play_dead_uncached(state_addr);
-}
-
-static int loongson3_enable_clock(unsigned int cpu)
-{
- uint64_t core_id = cpu_data[cpu].core;
- uint64_t package_id = cpu_data[cpu].package;
-
- LOONGSON_FREQCTRL(package_id) |= 1 << (core_id * 4 + 3);
-
- return 0;
-}
-
-static int loongson3_disable_clock(unsigned int cpu)
-{
- uint64_t core_id = cpu_data[cpu].core;
- uint64_t package_id = cpu_data[cpu].package;
-
- LOONGSON_FREQCTRL(package_id) &= ~(1 << (core_id * 4 + 3));
-
- return 0;
-}
-
-static int register_loongson3_notifier(void)
-{
- return cpuhp_setup_state_nocalls(CPUHP_LOONGARCH_SOC_PREPARE,
- "loongarch/loongson:prepare",
- loongson3_enable_clock,
- loongson3_disable_clock);
-}
-early_initcall(register_loongson3_notifier);
-
#endif
/*
diff --git a/arch/loongarch/kernel/switch.S b/arch/loongarch/kernel/switch.S
index 53e2fa8e580e..37e84ac8ffc2 100644
--- a/arch/loongarch/kernel/switch.S
+++ b/arch/loongarch/kernel/switch.S
@@ -24,8 +24,8 @@ SYM_FUNC_START(__switch_to)
move tp, a2
cpu_restore_nonscratch a1
- li.w t0, _THREAD_SIZE - 32
- PTR_ADD t0, t0, tp
+ li.w t0, _THREAD_SIZE - 32
+ PTR_ADD t0, t0, tp
set_saved_sp t0, t1, t2
ldptr.d t1, a1, THREAD_CSRPRMD
diff --git a/arch/loongarch/lib/clear_user.S b/arch/loongarch/lib/clear_user.S
index 25d9be5fbb19..16ba2b8dd68a 100644
--- a/arch/loongarch/lib/clear_user.S
+++ b/arch/loongarch/lib/clear_user.S
@@ -32,7 +32,7 @@ SYM_FUNC_START(__clear_user)
1: st.b zero, a0, 0
addi.d a0, a0, 1
addi.d a1, a1, -1
- bgt a1, zero, 1b
+ bgtz a1, 1b
2: move a0, a1
jr ra
diff --git a/arch/loongarch/lib/copy_user.S b/arch/loongarch/lib/copy_user.S
index 9ae507f851b5..97d20327a69e 100644
--- a/arch/loongarch/lib/copy_user.S
+++ b/arch/loongarch/lib/copy_user.S
@@ -35,7 +35,7 @@ SYM_FUNC_START(__copy_user)
addi.d a0, a0, 1
addi.d a1, a1, 1
addi.d a2, a2, -1
- bgt a2, zero, 1b
+ bgtz a2, 1b
3: move a0, a2
jr ra
diff --git a/arch/loongarch/lib/delay.c b/arch/loongarch/lib/delay.c
index 5d856694fcfe..831d4761f385 100644
--- a/arch/loongarch/lib/delay.c
+++ b/arch/loongarch/lib/delay.c
@@ -7,7 +7,6 @@
#include <linux/smp.h>
#include <linux/timex.h>
-#include <asm/compiler.h>
#include <asm/processor.h>
void __delay(unsigned long cycles)
diff --git a/arch/loongarch/mm/page.S b/arch/loongarch/mm/page.S
index ddc78ab33c7b..4c874a7af0ad 100644
--- a/arch/loongarch/mm/page.S
+++ b/arch/loongarch/mm/page.S
@@ -10,75 +10,75 @@
.align 5
SYM_FUNC_START(clear_page)
- lu12i.w t0, 1 << (PAGE_SHIFT - 12)
- add.d t0, t0, a0
+ lu12i.w t0, 1 << (PAGE_SHIFT - 12)
+ add.d t0, t0, a0
1:
- st.d zero, a0, 0
- st.d zero, a0, 8
- st.d zero, a0, 16
- st.d zero, a0, 24
- st.d zero, a0, 32
- st.d zero, a0, 40
- st.d zero, a0, 48
- st.d zero, a0, 56
- addi.d a0, a0, 128
- st.d zero, a0, -64
- st.d zero, a0, -56
- st.d zero, a0, -48
- st.d zero, a0, -40
- st.d zero, a0, -32
- st.d zero, a0, -24
- st.d zero, a0, -16
- st.d zero, a0, -8
- bne t0, a0, 1b
+ st.d zero, a0, 0
+ st.d zero, a0, 8
+ st.d zero, a0, 16
+ st.d zero, a0, 24
+ st.d zero, a0, 32
+ st.d zero, a0, 40
+ st.d zero, a0, 48
+ st.d zero, a0, 56
+ addi.d a0, a0, 128
+ st.d zero, a0, -64
+ st.d zero, a0, -56
+ st.d zero, a0, -48
+ st.d zero, a0, -40
+ st.d zero, a0, -32
+ st.d zero, a0, -24
+ st.d zero, a0, -16
+ st.d zero, a0, -8
+ bne t0, a0, 1b
- jirl $r0, ra, 0
+ jr ra
SYM_FUNC_END(clear_page)
EXPORT_SYMBOL(clear_page)
.align 5
SYM_FUNC_START(copy_page)
- lu12i.w t8, 1 << (PAGE_SHIFT - 12)
- add.d t8, t8, a0
+ lu12i.w t8, 1 << (PAGE_SHIFT - 12)
+ add.d t8, t8, a0
1:
- ld.d t0, a1, 0
- ld.d t1, a1, 8
- ld.d t2, a1, 16
- ld.d t3, a1, 24
- ld.d t4, a1, 32
- ld.d t5, a1, 40
- ld.d t6, a1, 48
- ld.d t7, a1, 56
+ ld.d t0, a1, 0
+ ld.d t1, a1, 8
+ ld.d t2, a1, 16
+ ld.d t3, a1, 24
+ ld.d t4, a1, 32
+ ld.d t5, a1, 40
+ ld.d t6, a1, 48
+ ld.d t7, a1, 56
- st.d t0, a0, 0
- st.d t1, a0, 8
- ld.d t0, a1, 64
- ld.d t1, a1, 72
- st.d t2, a0, 16
- st.d t3, a0, 24
- ld.d t2, a1, 80
- ld.d t3, a1, 88
- st.d t4, a0, 32
- st.d t5, a0, 40
- ld.d t4, a1, 96
- ld.d t5, a1, 104
- st.d t6, a0, 48
- st.d t7, a0, 56
- ld.d t6, a1, 112
- ld.d t7, a1, 120
- addi.d a0, a0, 128
- addi.d a1, a1, 128
+ st.d t0, a0, 0
+ st.d t1, a0, 8
+ ld.d t0, a1, 64
+ ld.d t1, a1, 72
+ st.d t2, a0, 16
+ st.d t3, a0, 24
+ ld.d t2, a1, 80
+ ld.d t3, a1, 88
+ st.d t4, a0, 32
+ st.d t5, a0, 40
+ ld.d t4, a1, 96
+ ld.d t5, a1, 104
+ st.d t6, a0, 48
+ st.d t7, a0, 56
+ ld.d t6, a1, 112
+ ld.d t7, a1, 120
+ addi.d a0, a0, 128
+ addi.d a1, a1, 128
- st.d t0, a0, -64
- st.d t1, a0, -56
- st.d t2, a0, -48
- st.d t3, a0, -40
- st.d t4, a0, -32
- st.d t5, a0, -24
- st.d t6, a0, -16
- st.d t7, a0, -8
+ st.d t0, a0, -64
+ st.d t1, a0, -56
+ st.d t2, a0, -48
+ st.d t3, a0, -40
+ st.d t4, a0, -32
+ st.d t5, a0, -24
+ st.d t6, a0, -16
+ st.d t7, a0, -8
- bne t8, a0, 1b
- jirl $r0, ra, 0
+ bne t8, a0, 1b
+ jr ra
SYM_FUNC_END(copy_page)
EXPORT_SYMBOL(copy_page)
diff --git a/arch/loongarch/mm/tlbex.S b/arch/loongarch/mm/tlbex.S
index 7eee40271577..de19fa2d7f0d 100644
--- a/arch/loongarch/mm/tlbex.S
+++ b/arch/loongarch/mm/tlbex.S
@@ -18,7 +18,7 @@
REG_S a2, sp, PT_BVADDR
li.w a1, \write
la.abs t0, do_page_fault
- jirl ra, t0, 0
+ jirl ra, t0, 0
RESTORE_ALL_AND_RET
SYM_FUNC_END(tlb_do_page_fault_\write)
.endm
@@ -34,7 +34,7 @@ SYM_FUNC_START(handle_tlb_protect)
csrrd a2, LOONGARCH_CSR_BADV
REG_S a2, sp, PT_BVADDR
la.abs t0, do_page_fault
- jirl ra, t0, 0
+ jirl ra, t0, 0
RESTORE_ALL_AND_RET
SYM_FUNC_END(handle_tlb_protect)
@@ -47,7 +47,7 @@ SYM_FUNC_START(handle_tlb_load)
* The vmalloc handling is not in the hotpath.
*/
csrrd t0, LOONGARCH_CSR_BADV
- blt t0, $r0, vmalloc_load
+ bltz t0, vmalloc_load
csrrd t1, LOONGARCH_CSR_PGDL
vmalloc_done_load:
@@ -80,7 +80,7 @@ vmalloc_done_load:
* see if we need to jump to huge tlb processing.
*/
andi t0, ra, _PAGE_HUGE
- bne t0, $r0, tlb_huge_update_load
+ bnez t0, tlb_huge_update_load
csrrd t0, LOONGARCH_CSR_BADV
srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER)
@@ -100,12 +100,12 @@ smp_pgtable_change_load:
srli.d ra, t0, _PAGE_PRESENT_SHIFT
andi ra, ra, 1
- beq ra, $r0, nopage_tlb_load
+ beqz ra, nopage_tlb_load
ori t0, t0, _PAGE_VALID
#ifdef CONFIG_SMP
sc.d t0, t1, 0
- beq t0, $r0, smp_pgtable_change_load
+ beqz t0, smp_pgtable_change_load
#else
st.d t0, t1, 0
#endif
@@ -139,23 +139,23 @@ tlb_huge_update_load:
#endif
srli.d ra, t0, _PAGE_PRESENT_SHIFT
andi ra, ra, 1
- beq ra, $r0, nopage_tlb_load
+ beqz ra, nopage_tlb_load
tlbsrch
ori t0, t0, _PAGE_VALID
#ifdef CONFIG_SMP
sc.d t0, t1, 0
- beq t0, $r0, tlb_huge_update_load
+ beqz t0, tlb_huge_update_load
ld.d t0, t1, 0
#else
st.d t0, t1, 0
#endif
- addu16i.d t1, $r0, -(CSR_TLBIDX_EHINV >> 16)
- addi.d ra, t1, 0
- csrxchg ra, t1, LOONGARCH_CSR_TLBIDX
+ addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16)
+ addi.d ra, t1, 0
+ csrxchg ra, t1, LOONGARCH_CSR_TLBIDX
tlbwr
- csrxchg $r0, t1, LOONGARCH_CSR_TLBIDX
+ csrxchg zero, t1, LOONGARCH_CSR_TLBIDX
/*
* A huge PTE describes an area the size of the
@@ -178,27 +178,27 @@ tlb_huge_update_load:
addi.d t0, ra, 0
/* Convert to entrylo1 */
- addi.d t1, $r0, 1
+ addi.d t1, zero, 1
slli.d t1, t1, (HPAGE_SHIFT - 1)
add.d t0, t0, t1
csrwr t0, LOONGARCH_CSR_TLBELO1
/* Set huge page tlb entry size */
- addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16)
- addu16i.d t1, $r0, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
+ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
+ addu16i.d t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
tlbfill
- addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16)
- addu16i.d t1, $r0, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
+ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
+ addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
nopage_tlb_load:
dbar 0
csrrd ra, EXCEPTION_KS2
la.abs t0, tlb_do_page_fault_0
- jirl $r0, t0, 0
+ jr t0
SYM_FUNC_END(handle_tlb_load)
SYM_FUNC_START(handle_tlb_store)
@@ -210,7 +210,7 @@ SYM_FUNC_START(handle_tlb_store)
* The vmalloc handling is not in the hotpath.
*/
csrrd t0, LOONGARCH_CSR_BADV
- blt t0, $r0, vmalloc_store
+ bltz t0, vmalloc_store
csrrd t1, LOONGARCH_CSR_PGDL
vmalloc_done_store:
@@ -244,7 +244,7 @@ vmalloc_done_store:
* see if we need to jump to huge tlb processing.
*/
andi t0, ra, _PAGE_HUGE
- bne t0, $r0, tlb_huge_update_store
+ bnez t0, tlb_huge_update_store
csrrd t0, LOONGARCH_CSR_BADV
srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER)
@@ -265,12 +265,12 @@ smp_pgtable_change_store:
srli.d ra, t0, _PAGE_PRESENT_SHIFT
andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
- bne ra, $r0, nopage_tlb_store
+ bnez ra, nopage_tlb_store
ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
#ifdef CONFIG_SMP
sc.d t0, t1, 0
- beq t0, $r0, smp_pgtable_change_store
+ beqz t0, smp_pgtable_change_store
#else
st.d t0, t1, 0
#endif
@@ -306,24 +306,24 @@ tlb_huge_update_store:
srli.d ra, t0, _PAGE_PRESENT_SHIFT
andi ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
xori ra, ra, ((_PAGE_PRESENT | _PAGE_WRITE) >> _PAGE_PRESENT_SHIFT)
- bne ra, $r0, nopage_tlb_store
+ bnez ra, nopage_tlb_store
tlbsrch
ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
#ifdef CONFIG_SMP
sc.d t0, t1, 0
- beq t0, $r0, tlb_huge_update_store
+ beqz t0, tlb_huge_update_store
ld.d t0, t1, 0
#else
st.d t0, t1, 0
#endif
- addu16i.d t1, $r0, -(CSR_TLBIDX_EHINV >> 16)
- addi.d ra, t1, 0
- csrxchg ra, t1, LOONGARCH_CSR_TLBIDX
+ addu16i.d t1, zero, -(CSR_TLBIDX_EHINV >> 16)
+ addi.d ra, t1, 0
+ csrxchg ra, t1, LOONGARCH_CSR_TLBIDX
tlbwr
- csrxchg $r0, t1, LOONGARCH_CSR_TLBIDX
+ csrxchg zero, t1, LOONGARCH_CSR_TLBIDX
/*
* A huge PTE describes an area the size of the
* configured huge page size. This is twice the
@@ -345,28 +345,28 @@ tlb_huge_update_store:
addi.d t0, ra, 0
/* Convert to entrylo1 */
- addi.d t1, $r0, 1
+ addi.d t1, zero, 1
slli.d t1, t1, (HPAGE_SHIFT - 1)
add.d t0, t0, t1
csrwr t0, LOONGARCH_CSR_TLBELO1
/* Set huge page tlb entry size */
- addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16)
- addu16i.d t1, $r0, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
+ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
+ addu16i.d t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
tlbfill
/* Reset default page size */
- addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16)
- addu16i.d t1, $r0, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
+ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
+ addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
nopage_tlb_store:
dbar 0
csrrd ra, EXCEPTION_KS2
la.abs t0, tlb_do_page_fault_1
- jirl $r0, t0, 0
+ jr t0
SYM_FUNC_END(handle_tlb_store)
SYM_FUNC_START(handle_tlb_modify)
@@ -378,7 +378,7 @@ SYM_FUNC_START(handle_tlb_modify)
* The vmalloc handling is not in the hotpath.
*/
csrrd t0, LOONGARCH_CSR_BADV
- blt t0, $r0, vmalloc_modify
+ bltz t0, vmalloc_modify
csrrd t1, LOONGARCH_CSR_PGDL
vmalloc_done_modify:
@@ -411,7 +411,7 @@ vmalloc_done_modify:
* see if we need to jump to huge tlb processing.
*/
andi t0, ra, _PAGE_HUGE
- bne t0, $r0, tlb_huge_update_modify
+ bnez t0, tlb_huge_update_modify
csrrd t0, LOONGARCH_CSR_BADV
srli.d t0, t0, (PAGE_SHIFT + PTE_ORDER)
@@ -431,12 +431,12 @@ smp_pgtable_change_modify:
srli.d ra, t0, _PAGE_WRITE_SHIFT
andi ra, ra, 1
- beq ra, $r0, nopage_tlb_modify
+ beqz ra, nopage_tlb_modify
ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
#ifdef CONFIG_SMP
sc.d t0, t1, 0
- beq t0, $r0, smp_pgtable_change_modify
+ beqz t0, smp_pgtable_change_modify
#else
st.d t0, t1, 0
#endif
@@ -454,7 +454,7 @@ leave_modify:
ertn
#ifdef CONFIG_64BIT
vmalloc_modify:
- la.abs t1, swapper_pg_dir
+ la.abs t1, swapper_pg_dir
b vmalloc_done_modify
#endif
@@ -471,14 +471,14 @@ tlb_huge_update_modify:
srli.d ra, t0, _PAGE_WRITE_SHIFT
andi ra, ra, 1
- beq ra, $r0, nopage_tlb_modify
+ beqz ra, nopage_tlb_modify
tlbsrch
ori t0, t0, (_PAGE_VALID | _PAGE_DIRTY | _PAGE_MODIFIED)
#ifdef CONFIG_SMP
sc.d t0, t1, 0
- beq t0, $r0, tlb_huge_update_modify
+ beqz t0, tlb_huge_update_modify
ld.d t0, t1, 0
#else
st.d t0, t1, 0
@@ -504,28 +504,28 @@ tlb_huge_update_modify:
addi.d t0, ra, 0
/* Convert to entrylo1 */
- addi.d t1, $r0, 1
+ addi.d t1, zero, 1
slli.d t1, t1, (HPAGE_SHIFT - 1)
add.d t0, t0, t1
csrwr t0, LOONGARCH_CSR_TLBELO1
/* Set huge page tlb entry size */
- addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16)
- addu16i.d t1, $r0, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
- csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
+ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
+ addu16i.d t1, zero, (PS_HUGE_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
+ csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
tlbwr
/* Reset default page size */
- addu16i.d t0, $r0, (CSR_TLBIDX_PS >> 16)
- addu16i.d t1, $r0, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
- csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
+ addu16i.d t0, zero, (CSR_TLBIDX_PS >> 16)
+ addu16i.d t1, zero, (PS_DEFAULT_SIZE << (CSR_TLBIDX_PS_SHIFT - 16))
+ csrxchg t1, t0, LOONGARCH_CSR_TLBIDX
nopage_tlb_modify:
dbar 0
csrrd ra, EXCEPTION_KS2
la.abs t0, tlb_do_page_fault_1
- jirl $r0, t0, 0
+ jr t0
SYM_FUNC_END(handle_tlb_modify)
SYM_FUNC_START(handle_tlb_refill)
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index f3aa44156969..e0e9e31339c1 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -155,7 +155,7 @@ config M520x
select COLDFIRE_PIT_TIMER
select HAVE_CACHE_SPLIT
help
- Freescale Coldfire 5207/5208 processor support.
+ Freescale Coldfire 5207/5208 processor support.
config M523x
bool "MCF523x"
@@ -322,7 +322,6 @@ config COLDFIRE_SLTIMERS
endif # COLDFIRE
-
comment "Processor Specific Options"
config M68KFPU_EMU
@@ -522,7 +521,7 @@ config CACHE_BOTH
Split the ColdFire CPU cache, and use half as an instruction cache
and half as a data cache.
endchoice
-endif
+endif # HAVE_CACHE_SPLIT
if HAVE_CACHE_CB
choice
@@ -539,4 +538,4 @@ config CACHE_COPYBACK
help
The ColdFire CPU cache is set into Copy-back mode.
endchoice
-endif
+endif # HAVE_CACHE_CB
diff --git a/arch/m68k/Kconfig.debug b/arch/m68k/Kconfig.debug
index 11b306bdd788..465e28be0ce4 100644
--- a/arch/m68k/Kconfig.debug
+++ b/arch/m68k/Kconfig.debug
@@ -1,11 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
config BOOTPARAM
- bool 'Compiled-in Kernel Boot Parameter'
+ bool "Compiled-in Kernel Boot Parameter"
config BOOTPARAM_STRING
- string 'Kernel Boot Parameter'
- default 'console=ttyS0,19200'
+ string "Kernel Boot Parameter"
+ default "console=ttyS0,19200"
depends on BOOTPARAM
config EARLY_PRINTK
diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine
index a1042568b9ad..53c45ccda564 100644
--- a/arch/m68k/Kconfig.machine
+++ b/arch/m68k/Kconfig.machine
@@ -161,10 +161,11 @@ config VIRT
select RTC_CLASS
select RTC_DRV_GOLDFISH
select TTY
+ select VIRTIO_MENU
select VIRTIO_MMIO
help
This options enable a pure virtual machine based on m68k,
- VIRTIO MMIO devices and GOLDFISH interfaces (TTY, RTC, PIC)
+ VIRTIO MMIO devices and GOLDFISH interfaces (TTY, RTC, PIC).
config PILOT
bool
@@ -492,4 +493,4 @@ config ROMKERNEL
endchoice
-endif
+endif # !MMU || COLDFIRE
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index c181030218bf..a6a886a89be2 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68020=y
@@ -43,8 +41,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -310,6 +309,7 @@ CONFIG_PARPORT_MFC3=m
CONFIG_PARPORT_1284=y
CONFIG_AMIGA_FLOPPY=y
CONFIG_AMIGA_Z2RAM=y
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -580,7 +580,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -595,7 +595,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -648,11 +648,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 40755648fb6c..bffd24c2755e 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68020=y
@@ -39,8 +37,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -300,6 +299,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -537,7 +537,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -552,7 +552,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -604,11 +604,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index be0d9155fc5b..0013425b1e08 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68020=y
@@ -46,8 +44,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -311,6 +310,7 @@ CONFIG_PARPORT=m
CONFIG_PARPORT_ATARI=m
CONFIG_PARPORT_1284=y
CONFIG_ATARI_FLOPPY=y
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -557,7 +557,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -572,7 +572,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -625,11 +625,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 9af0e2d0d153..42d969697f7f 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68040=y
@@ -36,8 +34,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -297,6 +296,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -529,7 +529,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -544,7 +544,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -596,11 +596,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 49341d66feb6..97d6d9acb395 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68020=y
@@ -38,8 +36,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -299,6 +298,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -539,7 +539,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -554,7 +554,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -606,11 +606,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 92b33d5ffab1..8cbfc1c659a3 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68020=y
@@ -37,8 +35,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -302,6 +301,7 @@ CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
CONFIG_BLK_DEV_SWIM=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -559,7 +559,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -574,7 +574,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -627,11 +627,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 6aaa947bc849..9f45fe60757f 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68020=y
@@ -57,8 +55,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -331,6 +330,7 @@ CONFIG_AMIGA_FLOPPY=y
CONFIG_ATARI_FLOPPY=y
CONFIG_BLK_DEV_SWIM=m
CONFIG_AMIGA_Z2RAM=y
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -645,7 +645,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -660,7 +660,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -713,11 +713,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index b62d65e59938..4736cfacf6a2 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68030=y
@@ -35,8 +33,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -296,6 +295,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -528,7 +528,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -543,7 +543,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -595,11 +595,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 8ecf261487d4..638cd38aa7d2 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68040=y
@@ -36,8 +34,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -297,6 +296,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -529,7 +529,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -544,7 +544,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -596,11 +596,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 7540d908897b..ec8b6bb70ebd 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_M68040=y
@@ -37,8 +35,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -301,6 +300,7 @@ CONFIG_CONNECTOR=m
CONFIG_PARPORT=m
CONFIG_PARPORT_PC=m
CONFIG_PARPORT_1284=y
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -546,7 +546,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -561,7 +561,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -614,11 +614,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 832b45944617..7d8dc578d59c 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_SUN3=y
@@ -33,8 +31,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -294,6 +293,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -528,7 +528,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -543,7 +543,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -594,11 +594,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 9171b687e565..96290aee5302 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -10,8 +10,6 @@ CONFIG_LOG_BUF_SHIFT=16
# CONFIG_NET_NS is not set
CONFIG_BLK_DEV_INITRD=y
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
-CONFIG_USERFAULTFD=y
-CONFIG_SLAB=y
CONFIG_KEXEC=y
CONFIG_BOOTINFO_PROC=y
CONFIG_SUN3X=y
@@ -33,8 +31,9 @@ CONFIG_MQ_IOSCHED_KYBER=m
CONFIG_IOSCHED_BFQ=m
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
CONFIG_BINFMT_MISC=m
+CONFIG_SLAB=y
# CONFIG_COMPACTION is not set
-CONFIG_ZPOOL=m
+CONFIG_USERFAULTFD=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -294,6 +293,7 @@ CONFIG_DEVTMPFS=y
CONFIG_DEVTMPFS_MOUNT=y
CONFIG_TEST_ASYNC_DRIVER_PROBE=m
CONFIG_CONNECTOR=m
+CONFIG_ZRAM=m
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_DRBD=m
CONFIG_BLK_DEV_NBD=m
@@ -527,7 +527,7 @@ CONFIG_CRYPTO_MD4=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_RMD160=m
CONFIG_CRYPTO_SHA3=m
-CONFIG_CRYPTO_SM3=m
+CONFIG_CRYPTO_SM3_GENERIC=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_AES=y
CONFIG_CRYPTO_AES_TI=m
@@ -542,7 +542,7 @@ CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_SM4=m
+CONFIG_CRYPTO_SM4_GENERIC=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_LZO=m
@@ -594,11 +594,7 @@ CONFIG_TEST_BLACKHOLE_DEV=m
CONFIG_FIND_BIT_BENCHMARK=m
CONFIG_TEST_FIRMWARE=m
CONFIG_TEST_SYSCTL=m
-CONFIG_BITFIELD_KUNIT=m
-CONFIG_RESOURCE_KUNIT_TEST=m
CONFIG_LINEAR_RANGES_TEST=m
-CONFIG_CMDLINE_KUNIT_TEST=m
-CONFIG_BITS_TEST=m
CONFIG_TEST_UDELAY=m
CONFIG_TEST_STATIC_KEYS=m
CONFIG_TEST_KMOD=m
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 51283db53667..87c2cd66a9ce 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -510,7 +510,7 @@ static inline int fls(unsigned int x)
return 32 - cnt;
}
-static inline int __fls(int x)
+static inline unsigned long __fls(unsigned long x)
{
return fls(x) - 1;
}
diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h
index ffeda9aa526a..d86b4009880b 100644
--- a/arch/m68k/include/asm/processor.h
+++ b/arch/m68k/include/asm/processor.h
@@ -151,6 +151,7 @@ static inline void release_thread(struct task_struct *dead_task)
}
unsigned long __get_wchan(struct task_struct *p);
+void show_registers(struct pt_regs *regs);
#define KSTK_EIP(tsk) \
({ \
diff --git a/arch/m68k/include/uapi/asm/bootinfo-virt.h b/arch/m68k/include/uapi/asm/bootinfo-virt.h
index e4db7e2213ab..b091ee9b06e0 100644
--- a/arch/m68k/include/uapi/asm/bootinfo-virt.h
+++ b/arch/m68k/include/uapi/asm/bootinfo-virt.h
@@ -13,6 +13,14 @@
#define BI_VIRT_VIRTIO_BASE 0x8004
#define BI_VIRT_CTRL_BASE 0x8005
+/*
+ * A random seed used to initialize the RNG. Record format:
+ *
+ * - length [ 2 bytes, 16-bit big endian ]
+ * - seed data [ `length` bytes, padded to preserve 2-byte alignment ]
+ */
+#define BI_VIRT_RNG_SEED 0x8006
+
#define VIRT_BOOTI_VERSION MK_BI_VERSION(2, 0)
#endif /* _UAPI_ASM_M68K_BOOTINFO_MAC_H */
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index 59fc63feb0dc..5c8cba0efc63 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -36,6 +36,7 @@
#include <linux/uaccess.h>
#include <asm/traps.h>
#include <asm/machdep.h>
+#include <asm/processor.h>
#include <asm/siginfo.h>
#include <asm/tlbflush.h>
diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c
index de156a027f5b..010b3b5ae8e8 100644
--- a/arch/m68k/mac/iop.c
+++ b/arch/m68k/mac/iop.c
@@ -25,7 +25,7 @@
* check this.)
* 990605 (jmt) - Rearranged things a bit wrt IOP detection; iop_present is
* gone, IOP base addresses are now in an array and the
- * globally-visible functions take an IOP number instead of an
+ * globally-visible functions take an IOP number instead of
* an actual base address.
* 990610 (jmt) - Finished the message passing framework and it seems to work.
* Sending _definitely_ works; my adb-bus.c mods can send
@@ -66,7 +66,7 @@
* a shared memory area in the IOP RAM. Each IOP has seven "channels"; each
* channel is connected to a specific software driver on the IOP. For example
* on the SCC IOP there is one channel for each serial port. Each channel has
- * an incoming and and outgoing message queue with a depth of one.
+ * an incoming and an outgoing message queue with a depth of one.
*
* A message is 32 bytes plus a state byte for the channel (MSG_IDLE, MSG_NEW,
* MSG_RCVD, MSG_COMPLETE). To send a message you copy the message into the
diff --git a/arch/m68k/mac/macints.c b/arch/m68k/mac/macints.c
index e3575388cd05..5cbaf6e9497c 100644
--- a/arch/m68k/mac/macints.c
+++ b/arch/m68k/mac/macints.c
@@ -126,10 +126,7 @@
#include <asm/mac_baboon.h>
#include <asm/hwtest.h>
#include <asm/irq_regs.h>
-
-extern void show_registers(struct pt_regs *);
-
-irqreturn_t mac_nmi_handler(int, void *);
+#include <asm/processor.h>
static unsigned int mac_irq_startup(struct irq_data *);
static void mac_irq_shutdown(struct irq_data *);
@@ -142,6 +139,21 @@ static struct irq_chip mac_irq_chip = {
.irq_shutdown = mac_irq_shutdown,
};
+static irqreturn_t mac_nmi_handler(int irq, void *dev_id)
+{
+ static volatile int in_nmi;
+
+ if (in_nmi)
+ return IRQ_HANDLED;
+ in_nmi = 1;
+
+ pr_info("Non-Maskable Interrupt\n");
+ show_registers(get_irq_regs());
+
+ in_nmi = 0;
+ return IRQ_HANDLED;
+}
+
void __init mac_init_IRQ(void)
{
m68k_setup_irq_controller(&mac_irq_chip, handle_simple_irq, IRQ_USER,
@@ -254,18 +266,3 @@ static void mac_irq_shutdown(struct irq_data *data)
else
mac_irq_disable(data);
}
-
-static volatile int in_nmi;
-
-irqreturn_t mac_nmi_handler(int irq, void *dev_id)
-{
- if (in_nmi)
- return IRQ_HANDLED;
- in_nmi = 1;
-
- pr_info("Non-Maskable Interrupt\n");
- show_registers(get_irq_regs());
-
- in_nmi = 0;
- return IRQ_HANDLED;
-}
diff --git a/arch/m68k/q40/q40ints.c b/arch/m68k/q40/q40ints.c
index 6886a5d0007b..d15057d34e56 100644
--- a/arch/m68k/q40/q40ints.c
+++ b/arch/m68k/q40/q40ints.c
@@ -32,7 +32,7 @@
* 33 : frame int (50/200 Hz periodic timer)
* 34 : sample int (10/20 KHz periodic timer)
*
-*/
+ */
static void q40_irq_handler(unsigned int, struct pt_regs *fp);
static void q40_irq_enable(struct irq_data *data);
diff --git a/arch/m68k/sun3/mmu_emu.c b/arch/m68k/sun3/mmu_emu.c
index 7ec20817c0c9..7321b3b76283 100644
--- a/arch/m68k/sun3/mmu_emu.c
+++ b/arch/m68k/sun3/mmu_emu.c
@@ -211,7 +211,7 @@ void clear_context(unsigned long context)
if(context) {
if(!ctx_alloc[context])
- panic("clear_context: context not allocated\n");
+ panic("%s: context not allocated\n", __func__);
ctx_alloc[context]->context = SUN3_INVALID_CONTEXT;
ctx_alloc[context] = (struct mm_struct *)0;
@@ -261,7 +261,7 @@ unsigned long get_free_context(struct mm_struct *mm)
}
// check to make sure one was really free...
if(new == CONTEXTS_NUM)
- panic("get_free_context: failed to find free context");
+ panic("%s: failed to find free context", __func__);
}
ctx_alloc[new] = mm;
@@ -369,16 +369,15 @@ int mmu_emu_handle_fault (unsigned long vaddr, int read_flag, int kernel_fault)
}
#ifdef DEBUG_MMU_EMU
- pr_info("mmu_emu_handle_fault: vaddr=%lx type=%s crp=%p\n",
- vaddr, read_flag ? "read" : "write", crp);
+ pr_info("%s: vaddr=%lx type=%s crp=%p\n", __func__, vaddr,
+ read_flag ? "read" : "write", crp);
#endif
segment = (vaddr >> SUN3_PMEG_SIZE_BITS) & 0x7FF;
offset = (vaddr >> SUN3_PTE_SIZE_BITS) & 0xF;
#ifdef DEBUG_MMU_EMU
- pr_info("mmu_emu_handle_fault: segment=%lx offset=%lx\n", segment,
- offset);
+ pr_info("%s: segment=%lx offset=%lx\n", __func__, segment, offset);
#endif
pte = (pte_t *) pgd_val (*(crp + segment));
diff --git a/arch/m68k/virt/config.c b/arch/m68k/virt/config.c
index 632ba200ad42..4ab22946ff68 100644
--- a/arch/m68k/virt/config.c
+++ b/arch/m68k/virt/config.c
@@ -2,6 +2,7 @@
#include <linux/reboot.h>
#include <linux/serial_core.h>
+#include <linux/random.h>
#include <clocksource/timer-goldfish.h>
#include <asm/bootinfo.h>
@@ -92,6 +93,16 @@ int __init virt_parse_bootinfo(const struct bi_record *record)
data += 4;
virt_bi_data.virtio.irq = be32_to_cpup(data);
break;
+ case BI_VIRT_RNG_SEED: {
+ u16 len = be16_to_cpup(data);
+ add_bootloader_randomness(data + 2, len);
+ /*
+ * Zero the data to preserve forward secrecy, and zero the
+ * length to prevent kexec from using it.
+ */
+ memzero_explicit((void *)data, len + 2);
+ break;
+ }
default:
unknown = 1;
break;
diff --git a/arch/m68k/virt/ints.c b/arch/m68k/virt/ints.c
index 95818f901ebe..896aa6eb8bcc 100644
--- a/arch/m68k/virt/ints.c
+++ b/arch/m68k/virt/ints.c
@@ -12,6 +12,7 @@
#include <asm/hwtest.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
+#include <asm/processor.h>
#include <asm/virt.h>
#define GFPIC_REG_IRQ_PENDING 0x04
@@ -19,8 +20,6 @@
#define GFPIC_REG_IRQ_DISABLE 0x0c
#define GFPIC_REG_IRQ_ENABLE 0x10
-extern void show_registers(struct pt_regs *regs);
-
static struct resource picres[6];
static const char *picname[6] = {
"goldfish_pic.0",
diff --git a/arch/m68k/virt/platform.c b/arch/m68k/virt/platform.c
index cb820f19a221..1560c4140ab9 100644
--- a/arch/m68k/virt/platform.c
+++ b/arch/m68k/virt/platform.c
@@ -8,20 +8,15 @@
#define VIRTIO_BUS_NB 128
-static int __init virt_virtio_init(unsigned int id)
+static struct platform_device * __init virt_virtio_init(unsigned int id)
{
const struct resource res[] = {
DEFINE_RES_MEM(virt_bi_data.virtio.mmio + id * 0x200, 0x200),
DEFINE_RES_IRQ(virt_bi_data.virtio.irq + id),
};
- struct platform_device *pdev;
- pdev = platform_device_register_simple("virtio-mmio", id,
+ return platform_device_register_simple("virtio-mmio", id,
res, ARRAY_SIZE(res));
- if (IS_ERR(pdev))
- return PTR_ERR(pdev);
-
- return 0;
}
static int __init virt_platform_init(void)
@@ -35,8 +30,10 @@ static int __init virt_platform_init(void)
DEFINE_RES_MEM(virt_bi_data.rtc.mmio + 0x1000, 0x1000),
DEFINE_RES_IRQ(virt_bi_data.rtc.irq + 1),
};
- struct platform_device *pdev;
+ struct platform_device *pdev1, *pdev2;
+ struct platform_device *pdevs[VIRTIO_BUS_NB];
unsigned int i;
+ int ret = 0;
if (!MACH_IS_VIRT)
return -ENODEV;
@@ -44,29 +41,40 @@ static int __init virt_platform_init(void)
/* We need this to have DMA'able memory provided to goldfish-tty */
min_low_pfn = 0;
- pdev = platform_device_register_simple("goldfish_tty",
- PLATFORM_DEVID_NONE,
- goldfish_tty_res,
- ARRAY_SIZE(goldfish_tty_res));
- if (IS_ERR(pdev))
- return PTR_ERR(pdev);
+ pdev1 = platform_device_register_simple("goldfish_tty",
+ PLATFORM_DEVID_NONE,
+ goldfish_tty_res,
+ ARRAY_SIZE(goldfish_tty_res));
+ if (IS_ERR(pdev1))
+ return PTR_ERR(pdev1);
- pdev = platform_device_register_simple("goldfish_rtc",
- PLATFORM_DEVID_NONE,
- goldfish_rtc_res,
- ARRAY_SIZE(goldfish_rtc_res));
- if (IS_ERR(pdev))
- return PTR_ERR(pdev);
+ pdev2 = platform_device_register_simple("goldfish_rtc",
+ PLATFORM_DEVID_NONE,
+ goldfish_rtc_res,
+ ARRAY_SIZE(goldfish_rtc_res));
+ if (IS_ERR(pdev2)) {
+ ret = PTR_ERR(pdev2);
+ goto err_unregister_tty;
+ }
for (i = 0; i < VIRTIO_BUS_NB; i++) {
- int err;
-
- err = virt_virtio_init(i);
- if (err)
- return err;
+ pdevs[i] = virt_virtio_init(i);
+ if (IS_ERR(pdevs[i])) {
+ ret = PTR_ERR(pdevs[i]);
+ goto err_unregister_rtc_virtio;
+ }
}
return 0;
+
+err_unregister_rtc_virtio:
+ while (i > 0)
+ platform_device_unregister(pdevs[--i]);
+ platform_device_unregister(pdev2);
+err_unregister_tty:
+ platform_device_unregister(pdev1);
+
+ return ret;
}
arch_initcall(virt_platform_init);
diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index 3185fd3220ec..c5c6864e64bc 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -8,6 +8,8 @@
#ifndef _ASM_MIPS_JUMP_LABEL_H
#define _ASM_MIPS_JUMP_LABEL_H
+#define arch_jump_label_transform_static arch_jump_label_transform
+
#ifndef __ASSEMBLY__
#include <linux/types.h>
diff --git a/arch/mips/kernel/jump_label.c b/arch/mips/kernel/jump_label.c
index 662c8db9f45b..71a882c8c6eb 100644
--- a/arch/mips/kernel/jump_label.c
+++ b/arch/mips/kernel/jump_label.c
@@ -88,3 +88,22 @@ void arch_jump_label_transform(struct jump_entry *e,
mutex_unlock(&text_mutex);
}
+
+#ifdef CONFIG_MODULES
+void jump_label_apply_nops(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+
+ /* if the module doesn't have jump label entries, just return */
+ if (iter_start == iter_stop)
+ return;
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ /* Only write NOPs for arch_branch_static(). */
+ if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform(iter, JUMP_LABEL_NOP);
+ }
+}
+#endif
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 14f46d17500a..0c936cbf20c5 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -21,6 +21,7 @@
#include <linux/spinlock.h>
#include <linux/jump_label.h>
+extern void jump_label_apply_nops(struct module *mod);
struct mips_hi16 {
struct mips_hi16 *next;
@@ -428,8 +429,8 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *s;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
- /* Make jump label nops. */
- jump_label_apply_nops(me);
+ if (IS_ENABLED(CONFIG_JUMP_LABEL))
+ jump_label_apply_nops(me);
INIT_LIST_HEAD(&me->arch.dbe_list);
for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c
index d2f3cb12e282..e253b134500d 100644
--- a/arch/parisc/kernel/jump_label.c
+++ b/arch/parisc/kernel/jump_label.c
@@ -42,14 +42,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
patch_text(addr, insn);
}
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- /*
- * We use the architected NOP in arch_static_branch, so there's no
- * need to patch an identical NOP over the top of it here. The core
- * will call arch_jump_label_transform from a module notifier if the
- * NOP needs to be replaced by a branch.
- */
-}
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c235648fae23..4d8f26c1399b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -282,6 +282,10 @@ config PPC
# Please keep this list sorted alphabetically.
#
+config PPC_LONG_DOUBLE_128
+ depends on PPC64
+ def_bool $(success,test "$(shell,echo __LONG_DOUBLE_128__ | $(CC) -E -P -)" = 1)
+
config PPC_BARRIER_NOSPEC
bool
default y
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index f91f0f29a566..c8cf924bf9c0 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -20,6 +20,7 @@ CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
CFLAGS_prom_init.o += -fno-stack-protector
CFLAGS_prom_init.o += -DDISABLE_BRANCH_PROFILING
CFLAGS_prom_init.o += -ffreestanding
+CFLAGS_prom_init.o += $(call cc-option, -ftrivial-auto-var-init=uninitialized)
ifdef CONFIG_FUNCTION_TRACER
# Do not trace early boot code
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index a4c46a03d2e2..81029d40a672 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -111,7 +111,7 @@ PHONY += vdso_install
vdso_install:
$(Q)$(MAKE) $(build)=arch/riscv/kernel/vdso $@
$(if $(CONFIG_COMPAT),$(Q)$(MAKE) \
- $(build)=arch/riscv/kernel/compat_vdso $@)
+ $(build)=arch/riscv/kernel/compat_vdso compat_$@)
ifeq ($(KBUILD_EXTMOD),)
ifeq ($(CONFIG_MMU),y)
diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c
index 20e09056d141..e6694759dbd0 100644
--- a/arch/riscv/kernel/jump_label.c
+++ b/arch/riscv/kernel/jump_label.c
@@ -39,15 +39,3 @@ void arch_jump_label_transform(struct jump_entry *entry,
patch_text_nosync(addr, &insn, sizeof(insn));
mutex_unlock(&text_mutex);
}
-
-void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- /*
- * We use the same instructions in the arch_static_branch and
- * arch_static_branch_jump inline functions, so there's no
- * need to patch them up here.
- * The core will call arch_jump_label_transform when those
- * instructions need to be replaced.
- */
-}
diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h
index 2c6e1c6ecbe7..4120c428dc37 100644
--- a/arch/s390/include/asm/archrandom.h
+++ b/arch/s390/include/asm/archrandom.h
@@ -2,7 +2,7 @@
/*
* Kernel interface for the s390 arch_random_* functions
*
- * Copyright IBM Corp. 2017, 2020
+ * Copyright IBM Corp. 2017, 2022
*
* Author: Harald Freudenberger <freude@de.ibm.com>
*
@@ -14,6 +14,7 @@
#ifdef CONFIG_ARCH_RANDOM
#include <linux/static_key.h>
+#include <linux/preempt.h>
#include <linux/atomic.h>
#include <asm/cpacf.h>
@@ -32,7 +33,8 @@ static inline bool __must_check arch_get_random_int(unsigned int *v)
static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
{
- if (static_branch_likely(&s390_arch_random_available)) {
+ if (static_branch_likely(&s390_arch_random_available) &&
+ in_task()) {
cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v));
atomic64_add(sizeof(*v), &s390_arch_random_counter);
return true;
@@ -42,7 +44,8 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
{
- if (static_branch_likely(&s390_arch_random_available)) {
+ if (static_branch_likely(&s390_arch_random_available) &&
+ in_task()) {
cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v));
atomic64_add(sizeof(*v), &s390_arch_random_counter);
return true;
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 916cfcb36d8a..895f774bbcc5 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -10,7 +10,6 @@
#include <linux/stringify.h>
#define JUMP_LABEL_NOP_SIZE 6
-#define JUMP_LABEL_NOP_OFFSET 2
#ifdef CONFIG_CC_IS_CLANG
#define JUMP_LABEL_STATIC_KEY_CONSTRAINT "i"
@@ -21,12 +20,12 @@
#endif
/*
- * We use a brcl 0,2 instruction for jump labels at compile time so it
+ * We use a brcl 0,<offset> instruction for jump labels so it
* can be easily distinguished from a hotpatch generated instruction.
*/
static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
- asm_volatile_goto("0: brcl 0,"__stringify(JUMP_LABEL_NOP_OFFSET)"\n"
+ asm_volatile_goto("0: brcl 0,%l[label]\n"
".pushsection __jump_table,\"aw\"\n"
".balign 8\n"
".long 0b-.,%l[label]-.\n"
diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c
index 6bec000c6c1c..e808bb8bc0da 100644
--- a/arch/s390/kernel/jump_label.c
+++ b/arch/s390/kernel/jump_label.c
@@ -44,14 +44,8 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected,
panic("Corrupted kernel text");
}
-static struct insn orignop = {
- .opcode = 0xc004,
- .offset = JUMP_LABEL_NOP_OFFSET >> 1,
-};
-
static void jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+ enum jump_label_type type)
{
void *code = (void *)jump_entry_code(entry);
struct insn old, new;
@@ -63,27 +57,22 @@ static void jump_label_transform(struct jump_entry *entry,
jump_label_make_branch(entry, &old);
jump_label_make_nop(entry, &new);
}
- if (init) {
- if (memcmp(code, &orignop, sizeof(orignop)))
- jump_label_bug(entry, &orignop, &new);
- } else {
- if (memcmp(code, &old, sizeof(old)))
- jump_label_bug(entry, &old, &new);
- }
+ if (memcmp(code, &old, sizeof(old)))
+ jump_label_bug(entry, &old, &new);
s390_kernel_write(code, &new, sizeof(new));
}
void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type)
{
- jump_label_transform(entry, type, 0);
+ jump_label_transform(entry, type);
text_poke_sync();
}
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- jump_label_transform(entry, type, 0);
+ jump_label_transform(entry, type);
return true;
}
@@ -91,10 +80,3 @@ void arch_jump_label_transform_apply(void)
{
text_poke_sync();
}
-
-void __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- jump_label_transform(entry, type, 1);
- text_poke_sync();
-}
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 26125a9c436d..2d159b32885b 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -548,6 +548,5 @@ int module_finalize(const Elf_Ehdr *hdr,
#endif /* CONFIG_FUNCTION_TRACER */
}
- jump_label_apply_nops(me);
return 0;
}
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index df39580f398d..66c45a2764bc 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -208,9 +208,6 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- /* make jump label nops */
- jump_label_apply_nops(me);
-
do_patch_sections(hdr, sechdrs);
/* Cheetah's I-cache is fully coherent. */
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
index 677111acbaa3..f2e1d6c347fb 100644
--- a/arch/x86/.gitignore
+++ b/arch/x86/.gitignore
@@ -3,6 +3,4 @@ boot/compressed/vmlinux
tools/test_get_len
tools/insn_sanity
tools/insn_decoder_test
-purgatory/kexec-purgatory.c
purgatory/purgatory.ro
-
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 52a7f91527fe..5aa4c2ecf5c7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -278,6 +278,7 @@ config X86
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
select TRACE_IRQFLAGS_SUPPORT
+ select TRACE_IRQFLAGS_NMI_SUPPORT
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select HAVE_ARCH_KCSAN if X86_64
@@ -392,8 +393,8 @@ config PGTABLE_LEVELS
config CC_HAS_SANE_STACKPROTECTOR
bool
- default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC)) if 64BIT
- default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC))
+ default $(success,$(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) $(CLANG_FLAGS)) if 64BIT
+ default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC) $(CLANG_FLAGS))
help
We have to make sure stack protector is unconditionally disabled if
the compiler produces broken code or if it does not let us control
@@ -2010,7 +2011,7 @@ config KEXEC
config KEXEC_FILE
bool "kexec file based system call"
select KEXEC_CORE
- select BUILD_BIN2C
+ select HAVE_IMA_KEXEC if IMA
depends on X86_64
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 340399f69954..bdfe08f1a930 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -1,8 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-config TRACE_IRQFLAGS_NMI_SUPPORT
- def_bool y
-
config EARLY_PRINTK_USB
bool
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 0d04414b97d2..d568afc705d2 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -21,7 +21,6 @@
#define NUM_COUNTERS_NB 4
#define NUM_COUNTERS_L2 4
#define NUM_COUNTERS_L3 6
-#define MAX_COUNTERS 6
#define RDPMC_BASE_NB 6
#define RDPMC_BASE_LLC 10
@@ -31,6 +30,7 @@
#undef pr_fmt
#define pr_fmt(fmt) "amd_uncore: " fmt
+static int pmu_version;
static int num_counters_llc;
static int num_counters_nb;
static bool l3_mask;
@@ -46,7 +46,7 @@ struct amd_uncore {
u32 msr_base;
cpumask_t *active_mask;
struct pmu *pmu;
- struct perf_event *events[MAX_COUNTERS];
+ struct perf_event **events;
struct hlist_node node;
};
@@ -158,6 +158,16 @@ out:
hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx;
hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+ /*
+ * The first four DF counters are accessible via RDPMC index 6 to 9
+ * followed by the L3 counters from index 10 to 15. For processors
+ * with more than four DF counters, the DF RDPMC assignments become
+ * discontiguous as the additional counters are accessible starting
+ * from index 16.
+ */
+ if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB)
+ hwc->event_base_rdpmc += NUM_COUNTERS_L3;
+
if (flags & PERF_EF_START)
amd_uncore_start(event, PERF_EF_RELOAD);
@@ -209,10 +219,14 @@ static int amd_uncore_event_init(struct perf_event *event)
{
struct amd_uncore *uncore;
struct hw_perf_event *hwc = &event->hw;
+ u64 event_mask = AMD64_RAW_EVENT_MASK_NB;
if (event->attr.type != event->pmu->type)
return -ENOENT;
+ if (pmu_version >= 2 && is_nb_event(event))
+ event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB;
+
/*
* NB and Last level cache counters (MSRs) are shared across all cores
* that share the same NB / Last level cache. On family 16h and below,
@@ -221,7 +235,7 @@ static int amd_uncore_event_init(struct perf_event *event)
* out. So we do not support sampling and per-thread events via
* CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
*/
- hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
+ hwc->config = event->attr.config & event_mask;
hwc->idx = -1;
if (event->cpu < 0)
@@ -247,6 +261,19 @@ static int amd_uncore_event_init(struct perf_event *event)
return 0;
}
+static umode_t
+amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ?
+ attr->mode : 0;
+}
+
+static umode_t
+amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0;
+}
+
static ssize_t amd_uncore_attr_show_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -287,8 +314,10 @@ static struct device_attribute format_attr_##_var = \
DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35");
DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */
+DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */
DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */
-DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */
DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */
DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */
DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */
@@ -297,20 +326,33 @@ DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3
DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */
DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */
+/* Common DF and NB attributes */
static struct attribute *amd_uncore_df_format_attr[] = {
- &format_attr_event12.attr, /* event14 if F17h+ */
- &format_attr_umask.attr,
+ &format_attr_event12.attr, /* event */
+ &format_attr_umask8.attr, /* umask */
NULL,
};
+/* Common L2 and L3 attributes */
static struct attribute *amd_uncore_l3_format_attr[] = {
- &format_attr_event12.attr, /* event8 if F17h+ */
- &format_attr_umask.attr,
- NULL, /* slicemask if F17h, coreid if F19h */
- NULL, /* threadmask8 if F17h, enallslices if F19h */
- NULL, /* enallcores if F19h */
- NULL, /* sliceid if F19h */
- NULL, /* threadmask2 if F19h */
+ &format_attr_event12.attr, /* event */
+ &format_attr_umask8.attr, /* umask */
+ NULL, /* threadmask */
+ NULL,
+};
+
+/* F17h unique L3 attributes */
+static struct attribute *amd_f17h_uncore_l3_format_attr[] = {
+ &format_attr_slicemask.attr, /* slicemask */
+ NULL,
+};
+
+/* F19h unique L3 attributes */
+static struct attribute *amd_f19h_uncore_l3_format_attr[] = {
+ &format_attr_coreid.attr, /* coreid */
+ &format_attr_enallslices.attr, /* enallslices */
+ &format_attr_enallcores.attr, /* enallcores */
+ &format_attr_sliceid.attr, /* sliceid */
NULL,
};
@@ -324,6 +366,18 @@ static struct attribute_group amd_uncore_l3_format_group = {
.attrs = amd_uncore_l3_format_attr,
};
+static struct attribute_group amd_f17h_uncore_l3_format_group = {
+ .name = "format",
+ .attrs = amd_f17h_uncore_l3_format_attr,
+ .is_visible = amd_f17h_uncore_is_visible,
+};
+
+static struct attribute_group amd_f19h_uncore_l3_format_group = {
+ .name = "format",
+ .attrs = amd_f19h_uncore_l3_format_attr,
+ .is_visible = amd_f19h_uncore_is_visible,
+};
+
static const struct attribute_group *amd_uncore_df_attr_groups[] = {
&amd_uncore_attr_group,
&amd_uncore_df_format_group,
@@ -336,6 +390,12 @@ static const struct attribute_group *amd_uncore_l3_attr_groups[] = {
NULL,
};
+static const struct attribute_group *amd_uncore_l3_attr_update[] = {
+ &amd_f17h_uncore_l3_format_group,
+ &amd_f19h_uncore_l3_format_group,
+ NULL,
+};
+
static struct pmu amd_nb_pmu = {
.task_ctx_nr = perf_invalid_context,
.attr_groups = amd_uncore_df_attr_groups,
@@ -353,6 +413,7 @@ static struct pmu amd_nb_pmu = {
static struct pmu amd_llc_pmu = {
.task_ctx_nr = perf_invalid_context,
.attr_groups = amd_uncore_l3_attr_groups,
+ .attr_update = amd_uncore_l3_attr_update,
.name = "amd_l2",
.event_init = amd_uncore_event_init,
.add = amd_uncore_add,
@@ -370,11 +431,19 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
cpu_to_node(cpu));
}
+static inline struct perf_event **
+amd_uncore_events_alloc(unsigned int num, unsigned int cpu)
+{
+ return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL,
+ cpu_to_node(cpu));
+}
+
static int amd_uncore_cpu_up_prepare(unsigned int cpu)
{
- struct amd_uncore *uncore_nb = NULL, *uncore_llc;
+ struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL;
if (amd_uncore_nb) {
+ *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
uncore_nb = amd_uncore_alloc(cpu);
if (!uncore_nb)
goto fail;
@@ -384,11 +453,15 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL;
uncore_nb->active_mask = &amd_nb_active_mask;
uncore_nb->pmu = &amd_nb_pmu;
+ uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu);
+ if (!uncore_nb->events)
+ goto fail;
uncore_nb->id = -1;
*per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb;
}
if (amd_uncore_llc) {
+ *per_cpu_ptr(amd_uncore_llc, cpu) = NULL;
uncore_llc = amd_uncore_alloc(cpu);
if (!uncore_llc)
goto fail;
@@ -398,6 +471,9 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL;
uncore_llc->active_mask = &amd_llc_active_mask;
uncore_llc->pmu = &amd_llc_pmu;
+ uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu);
+ if (!uncore_llc->events)
+ goto fail;
uncore_llc->id = -1;
*per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc;
}
@@ -405,9 +481,16 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu)
return 0;
fail:
- if (amd_uncore_nb)
- *per_cpu_ptr(amd_uncore_nb, cpu) = NULL;
- kfree(uncore_nb);
+ if (uncore_nb) {
+ kfree(uncore_nb->events);
+ kfree(uncore_nb);
+ }
+
+ if (uncore_llc) {
+ kfree(uncore_llc->events);
+ kfree(uncore_llc);
+ }
+
return -ENOMEM;
}
@@ -540,8 +623,11 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores)
if (cpu == uncore->cpu)
cpumask_clear_cpu(cpu, uncore->active_mask);
- if (!--uncore->refcnt)
+ if (!--uncore->refcnt) {
+ kfree(uncore->events);
kfree(uncore);
+ }
+
*per_cpu_ptr(uncores, cpu) = NULL;
}
@@ -560,6 +646,7 @@ static int __init amd_uncore_init(void)
{
struct attribute **df_attr = amd_uncore_df_format_attr;
struct attribute **l3_attr = amd_uncore_l3_format_attr;
+ union cpuid_0x80000022_ebx ebx;
int ret = -ENODEV;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
@@ -569,6 +656,9 @@ static int __init amd_uncore_init(void)
if (!boot_cpu_has(X86_FEATURE_TOPOEXT))
return -ENODEV;
+ if (boot_cpu_has(X86_FEATURE_PERFMON_V2))
+ pmu_version = 2;
+
num_counters_nb = NUM_COUNTERS_NB;
num_counters_llc = NUM_COUNTERS_L2;
if (boot_cpu_data.x86 >= 0x17) {
@@ -585,8 +675,12 @@ static int __init amd_uncore_init(void)
}
if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) {
- if (boot_cpu_data.x86 >= 0x17)
+ if (pmu_version >= 2) {
+ *df_attr++ = &format_attr_event14v2.attr;
+ *df_attr++ = &format_attr_umask12.attr;
+ } else if (boot_cpu_data.x86 >= 0x17) {
*df_attr = &format_attr_event14.attr;
+ }
amd_uncore_nb = alloc_percpu(struct amd_uncore *);
if (!amd_uncore_nb) {
@@ -597,6 +691,11 @@ static int __init amd_uncore_init(void)
if (ret)
goto fail_nb;
+ if (pmu_version >= 2) {
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+ num_counters_nb = ebx.split.num_df_pmc;
+ }
+
pr_info("%d %s %s counters detected\n", num_counters_nb,
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "",
amd_nb_pmu.name);
@@ -607,16 +706,11 @@ static int __init amd_uncore_init(void)
if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) {
if (boot_cpu_data.x86 >= 0x19) {
*l3_attr++ = &format_attr_event8.attr;
- *l3_attr++ = &format_attr_umask.attr;
- *l3_attr++ = &format_attr_coreid.attr;
- *l3_attr++ = &format_attr_enallslices.attr;
- *l3_attr++ = &format_attr_enallcores.attr;
- *l3_attr++ = &format_attr_sliceid.attr;
+ *l3_attr++ = &format_attr_umask8.attr;
*l3_attr++ = &format_attr_threadmask2.attr;
} else if (boot_cpu_data.x86 >= 0x17) {
*l3_attr++ = &format_attr_event8.attr;
- *l3_attr++ = &format_attr_umask.attr;
- *l3_attr++ = &format_attr_slicemask.attr;
+ *l3_attr++ = &format_attr_umask8.attr;
*l3_attr++ = &format_attr_threadmask8.attr;
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 45024abd929f..bd8b98857609 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4141,6 +4141,8 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
{
struct event_constraint *c;
+ c = intel_get_event_constraints(cpuc, idx, event);
+
/*
* :ppp means to do reduced skid PEBS,
* which is available on PMC0 and fixed counter 0.
@@ -4153,8 +4155,6 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return &counter0_constraint;
}
- c = intel_get_event_constraints(cpuc, idx, event);
-
return c;
}
@@ -6241,7 +6241,8 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.lbr_pt_coexist = true;
- intel_pmu_pebs_data_source_skl(false);
+ intel_pmu_pebs_data_source_adl();
+ x86_pmu.pebs_latency_data = adl_latency_data_small;
x86_pmu.num_topdown_events = 8;
x86_pmu.update_topdown_event = adl_update_topdown_event;
x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 376cc3d66094..ba60427caa6d 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -94,15 +94,40 @@ void __init intel_pmu_pebs_data_source_nhm(void)
pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
}
-void __init intel_pmu_pebs_data_source_skl(bool pmem)
+static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source)
{
u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
- pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
- pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
- pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
- pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
- pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
+ data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
+ data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
+ data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
+ data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
+ data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
+}
+
+void __init intel_pmu_pebs_data_source_skl(bool pmem)
+{
+ __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
+}
+
+static void __init intel_pmu_pebs_data_source_grt(u64 *data_source)
+{
+ data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
+ data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
+ data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
+}
+
+void __init intel_pmu_pebs_data_source_adl(void)
+{
+ u64 *data_source;
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ __intel_pmu_pebs_data_source_skl(false, data_source);
+
+ data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
+ memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
+ intel_pmu_pebs_data_source_grt(data_source);
}
static u64 precise_store_data(u64 status)
@@ -171,7 +196,50 @@ static u64 precise_datala_hsw(struct perf_event *event, u64 status)
return dse.val;
}
-static u64 load_latency_data(u64 status)
+static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
+{
+ /*
+ * TLB access
+ * 0 = did not miss 2nd level TLB
+ * 1 = missed 2nd level TLB
+ */
+ if (tlb)
+ *val |= P(TLB, MISS) | P(TLB, L2);
+ else
+ *val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
+
+ /* locked prefix */
+ if (lock)
+ *val |= P(LOCK, LOCKED);
+}
+
+/* Retrieve the latency data for e-core of ADL */
+u64 adl_latency_data_small(struct perf_event *event, u64 status)
+{
+ union intel_x86_pebs_dse dse;
+ u64 val;
+
+ WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
+
+ dse.val = status;
+
+ val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
+
+ /*
+ * For the atom core on ADL,
+ * bit 4: lock, bit 5: TLB access.
+ */
+ pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss);
+
+ if (dse.ld_data_blk)
+ val |= P(BLK, DATA);
+ else
+ val |= P(BLK, NA);
+
+ return val;
+}
+
+static u64 load_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
@@ -181,7 +249,7 @@ static u64 load_latency_data(u64 status)
/*
* use the mapping table for bit 0-3
*/
- val = pebs_data_source[dse.ld_dse];
+ val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
/*
* Nehalem models do not support TLB, Lock infos
@@ -190,21 +258,8 @@ static u64 load_latency_data(u64 status)
val |= P(TLB, NA) | P(LOCK, NA);
return val;
}
- /*
- * bit 4: TLB access
- * 0 = did not miss 2nd level TLB
- * 1 = missed 2nd level TLB
- */
- if (dse.ld_stlb_miss)
- val |= P(TLB, MISS) | P(TLB, L2);
- else
- val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
- /*
- * bit 5: locked prefix
- */
- if (dse.ld_locked)
- val |= P(LOCK, LOCKED);
+ pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked);
/*
* Ice Lake and earlier models do not support block infos.
@@ -233,7 +288,7 @@ static u64 load_latency_data(u64 status)
return val;
}
-static u64 store_latency_data(u64 status)
+static u64 store_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
@@ -243,23 +298,9 @@ static u64 store_latency_data(u64 status)
/*
* use the mapping table for bit 0-3
*/
- val = pebs_data_source[dse.st_lat_dse];
+ val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse];
- /*
- * bit 4: TLB access
- * 0 = did not miss 2nd level TLB
- * 1 = missed 2nd level TLB
- */
- if (dse.st_lat_stlb_miss)
- val |= P(TLB, MISS) | P(TLB, L2);
- else
- val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
-
- /*
- * bit 5: locked prefix
- */
- if (dse.st_lat_locked)
- val |= P(LOCK, LOCKED);
+ pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked);
val |= P(BLK, NA);
@@ -781,8 +822,8 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
struct event_constraint intel_grt_pebs_event_constraints[] = {
/* Allow all events as PEBS with no flags */
- INTEL_PLD_CONSTRAINT(0x5d0, 0xf),
- INTEL_PSD_CONSTRAINT(0x6d0, 0xf),
+ INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf),
+ INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),
EVENT_CONSTRAINT_END
};
@@ -1443,9 +1484,11 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
- val = load_latency_data(aux);
+ val = load_latency_data(event, aux);
else if (fl & PERF_X86_EVENT_PEBS_STLAT)
- val = store_latency_data(aux);
+ val = store_latency_data(event, aux);
+ else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID)
+ val = x86_pmu.pebs_latency_data(event, aux);
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
val = precise_datala_hsw(event, aux);
else if (fst)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 21a5482bcf84..ca2f8bfe6ff1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -84,6 +84,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */
#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */
#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */
+#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -136,7 +137,8 @@ struct amd_nb {
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
- PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE)
+ PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \
+ PERF_SAMPLE_WEIGHT_TYPE)
#define PEBS_GP_REGS \
((1ULL << PERF_REG_X86_AX) | \
@@ -460,6 +462,10 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
+#define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID)
+
/* Event constraint, but match on all event flags too. */
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
@@ -638,6 +644,8 @@ enum {
x86_lbr_exclusive_max,
};
+#define PERF_PEBS_DATA_SOURCE_MAX 0x10
+
struct x86_hybrid_pmu {
struct pmu pmu;
const char *name;
@@ -665,6 +673,8 @@ struct x86_hybrid_pmu {
unsigned int late_ack :1,
mid_ack :1,
enabled_ack :1;
+
+ u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX];
};
static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
@@ -825,6 +835,7 @@ struct x86_pmu {
void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
+ u64 (*pebs_latency_data)(struct perf_event *event, u64 status);
unsigned long large_pebs_flags;
u64 rtm_abort_event;
@@ -1392,6 +1403,8 @@ void intel_pmu_disable_bts(void);
int intel_pmu_drain_bts_buffer(void);
+u64 adl_latency_data_small(struct perf_event *event, u64 status);
+
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
@@ -1499,6 +1512,8 @@ void intel_pmu_pebs_data_source_nhm(void);
void intel_pmu_pebs_data_source_skl(bool pmem);
+void intel_pmu_pebs_data_source_adl(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h
index aabdbb5ab920..f3eb098d63d4 100644
--- a/arch/x86/include/asm/amd-ibs.h
+++ b/arch/x86/include/asm/amd-ibs.h
@@ -29,7 +29,10 @@ union ibs_fetch_ctl {
rand_en:1, /* 57: random tagging enable */
fetch_l2_miss:1,/* 58: L2 miss for sampled fetch
* (needs IbsFetchComp) */
- reserved:5; /* 59-63: reserved */
+ l3_miss_only:1, /* 59: Collect L3 miss samples only */
+ fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */
+ fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */
+ reserved:2; /* 62-63: reserved */
};
};
@@ -38,14 +41,14 @@ union ibs_op_ctl {
__u64 val;
struct {
__u64 opmaxcnt:16, /* 0-15: periodic op max. count */
- reserved0:1, /* 16: reserved */
+ l3_miss_only:1, /* 16: Collect L3 miss samples only */
op_en:1, /* 17: op sampling enable */
op_val:1, /* 18: op sample valid */
cnt_ctl:1, /* 19: periodic op counter control */
opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */
- reserved1:5, /* 27-31: reserved */
+ reserved0:5, /* 27-31: reserved */
opcurcnt:27, /* 32-58: periodic op counter current count */
- reserved2:5; /* 59-63: reserved */
+ reserved1:5; /* 59-63: reserved */
};
};
@@ -71,11 +74,12 @@ union ibs_op_data {
union ibs_op_data2 {
__u64 val;
struct {
- __u64 data_src:3, /* 0-2: data source */
+ __u64 data_src_lo:3, /* 0-2: data source low */
reserved0:1, /* 3: reserved */
rmt_node:1, /* 4: destination node */
cache_hit_st:1, /* 5: cache hit state */
- reserved1:57; /* 5-63: reserved */
+ data_src_hi:2, /* 6-7: data source high */
+ reserved1:56; /* 8-63: reserved */
};
};
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index a77b915d36a8..5fe7f6c8a7a4 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -219,7 +219,7 @@
#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
-#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 or above (Zen) */
+#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 6b0f31fb53f7..503a577814b2 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -164,4 +164,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu)
/* prctl */
extern long fpu_xstate_prctl(int option, unsigned long arg2);
+extern void fpu_idle_fpregs(void);
+
#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 29dd27b5a339..3a8fdf881313 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -13,6 +13,7 @@
#define MWAIT_SUBSTATE_SIZE 4
#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK)
#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
+#define MWAIT_C1_SUBSTATE_MASK 0xf0
#define CPUID_MWAIT_LEAF 5
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 38a3e86e665e..cba942006ffe 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -94,25 +94,37 @@
.endm
/*
+ * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call
+ * to the retpoline thunk with a CS prefix when the register requires
+ * a RAX prefix byte to encode. Also see apply_retpolines().
+ */
+.macro __CS_PREFIX reg:req
+ .irp rs,r8,r9,r10,r11,r12,r13,r14,r15
+ .ifc \reg,\rs
+ .byte 0x2e
+ .endif
+ .endr
+.endm
+
+/*
* JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
* indirect jmp/call which may be susceptible to the Spectre variant 2
* attack.
*/
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
- ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
- __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_LFENCE
+ __CS_PREFIX \reg
+ jmp __x86_indirect_thunk_\reg
#else
jmp *%\reg
+ int3
#endif
.endm
.macro CALL_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
- ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
- __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
- __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_LFENCE
+ __CS_PREFIX \reg
+ call __x86_indirect_thunk_\reg
#else
call *%\reg
#endif
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 409725e86f42..34348ae41cdb 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -89,6 +89,19 @@
#define AMD64_RAW_EVENT_MASK_NB \
(AMD64_EVENTSEL_EVENT | \
ARCH_PERFMON_EVENTSEL_UMASK)
+
+#define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \
+ (AMD64_EVENTSEL_EVENT | \
+ GENMASK_ULL(37, 36))
+
+#define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \
+ (ARCH_PERFMON_EVENTSEL_UMASK | \
+ GENMASK_ULL(27, 24))
+
+#define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \
+ (AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \
+ AMD64_PERFMON_V2_EVENTSEL_UMASK_NB)
+
#define AMD64_NUM_COUNTERS 4
#define AMD64_NUM_COUNTERS_CORE 6
#define AMD64_NUM_COUNTERS_NB 4
@@ -194,6 +207,9 @@ union cpuid_0x80000022_ebx {
struct {
/* Number of Core Performance Counters */
unsigned int num_core_pmc:4;
+ unsigned int reserved:6;
+ /* Number of Data Fabric Counters */
+ unsigned int num_df_pmc:6;
} split;
unsigned int full;
};
diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h
index 19514524f0f8..4a23e52fe0ee 100644
--- a/arch/x86/include/asm/sev.h
+++ b/arch/x86/include/asm/sev.h
@@ -72,7 +72,6 @@ static inline u64 lower_bits(u64 val, unsigned int bits)
struct real_mode_header;
enum stack_type;
-struct ghcb;
/* Early IDT entry points for #VC handler */
extern void vc_no_ghcb(void);
@@ -156,11 +155,7 @@ static __always_inline void sev_es_nmi_complete(void)
__sev_es_nmi_complete();
}
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
-extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
- bool set_ghcb_msr,
- struct es_em_ctxt *ctxt,
- u64 exit_code, u64 exit_info_1,
- u64 exit_info_2);
+
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
{
int rc;
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 45b18eb94fa1..35f709f619fb 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -295,6 +295,15 @@ static inline int enqcmds(void __iomem *dst, const void *src)
return 0;
}
+static inline void tile_release(void)
+{
+ /*
+ * Instruction opcode for TILERELEASE; supported in binutils
+ * version >= 2.36.
+ */
+ asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0");
+}
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_SPECIAL_INSNS_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4af5579c7ef7..cda3118f3b27 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -16,6 +16,7 @@
void __flush_tlb_all(void);
#define TLB_FLUSH_ALL -1UL
+#define TLB_GENERATION_INVALID 0
void cr4_update_irqsoff(unsigned long set, unsigned long clear);
unsigned long cr4_read_shadow(void);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index e02a8a8ef23c..342290624040 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -11,11 +11,12 @@
#define SETUP_APPLE_PROPERTIES 5
#define SETUP_JAILHOUSE 6
#define SETUP_CC_BLOB 7
+#define SETUP_IMA 8
+#define SETUP_RNG_SEED 9
+#define SETUP_ENUM_MAX SETUP_RNG_SEED
#define SETUP_INDIRECT (1<<31)
-
-/* SETUP_INDIRECT | max(SETUP_*) */
-#define SETUP_TYPE_MAX (SETUP_INDIRECT | SETUP_CC_BLOB)
+#define SETUP_TYPE_MAX (SETUP_ENUM_MAX | SETUP_INDIRECT)
/* ram_size flags */
#define RAMDISK_IMAGE_START_MASK 0x07FF
@@ -172,6 +173,14 @@ struct jailhouse_setup_data {
} __attribute__((packed)) v2;
} __attribute__((packed));
+/*
+ * IMA buffer setup data information from the previous kernel during kexec
+ */
+struct ima_setup_data {
+ __u64 addr;
+ __u64 size;
+} __attribute__((packed));
+
/* The so-called "zeropage" */
struct boot_params {
struct screen_info screen_info; /* 0x000 */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c8b6ae802ac..a20a5ebfacd7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -34,8 +34,6 @@ KASAN_SANITIZE_sev.o := n
# by several compilation units. To be safe, disable all instrumentation.
KCSAN_SANITIZE := n
-OBJECT_FILES_NON_STANDARD_test_nx.o := y
-
# If instrumentation of this dir is enabled, boot hangs during first second.
# Probably could be more selective here, but note that files related to irqs,
# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 190e0f763375..4266b64631a4 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -19,17 +19,23 @@
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
+#define PCI_DEVICE_ID_AMD_17H_MA0H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M10H_ROOT 0x14a4
+#define PCI_DEVICE_ID_AMD_19H_M60H_ROOT 0x14d8
+#define PCI_DEVICE_ID_AMD_19H_M70H_ROOT 0x14e8
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4 0x1728
#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F4 0x14b1
#define PCI_DEVICE_ID_AMD_19H_M40H_ROOT 0x14b5
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
+#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4
+#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4
/* Protect the PCI config register pairs used for SMN. */
static DEFINE_MUTEX(smn_mutex);
@@ -41,8 +47,11 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_ROOT) },
{}
};
@@ -61,12 +70,15 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{}
};
@@ -81,6 +93,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6454bc767f0f..6761668100b9 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -1520,6 +1520,7 @@ static void __init spectre_v2_select_mitigation(void)
* enable IBRS around firmware calls.
*/
if (boot_cpu_has_bug(X86_BUG_RETBLEED) &&
+ boot_cpu_has(X86_FEATURE_IBPB) &&
(boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) {
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fd5dead8371c..663f6e6dd288 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -682,9 +682,9 @@ static void init_intel(struct cpuinfo_x86 *c)
unsigned int l1, l2;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
- if (!(l1 & (1<<11)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_BTS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_BTS);
- if (!(l1 & (1<<12)))
+ if (!(l1 & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
set_cpu_cap(c, X86_FEATURE_PEBS);
}
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 5fbd7ffb3233..12cf2e7ca33c 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -33,6 +33,8 @@
#include "internal.h"
+static bool hw_injection_possible = true;
+
/*
* Collect all the MCi_XXX settings
*/
@@ -339,6 +341,8 @@ static int __set_inj(const char *buf)
for (i = 0; i < N_INJ_TYPES; i++) {
if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) {
+ if (i > SW_INJ && !hw_injection_possible)
+ continue;
inj_type = i;
return 0;
}
@@ -717,11 +721,54 @@ static void __init debugfs_init(void)
&i_mce, dfs_fls[i].fops);
}
+static void check_hw_inj_possible(void)
+{
+ int cpu;
+ u8 bank;
+
+ /*
+ * This behavior exists only on SMCA systems though its not directly
+ * related to SMCA.
+ */
+ if (!cpu_feature_enabled(X86_FEATURE_SMCA))
+ return;
+
+ cpu = get_cpu();
+
+ for (bank = 0; bank < MAX_NR_BANKS; ++bank) {
+ u64 status = MCI_STATUS_VAL, ipid;
+
+ /* Check whether bank is populated */
+ rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid);
+ if (!ipid)
+ continue;
+
+ toggle_hw_mce_inject(cpu, true);
+
+ wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
+ rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
+
+ if (!status) {
+ hw_injection_possible = false;
+ pr_warn("Platform does not allow *hardware* error injection."
+ "Try using APEI EINJ instead.\n");
+ }
+
+ toggle_hw_mce_inject(cpu, false);
+
+ break;
+ }
+
+ put_cpu();
+}
+
static int __init inject_init(void)
{
if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
return -ENOMEM;
+ check_hw_inj_possible();
+
debugfs_init();
register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, "mce_notify");
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 4ae0e603f7fa..7e03f5b7f6bd 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -211,7 +211,7 @@ noinstr u64 mce_rdmsrl(u32 msr);
static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
{
- if (mce_flags.smca) {
+ if (cpu_feature_enabled(X86_FEATURE_SMCA)) {
switch (reg) {
case MCA_CTL: return MSR_AMD64_SMCA_MCx_CTL(bank);
case MCA_ADDR: return MSR_AMD64_SMCA_MCx_ADDR(bank);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index c04b933f48d3..02039ec3597d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -476,8 +476,8 @@ static bool __init vmware_legacy_x2apic_available(void)
{
uint32_t eax, ebx, ecx, edx;
VMWARE_CMD(GETVCPU_INFO, eax, ebx, ecx, edx);
- return (eax & (1 << VMWARE_CMD_VCPU_RESERVED)) == 0 &&
- (eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
+ return !(eax & BIT(VMWARE_CMD_VCPU_RESERVED)) &&
+ (eax & BIT(VMWARE_CMD_LEGACY_X2APIC));
}
#ifdef CONFIG_AMD_MEM_ENCRYPT
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index f267205f2d5a..9dac24680ff8 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1017,10 +1017,10 @@ void __init e820__reserve_setup_data(void)
e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
/*
- * SETUP_EFI is supplied by kexec and does not need to be
- * reserved.
+ * SETUP_EFI and SETUP_IMA are supplied by kexec and do not need
+ * to be reserved.
*/
- if (data->type != SETUP_EFI)
+ if (data->type != SETUP_EFI && data->type != SETUP_IMA)
e820__range_update_kexec(pa_data,
sizeof(*data) + data->len,
E820_TYPE_RAM, E820_TYPE_RESERVED_KERN);
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 0531d6a06df5..3b28c5b25e12 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -851,3 +851,17 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
*/
return 0;
}
+
+/*
+ * Initialize register state that may prevent from entering low-power idle.
+ * This function will be invoked from the cpuidle driver only when needed.
+ */
+void fpu_idle_fpregs(void)
+{
+ /* Note: AMX_TILE being enabled implies XGETBV1 support */
+ if (cpu_feature_enabled(X86_FEATURE_AMX_TILE) &&
+ (xfeatures_in_use() & XFEATURE_MASK_XTILE)) {
+ tile_release();
+ fpregs_deactivate(&current->thread.fpu);
+ }
+}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 68f091ba8443..f5b8ef02d172 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -146,16 +146,3 @@ void arch_jump_label_transform_apply(void)
text_poke_finish();
mutex_unlock(&text_mutex);
}
-
-static enum {
- JL_STATE_START,
- JL_STATE_NO_UPDATE,
- JL_STATE_UPDATE,
-} jlstate __initdata_or_module = JL_STATE_START;
-
-__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
-{
- if (jlstate == JL_STATE_UPDATE)
- jump_label_transform(entry, type, 1);
-}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 170d0fd68b1f..b9bdb40364a6 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/efi.h>
#include <linux/verification.h>
+#include <linux/random.h>
#include <asm/bootparam.h>
#include <asm/setup.h>
@@ -110,6 +111,26 @@ static int setup_e820_entries(struct boot_params *params)
return 0;
}
+enum { RNG_SEED_LENGTH = 32 };
+
+static void
+setup_rng_seed(struct boot_params *params, unsigned long params_load_addr,
+ unsigned int rng_seed_setup_data_offset)
+{
+ struct setup_data *sd = (void *)params + rng_seed_setup_data_offset;
+ unsigned long setup_data_phys;
+
+ if (!rng_is_initialized())
+ return;
+
+ sd->type = SETUP_RNG_SEED;
+ sd->len = RNG_SEED_LENGTH;
+ get_random_bytes(sd->data, RNG_SEED_LENGTH);
+ setup_data_phys = params_load_addr + rng_seed_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+}
+
#ifdef CONFIG_EFI
static int setup_efi_info_memmap(struct boot_params *params,
unsigned long params_load_addr,
@@ -186,11 +207,38 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
}
#endif /* CONFIG_EFI */
+static void
+setup_ima_state(const struct kimage *image, struct boot_params *params,
+ unsigned long params_load_addr,
+ unsigned int ima_setup_data_offset)
+{
+#ifdef CONFIG_IMA_KEXEC
+ struct setup_data *sd = (void *)params + ima_setup_data_offset;
+ unsigned long setup_data_phys;
+ struct ima_setup_data *ima;
+
+ if (!image->ima_buffer_size)
+ return;
+
+ sd->type = SETUP_IMA;
+ sd->len = sizeof(*ima);
+
+ ima = (void *)sd + sizeof(struct setup_data);
+ ima->addr = image->ima_buffer_addr;
+ ima->size = image->ima_buffer_size;
+
+ /* Add setup data */
+ setup_data_phys = params_load_addr + ima_setup_data_offset;
+ sd->next = params->hdr.setup_data;
+ params->hdr.setup_data = setup_data_phys;
+#endif /* CONFIG_IMA_KEXEC */
+}
+
static int
setup_boot_parameters(struct kimage *image, struct boot_params *params,
unsigned long params_load_addr,
unsigned int efi_map_offset, unsigned int efi_map_sz,
- unsigned int efi_setup_data_offset)
+ unsigned int setup_data_offset)
{
unsigned int nr_e820_entries;
unsigned long long mem_k, start, end;
@@ -245,8 +293,22 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
#ifdef CONFIG_EFI
/* Setup EFI state */
setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz,
- efi_setup_data_offset);
+ setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct efi_setup_data);
#endif
+
+ if (IS_ENABLED(CONFIG_IMA_KEXEC)) {
+ /* Setup IMA log buffer state */
+ setup_ima_state(image, params, params_load_addr,
+ setup_data_offset);
+ setup_data_offset += sizeof(struct setup_data) +
+ sizeof(struct ima_setup_data);
+ }
+
+ /* Setup RNG seed */
+ setup_rng_seed(params, params_load_addr, setup_data_offset);
+
/* Setup EDD info */
memcpy(params->eddbuf, boot_params.eddbuf,
EDDMAXNR * sizeof(struct edd_info));
@@ -401,7 +463,13 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
kbuf.bufsz = params_cmdline_sz + ALIGN(efi_map_sz, 16) +
sizeof(struct setup_data) +
- sizeof(struct efi_setup_data);
+ sizeof(struct efi_setup_data) +
+ sizeof(struct setup_data) +
+ RNG_SEED_LENGTH;
+
+ if (IS_ENABLED(CONFIG_IMA_KEXEC))
+ kbuf.bufsz += sizeof(struct setup_data) +
+ sizeof(struct ima_setup_data);
params = kzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 67828d973389..b1abf663417c 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -310,9 +310,6 @@ int module_finalize(const Elf_Ehdr *hdr,
tseg, tseg + text->sh_size);
}
- /* make jump label nops */
- jump_label_apply_nops(me);
-
if (orc && orc_ip)
unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size,
(void *)orc->sh_addr, orc->sh_size);
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 6b07faaa1579..23154d24b117 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -27,6 +27,11 @@ static __init int register_e820_pmem(void)
* simply here to trigger the module to load on demand.
*/
pdev = platform_device_alloc("e820_pmem", -1);
- return platform_device_add(pdev);
+
+ rc = platform_device_add(pdev);
+ if (rc)
+ platform_device_put(pdev);
+
+ return rc;
}
device_initcall(register_e820_pmem);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d456ce21c255..58a6ea472db9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -810,24 +810,43 @@ static void amd_e400_idle(void)
}
/*
- * Intel Core2 and older machines prefer MWAIT over HALT for C1.
- * We can't rely on cpuidle installing MWAIT, because it will not load
- * on systems that support only C1 -- so the boot default must be MWAIT.
+ * Prefer MWAIT over HALT if MWAIT is supported, MWAIT_CPUID leaf
+ * exists and whenever MONITOR/MWAIT extensions are present there is at
+ * least one C1 substate.
*
- * Some AMD machines are the opposite, they depend on using HALT.
- *
- * So for default C1, which is used during boot until cpuidle loads,
- * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ * Do not prefer MWAIT if MONITOR instruction has a bug or idle=nomwait
+ * is passed to kernel commandline parameter.
*/
static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
{
- if (c->x86_vendor != X86_VENDOR_INTEL)
+ u32 eax, ebx, ecx, edx;
+
+ /* User has disallowed the use of MWAIT. Fallback to HALT */
+ if (boot_option_idle_override == IDLE_NOMWAIT)
return 0;
- if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR))
+ /* MWAIT is not supported on this platform. Fallback to HALT */
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
return 0;
- return 1;
+ /* Monitor has a bug. Fallback to HALT */
+ if (boot_cpu_has_bug(X86_BUG_MONITOR))
+ return 0;
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
+
+ /*
+ * If MWAIT extensions are not available, it is safe to use MWAIT
+ * with EAX=0, ECX=0.
+ */
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED))
+ return 1;
+
+ /*
+ * If MWAIT extensions are available, there should be at least one
+ * MWAIT C1 substate present.
+ */
+ return (edx & MWAIT_C1_SUBSTATE_MASK);
}
/*
@@ -932,9 +951,8 @@ static int __init idle_setup(char *str)
} else if (!strcmp(str, "nomwait")) {
/*
* If the boot option of "idle=nomwait" is added,
- * it means that mwait will be disabled for CPU C2/C3
- * states. In such case it won't touch the variable
- * of boot_option_idle_override.
+ * it means that mwait will be disabled for CPU C1/C2/C3
+ * states.
*/
boot_option_idle_override = IDLE_NOMWAIT;
} else
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bd6c6fd373ae..216fee7144ee 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -11,6 +11,7 @@
#include <linux/dma-map-ops.h>
#include <linux/dmi.h>
#include <linux/efi.h>
+#include <linux/ima.h>
#include <linux/init_ohci1394_dma.h>
#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
@@ -23,6 +24,7 @@
#include <linux/usb/xhci-dbgp.h>
#include <linux/static_call.h>
#include <linux/swiotlb.h>
+#include <linux/random.h>
#include <uapi/linux/mount.h>
@@ -140,6 +142,11 @@ __visible unsigned long mmu_cr4_features __ro_after_init;
__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE;
#endif
+#ifdef CONFIG_IMA
+static phys_addr_t ima_kexec_buffer_phys;
+static size_t ima_kexec_buffer_size;
+#endif
+
/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
int bootloader_type, bootloader_version;
@@ -330,6 +337,60 @@ static void __init reserve_initrd(void)
}
#endif /* CONFIG_BLK_DEV_INITRD */
+static void __init add_early_ima_buffer(u64 phys_addr)
+{
+#ifdef CONFIG_IMA
+ struct ima_setup_data *data;
+
+ data = early_memremap(phys_addr + sizeof(struct setup_data), sizeof(*data));
+ if (!data) {
+ pr_warn("setup: failed to memremap ima_setup_data entry\n");
+ return;
+ }
+
+ if (data->size) {
+ memblock_reserve(data->addr, data->size);
+ ima_kexec_buffer_phys = data->addr;
+ ima_kexec_buffer_size = data->size;
+ }
+
+ early_memunmap(data, sizeof(*data));
+#else
+ pr_warn("Passed IMA kexec data, but CONFIG_IMA not set. Ignoring.\n");
+#endif
+}
+
+#if defined(CONFIG_HAVE_IMA_KEXEC) && !defined(CONFIG_OF_FLATTREE)
+int __init ima_free_kexec_buffer(void)
+{
+ int rc;
+
+ if (!ima_kexec_buffer_size)
+ return -ENOENT;
+
+ rc = memblock_phys_free(ima_kexec_buffer_phys,
+ ima_kexec_buffer_size);
+ if (rc)
+ return rc;
+
+ ima_kexec_buffer_phys = 0;
+ ima_kexec_buffer_size = 0;
+
+ return 0;
+}
+
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
+{
+ if (!ima_kexec_buffer_size)
+ return -ENOENT;
+
+ *addr = __va(ima_kexec_buffer_phys);
+ *size = ima_kexec_buffer_size;
+
+ return 0;
+}
+#endif
+
static void __init parse_setup_data(void)
{
struct setup_data *data;
@@ -355,6 +416,18 @@ static void __init parse_setup_data(void)
case SETUP_EFI:
parse_efi_setup(pa_data, data_len);
break;
+ case SETUP_IMA:
+ add_early_ima_buffer(pa_data);
+ break;
+ case SETUP_RNG_SEED:
+ data = early_memremap(pa_data, data_len);
+ add_bootloader_randomness(data->data, data->len);
+ /* Zero seed for forward secrecy. */
+ memzero_explicit(data->data, data->len);
+ /* Zero length in case we find ourselves back here by accident. */
+ memzero_explicit(&data->len, sizeof(data->len));
+ early_memunmap(data, data_len);
+ break;
default:
break;
}
diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c
index b478edf43bec..3a5b0c9c4fcc 100644
--- a/arch/x86/kernel/sev-shared.c
+++ b/arch/x86/kernel/sev-shared.c
@@ -219,9 +219,10 @@ static enum es_result verify_exception_info(struct ghcb *ghcb, struct es_em_ctxt
return ES_VMM_ERROR;
}
-enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
- struct es_em_ctxt *ctxt, u64 exit_code,
- u64 exit_info_1, u64 exit_info_2)
+static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2)
{
/* Fill in protocol and format specifiers */
ghcb->protocol_version = ghcb_version;
@@ -231,14 +232,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr,
ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
- /*
- * Hyper-V unenlightened guests use a paravisor for communicating and
- * GHCB pages are being allocated and set up by that paravisor. Linux
- * should not change the GHCB page's physical address.
- */
- if (set_ghcb_msr)
- sev_es_wr_ghcb_msr(__pa(ghcb));
-
+ sev_es_wr_ghcb_msr(__pa(ghcb));
VMGEXIT();
return verify_exception_info(ghcb, ctxt);
@@ -795,7 +789,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
*/
sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
ghcb_set_sw_scratch(ghcb, sw_scratch);
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_IOIO,
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
exit_info_1, exit_info_2);
if (ret != ES_OK)
return ret;
@@ -837,8 +831,7 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
ghcb_set_rax(ghcb, rax);
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt,
- SVM_EXIT_IOIO, exit_info_1, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
if (ret != ES_OK)
return ret;
@@ -894,7 +887,7 @@ static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
/* xgetbv will cause #GP - use reset value for xcr0 */
ghcb_set_xcr0(ghcb, 1);
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_CPUID, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
if (ret != ES_OK)
return ret;
@@ -919,7 +912,7 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
enum es_result ret;
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
if (ret != ES_OK)
return ret;
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index c05f0124c410..63dc626627a0 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -786,7 +786,7 @@ static int vmgexit_psc(struct snp_psc_desc *desc)
ghcb_set_sw_scratch(ghcb, (u64)__pa(data));
/* This will advance the shared buffer data points to. */
- ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, &ctxt, SVM_VMGEXIT_PSC, 0, 0);
/*
* Page State Change VMGEXIT can pass error code through
@@ -1212,8 +1212,7 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
ghcb_set_rdx(ghcb, regs->dx);
}
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_MSR,
- exit_info_1, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
if ((ret == ES_OK) && (!exit_info_1)) {
regs->ax = ghcb->save.rax;
@@ -1452,7 +1451,7 @@ static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
- return sev_es_ghcb_hv_call(ghcb, true, ctxt, exit_code, exit_info_1, exit_info_2);
+ return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
}
/*
@@ -1628,7 +1627,7 @@ static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
/* Using a value of 0 for ExitInfo1 means RAX holds the value */
ghcb_set_rax(ghcb, val);
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1658,7 +1657,7 @@ static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
struct es_em_ctxt *ctxt)
{
- return sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_WBINVD, 0, 0);
+ return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
}
static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
@@ -1667,7 +1666,7 @@ static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt
ghcb_set_rcx(ghcb, ctxt->regs->cx);
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
if (ret != ES_OK)
return ret;
@@ -1708,7 +1707,7 @@ static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
if (x86_platform.hyper.sev_es_hcall_prepare)
x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
- ret = sev_es_ghcb_hv_call(ghcb, true, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
if (ret != ES_OK)
return ret;
@@ -2197,7 +2196,7 @@ int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned
ghcb_set_rbx(ghcb, input->data_npages);
}
- ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
+ ret = sev_es_ghcb_hv_call(ghcb, &ctxt, exit_code, input->req_gpa, input->resp_gpa);
if (ret)
goto e_put;
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index dba2197c05c3..331310c29349 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -94,16 +94,18 @@ static bool ex_handler_copy(const struct exception_table_entry *fixup,
static bool ex_handler_msr(const struct exception_table_entry *fixup,
struct pt_regs *regs, bool wrmsr, bool safe, int reg)
{
- if (!safe && wrmsr &&
- pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, (unsigned int)regs->dx,
- (unsigned int)regs->ax, regs->ip, (void *)regs->ip))
+ if (__ONCE_LITE_IF(!safe && wrmsr)) {
+ pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx,
+ (unsigned int)regs->ax, regs->ip, (void *)regs->ip);
show_stack_regs(regs);
+ }
- if (!safe && !wrmsr &&
- pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
- (unsigned int)regs->cx, regs->ip, (void *)regs->ip))
+ if (__ONCE_LITE_IF(!safe && !wrmsr)) {
+ pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
show_stack_regs(regs);
+ }
if (!wrmsr) {
/* Pretend that the read succeeded and returned 0. */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 57ba5502aecf..82a042c03824 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -856,7 +856,7 @@ int devmem_is_allowed(unsigned long pagenr)
/*
* This must follow RAM test, since System RAM is considered a
- * restricted resource under CONFIG_STRICT_IOMEM.
+ * restricted resource under CONFIG_STRICT_DEVMEM.
*/
if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
/* Low 1MB bypasses iomem restrictions. */
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index e44e938885b7..7418c367e328 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -110,7 +110,7 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
return vma_pkey(vma);
}
-#define PKRU_AD_KEY(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
+#define PKRU_AD_MASK(pkey) (PKRU_AD_BIT << ((pkey) * PKRU_BITS_PER_PKEY))
/*
* Make the default PKRU value (at execve() time) as restrictive
@@ -118,11 +118,14 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
* in the process's lifetime will not accidentally get access
* to data which is pkey-protected later on.
*/
-u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
- PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) |
- PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) |
- PKRU_AD_KEY(10) | PKRU_AD_KEY(11) | PKRU_AD_KEY(12) |
- PKRU_AD_KEY(13) | PKRU_AD_KEY(14) | PKRU_AD_KEY(15);
+u32 init_pkru_value = PKRU_AD_MASK( 1) | PKRU_AD_MASK( 2) |
+ PKRU_AD_MASK( 3) | PKRU_AD_MASK( 4) |
+ PKRU_AD_MASK( 5) | PKRU_AD_MASK( 6) |
+ PKRU_AD_MASK( 7) | PKRU_AD_MASK( 8) |
+ PKRU_AD_MASK( 9) | PKRU_AD_MASK(10) |
+ PKRU_AD_MASK(11) | PKRU_AD_MASK(12) |
+ PKRU_AD_MASK(13) | PKRU_AD_MASK(14) |
+ PKRU_AD_MASK(15);
static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index d400b6d9d246..c1e31e9a85d7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -734,10 +734,10 @@ static void flush_tlb_func(void *info)
const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
bool local = smp_processor_id() == f->initiating_cpu;
unsigned long nr_invalidate = 0;
+ u64 mm_tlb_gen;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
@@ -771,6 +771,23 @@ static void flush_tlb_func(void *info)
return;
}
+ if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&
+ f->new_tlb_gen <= local_tlb_gen)) {
+ /*
+ * The TLB is already up to date in respect to f->new_tlb_gen.
+ * While the core might be still behind mm_tlb_gen, checking
+ * mm_tlb_gen unnecessarily would have negative caching effects
+ * so avoid it.
+ */
+ return;
+ }
+
+ /*
+ * Defer mm_tlb_gen reading as long as possible to avoid cache
+ * contention.
+ */
+ mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
+
if (unlikely(local_tlb_gen == mm_tlb_gen)) {
/*
* There's nothing to do: we're already up to date. This can
@@ -827,6 +844,12 @@ static void flush_tlb_func(void *info)
/* Partial flush */
unsigned long addr = f->start;
+ /* Partial flush cannot have invalid generations */
+ VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);
+
+ /* Partial flush must have valid mm */
+ VM_WARN_ON(f->mm == NULL);
+
nr_invalidate = (f->end - f->start) >> f->stride_shift;
while (addr < f->end) {
@@ -1029,7 +1052,8 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
struct flush_tlb_info *info;
preempt_disable();
- info = get_flush_tlb_info(NULL, start, end, 0, false, 0);
+ info = get_flush_tlb_info(NULL, start, end, 0, false,
+ TLB_GENERATION_INVALID);
on_each_cpu(do_kernel_range_flush, info, 1);
@@ -1198,7 +1222,8 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
int cpu = get_cpu();
- info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false,
+ TLB_GENERATION_INVALID);
/*
* flush_tlb_multi() is not optimized for the common case in which only
* a local TLB flush is needed. Optimize this use-case by calling
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index ae53d54d7959..31c634a22818 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -73,12 +73,6 @@ $(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE
$(obj)/purgatory.chk: $(obj)/purgatory.ro FORCE
$(call if_changed,ld)
-targets += kexec-purgatory.c
+$(obj)/kexec-purgatory.o: $(obj)/purgatory.ro $(obj)/purgatory.chk
-quiet_cmd_bin2c = BIN2C $@
- cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@
-
-$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro $(obj)/purgatory.chk FORCE
- $(call if_changed,bin2c)
-
-obj-$(CONFIG_KEXEC_FILE) += kexec-purgatory.o
+obj-y += kexec-purgatory.o
diff --git a/arch/x86/purgatory/kexec-purgatory.S b/arch/x86/purgatory/kexec-purgatory.S
new file mode 100644
index 000000000000..8530fe93b718
--- /dev/null
+++ b/arch/x86/purgatory/kexec-purgatory.S
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+ .section .rodata, "a"
+
+ .align 8
+kexec_purgatory:
+ .globl kexec_purgatory
+ .incbin "arch/x86/purgatory/purgatory.ro"
+.Lkexec_purgatory_end:
+
+ .align 8
+kexec_purgatory_size:
+ .globl kexec_purgatory_size
+ .quad .Lkexec_purgatory_end - kexec_purgatory
diff --git a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c
index 29a8c710ae06..b7962e5149a5 100644
--- a/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c
+++ b/drivers/clk/sunxi-ng/ccu-sun50i-h6-r.c
@@ -138,6 +138,7 @@ static struct ccu_common *sun50i_h6_r_ccu_clks[] = {
&r_apb2_rsb_clk.common,
&r_apb1_ir_clk.common,
&r_apb1_w1_clk.common,
+ &r_apb1_rtc_clk.common,
&ir_clk.common,
&w1_clk.common,
};
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index be7f512109f7..747aa537389b 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -3,7 +3,8 @@
# ARM CPU Idle drivers
#
config ARM_CPUIDLE
- bool "Generic ARM/ARM64 CPU idle Driver"
+ bool "Generic ARM CPU idle Driver"
+ depends on ARM
select DT_IDLE_STATES
select CPU_IDLE_MULTIPLE_DRIVERS
help
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 59b0bedc9c24..c8fa7dcfdbd0 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -103,9 +103,14 @@ static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
dmi_memdev_name(handle, &bank, &device);
- /* both strings must be non-zero */
- if (bank && *bank && device && *device)
- snprintf(dimm->label, sizeof(dimm->label), "%s %s", bank, device);
+ /*
+ * Set to a NULL string when both bank and device are zero. In this case,
+ * the label assigned by default will be preserved.
+ */
+ snprintf(dimm->label, sizeof(dimm->label), "%s%s%s",
+ (bank && *bank) ? bank : "",
+ (bank && *bank && device && *device) ? " " : "",
+ (device && *device) ? device : "");
}
static void assign_dmi_dimm_info(struct dimm_info *dimm, struct memdev_dmi_entry *entry)
diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index 1cee64b80a7e..f7d37c282819 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -514,6 +514,28 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p)
memset(p, 0, sizeof(*p));
}
+static void enable_intr(struct synps_edac_priv *priv)
+{
+ /* Enable UE/CE Interrupts */
+ if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR)
+ writel(DDR_UE_MASK | DDR_CE_MASK,
+ priv->baseaddr + ECC_CLR_OFST);
+ else
+ writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK,
+ priv->baseaddr + DDR_QOS_IRQ_EN_OFST);
+
+}
+
+static void disable_intr(struct synps_edac_priv *priv)
+{
+ /* Disable UE/CE Interrupts */
+ if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR)
+ writel(0x0, priv->baseaddr + ECC_CLR_OFST);
+ else
+ writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK,
+ priv->baseaddr + DDR_QOS_IRQ_DB_OFST);
+}
+
/**
* intr_handler - Interrupt Handler for ECC interrupts.
* @irq: IRQ number.
@@ -555,6 +577,9 @@ static irqreturn_t intr_handler(int irq, void *dev_id)
/* v3.0 of the controller does not have this register */
if (!(priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR))
writel(regval, priv->baseaddr + DDR_QOS_IRQ_STAT_OFST);
+ else
+ enable_intr(priv);
+
return IRQ_HANDLED;
}
@@ -837,25 +862,6 @@ static void mc_init(struct mem_ctl_info *mci, struct platform_device *pdev)
init_csrows(mci);
}
-static void enable_intr(struct synps_edac_priv *priv)
-{
- /* Enable UE/CE Interrupts */
- if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR)
- writel(DDR_UE_MASK | DDR_CE_MASK,
- priv->baseaddr + ECC_CLR_OFST);
- else
- writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK,
- priv->baseaddr + DDR_QOS_IRQ_EN_OFST);
-
-}
-
-static void disable_intr(struct synps_edac_priv *priv)
-{
- /* Disable UE/CE Interrupts */
- writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK,
- priv->baseaddr + DDR_QOS_IRQ_DB_OFST);
-}
-
static int setup_irq(struct mem_ctl_info *mci,
struct platform_device *pdev)
{
diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig
index 0ba0598eba20..ec6771e87e73 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -6,7 +6,7 @@ config DRM_AMD_DC
bool "AMD DC - Enable new display engine"
default y
select SND_HDA_COMPONENT if SND_HDA_CORE
- select DRM_AMD_DC_DCN if X86 && !(KCOV_INSTRUMENT_ALL && KCOV_ENABLE_COMPARISONS)
+ select DRM_AMD_DC_DCN if (X86 || PPC_LONG_DOUBLE_128) && !(KCOV_INSTRUMENT_ALL && KCOV_ENABLE_COMPARISONS)
help
Choose this option if you want to use the new display engine
support for AMDGPU. This adds required support for Vega and
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index 1431f1e9dbee..04e435bce79b 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -201,6 +201,8 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine);
int intel_engine_stop_cs(struct intel_engine_cs *engine);
void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine);
+void intel_engine_wait_for_pending_mi_fw(struct intel_engine_cs *engine);
+
void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask);
u64 intel_engine_get_active_head(const struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 14c6ddbbfde8..5b6ce10cb158 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1282,10 +1282,10 @@ static int __intel_engine_stop_cs(struct intel_engine_cs *engine,
intel_uncore_write_fw(uncore, mode, _MASKED_BIT_ENABLE(STOP_RING));
/*
- * Wa_22011802037 : gen12, Prior to doing a reset, ensure CS is
+ * Wa_22011802037 : gen11, gen12, Prior to doing a reset, ensure CS is
* stopped, set ring stop bit and prefetch disable bit to halt CS
*/
- if (GRAPHICS_VER(engine->i915) == 12)
+ if (IS_GRAPHICS_VER(engine->i915, 11, 12))
intel_uncore_write_fw(uncore, RING_MODE_GEN7(engine->mmio_base),
_MASKED_BIT_ENABLE(GEN12_GFX_PREFETCH_DISABLE));
@@ -1308,6 +1308,18 @@ int intel_engine_stop_cs(struct intel_engine_cs *engine)
return -ENODEV;
ENGINE_TRACE(engine, "\n");
+ /*
+ * TODO: Find out why occasionally stopping the CS times out. Seen
+ * especially with gem_eio tests.
+ *
+ * Occasionally trying to stop the cs times out, but does not adversely
+ * affect functionality. The timeout is set as a config parameter that
+ * defaults to 100ms. In most cases the follow up operation is to wait
+ * for pending MI_FORCE_WAKES. The assumption is that this timeout is
+ * sufficient for any pending MI_FORCEWAKEs to complete. Once root
+ * caused, the caller must check and handle the return from this
+ * function.
+ */
if (__intel_engine_stop_cs(engine, 1000, stop_timeout(engine))) {
ENGINE_TRACE(engine,
"timed out on STOP_RING -> IDLE; HEAD:%04x, TAIL:%04x\n",
@@ -1334,6 +1346,78 @@ void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine)
ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
}
+static u32 __cs_pending_mi_force_wakes(struct intel_engine_cs *engine)
+{
+ static const i915_reg_t _reg[I915_NUM_ENGINES] = {
+ [RCS0] = MSG_IDLE_CS,
+ [BCS0] = MSG_IDLE_BCS,
+ [VCS0] = MSG_IDLE_VCS0,
+ [VCS1] = MSG_IDLE_VCS1,
+ [VCS2] = MSG_IDLE_VCS2,
+ [VCS3] = MSG_IDLE_VCS3,
+ [VCS4] = MSG_IDLE_VCS4,
+ [VCS5] = MSG_IDLE_VCS5,
+ [VCS6] = MSG_IDLE_VCS6,
+ [VCS7] = MSG_IDLE_VCS7,
+ [VECS0] = MSG_IDLE_VECS0,
+ [VECS1] = MSG_IDLE_VECS1,
+ [VECS2] = MSG_IDLE_VECS2,
+ [VECS3] = MSG_IDLE_VECS3,
+ [CCS0] = MSG_IDLE_CS,
+ [CCS1] = MSG_IDLE_CS,
+ [CCS2] = MSG_IDLE_CS,
+ [CCS3] = MSG_IDLE_CS,
+ };
+ u32 val;
+
+ if (!_reg[engine->id].reg) {
+ drm_err(&engine->i915->drm,
+ "MSG IDLE undefined for engine id %u\n", engine->id);
+ return 0;
+ }
+
+ val = intel_uncore_read(engine->uncore, _reg[engine->id]);
+
+ /* bits[29:25] & bits[13:9] >> shift */
+ return (val & (val >> 16) & MSG_IDLE_FW_MASK) >> MSG_IDLE_FW_SHIFT;
+}
+
+static void __gpm_wait_for_fw_complete(struct intel_gt *gt, u32 fw_mask)
+{
+ int ret;
+
+ /* Ensure GPM receives fw up/down after CS is stopped */
+ udelay(1);
+
+ /* Wait for forcewake request to complete in GPM */
+ ret = __intel_wait_for_register_fw(gt->uncore,
+ GEN9_PWRGT_DOMAIN_STATUS,
+ fw_mask, fw_mask, 5000, 0, NULL);
+
+ /* Ensure CS receives fw ack from GPM */
+ udelay(1);
+
+ if (ret)
+ GT_TRACE(gt, "Failed to complete pending forcewake %d\n", ret);
+}
+
+/*
+ * Wa_22011802037:gen12: In addition to stopping the cs, we need to wait for any
+ * pending MI_FORCE_WAKEUP requests that the CS has initiated to complete. The
+ * pending status is indicated by bits[13:9] (masked by bits[29:25]) in the
+ * MSG_IDLE register. There's one MSG_IDLE register per reset domain. Since we
+ * are concerned only with the gt reset here, we use a logical OR of pending
+ * forcewakeups from all reset domains and then wait for them to complete by
+ * querying PWRGT_DOMAIN_STATUS.
+ */
+void intel_engine_wait_for_pending_mi_fw(struct intel_engine_cs *engine)
+{
+ u32 fw_pending = __cs_pending_mi_force_wakes(engine);
+
+ if (fw_pending)
+ __gpm_wait_for_fw_complete(engine->gt, fw_pending);
+}
+
static u32
read_subslice_reg(const struct intel_engine_cs *engine,
int slice, int subslice, i915_reg_t reg)
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 2b0266cab66b..0627fa10d2dc 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2968,6 +2968,13 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
ring_set_paused(engine, 1);
intel_engine_stop_cs(engine);
+ /*
+ * Wa_22011802037:gen11/gen12: In addition to stopping the cs, we need
+ * to wait for any pending mi force wakeups
+ */
+ if (IS_GRAPHICS_VER(engine->i915, 11, 12))
+ intel_engine_wait_for_pending_mi_fw(engine);
+
engine->execlists.reset_ccid = active_ccid(engine);
}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 2c4ad4a65089..8c6885f43d1a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -310,8 +310,8 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc)
if (IS_DG2(gt->i915))
flags |= GUC_WA_DUAL_QUEUE;
- /* Wa_22011802037: graphics version 12 */
- if (GRAPHICS_VER(gt->i915) == 12)
+ /* Wa_22011802037: graphics version 11/12 */
+ if (IS_GRAPHICS_VER(gt->i915, 11, 12))
flags |= GUC_WA_PRE_PARSER;
/* Wa_16011777198:dg2 */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 9ffb343d0f79..2d9f5f1c79d3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1578,87 +1578,18 @@ static void guc_reset_state(struct intel_context *ce, u32 head, bool scrub)
lrc_update_regs(ce, engine, head);
}
-static u32 __cs_pending_mi_force_wakes(struct intel_engine_cs *engine)
-{
- static const i915_reg_t _reg[I915_NUM_ENGINES] = {
- [RCS0] = MSG_IDLE_CS,
- [BCS0] = MSG_IDLE_BCS,
- [VCS0] = MSG_IDLE_VCS0,
- [VCS1] = MSG_IDLE_VCS1,
- [VCS2] = MSG_IDLE_VCS2,
- [VCS3] = MSG_IDLE_VCS3,
- [VCS4] = MSG_IDLE_VCS4,
- [VCS5] = MSG_IDLE_VCS5,
- [VCS6] = MSG_IDLE_VCS6,
- [VCS7] = MSG_IDLE_VCS7,
- [VECS0] = MSG_IDLE_VECS0,
- [VECS1] = MSG_IDLE_VECS1,
- [VECS2] = MSG_IDLE_VECS2,
- [VECS3] = MSG_IDLE_VECS3,
- [CCS0] = MSG_IDLE_CS,
- [CCS1] = MSG_IDLE_CS,
- [CCS2] = MSG_IDLE_CS,
- [CCS3] = MSG_IDLE_CS,
- };
- u32 val;
-
- if (!_reg[engine->id].reg)
- return 0;
-
- val = intel_uncore_read(engine->uncore, _reg[engine->id]);
-
- /* bits[29:25] & bits[13:9] >> shift */
- return (val & (val >> 16) & MSG_IDLE_FW_MASK) >> MSG_IDLE_FW_SHIFT;
-}
-
-static void __gpm_wait_for_fw_complete(struct intel_gt *gt, u32 fw_mask)
-{
- int ret;
-
- /* Ensure GPM receives fw up/down after CS is stopped */
- udelay(1);
-
- /* Wait for forcewake request to complete in GPM */
- ret = __intel_wait_for_register_fw(gt->uncore,
- GEN9_PWRGT_DOMAIN_STATUS,
- fw_mask, fw_mask, 5000, 0, NULL);
-
- /* Ensure CS receives fw ack from GPM */
- udelay(1);
-
- if (ret)
- GT_TRACE(gt, "Failed to complete pending forcewake %d\n", ret);
-}
-
-/*
- * Wa_22011802037:gen12: In addition to stopping the cs, we need to wait for any
- * pending MI_FORCE_WAKEUP requests that the CS has initiated to complete. The
- * pending status is indicated by bits[13:9] (masked by bits[ 29:25]) in the
- * MSG_IDLE register. There's one MSG_IDLE register per reset domain. Since we
- * are concerned only with the gt reset here, we use a logical OR of pending
- * forcewakeups from all reset domains and then wait for them to complete by
- * querying PWRGT_DOMAIN_STATUS.
- */
static void guc_engine_reset_prepare(struct intel_engine_cs *engine)
{
- u32 fw_pending;
-
- if (GRAPHICS_VER(engine->i915) != 12)
+ if (!IS_GRAPHICS_VER(engine->i915, 11, 12))
return;
- /*
- * Wa_22011802037
- * TODO: Occasionally trying to stop the cs times out, but does not
- * adversely affect functionality. The timeout is set as a config
- * parameter that defaults to 100ms. Assuming that this timeout is
- * sufficient for any pending MI_FORCEWAKEs to complete, ignore the
- * timeout returned here until it is root caused.
- */
intel_engine_stop_cs(engine);
- fw_pending = __cs_pending_mi_force_wakes(engine);
- if (fw_pending)
- __gpm_wait_for_fw_complete(engine->gt, fw_pending);
+ /*
+ * Wa_22011802037:gen11/gen12: In addition to stopping the cs, we need
+ * to wait for any pending mi force wakeups
+ */
+ intel_engine_wait_for_pending_mi_fw(engine);
}
static void guc_reset_nop(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 7ba66ad68a8a..16356611b5b9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -680,7 +680,11 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
goto out_free_dma;
for (i = 0; i < npages; i += max) {
- args.end = start + (max << PAGE_SHIFT);
+ if (args.start + (max << PAGE_SHIFT) > end)
+ args.end = end;
+ else
+ args.end = args.start + (max << PAGE_SHIFT);
+
ret = migrate_vma_setup(&args);
if (ret)
goto out_free_pfns;
diff --git a/drivers/gpu/drm/tiny/simpledrm.c b/drivers/gpu/drm/tiny/simpledrm.c
index 768242a78e2b..5422363690e7 100644
--- a/drivers/gpu/drm/tiny/simpledrm.c
+++ b/drivers/gpu/drm/tiny/simpledrm.c
@@ -627,7 +627,7 @@ static const struct drm_connector_funcs simpledrm_connector_funcs = {
.atomic_destroy_state = drm_atomic_helper_connector_destroy_state,
};
-static int
+static enum drm_mode_status
simpledrm_simple_display_pipe_mode_valid(struct drm_simple_display_pipe *pipe,
const struct drm_display_mode *mode)
{
diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c
index 4e239bd75b1d..5a9d47a229e4 100644
--- a/drivers/hwmon/k10temp.c
+++ b/drivers/hwmon/k10temp.c
@@ -428,6 +428,10 @@ static int k10temp_probe(struct pci_dev *pdev, const struct pci_device_id *id)
data->ccd_offset = 0x154;
k10temp_get_ccd_support(pdev, data, 8);
break;
+ case 0xa0 ... 0xaf:
+ data->ccd_offset = 0x300;
+ k10temp_get_ccd_support(pdev, data, 8);
+ break;
}
} else if (boot_cpu_data.x86 == 0x19) {
data->temp_adjust_mask = ZEN_CUR_TEMP_RANGE_SEL_MASK;
@@ -445,6 +449,11 @@ static int k10temp_probe(struct pci_dev *pdev, const struct pci_device_id *id)
data->ccd_offset = 0x300;
k10temp_get_ccd_support(pdev, data, 8);
break;
+ case 0x60 ... 0x6f:
+ case 0x70 ... 0x7f:
+ data->ccd_offset = 0x308;
+ k10temp_get_ccd_support(pdev, data, 8);
+ break;
case 0x10 ... 0x1f:
case 0xa0 ... 0xaf:
data->ccd_offset = 0x300;
@@ -489,10 +498,13 @@ static const struct pci_device_id k10temp_id_table[] = {
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M10H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
{ PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) },
+ { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) },
{ PCI_VDEVICE(HYGON, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f5c6802aa6c3..445b19d20b9a 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -56,6 +56,7 @@
#include <asm/nospec-branch.h>
#include <asm/mwait.h>
#include <asm/msr.h>
+#include <asm/fpu/api.h>
#define INTEL_IDLE_VERSION "0.5.1"
@@ -114,6 +115,11 @@ static unsigned int mwait_substates __initdata;
#define CPUIDLE_FLAG_IBRS BIT(16)
/*
+ * Initialize large xstate for the C6-state entrance.
+ */
+#define CPUIDLE_FLAG_INIT_XSTATE BIT(17)
+
+/*
* MWAIT takes an 8-bit "hint" in EAX "suggesting"
* the C-state (top nibble) and sub-state (bottom nibble)
* 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc.
@@ -162,7 +168,13 @@ static __cpuidle int intel_idle_irq(struct cpuidle_device *dev,
raw_local_irq_enable();
ret = __intel_idle(dev, drv, index);
- raw_local_irq_disable();
+
+ /*
+ * The lockdep hardirqs state may be changed to 'on' with timer
+ * tick interrupt followed by __do_softirq(). Use local_irq_disable()
+ * to keep the hardirqs state correct.
+ */
+ local_irq_disable();
return ret;
}
@@ -185,6 +197,13 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
return ret;
}
+static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
+{
+ fpu_idle_fpregs();
+ return __intel_idle(dev, drv, index);
+}
+
/**
* intel_idle_s2idle - Ask the processor to enter the given idle state.
* @dev: cpuidle device of the target CPU.
@@ -200,8 +219,12 @@ static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
{
- unsigned long eax = flg2MWAIT(drv->states[index].flags);
unsigned long ecx = 1; /* break on interrupt flag */
+ struct cpuidle_state *state = &drv->states[index];
+ unsigned long eax = flg2MWAIT(state->flags);
+
+ if (state->flags & CPUIDLE_FLAG_INIT_XSTATE)
+ fpu_idle_fpregs();
mwait_idle_with_hints(eax, ecx);
@@ -936,7 +959,8 @@ static struct cpuidle_state spr_cstates[] __initdata = {
{
.name = "C6",
.desc = "MWAIT 0x20",
- .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED |
+ CPUIDLE_FLAG_INIT_XSTATE,
.exit_latency = 290,
.target_residency = 800,
.enter = &intel_idle,
@@ -1851,6 +1875,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
drv->states[drv->state_count].enter = intel_idle_ibrs;
}
+ if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_INIT_XSTATE)
+ drv->states[drv->state_count].enter = intel_idle_xstate;
+
if ((disabled_states_mask & BIT(drv->state_count)) ||
((icpu->use_acpi || force_use_acpi) &&
intel_idle_off_by_default(mwait_hint) &&
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_rx.c b/drivers/net/ethernet/fungible/funeth/funeth_rx.c
index 0f6a549b9f67..29a6c2ede43a 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_rx.c
+++ b/drivers/net/ethernet/fungible/funeth/funeth_rx.c
@@ -142,6 +142,7 @@ static void *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va,
int ref_ok, struct funeth_txq *xdp_q)
{
struct bpf_prog *xdp_prog;
+ struct xdp_frame *xdpf;
struct xdp_buff xdp;
u32 act;
@@ -163,7 +164,9 @@ static void *fun_run_xdp(struct funeth_rxq *q, skb_frag_t *frags, void *buf_va,
case XDP_TX:
if (unlikely(!ref_ok))
goto pass;
- if (!fun_xdp_tx(xdp_q, xdp.data, xdp.data_end - xdp.data))
+
+ xdpf = xdp_convert_buff_to_frame(&xdp);
+ if (!xdpf || !fun_xdp_tx(xdp_q, xdpf))
goto xdp_error;
FUN_QSTAT_INC(q, xdp_tx);
q->xdp_flush |= FUN_XDP_FLUSH_TX;
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_tx.c b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
index ff6e29237253..2f6698b98b03 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_tx.c
+++ b/drivers/net/ethernet/fungible/funeth/funeth_tx.c
@@ -466,7 +466,7 @@ static unsigned int fun_xdpq_clean(struct funeth_txq *q, unsigned int budget)
do {
fun_xdp_unmap(q, reclaim_idx);
- page_frag_free(q->info[reclaim_idx].vaddr);
+ xdp_return_frame(q->info[reclaim_idx].xdpf);
trace_funeth_tx_free(q, reclaim_idx, 1, head);
@@ -479,11 +479,11 @@ static unsigned int fun_xdpq_clean(struct funeth_txq *q, unsigned int budget)
return npkts;
}
-bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len)
+bool fun_xdp_tx(struct funeth_txq *q, struct xdp_frame *xdpf)
{
struct fun_eth_tx_req *req;
struct fun_dataop_gl *gle;
- unsigned int idx;
+ unsigned int idx, len;
dma_addr_t dma;
if (fun_txq_avail(q) < FUN_XDP_CLEAN_THRES)
@@ -494,7 +494,8 @@ bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len)
return false;
}
- dma = dma_map_single(q->dma_dev, data, len, DMA_TO_DEVICE);
+ len = xdpf->len;
+ dma = dma_map_single(q->dma_dev, xdpf->data, len, DMA_TO_DEVICE);
if (unlikely(dma_mapping_error(q->dma_dev, dma))) {
FUN_QSTAT_INC(q, tx_map_err);
return false;
@@ -514,7 +515,7 @@ bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len)
gle = (struct fun_dataop_gl *)req->dataop.imm;
fun_dataop_gl_init(gle, 0, 0, len, dma);
- q->info[idx].vaddr = data;
+ q->info[idx].xdpf = xdpf;
u64_stats_update_begin(&q->syncp);
q->stats.tx_bytes += len;
@@ -545,12 +546,9 @@ int fun_xdp_xmit_frames(struct net_device *dev, int n,
if (unlikely(q_idx >= fp->num_xdpqs))
return -ENXIO;
- for (q = xdpqs[q_idx], i = 0; i < n; i++) {
- const struct xdp_frame *xdpf = frames[i];
-
- if (!fun_xdp_tx(q, xdpf->data, xdpf->len))
+ for (q = xdpqs[q_idx], i = 0; i < n; i++)
+ if (!fun_xdp_tx(q, frames[i]))
break;
- }
if (unlikely(flags & XDP_XMIT_FLUSH))
fun_txq_wr_db(q);
@@ -577,7 +575,7 @@ static void fun_xdpq_purge(struct funeth_txq *q)
unsigned int idx = q->cons_cnt & q->mask;
fun_xdp_unmap(q, idx);
- page_frag_free(q->info[idx].vaddr);
+ xdp_return_frame(q->info[idx].xdpf);
q->cons_cnt++;
}
}
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
index 04c9f91b7489..8708e2895946 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
+++ b/drivers/net/ethernet/fungible/funeth/funeth_txrx.h
@@ -95,8 +95,8 @@ struct funeth_txq_stats { /* per Tx queue SW counters */
struct funeth_tx_info { /* per Tx descriptor state */
union {
- struct sk_buff *skb; /* associated packet */
- void *vaddr; /* start address for XDP */
+ struct sk_buff *skb; /* associated packet (sk_buff path) */
+ struct xdp_frame *xdpf; /* associated XDP frame (XDP path) */
};
};
@@ -245,7 +245,7 @@ static inline int fun_irq_node(const struct fun_irq *p)
int fun_rxq_napi_poll(struct napi_struct *napi, int budget);
int fun_txq_napi_poll(struct napi_struct *napi, int budget);
netdev_tx_t fun_start_xmit(struct sk_buff *skb, struct net_device *netdev);
-bool fun_xdp_tx(struct funeth_txq *q, void *data, unsigned int len);
+bool fun_xdp_tx(struct funeth_txq *q, struct xdp_frame *xdpf);
int fun_xdp_xmit_frames(struct net_device *dev, int n,
struct xdp_frame **frames, u32 flags);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 7f1a0d90dc51..685556e968f2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1925,11 +1925,15 @@ static void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
* non-zero req_queue_pairs says that user requested a new
* queue count via ethtool's set_channels, so use this
* value for queues distribution across traffic classes
+ * We need at least one queue pair for the interface
+ * to be usable as we see in else statement.
*/
if (vsi->req_queue_pairs > 0)
vsi->num_queue_pairs = vsi->req_queue_pairs;
else if (pf->flags & I40E_FLAG_MSIX_ENABLED)
vsi->num_queue_pairs = pf->num_lan_msix;
+ else
+ vsi->num_queue_pairs = 1;
}
/* Number of queues per enabled TC */
diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c
index 70335f6e8524..4efa5e5846e0 100644
--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -658,7 +658,8 @@ static int ice_lbtest_receive_frames(struct ice_rx_ring *rx_ring)
rx_desc = ICE_RX_DESC(rx_ring, i);
if (!(rx_desc->wb.status_error0 &
- cpu_to_le16(ICE_TX_DESC_CMD_EOP | ICE_TX_DESC_CMD_RS)))
+ (cpu_to_le16(BIT(ICE_RX_FLEX_DESC_STATUS0_DD_S)) |
+ cpu_to_le16(BIT(ICE_RX_FLEX_DESC_STATUS0_EOF_S)))))
continue;
rx_buf = &rx_ring->rx_buf[i];
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index ff2eac2f8c64..9f02b60459f1 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -4656,6 +4656,8 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
ice_set_safe_mode_caps(hw);
}
+ hw->ucast_shared = true;
+
err = ice_init_pf(pf);
if (err) {
dev_err(dev, "ice_init_pf failed: %d\n", err);
@@ -6011,10 +6013,12 @@ int ice_vsi_cfg(struct ice_vsi *vsi)
if (vsi->netdev) {
ice_set_rx_mode(vsi->netdev);
- err = ice_vsi_vlan_setup(vsi);
+ if (vsi->type != ICE_VSI_LB) {
+ err = ice_vsi_vlan_setup(vsi);
- if (err)
- return err;
+ if (err)
+ return err;
+ }
}
ice_vsi_cfg_dcb_rings(vsi);
diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index bb1721f1321d..f4907a3c2d19 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1310,39 +1310,6 @@ out_put_vf:
}
/**
- * ice_unicast_mac_exists - check if the unicast MAC exists on the PF's switch
- * @pf: PF used to reference the switch's rules
- * @umac: unicast MAC to compare against existing switch rules
- *
- * Return true on the first/any match, else return false
- */
-static bool ice_unicast_mac_exists(struct ice_pf *pf, u8 *umac)
-{
- struct ice_sw_recipe *mac_recipe_list =
- &pf->hw.switch_info->recp_list[ICE_SW_LKUP_MAC];
- struct ice_fltr_mgmt_list_entry *list_itr;
- struct list_head *rule_head;
- struct mutex *rule_lock; /* protect MAC filter list access */
-
- rule_head = &mac_recipe_list->filt_rules;
- rule_lock = &mac_recipe_list->filt_rule_lock;
-
- mutex_lock(rule_lock);
- list_for_each_entry(list_itr, rule_head, list_entry) {
- u8 *existing_mac = &list_itr->fltr_info.l_data.mac.mac_addr[0];
-
- if (ether_addr_equal(existing_mac, umac)) {
- mutex_unlock(rule_lock);
- return true;
- }
- }
-
- mutex_unlock(rule_lock);
-
- return false;
-}
-
-/**
* ice_set_vf_mac
* @netdev: network interface device structure
* @vf_id: VF identifier
@@ -1376,13 +1343,6 @@ int ice_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac)
if (ret)
goto out_put_vf;
- if (ice_unicast_mac_exists(pf, mac)) {
- netdev_err(netdev, "Unicast MAC %pM already exists on this PF. Preventing setting VF %u unicast MAC address to %pM\n",
- mac, vf_id, mac);
- ret = -EINVAL;
- goto out_put_vf;
- }
-
mutex_lock(&vf->cfg_lock);
/* VF is notified of its new MAC via the PF's response to the
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 3f8b7274ed2f..836dce840712 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1751,11 +1751,13 @@ int ice_tx_csum(struct ice_tx_buf *first, struct ice_tx_offload_params *off)
protocol = vlan_get_protocol(skb);
- if (eth_p_mpls(protocol))
+ if (eth_p_mpls(protocol)) {
ip.hdr = skb_inner_network_header(skb);
- else
+ l4.hdr = skb_checksum_start(skb);
+ } else {
ip.hdr = skb_network_header(skb);
- l4.hdr = skb_checksum_start(skb);
+ l4.hdr = skb_transport_header(skb);
+ }
/* compute outer L2 header size */
l2_len = ip.hdr - skb->data;
diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
index 4547bc1f7cee..24188ec594d5 100644
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c
@@ -2948,7 +2948,8 @@ ice_vc_validate_add_vlan_filter_list(struct ice_vsi *vsi,
struct virtchnl_vlan_filtering_caps *vfc,
struct virtchnl_vlan_filter_list_v2 *vfl)
{
- u16 num_requested_filters = vsi->num_vlan + vfl->num_elements;
+ u16 num_requested_filters = ice_vsi_num_non_zero_vlans(vsi) +
+ vfl->num_elements;
if (num_requested_filters > vfc->max_filters)
return false;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
index 28b19945d716..e64318c110fd 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_tc.c
@@ -28,6 +28,9 @@
#define MAX_RATE_EXPONENT 0x0FULL
#define MAX_RATE_MANTISSA 0xFFULL
+#define CN10K_MAX_BURST_MANTISSA 0x7FFFULL
+#define CN10K_MAX_BURST_SIZE 8453888ULL
+
/* Bitfields in NIX_TLX_PIR register */
#define TLX_RATE_MANTISSA GENMASK_ULL(8, 1)
#define TLX_RATE_EXPONENT GENMASK_ULL(12, 9)
@@ -35,6 +38,9 @@
#define TLX_BURST_MANTISSA GENMASK_ULL(36, 29)
#define TLX_BURST_EXPONENT GENMASK_ULL(40, 37)
+#define CN10K_TLX_BURST_MANTISSA GENMASK_ULL(43, 29)
+#define CN10K_TLX_BURST_EXPONENT GENMASK_ULL(47, 44)
+
struct otx2_tc_flow_stats {
u64 bytes;
u64 pkts;
@@ -77,33 +83,42 @@ int otx2_tc_alloc_ent_bitmap(struct otx2_nic *nic)
}
EXPORT_SYMBOL(otx2_tc_alloc_ent_bitmap);
-static void otx2_get_egress_burst_cfg(u32 burst, u32 *burst_exp,
- u32 *burst_mantissa)
+static void otx2_get_egress_burst_cfg(struct otx2_nic *nic, u32 burst,
+ u32 *burst_exp, u32 *burst_mantissa)
{
+ int max_burst, max_mantissa;
unsigned int tmp;
+ if (is_dev_otx2(nic->pdev)) {
+ max_burst = MAX_BURST_SIZE;
+ max_mantissa = MAX_BURST_MANTISSA;
+ } else {
+ max_burst = CN10K_MAX_BURST_SIZE;
+ max_mantissa = CN10K_MAX_BURST_MANTISSA;
+ }
+
/* Burst is calculated as
* ((256 + BURST_MANTISSA) << (1 + BURST_EXPONENT)) / 256
* Max supported burst size is 130,816 bytes.
*/
- burst = min_t(u32, burst, MAX_BURST_SIZE);
+ burst = min_t(u32, burst, max_burst);
if (burst) {
*burst_exp = ilog2(burst) ? ilog2(burst) - 1 : 0;
tmp = burst - rounddown_pow_of_two(burst);
- if (burst < MAX_BURST_MANTISSA)
+ if (burst < max_mantissa)
*burst_mantissa = tmp * 2;
else
*burst_mantissa = tmp / (1ULL << (*burst_exp - 7));
} else {
*burst_exp = MAX_BURST_EXPONENT;
- *burst_mantissa = MAX_BURST_MANTISSA;
+ *burst_mantissa = max_mantissa;
}
}
-static void otx2_get_egress_rate_cfg(u32 maxrate, u32 *exp,
+static void otx2_get_egress_rate_cfg(u64 maxrate, u32 *exp,
u32 *mantissa, u32 *div_exp)
{
- unsigned int tmp;
+ u64 tmp;
/* Rate calculation by hardware
*
@@ -132,21 +147,44 @@ static void otx2_get_egress_rate_cfg(u32 maxrate, u32 *exp,
}
}
-static int otx2_set_matchall_egress_rate(struct otx2_nic *nic, u32 burst, u32 maxrate)
+static u64 otx2_get_txschq_rate_regval(struct otx2_nic *nic,
+ u64 maxrate, u32 burst)
{
- struct otx2_hw *hw = &nic->hw;
- struct nix_txschq_config *req;
u32 burst_exp, burst_mantissa;
u32 exp, mantissa, div_exp;
+ u64 regval = 0;
+
+ /* Get exponent and mantissa values from the desired rate */
+ otx2_get_egress_burst_cfg(nic, burst, &burst_exp, &burst_mantissa);
+ otx2_get_egress_rate_cfg(maxrate, &exp, &mantissa, &div_exp);
+
+ if (is_dev_otx2(nic->pdev)) {
+ regval = FIELD_PREP(TLX_BURST_EXPONENT, (u64)burst_exp) |
+ FIELD_PREP(TLX_BURST_MANTISSA, (u64)burst_mantissa) |
+ FIELD_PREP(TLX_RATE_DIVIDER_EXPONENT, div_exp) |
+ FIELD_PREP(TLX_RATE_EXPONENT, exp) |
+ FIELD_PREP(TLX_RATE_MANTISSA, mantissa) | BIT_ULL(0);
+ } else {
+ regval = FIELD_PREP(CN10K_TLX_BURST_EXPONENT, (u64)burst_exp) |
+ FIELD_PREP(CN10K_TLX_BURST_MANTISSA, (u64)burst_mantissa) |
+ FIELD_PREP(TLX_RATE_DIVIDER_EXPONENT, div_exp) |
+ FIELD_PREP(TLX_RATE_EXPONENT, exp) |
+ FIELD_PREP(TLX_RATE_MANTISSA, mantissa) | BIT_ULL(0);
+ }
+
+ return regval;
+}
+
+static int otx2_set_matchall_egress_rate(struct otx2_nic *nic,
+ u32 burst, u64 maxrate)
+{
+ struct otx2_hw *hw = &nic->hw;
+ struct nix_txschq_config *req;
int txschq, err;
/* All SQs share the same TL4, so pick the first scheduler */
txschq = hw->txschq_list[NIX_TXSCH_LVL_TL4][0];
- /* Get exponent and mantissa values from the desired rate */
- otx2_get_egress_burst_cfg(burst, &burst_exp, &burst_mantissa);
- otx2_get_egress_rate_cfg(maxrate, &exp, &mantissa, &div_exp);
-
mutex_lock(&nic->mbox.lock);
req = otx2_mbox_alloc_msg_nix_txschq_cfg(&nic->mbox);
if (!req) {
@@ -157,11 +195,7 @@ static int otx2_set_matchall_egress_rate(struct otx2_nic *nic, u32 burst, u32 ma
req->lvl = NIX_TXSCH_LVL_TL4;
req->num_regs = 1;
req->reg[0] = NIX_AF_TL4X_PIR(txschq);
- req->regval[0] = FIELD_PREP(TLX_BURST_EXPONENT, burst_exp) |
- FIELD_PREP(TLX_BURST_MANTISSA, burst_mantissa) |
- FIELD_PREP(TLX_RATE_DIVIDER_EXPONENT, div_exp) |
- FIELD_PREP(TLX_RATE_EXPONENT, exp) |
- FIELD_PREP(TLX_RATE_MANTISSA, mantissa) | BIT_ULL(0);
+ req->regval[0] = otx2_get_txschq_rate_regval(nic, maxrate, burst);
err = otx2_sync_mbox_msg(&nic->mbox);
mutex_unlock(&nic->mbox.lock);
@@ -230,7 +264,7 @@ static int otx2_tc_egress_matchall_install(struct otx2_nic *nic,
struct netlink_ext_ack *extack = cls->common.extack;
struct flow_action *actions = &cls->rule->action;
struct flow_action_entry *entry;
- u32 rate;
+ u64 rate;
int err;
err = otx2_tc_validate_flow(nic, actions, extack);
@@ -256,7 +290,7 @@ static int otx2_tc_egress_matchall_install(struct otx2_nic *nic,
}
/* Convert bytes per second to Mbps */
rate = entry->police.rate_bytes_ps * 8;
- rate = max_t(u32, rate / 1000000, 1);
+ rate = max_t(u64, rate / 1000000, 1);
err = otx2_set_matchall_egress_rate(nic, entry->police.burst, rate);
if (err)
return err;
@@ -614,21 +648,27 @@ static int otx2_tc_prepare_flow(struct otx2_nic *nic, struct otx2_tc_flow *node,
flow_spec->dport = match.key->dst;
flow_mask->dport = match.mask->dst;
- if (ip_proto == IPPROTO_UDP)
- req->features |= BIT_ULL(NPC_DPORT_UDP);
- else if (ip_proto == IPPROTO_TCP)
- req->features |= BIT_ULL(NPC_DPORT_TCP);
- else if (ip_proto == IPPROTO_SCTP)
- req->features |= BIT_ULL(NPC_DPORT_SCTP);
+
+ if (flow_mask->dport) {
+ if (ip_proto == IPPROTO_UDP)
+ req->features |= BIT_ULL(NPC_DPORT_UDP);
+ else if (ip_proto == IPPROTO_TCP)
+ req->features |= BIT_ULL(NPC_DPORT_TCP);
+ else if (ip_proto == IPPROTO_SCTP)
+ req->features |= BIT_ULL(NPC_DPORT_SCTP);
+ }
flow_spec->sport = match.key->src;
flow_mask->sport = match.mask->src;
- if (ip_proto == IPPROTO_UDP)
- req->features |= BIT_ULL(NPC_SPORT_UDP);
- else if (ip_proto == IPPROTO_TCP)
- req->features |= BIT_ULL(NPC_SPORT_TCP);
- else if (ip_proto == IPPROTO_SCTP)
- req->features |= BIT_ULL(NPC_SPORT_SCTP);
+
+ if (flow_mask->sport) {
+ if (ip_proto == IPPROTO_UDP)
+ req->features |= BIT_ULL(NPC_SPORT_UDP);
+ else if (ip_proto == IPPROTO_TCP)
+ req->features |= BIT_ULL(NPC_SPORT_TCP);
+ else if (ip_proto == IPPROTO_SCTP)
+ req->features |= BIT_ULL(NPC_SPORT_SCTP);
+ }
}
return otx2_tc_parse_actions(nic, &rule->action, req, f, node);
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index e31f8fbbc696..df2ab5cbd49b 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -4233,7 +4233,7 @@ static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
}
/* If the chain is ended by an load/store pair then this
- * could serve as the new head of the the next chain.
+ * could serve as the new head of the next chain.
*/
if (curr_pair_is_memcpy(meta1, meta2)) {
head_ld_meta = meta1;
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 4625f85acab2..10ad0b93d283 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -1100,7 +1100,29 @@ static void efx_ptp_xmit_skb_queue(struct efx_nic *efx, struct sk_buff *skb)
tx_queue = efx_channel_get_tx_queue(ptp_data->channel, type);
if (tx_queue && tx_queue->timestamping) {
+ /* This code invokes normal driver TX code which is always
+ * protected from softirqs when called from generic TX code,
+ * which in turn disables preemption. Look at __dev_queue_xmit
+ * which uses rcu_read_lock_bh disabling preemption for RCU
+ * plus disabling softirqs. We do not need RCU reader
+ * protection here.
+ *
+ * Although it is theoretically safe for current PTP TX/RX code
+ * running without disabling softirqs, there are three good
+ * reasond for doing so:
+ *
+ * 1) The code invoked is mainly implemented for non-PTP
+ * packets and it is always executed with softirqs
+ * disabled.
+ * 2) This being a single PTP packet, better to not
+ * interrupt its processing by softirqs which can lead
+ * to high latencies.
+ * 3) netdev_xmit_more checks preemption is disabled and
+ * triggers a BUG_ON if not.
+ */
+ local_bh_disable();
efx_enqueue_skb(tx_queue, skb);
+ local_bh_enable();
} else {
WARN_ONCE(1, "PTP channel has no timestamped tx queue\n");
dev_kfree_skb_any(skb);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
index ca8ab290013c..d42e1afb6521 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c
@@ -688,18 +688,19 @@ static int mediatek_dwmac_probe(struct platform_device *pdev)
ret = mediatek_dwmac_clks_config(priv_plat, true);
if (ret)
- return ret;
+ goto err_remove_config_dt;
ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res);
- if (ret) {
- stmmac_remove_config_dt(pdev, plat_dat);
+ if (ret)
goto err_drv_probe;
- }
return 0;
err_drv_probe:
mediatek_dwmac_clks_config(priv_plat, false);
+err_remove_config_dt:
+ stmmac_remove_config_dt(pdev, plat_dat);
+
return ret;
}
diff --git a/drivers/net/ipa/ipa_qmi_msg.h b/drivers/net/ipa/ipa_qmi_msg.h
index 3233d145fd87..495e85abe50b 100644
--- a/drivers/net/ipa/ipa_qmi_msg.h
+++ b/drivers/net/ipa/ipa_qmi_msg.h
@@ -214,7 +214,7 @@ struct ipa_init_modem_driver_req {
/* The response to a IPA_QMI_INIT_DRIVER request begins with a standard
* QMI response, but contains other information as well. Currently we
- * simply wait for the the INIT_DRIVER transaction to complete and
+ * simply wait for the INIT_DRIVER transaction to complete and
* ignore any other data that might be returned.
*/
struct ipa_init_modem_driver_rsp {
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 817577e713d7..f354fad05714 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -243,6 +243,7 @@ static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb)
#define DEFAULT_SEND_SCI true
#define DEFAULT_ENCRYPT false
#define DEFAULT_ENCODING_SA 0
+#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1))
static bool send_sci(const struct macsec_secy *secy)
{
@@ -1697,7 +1698,7 @@ static bool validate_add_rxsa(struct nlattr **attrs)
return false;
if (attrs[MACSEC_SA_ATTR_PN] &&
- *(u64 *)nla_data(attrs[MACSEC_SA_ATTR_PN]) == 0)
+ nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
return false;
if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
@@ -1753,7 +1754,8 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
}
pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
- if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
+ if (tb_sa[MACSEC_SA_ATTR_PN] &&
+ nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n",
nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
rtnl_unlock();
@@ -1769,7 +1771,7 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n",
nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
- MACSEC_SA_ATTR_SALT);
+ MACSEC_SALT_LEN);
rtnl_unlock();
return -EINVAL;
}
@@ -1842,7 +1844,7 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
return 0;
cleanup:
- kfree(rx_sa);
+ macsec_rxsa_put(rx_sa);
rtnl_unlock();
return err;
}
@@ -1939,7 +1941,7 @@ static bool validate_add_txsa(struct nlattr **attrs)
if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
return false;
- if (nla_get_u32(attrs[MACSEC_SA_ATTR_PN]) == 0)
+ if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
return false;
if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
@@ -2011,7 +2013,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n",
nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
- MACSEC_SA_ATTR_SALT);
+ MACSEC_SALT_LEN);
rtnl_unlock();
return -EINVAL;
}
@@ -2085,7 +2087,7 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
cleanup:
secy->operational = was_operational;
- kfree(tx_sa);
+ macsec_txsa_put(tx_sa);
rtnl_unlock();
return err;
}
@@ -2293,7 +2295,7 @@ static bool validate_upd_sa(struct nlattr **attrs)
if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
return false;
- if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u32(attrs[MACSEC_SA_ATTR_PN]) == 0)
+ if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
return false;
if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
@@ -3745,9 +3747,6 @@ static int macsec_changelink_common(struct net_device *dev,
secy->operational = tx_sa && tx_sa->active;
}
- if (data[IFLA_MACSEC_WINDOW])
- secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]);
-
if (data[IFLA_MACSEC_ENCRYPT])
tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]);
@@ -3793,6 +3792,16 @@ static int macsec_changelink_common(struct net_device *dev,
}
}
+ if (data[IFLA_MACSEC_WINDOW]) {
+ secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]);
+
+ /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window
+ * for XPN cipher suites */
+ if (secy->xpn &&
+ secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW)
+ return -EINVAL;
+ }
+
return 0;
}
@@ -3822,7 +3831,7 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
ret = macsec_changelink_common(dev, data);
if (ret)
- return ret;
+ goto cleanup;
/* If h/w offloading is available, propagate to the device */
if (macsec_is_offloaded(macsec)) {
diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c
index 4cfd05c15aee..d25fbb9caeba 100644
--- a/drivers/net/pcs/pcs-xpcs.c
+++ b/drivers/net/pcs/pcs-xpcs.c
@@ -896,7 +896,7 @@ static int xpcs_get_state_c37_sgmii(struct dw_xpcs *xpcs,
*/
ret = xpcs_read(xpcs, MDIO_MMD_VEND2, DW_VR_MII_AN_INTR_STS);
if (ret < 0)
- return false;
+ return ret;
if (ret & DW_VR_MII_C37_ANSGM_SP_LNKSTS) {
int speed_value;
diff --git a/drivers/net/sungem_phy.c b/drivers/net/sungem_phy.c
index ff22b6b1c686..36803d932dff 100644
--- a/drivers/net/sungem_phy.c
+++ b/drivers/net/sungem_phy.c
@@ -450,6 +450,7 @@ static int bcm5421_init(struct mii_phy* phy)
int can_low_power = 1;
if (np == NULL || of_get_property(np, "no-autolowpower", NULL))
can_low_power = 0;
+ of_node_put(np);
if (can_low_power) {
/* Enable automatic low-power */
sungem_phy_write(phy, 0x1c, 0x9002);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 356cf8dd4164..ec8e1b3108c3 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -242,9 +242,15 @@ struct virtnet_info {
/* Packet virtio header size */
u8 hdr_len;
- /* Work struct for refilling if we run low on memory. */
+ /* Work struct for delayed refilling if we run low on memory. */
struct delayed_work refill;
+ /* Is delayed refill enabled? */
+ bool refill_enabled;
+
+ /* The lock to synchronize the access to refill_enabled */
+ spinlock_t refill_lock;
+
/* Work struct for config space updates */
struct work_struct config_work;
@@ -348,6 +354,20 @@ static struct page *get_a_page(struct receive_queue *rq, gfp_t gfp_mask)
return p;
}
+static void enable_delayed_refill(struct virtnet_info *vi)
+{
+ spin_lock_bh(&vi->refill_lock);
+ vi->refill_enabled = true;
+ spin_unlock_bh(&vi->refill_lock);
+}
+
+static void disable_delayed_refill(struct virtnet_info *vi)
+{
+ spin_lock_bh(&vi->refill_lock);
+ vi->refill_enabled = false;
+ spin_unlock_bh(&vi->refill_lock);
+}
+
static void virtqueue_napi_schedule(struct napi_struct *napi,
struct virtqueue *vq)
{
@@ -1527,8 +1547,12 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
}
if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
- if (!try_fill_recv(vi, rq, GFP_ATOMIC))
- schedule_delayed_work(&vi->refill, 0);
+ if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
+ spin_lock(&vi->refill_lock);
+ if (vi->refill_enabled)
+ schedule_delayed_work(&vi->refill, 0);
+ spin_unlock(&vi->refill_lock);
+ }
}
u64_stats_update_begin(&rq->stats.syncp);
@@ -1651,6 +1675,8 @@ static int virtnet_open(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev);
int i, err;
+ enable_delayed_refill(vi);
+
for (i = 0; i < vi->max_queue_pairs; i++) {
if (i < vi->curr_queue_pairs)
/* Make sure we have some buffers: if oom use wq. */
@@ -2033,6 +2059,8 @@ static int virtnet_close(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev);
int i;
+ /* Make sure NAPI doesn't schedule refill work */
+ disable_delayed_refill(vi);
/* Make sure refill_work doesn't re-enable napi! */
cancel_delayed_work_sync(&vi->refill);
@@ -2792,6 +2820,8 @@ static int virtnet_restore_up(struct virtio_device *vdev)
virtio_device_ready(vdev);
+ enable_delayed_refill(vi);
+
if (netif_running(vi->dev)) {
err = virtnet_open(vi->dev);
if (err)
@@ -3535,6 +3565,7 @@ static int virtnet_probe(struct virtio_device *vdev)
vdev->priv = vi;
INIT_WORK(&vi->config_work, virtnet_config_changed_work);
+ spin_lock_init(&vi->refill_lock);
/* If we can receive ANY GSO packets, we must allocate large ones. */
if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) ||
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 58c72d55769a..73d9fcba3b1c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3515,6 +3515,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */
+ .driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
diff --git a/drivers/of/kexec.c b/drivers/of/kexec.c
index 8d374cc552be..f2e58ddfaed2 100644
--- a/drivers/of/kexec.c
+++ b/drivers/of/kexec.c
@@ -9,6 +9,7 @@
* Copyright (C) 2016 IBM Corporation
*/
+#include <linux/ima.h>
#include <linux/kernel.h>
#include <linux/kexec.h>
#include <linux/memblock.h>
@@ -115,6 +116,7 @@ static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
return 0;
}
+#ifdef CONFIG_HAVE_IMA_KEXEC
/**
* ima_get_kexec_buffer - get IMA buffer from the previous kernel
* @addr: On successful return, set to point to the buffer contents.
@@ -122,16 +124,13 @@ static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
*
* Return: 0 on success, negative errno on error.
*/
-int ima_get_kexec_buffer(void **addr, size_t *size)
+int __init ima_get_kexec_buffer(void **addr, size_t *size)
{
int ret, len;
unsigned long tmp_addr;
size_t tmp_size;
const void *prop;
- if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
- return -ENOTSUPP;
-
prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
if (!prop)
return -ENOENT;
@@ -149,16 +148,13 @@ int ima_get_kexec_buffer(void **addr, size_t *size)
/**
* ima_free_kexec_buffer - free memory used by the IMA buffer
*/
-int ima_free_kexec_buffer(void)
+int __init ima_free_kexec_buffer(void)
{
int ret;
unsigned long addr;
size_t size;
struct property *prop;
- if (!IS_ENABLED(CONFIG_HAVE_IMA_KEXEC))
- return -ENOTSUPP;
-
prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
if (!prop)
return -ENOENT;
@@ -173,6 +169,7 @@ int ima_free_kexec_buffer(void)
return memblock_phys_free(addr, size);
}
+#endif
/**
* remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
diff --git a/drivers/perf/arm-cci.c b/drivers/perf/arm-cci.c
index 96e09fa40909..03b1309875ae 100644
--- a/drivers/perf/arm-cci.c
+++ b/drivers/perf/arm-cci.c
@@ -1139,7 +1139,7 @@ static void cci_pmu_start(struct perf_event *event, int pmu_flags)
/*
* To handle interrupt latency, we always reprogram the period
- * regardlesss of PERF_EF_RELOAD.
+ * regardless of PERF_EF_RELOAD.
*/
if (pmu_flags & PERF_EF_RELOAD)
WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
@@ -1261,7 +1261,7 @@ static int validate_group(struct perf_event *event)
*/
.used_mask = mask,
};
- memset(mask, 0, BITS_TO_LONGS(cci_pmu->num_cntrs) * sizeof(unsigned long));
+ bitmap_zero(mask, cci_pmu->num_cntrs);
if (!validate_event(event->pmu, &fake_pmu, leader))
return -EINVAL;
@@ -1629,10 +1629,9 @@ static struct cci_pmu *cci_pmu_alloc(struct device *dev)
GFP_KERNEL);
if (!cci_pmu->hw_events.events)
return ERR_PTR(-ENOMEM);
- cci_pmu->hw_events.used_mask = devm_kcalloc(dev,
- BITS_TO_LONGS(CCI_PMU_MAX_HW_CNTRS(model)),
- sizeof(*cci_pmu->hw_events.used_mask),
- GFP_KERNEL);
+ cci_pmu->hw_events.used_mask = devm_bitmap_zalloc(dev,
+ CCI_PMU_MAX_HW_CNTRS(model),
+ GFP_KERNEL);
if (!cci_pmu->hw_events.used_mask)
return ERR_PTR(-ENOMEM);
diff --git a/drivers/perf/arm-ccn.c b/drivers/perf/arm-ccn.c
index 40b352e8aa7f..728d13d8e98a 100644
--- a/drivers/perf/arm-ccn.c
+++ b/drivers/perf/arm-ccn.c
@@ -1250,7 +1250,7 @@ static int arm_ccn_pmu_init(struct arm_ccn *ccn)
ccn->dt.cmp_mask[CCN_IDX_MASK_OPCODE].h = ~(0x1f << 9);
/* Get a convenient /sys/event_source/devices/ name */
- ccn->dt.id = ida_simple_get(&arm_ccn_pmu_ida, 0, 0, GFP_KERNEL);
+ ccn->dt.id = ida_alloc(&arm_ccn_pmu_ida, GFP_KERNEL);
if (ccn->dt.id == 0) {
name = "ccn";
} else {
@@ -1312,7 +1312,7 @@ error_pmu_register:
&ccn->dt.node);
error_set_affinity:
error_choose_name:
- ida_simple_remove(&arm_ccn_pmu_ida, ccn->dt.id);
+ ida_free(&arm_ccn_pmu_ida, ccn->dt.id);
for (i = 0; i < ccn->num_xps; i++)
writel(0, ccn->xp[i].base + CCN_XP_DT_CONTROL);
writel(0, ccn->dt.base + CCN_DT_PMCR);
@@ -1329,7 +1329,7 @@ static void arm_ccn_pmu_cleanup(struct arm_ccn *ccn)
writel(0, ccn->xp[i].base + CCN_XP_DT_CONTROL);
writel(0, ccn->dt.base + CCN_DT_PMCR);
perf_pmu_unregister(&ccn->dt.pmu);
- ida_simple_remove(&arm_ccn_pmu_ida, ccn->dt.id);
+ ida_free(&arm_ccn_pmu_ida, ccn->dt.id);
}
static int arm_ccn_for_each_valid_region(struct arm_ccn *ccn,
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index db670b265897..b65a7d9640e1 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -39,6 +39,24 @@
#include <asm/mmu.h>
#include <asm/sysreg.h>
+/*
+ * Cache if the event is allowed to trace Context information.
+ * This allows us to perform the check, i.e, perfmon_capable(),
+ * in the context of the event owner, once, during the event_init().
+ */
+#define SPE_PMU_HW_FLAGS_CX BIT(0)
+
+static void set_spe_event_has_cx(struct perf_event *event)
+{
+ if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable())
+ event->hw.flags |= SPE_PMU_HW_FLAGS_CX;
+}
+
+static bool get_spe_event_has_cx(struct perf_event *event)
+{
+ return !!(event->hw.flags & SPE_PMU_HW_FLAGS_CX);
+}
+
#define ARM_SPE_BUF_PAD_BYTE 0
struct arm_spe_pmu_buf {
@@ -272,7 +290,7 @@ static u64 arm_spe_event_to_pmscr(struct perf_event *event)
if (!attr->exclude_kernel)
reg |= BIT(SYS_PMSCR_EL1_E1SPE_SHIFT);
- if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR) && perfmon_capable())
+ if (get_spe_event_has_cx(event))
reg |= BIT(SYS_PMSCR_EL1_CX_SHIFT);
return reg;
@@ -709,10 +727,10 @@ static int arm_spe_pmu_event_init(struct perf_event *event)
!(spe_pmu->features & SPE_PMU_FEAT_FILT_LAT))
return -EOPNOTSUPP;
+ set_spe_event_has_cx(event);
reg = arm_spe_event_to_pmscr(event);
if (!perfmon_capable() &&
(reg & (BIT(SYS_PMSCR_EL1_PA_SHIFT) |
- BIT(SYS_PMSCR_EL1_CX_SHIFT) |
BIT(SYS_PMSCR_EL1_PCT_SHIFT))))
return -EACCES;
diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c
index b1b2a55de77f..8e058e08fe81 100644
--- a/drivers/perf/fsl_imx8_ddr_perf.c
+++ b/drivers/perf/fsl_imx8_ddr_perf.c
@@ -611,7 +611,7 @@ static int ddr_perf_init(struct ddr_pmu *pmu, void __iomem *base,
.dev = dev,
};
- pmu->id = ida_simple_get(&ddr_ida, 0, 0, GFP_KERNEL);
+ pmu->id = ida_alloc(&ddr_ida, GFP_KERNEL);
return pmu->id;
}
@@ -765,7 +765,7 @@ ddr_perf_err:
cpuhp_instance_err:
cpuhp_remove_multi_state(pmu->cpuhp_state);
cpuhp_state_err:
- ida_simple_remove(&ddr_ida, pmu->id);
+ ida_free(&ddr_ida, pmu->id);
dev_warn(&pdev->dev, "i.MX8 DDR Perf PMU failed (%d), disabled\n", ret);
return ret;
}
@@ -779,7 +779,7 @@ static int ddr_perf_remove(struct platform_device *pdev)
perf_pmu_unregister(&pmu->pmu);
- ida_simple_remove(&ddr_ida, pmu->id);
+ ida_free(&ddr_ida, pmu->id);
return 0;
}
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig
index 5546218b5598..171bfc1b6bc2 100644
--- a/drivers/perf/hisilicon/Kconfig
+++ b/drivers/perf/hisilicon/Kconfig
@@ -14,3 +14,13 @@ config HISI_PCIE_PMU
RCiEP devices.
Adds the PCIe PMU into perf events system for monitoring latency,
bandwidth etc.
+
+config HNS3_PMU
+ tristate "HNS3 PERF PMU"
+ depends on ARM64 || COMPILE_TEST
+ depends on PCI
+ help
+ Provide support for HNS3 performance monitoring unit (PMU) RCiEP
+ devices.
+ Adds the HNS3 PMU into perf events system for monitoring latency,
+ bandwidth etc.
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 6be83517acaa..4d2c9abe3372 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o
obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
+obj-$(CONFIG_HNS3_PMU) += hns3_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
index 62299ab5a9be..50d0c0a2f1fe 100644
--- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c
@@ -516,21 +516,7 @@ static int hisi_ddrc_pmu_probe(struct platform_device *pdev)
"hisi_sccl%u_ddrc%u", ddrc_pmu->sccl_id,
ddrc_pmu->index_id);
- ddrc_pmu->pmu = (struct pmu) {
- .name = name,
- .module = THIS_MODULE,
- .task_ctx_nr = perf_invalid_context,
- .event_init = hisi_uncore_pmu_event_init,
- .pmu_enable = hisi_uncore_pmu_enable,
- .pmu_disable = hisi_uncore_pmu_disable,
- .add = hisi_uncore_pmu_add,
- .del = hisi_uncore_pmu_del,
- .start = hisi_uncore_pmu_start,
- .stop = hisi_uncore_pmu_stop,
- .read = hisi_uncore_pmu_read,
- .attr_groups = ddrc_pmu->pmu_events.attr_groups,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
- };
+ hisi_pmu_init(&ddrc_pmu->pmu, name, ddrc_pmu->pmu_events.attr_groups, THIS_MODULE);
ret = perf_pmu_register(&ddrc_pmu->pmu, name, -1);
if (ret) {
diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
index 393513150106..13017b3412a5 100644
--- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c
@@ -519,21 +519,7 @@ static int hisi_hha_pmu_probe(struct platform_device *pdev)
name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%u_hha%u",
hha_pmu->sccl_id, hha_pmu->index_id);
- hha_pmu->pmu = (struct pmu) {
- .name = name,
- .module = THIS_MODULE,
- .task_ctx_nr = perf_invalid_context,
- .event_init = hisi_uncore_pmu_event_init,
- .pmu_enable = hisi_uncore_pmu_enable,
- .pmu_disable = hisi_uncore_pmu_disable,
- .add = hisi_uncore_pmu_add,
- .del = hisi_uncore_pmu_del,
- .start = hisi_uncore_pmu_start,
- .stop = hisi_uncore_pmu_stop,
- .read = hisi_uncore_pmu_read,
- .attr_groups = hha_pmu->pmu_events.attr_groups,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
- };
+ hisi_pmu_init(&hha_pmu->pmu, name, hha_pmu->pmu_events.attr_groups, THIS_MODULE);
ret = perf_pmu_register(&hha_pmu->pmu, name, -1);
if (ret) {
diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
index 560ab964c8b5..2995f3630d49 100644
--- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c
@@ -557,21 +557,7 @@ static int hisi_l3c_pmu_probe(struct platform_device *pdev)
*/
name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sccl%u_l3c%u",
l3c_pmu->sccl_id, l3c_pmu->ccl_id);
- l3c_pmu->pmu = (struct pmu) {
- .name = name,
- .module = THIS_MODULE,
- .task_ctx_nr = perf_invalid_context,
- .event_init = hisi_uncore_pmu_event_init,
- .pmu_enable = hisi_uncore_pmu_enable,
- .pmu_disable = hisi_uncore_pmu_disable,
- .add = hisi_uncore_pmu_add,
- .del = hisi_uncore_pmu_del,
- .start = hisi_uncore_pmu_start,
- .stop = hisi_uncore_pmu_stop,
- .read = hisi_uncore_pmu_read,
- .attr_groups = l3c_pmu->pmu_events.attr_groups,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
- };
+ hisi_pmu_init(&l3c_pmu->pmu, name, l3c_pmu->pmu_events.attr_groups, THIS_MODULE);
ret = perf_pmu_register(&l3c_pmu->pmu, name, -1);
if (ret) {
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
index a0ee84d97c41..47d3cc9b6eec 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c
@@ -412,21 +412,7 @@ static int hisi_pa_pmu_probe(struct platform_device *pdev)
return ret;
}
- pa_pmu->pmu = (struct pmu) {
- .module = THIS_MODULE,
- .task_ctx_nr = perf_invalid_context,
- .event_init = hisi_uncore_pmu_event_init,
- .pmu_enable = hisi_uncore_pmu_enable,
- .pmu_disable = hisi_uncore_pmu_disable,
- .add = hisi_uncore_pmu_add,
- .del = hisi_uncore_pmu_del,
- .start = hisi_uncore_pmu_start,
- .stop = hisi_uncore_pmu_stop,
- .read = hisi_uncore_pmu_read,
- .attr_groups = pa_pmu->pmu_events.attr_groups,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
- };
-
+ hisi_pmu_init(&pa_pmu->pmu, name, pa_pmu->pmu_events.attr_groups, THIS_MODULE);
ret = perf_pmu_register(&pa_pmu->pmu, name, -1);
if (ret) {
dev_err(pa_pmu->dev, "PMU register failed, ret = %d\n", ret);
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c
index 980b9ee6eb14..fbc8a93d5eac 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c
@@ -531,4 +531,22 @@ int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
}
EXPORT_SYMBOL_GPL(hisi_uncore_pmu_offline_cpu);
+void hisi_pmu_init(struct pmu *pmu, const char *name,
+ const struct attribute_group **attr_groups, struct module *module)
+{
+ pmu->name = name;
+ pmu->module = module;
+ pmu->task_ctx_nr = perf_invalid_context;
+ pmu->event_init = hisi_uncore_pmu_event_init;
+ pmu->pmu_enable = hisi_uncore_pmu_enable;
+ pmu->pmu_disable = hisi_uncore_pmu_disable;
+ pmu->add = hisi_uncore_pmu_add;
+ pmu->del = hisi_uncore_pmu_del;
+ pmu->start = hisi_uncore_pmu_start;
+ pmu->stop = hisi_uncore_pmu_stop;
+ pmu->read = hisi_uncore_pmu_read;
+ pmu->attr_groups = attr_groups;
+}
+EXPORT_SYMBOL_GPL(hisi_pmu_init);
+
MODULE_LICENSE("GPL v2");
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h
index 96eeddad55ff..b59de33cd059 100644
--- a/drivers/perf/hisilicon/hisi_uncore_pmu.h
+++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h
@@ -121,4 +121,6 @@ ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev,
int hisi_uncore_pmu_init_irq(struct hisi_pmu *hisi_pmu,
struct platform_device *pdev);
+void hisi_pmu_init(struct pmu *pmu, const char *name,
+ const struct attribute_group **attr_groups, struct module *module);
#endif /* __HISI_UNCORE_PMU_H__ */
diff --git a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
index 6aedc303ff56..b9c79f17230c 100644
--- a/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
+++ b/drivers/perf/hisilicon/hisi_uncore_sllc_pmu.c
@@ -445,20 +445,7 @@ static int hisi_sllc_pmu_probe(struct platform_device *pdev)
return ret;
}
- sllc_pmu->pmu = (struct pmu) {
- .module = THIS_MODULE,
- .task_ctx_nr = perf_invalid_context,
- .event_init = hisi_uncore_pmu_event_init,
- .pmu_enable = hisi_uncore_pmu_enable,
- .pmu_disable = hisi_uncore_pmu_disable,
- .add = hisi_uncore_pmu_add,
- .del = hisi_uncore_pmu_del,
- .start = hisi_uncore_pmu_start,
- .stop = hisi_uncore_pmu_stop,
- .read = hisi_uncore_pmu_read,
- .attr_groups = sllc_pmu->pmu_events.attr_groups,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
- };
+ hisi_pmu_init(&sllc_pmu->pmu, name, sllc_pmu->pmu_events.attr_groups, THIS_MODULE);
ret = perf_pmu_register(&sllc_pmu->pmu, name, -1);
if (ret) {
diff --git a/drivers/perf/hisilicon/hns3_pmu.c b/drivers/perf/hisilicon/hns3_pmu.c
new file mode 100644
index 000000000000..e0457d84af6b
--- /dev/null
+++ b/drivers/perf/hisilicon/hns3_pmu.c
@@ -0,0 +1,1671 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This driver adds support for HNS3 PMU iEP device. Related perf events are
+ * bandwidth, latency, packet rate, interrupt rate etc.
+ *
+ * Copyright (C) 2022 HiSilicon Limited
+ */
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/cpuhotplug.h>
+#include <linux/cpumask.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/io-64-nonatomic-hi-lo.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci-epf.h>
+#include <linux/perf_event.h>
+#include <linux/smp.h>
+
+/* registers offset address */
+#define HNS3_PMU_REG_GLOBAL_CTRL 0x0000
+#define HNS3_PMU_REG_CLOCK_FREQ 0x0020
+#define HNS3_PMU_REG_BDF 0x0fe0
+#define HNS3_PMU_REG_VERSION 0x0fe4
+#define HNS3_PMU_REG_DEVICE_ID 0x0fe8
+
+#define HNS3_PMU_REG_EVENT_OFFSET 0x1000
+#define HNS3_PMU_REG_EVENT_SIZE 0x1000
+#define HNS3_PMU_REG_EVENT_CTRL_LOW 0x00
+#define HNS3_PMU_REG_EVENT_CTRL_HIGH 0x04
+#define HNS3_PMU_REG_EVENT_INTR_STATUS 0x08
+#define HNS3_PMU_REG_EVENT_INTR_MASK 0x0c
+#define HNS3_PMU_REG_EVENT_COUNTER 0x10
+#define HNS3_PMU_REG_EVENT_EXT_COUNTER 0x18
+#define HNS3_PMU_REG_EVENT_QID_CTRL 0x28
+#define HNS3_PMU_REG_EVENT_QID_PARA 0x2c
+
+#define HNS3_PMU_FILTER_SUPPORT_GLOBAL BIT(0)
+#define HNS3_PMU_FILTER_SUPPORT_PORT BIT(1)
+#define HNS3_PMU_FILTER_SUPPORT_PORT_TC BIT(2)
+#define HNS3_PMU_FILTER_SUPPORT_FUNC BIT(3)
+#define HNS3_PMU_FILTER_SUPPORT_FUNC_QUEUE BIT(4)
+#define HNS3_PMU_FILTER_SUPPORT_FUNC_INTR BIT(5)
+
+#define HNS3_PMU_FILTER_ALL_TC 0xf
+#define HNS3_PMU_FILTER_ALL_QUEUE 0xffff
+
+#define HNS3_PMU_CTRL_SUBEVENT_S 4
+#define HNS3_PMU_CTRL_FILTER_MODE_S 24
+
+#define HNS3_PMU_GLOBAL_START BIT(0)
+
+#define HNS3_PMU_EVENT_STATUS_RESET BIT(11)
+#define HNS3_PMU_EVENT_EN BIT(12)
+#define HNS3_PMU_EVENT_OVERFLOW_RESTART BIT(15)
+
+#define HNS3_PMU_QID_PARA_FUNC_S 0
+#define HNS3_PMU_QID_PARA_QUEUE_S 16
+
+#define HNS3_PMU_QID_CTRL_REQ_ENABLE BIT(0)
+#define HNS3_PMU_QID_CTRL_DONE BIT(1)
+#define HNS3_PMU_QID_CTRL_MISS BIT(2)
+
+#define HNS3_PMU_INTR_MASK_OVERFLOW BIT(1)
+
+#define HNS3_PMU_MAX_HW_EVENTS 8
+
+/*
+ * Each hardware event contains two registers (counter and ext_counter) for
+ * bandwidth, packet rate, latency and interrupt rate. These two registers will
+ * be triggered to run at the same when a hardware event is enabled. The meaning
+ * of counter and ext_counter of different event type are different, their
+ * meaning show as follow:
+ *
+ * +----------------+------------------+---------------+
+ * | event type | counter | ext_counter |
+ * +----------------+------------------+---------------+
+ * | bandwidth | byte number | cycle number |
+ * +----------------+------------------+---------------+
+ * | packet rate | packet number | cycle number |
+ * +----------------+------------------+---------------+
+ * | latency | cycle number | packet number |
+ * +----------------+------------------+---------------+
+ * | interrupt rate | interrupt number | cycle number |
+ * +----------------+------------------+---------------+
+ *
+ * The cycle number indicates increment of counter of hardware timer, the
+ * frequency of hardware timer can be read from hw_clk_freq file.
+ *
+ * Performance of each hardware event is calculated by: counter / ext_counter.
+ *
+ * Since processing of data is preferred to be done in userspace, we expose
+ * ext_counter as a separate event for userspace and use bit 16 to indicate it.
+ * For example, event 0x00001 and 0x10001 are actually one event for hardware
+ * because bit 0-15 are same. If the bit 16 of one event is 0 means to read
+ * counter register, otherwise means to read ext_counter register.
+ */
+/* bandwidth events */
+#define HNS3_PMU_EVT_BW_SSU_EGU_BYTE_NUM 0x00001
+#define HNS3_PMU_EVT_BW_SSU_EGU_TIME 0x10001
+#define HNS3_PMU_EVT_BW_SSU_RPU_BYTE_NUM 0x00002
+#define HNS3_PMU_EVT_BW_SSU_RPU_TIME 0x10002
+#define HNS3_PMU_EVT_BW_SSU_ROCE_BYTE_NUM 0x00003
+#define HNS3_PMU_EVT_BW_SSU_ROCE_TIME 0x10003
+#define HNS3_PMU_EVT_BW_ROCE_SSU_BYTE_NUM 0x00004
+#define HNS3_PMU_EVT_BW_ROCE_SSU_TIME 0x10004
+#define HNS3_PMU_EVT_BW_TPU_SSU_BYTE_NUM 0x00005
+#define HNS3_PMU_EVT_BW_TPU_SSU_TIME 0x10005
+#define HNS3_PMU_EVT_BW_RPU_RCBRX_BYTE_NUM 0x00006
+#define HNS3_PMU_EVT_BW_RPU_RCBRX_TIME 0x10006
+#define HNS3_PMU_EVT_BW_RCBTX_TXSCH_BYTE_NUM 0x00008
+#define HNS3_PMU_EVT_BW_RCBTX_TXSCH_TIME 0x10008
+#define HNS3_PMU_EVT_BW_WR_FBD_BYTE_NUM 0x00009
+#define HNS3_PMU_EVT_BW_WR_FBD_TIME 0x10009
+#define HNS3_PMU_EVT_BW_WR_EBD_BYTE_NUM 0x0000a
+#define HNS3_PMU_EVT_BW_WR_EBD_TIME 0x1000a
+#define HNS3_PMU_EVT_BW_RD_FBD_BYTE_NUM 0x0000b
+#define HNS3_PMU_EVT_BW_RD_FBD_TIME 0x1000b
+#define HNS3_PMU_EVT_BW_RD_EBD_BYTE_NUM 0x0000c
+#define HNS3_PMU_EVT_BW_RD_EBD_TIME 0x1000c
+#define HNS3_PMU_EVT_BW_RD_PAY_M0_BYTE_NUM 0x0000d
+#define HNS3_PMU_EVT_BW_RD_PAY_M0_TIME 0x1000d
+#define HNS3_PMU_EVT_BW_RD_PAY_M1_BYTE_NUM 0x0000e
+#define HNS3_PMU_EVT_BW_RD_PAY_M1_TIME 0x1000e
+#define HNS3_PMU_EVT_BW_WR_PAY_M0_BYTE_NUM 0x0000f
+#define HNS3_PMU_EVT_BW_WR_PAY_M0_TIME 0x1000f
+#define HNS3_PMU_EVT_BW_WR_PAY_M1_BYTE_NUM 0x00010
+#define HNS3_PMU_EVT_BW_WR_PAY_M1_TIME 0x10010
+
+/* packet rate events */
+#define HNS3_PMU_EVT_PPS_IGU_SSU_PACKET_NUM 0x00100
+#define HNS3_PMU_EVT_PPS_IGU_SSU_TIME 0x10100
+#define HNS3_PMU_EVT_PPS_SSU_EGU_PACKET_NUM 0x00101
+#define HNS3_PMU_EVT_PPS_SSU_EGU_TIME 0x10101
+#define HNS3_PMU_EVT_PPS_SSU_RPU_PACKET_NUM 0x00102
+#define HNS3_PMU_EVT_PPS_SSU_RPU_TIME 0x10102
+#define HNS3_PMU_EVT_PPS_SSU_ROCE_PACKET_NUM 0x00103
+#define HNS3_PMU_EVT_PPS_SSU_ROCE_TIME 0x10103
+#define HNS3_PMU_EVT_PPS_ROCE_SSU_PACKET_NUM 0x00104
+#define HNS3_PMU_EVT_PPS_ROCE_SSU_TIME 0x10104
+#define HNS3_PMU_EVT_PPS_TPU_SSU_PACKET_NUM 0x00105
+#define HNS3_PMU_EVT_PPS_TPU_SSU_TIME 0x10105
+#define HNS3_PMU_EVT_PPS_RPU_RCBRX_PACKET_NUM 0x00106
+#define HNS3_PMU_EVT_PPS_RPU_RCBRX_TIME 0x10106
+#define HNS3_PMU_EVT_PPS_RCBTX_TPU_PACKET_NUM 0x00107
+#define HNS3_PMU_EVT_PPS_RCBTX_TPU_TIME 0x10107
+#define HNS3_PMU_EVT_PPS_RCBTX_TXSCH_PACKET_NUM 0x00108
+#define HNS3_PMU_EVT_PPS_RCBTX_TXSCH_TIME 0x10108
+#define HNS3_PMU_EVT_PPS_WR_FBD_PACKET_NUM 0x00109
+#define HNS3_PMU_EVT_PPS_WR_FBD_TIME 0x10109
+#define HNS3_PMU_EVT_PPS_WR_EBD_PACKET_NUM 0x0010a
+#define HNS3_PMU_EVT_PPS_WR_EBD_TIME 0x1010a
+#define HNS3_PMU_EVT_PPS_RD_FBD_PACKET_NUM 0x0010b
+#define HNS3_PMU_EVT_PPS_RD_FBD_TIME 0x1010b
+#define HNS3_PMU_EVT_PPS_RD_EBD_PACKET_NUM 0x0010c
+#define HNS3_PMU_EVT_PPS_RD_EBD_TIME 0x1010c
+#define HNS3_PMU_EVT_PPS_RD_PAY_M0_PACKET_NUM 0x0010d
+#define HNS3_PMU_EVT_PPS_RD_PAY_M0_TIME 0x1010d
+#define HNS3_PMU_EVT_PPS_RD_PAY_M1_PACKET_NUM 0x0010e
+#define HNS3_PMU_EVT_PPS_RD_PAY_M1_TIME 0x1010e
+#define HNS3_PMU_EVT_PPS_WR_PAY_M0_PACKET_NUM 0x0010f
+#define HNS3_PMU_EVT_PPS_WR_PAY_M0_TIME 0x1010f
+#define HNS3_PMU_EVT_PPS_WR_PAY_M1_PACKET_NUM 0x00110
+#define HNS3_PMU_EVT_PPS_WR_PAY_M1_TIME 0x10110
+#define HNS3_PMU_EVT_PPS_NICROH_TX_PRE_PACKET_NUM 0x00111
+#define HNS3_PMU_EVT_PPS_NICROH_TX_PRE_TIME 0x10111
+#define HNS3_PMU_EVT_PPS_NICROH_RX_PRE_PACKET_NUM 0x00112
+#define HNS3_PMU_EVT_PPS_NICROH_RX_PRE_TIME 0x10112
+
+/* latency events */
+#define HNS3_PMU_EVT_DLY_TX_PUSH_TIME 0x00202
+#define HNS3_PMU_EVT_DLY_TX_PUSH_PACKET_NUM 0x10202
+#define HNS3_PMU_EVT_DLY_TX_TIME 0x00204
+#define HNS3_PMU_EVT_DLY_TX_PACKET_NUM 0x10204
+#define HNS3_PMU_EVT_DLY_SSU_TX_NIC_TIME 0x00206
+#define HNS3_PMU_EVT_DLY_SSU_TX_NIC_PACKET_NUM 0x10206
+#define HNS3_PMU_EVT_DLY_SSU_TX_ROCE_TIME 0x00207
+#define HNS3_PMU_EVT_DLY_SSU_TX_ROCE_PACKET_NUM 0x10207
+#define HNS3_PMU_EVT_DLY_SSU_RX_NIC_TIME 0x00208
+#define HNS3_PMU_EVT_DLY_SSU_RX_NIC_PACKET_NUM 0x10208
+#define HNS3_PMU_EVT_DLY_SSU_RX_ROCE_TIME 0x00209
+#define HNS3_PMU_EVT_DLY_SSU_RX_ROCE_PACKET_NUM 0x10209
+#define HNS3_PMU_EVT_DLY_RPU_TIME 0x0020e
+#define HNS3_PMU_EVT_DLY_RPU_PACKET_NUM 0x1020e
+#define HNS3_PMU_EVT_DLY_TPU_TIME 0x0020f
+#define HNS3_PMU_EVT_DLY_TPU_PACKET_NUM 0x1020f
+#define HNS3_PMU_EVT_DLY_RPE_TIME 0x00210
+#define HNS3_PMU_EVT_DLY_RPE_PACKET_NUM 0x10210
+#define HNS3_PMU_EVT_DLY_TPE_TIME 0x00211
+#define HNS3_PMU_EVT_DLY_TPE_PACKET_NUM 0x10211
+#define HNS3_PMU_EVT_DLY_TPE_PUSH_TIME 0x00212
+#define HNS3_PMU_EVT_DLY_TPE_PUSH_PACKET_NUM 0x10212
+#define HNS3_PMU_EVT_DLY_WR_FBD_TIME 0x00213
+#define HNS3_PMU_EVT_DLY_WR_FBD_PACKET_NUM 0x10213
+#define HNS3_PMU_EVT_DLY_WR_EBD_TIME 0x00214
+#define HNS3_PMU_EVT_DLY_WR_EBD_PACKET_NUM 0x10214
+#define HNS3_PMU_EVT_DLY_RD_FBD_TIME 0x00215
+#define HNS3_PMU_EVT_DLY_RD_FBD_PACKET_NUM 0x10215
+#define HNS3_PMU_EVT_DLY_RD_EBD_TIME 0x00216
+#define HNS3_PMU_EVT_DLY_RD_EBD_PACKET_NUM 0x10216
+#define HNS3_PMU_EVT_DLY_RD_PAY_M0_TIME 0x00217
+#define HNS3_PMU_EVT_DLY_RD_PAY_M0_PACKET_NUM 0x10217
+#define HNS3_PMU_EVT_DLY_RD_PAY_M1_TIME 0x00218
+#define HNS3_PMU_EVT_DLY_RD_PAY_M1_PACKET_NUM 0x10218
+#define HNS3_PMU_EVT_DLY_WR_PAY_M0_TIME 0x00219
+#define HNS3_PMU_EVT_DLY_WR_PAY_M0_PACKET_NUM 0x10219
+#define HNS3_PMU_EVT_DLY_WR_PAY_M1_TIME 0x0021a
+#define HNS3_PMU_EVT_DLY_WR_PAY_M1_PACKET_NUM 0x1021a
+#define HNS3_PMU_EVT_DLY_MSIX_WRITE_TIME 0x0021c
+#define HNS3_PMU_EVT_DLY_MSIX_WRITE_PACKET_NUM 0x1021c
+
+/* interrupt rate events */
+#define HNS3_PMU_EVT_PPS_MSIX_NIC_INTR_NUM 0x00300
+#define HNS3_PMU_EVT_PPS_MSIX_NIC_TIME 0x10300
+
+/* filter mode supported by each bandwidth event */
+#define HNS3_PMU_FILTER_BW_SSU_EGU 0x07
+#define HNS3_PMU_FILTER_BW_SSU_RPU 0x1f
+#define HNS3_PMU_FILTER_BW_SSU_ROCE 0x0f
+#define HNS3_PMU_FILTER_BW_ROCE_SSU 0x0f
+#define HNS3_PMU_FILTER_BW_TPU_SSU 0x1f
+#define HNS3_PMU_FILTER_BW_RPU_RCBRX 0x11
+#define HNS3_PMU_FILTER_BW_RCBTX_TXSCH 0x11
+#define HNS3_PMU_FILTER_BW_WR_FBD 0x1b
+#define HNS3_PMU_FILTER_BW_WR_EBD 0x11
+#define HNS3_PMU_FILTER_BW_RD_FBD 0x01
+#define HNS3_PMU_FILTER_BW_RD_EBD 0x1b
+#define HNS3_PMU_FILTER_BW_RD_PAY_M0 0x01
+#define HNS3_PMU_FILTER_BW_RD_PAY_M1 0x01
+#define HNS3_PMU_FILTER_BW_WR_PAY_M0 0x01
+#define HNS3_PMU_FILTER_BW_WR_PAY_M1 0x01
+
+/* filter mode supported by each packet rate event */
+#define HNS3_PMU_FILTER_PPS_IGU_SSU 0x07
+#define HNS3_PMU_FILTER_PPS_SSU_EGU 0x07
+#define HNS3_PMU_FILTER_PPS_SSU_RPU 0x1f
+#define HNS3_PMU_FILTER_PPS_SSU_ROCE 0x0f
+#define HNS3_PMU_FILTER_PPS_ROCE_SSU 0x0f
+#define HNS3_PMU_FILTER_PPS_TPU_SSU 0x1f
+#define HNS3_PMU_FILTER_PPS_RPU_RCBRX 0x11
+#define HNS3_PMU_FILTER_PPS_RCBTX_TPU 0x1f
+#define HNS3_PMU_FILTER_PPS_RCBTX_TXSCH 0x11
+#define HNS3_PMU_FILTER_PPS_WR_FBD 0x1b
+#define HNS3_PMU_FILTER_PPS_WR_EBD 0x11
+#define HNS3_PMU_FILTER_PPS_RD_FBD 0x01
+#define HNS3_PMU_FILTER_PPS_RD_EBD 0x1b
+#define HNS3_PMU_FILTER_PPS_RD_PAY_M0 0x01
+#define HNS3_PMU_FILTER_PPS_RD_PAY_M1 0x01
+#define HNS3_PMU_FILTER_PPS_WR_PAY_M0 0x01
+#define HNS3_PMU_FILTER_PPS_WR_PAY_M1 0x01
+#define HNS3_PMU_FILTER_PPS_NICROH_TX_PRE 0x01
+#define HNS3_PMU_FILTER_PPS_NICROH_RX_PRE 0x01
+
+/* filter mode supported by each latency event */
+#define HNS3_PMU_FILTER_DLY_TX_PUSH 0x01
+#define HNS3_PMU_FILTER_DLY_TX 0x01
+#define HNS3_PMU_FILTER_DLY_SSU_TX_NIC 0x07
+#define HNS3_PMU_FILTER_DLY_SSU_TX_ROCE 0x07
+#define HNS3_PMU_FILTER_DLY_SSU_RX_NIC 0x07
+#define HNS3_PMU_FILTER_DLY_SSU_RX_ROCE 0x07
+#define HNS3_PMU_FILTER_DLY_RPU 0x11
+#define HNS3_PMU_FILTER_DLY_TPU 0x1f
+#define HNS3_PMU_FILTER_DLY_RPE 0x01
+#define HNS3_PMU_FILTER_DLY_TPE 0x0b
+#define HNS3_PMU_FILTER_DLY_TPE_PUSH 0x1b
+#define HNS3_PMU_FILTER_DLY_WR_FBD 0x1b
+#define HNS3_PMU_FILTER_DLY_WR_EBD 0x11
+#define HNS3_PMU_FILTER_DLY_RD_FBD 0x01
+#define HNS3_PMU_FILTER_DLY_RD_EBD 0x1b
+#define HNS3_PMU_FILTER_DLY_RD_PAY_M0 0x01
+#define HNS3_PMU_FILTER_DLY_RD_PAY_M1 0x01
+#define HNS3_PMU_FILTER_DLY_WR_PAY_M0 0x01
+#define HNS3_PMU_FILTER_DLY_WR_PAY_M1 0x01
+#define HNS3_PMU_FILTER_DLY_MSIX_WRITE 0x01
+
+/* filter mode supported by each interrupt rate event */
+#define HNS3_PMU_FILTER_INTR_MSIX_NIC 0x01
+
+enum hns3_pmu_hw_filter_mode {
+ HNS3_PMU_HW_FILTER_GLOBAL,
+ HNS3_PMU_HW_FILTER_PORT,
+ HNS3_PMU_HW_FILTER_PORT_TC,
+ HNS3_PMU_HW_FILTER_FUNC,
+ HNS3_PMU_HW_FILTER_FUNC_QUEUE,
+ HNS3_PMU_HW_FILTER_FUNC_INTR,
+};
+
+struct hns3_pmu_event_attr {
+ u32 event;
+ u16 filter_support;
+};
+
+struct hns3_pmu {
+ struct perf_event *hw_events[HNS3_PMU_MAX_HW_EVENTS];
+ struct hlist_node node;
+ struct pci_dev *pdev;
+ struct pmu pmu;
+ void __iomem *base;
+ int irq;
+ int on_cpu;
+ u32 identifier;
+ u32 hw_clk_freq; /* hardware clock frequency of PMU */
+ /* maximum and minimum bdf allowed by PMU */
+ u16 bdf_min;
+ u16 bdf_max;
+};
+
+#define to_hns3_pmu(p) (container_of((p), struct hns3_pmu, pmu))
+
+#define GET_PCI_DEVFN(bdf) ((bdf) & 0xff)
+
+#define FILTER_CONDITION_PORT(port) ((1 << (port)) & 0xff)
+#define FILTER_CONDITION_PORT_TC(port, tc) (((port) << 3) | ((tc) & 0x07))
+#define FILTER_CONDITION_FUNC_INTR(func, intr) (((intr) << 8) | (func))
+
+#define HNS3_PMU_FILTER_ATTR(_name, _config, _start, _end) \
+ static inline u64 hns3_pmu_get_##_name(struct perf_event *event) \
+ { \
+ return FIELD_GET(GENMASK_ULL(_end, _start), \
+ event->attr._config); \
+ }
+
+HNS3_PMU_FILTER_ATTR(subevent, config, 0, 7);
+HNS3_PMU_FILTER_ATTR(event_type, config, 8, 15);
+HNS3_PMU_FILTER_ATTR(ext_counter_used, config, 16, 16);
+HNS3_PMU_FILTER_ATTR(port, config1, 0, 3);
+HNS3_PMU_FILTER_ATTR(tc, config1, 4, 7);
+HNS3_PMU_FILTER_ATTR(bdf, config1, 8, 23);
+HNS3_PMU_FILTER_ATTR(queue, config1, 24, 39);
+HNS3_PMU_FILTER_ATTR(intr, config1, 40, 51);
+HNS3_PMU_FILTER_ATTR(global, config1, 52, 52);
+
+#define HNS3_BW_EVT_BYTE_NUM(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_BW_##_name##_BYTE_NUM, \
+ HNS3_PMU_FILTER_BW_##_name})
+#define HNS3_BW_EVT_TIME(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_BW_##_name##_TIME, \
+ HNS3_PMU_FILTER_BW_##_name})
+#define HNS3_PPS_EVT_PACKET_NUM(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_PPS_##_name##_PACKET_NUM, \
+ HNS3_PMU_FILTER_PPS_##_name})
+#define HNS3_PPS_EVT_TIME(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_PPS_##_name##_TIME, \
+ HNS3_PMU_FILTER_PPS_##_name})
+#define HNS3_DLY_EVT_TIME(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_DLY_##_name##_TIME, \
+ HNS3_PMU_FILTER_DLY_##_name})
+#define HNS3_DLY_EVT_PACKET_NUM(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_DLY_##_name##_PACKET_NUM, \
+ HNS3_PMU_FILTER_DLY_##_name})
+#define HNS3_INTR_EVT_INTR_NUM(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_PPS_##_name##_INTR_NUM, \
+ HNS3_PMU_FILTER_INTR_##_name})
+#define HNS3_INTR_EVT_TIME(_name) (&(struct hns3_pmu_event_attr) {\
+ HNS3_PMU_EVT_PPS_##_name##_TIME, \
+ HNS3_PMU_FILTER_INTR_##_name})
+
+static ssize_t hns3_pmu_format_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct dev_ext_attribute *eattr;
+
+ eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+ return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t hns3_pmu_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hns3_pmu_event_attr *event;
+ struct dev_ext_attribute *eattr;
+
+ eattr = container_of(attr, struct dev_ext_attribute, attr);
+ event = eattr->var;
+
+ return sysfs_emit(buf, "config=0x%x\n", event->event);
+}
+
+static ssize_t hns3_pmu_filter_mode_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct hns3_pmu_event_attr *event;
+ struct dev_ext_attribute *eattr;
+ int len;
+
+ eattr = container_of(attr, struct dev_ext_attribute, attr);
+ event = eattr->var;
+
+ len = sysfs_emit_at(buf, 0, "filter mode supported: ");
+ if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_GLOBAL)
+ len += sysfs_emit_at(buf, len, "global ");
+ if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT)
+ len += sysfs_emit_at(buf, len, "port ");
+ if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT_TC)
+ len += sysfs_emit_at(buf, len, "port-tc ");
+ if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC)
+ len += sysfs_emit_at(buf, len, "func ");
+ if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_QUEUE)
+ len += sysfs_emit_at(buf, len, "func-queue ");
+ if (event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_INTR)
+ len += sysfs_emit_at(buf, len, "func-intr ");
+
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
+}
+
+#define HNS3_PMU_ATTR(_name, _func, _config) \
+ (&((struct dev_ext_attribute[]) { \
+ { __ATTR(_name, 0444, _func, NULL), (void *)_config } \
+ })[0].attr.attr)
+
+#define HNS3_PMU_FORMAT_ATTR(_name, _format) \
+ HNS3_PMU_ATTR(_name, hns3_pmu_format_show, (void *)_format)
+#define HNS3_PMU_EVENT_ATTR(_name, _event) \
+ HNS3_PMU_ATTR(_name, hns3_pmu_event_show, (void *)_event)
+#define HNS3_PMU_FLT_MODE_ATTR(_name, _event) \
+ HNS3_PMU_ATTR(_name, hns3_pmu_filter_mode_show, (void *)_event)
+
+#define HNS3_PMU_BW_EVT_PAIR(_name, _macro) \
+ HNS3_PMU_EVENT_ATTR(_name##_byte_num, HNS3_BW_EVT_BYTE_NUM(_macro)), \
+ HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_BW_EVT_TIME(_macro))
+#define HNS3_PMU_PPS_EVT_PAIR(_name, _macro) \
+ HNS3_PMU_EVENT_ATTR(_name##_packet_num, HNS3_PPS_EVT_PACKET_NUM(_macro)), \
+ HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_PPS_EVT_TIME(_macro))
+#define HNS3_PMU_DLY_EVT_PAIR(_name, _macro) \
+ HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_DLY_EVT_TIME(_macro)), \
+ HNS3_PMU_EVENT_ATTR(_name##_packet_num, HNS3_DLY_EVT_PACKET_NUM(_macro))
+#define HNS3_PMU_INTR_EVT_PAIR(_name, _macro) \
+ HNS3_PMU_EVENT_ATTR(_name##_intr_num, HNS3_INTR_EVT_INTR_NUM(_macro)), \
+ HNS3_PMU_EVENT_ATTR(_name##_time, HNS3_INTR_EVT_TIME(_macro))
+
+#define HNS3_PMU_BW_FLT_MODE_PAIR(_name, _macro) \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_byte_num, HNS3_BW_EVT_BYTE_NUM(_macro)), \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_BW_EVT_TIME(_macro))
+#define HNS3_PMU_PPS_FLT_MODE_PAIR(_name, _macro) \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_packet_num, HNS3_PPS_EVT_PACKET_NUM(_macro)), \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_PPS_EVT_TIME(_macro))
+#define HNS3_PMU_DLY_FLT_MODE_PAIR(_name, _macro) \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_DLY_EVT_TIME(_macro)), \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_packet_num, HNS3_DLY_EVT_PACKET_NUM(_macro))
+#define HNS3_PMU_INTR_FLT_MODE_PAIR(_name, _macro) \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_intr_num, HNS3_INTR_EVT_INTR_NUM(_macro)), \
+ HNS3_PMU_FLT_MODE_ATTR(_name##_time, HNS3_INTR_EVT_TIME(_macro))
+
+static u8 hns3_pmu_hw_filter_modes[] = {
+ HNS3_PMU_HW_FILTER_GLOBAL,
+ HNS3_PMU_HW_FILTER_PORT,
+ HNS3_PMU_HW_FILTER_PORT_TC,
+ HNS3_PMU_HW_FILTER_FUNC,
+ HNS3_PMU_HW_FILTER_FUNC_QUEUE,
+ HNS3_PMU_HW_FILTER_FUNC_INTR,
+};
+
+#define HNS3_PMU_SET_HW_FILTER(_hwc, _mode) \
+ ((_hwc)->addr_filters = (void *)&hns3_pmu_hw_filter_modes[(_mode)])
+
+static ssize_t identifier_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "0x%x\n", hns3_pmu->identifier);
+}
+static DEVICE_ATTR_RO(identifier);
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%d\n", hns3_pmu->on_cpu);
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static ssize_t bdf_min_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+ u16 bdf = hns3_pmu->bdf_min;
+
+ return sysfs_emit(buf, "%02x:%02x.%x\n", PCI_BUS_NUM(bdf),
+ PCI_SLOT(bdf), PCI_FUNC(bdf));
+}
+static DEVICE_ATTR_RO(bdf_min);
+
+static ssize_t bdf_max_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+ u16 bdf = hns3_pmu->bdf_max;
+
+ return sysfs_emit(buf, "%02x:%02x.%x\n", PCI_BUS_NUM(bdf),
+ PCI_SLOT(bdf), PCI_FUNC(bdf));
+}
+static DEVICE_ATTR_RO(bdf_max);
+
+static ssize_t hw_clk_freq_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(buf, "%u\n", hns3_pmu->hw_clk_freq);
+}
+static DEVICE_ATTR_RO(hw_clk_freq);
+
+static struct attribute *hns3_pmu_events_attr[] = {
+ /* bandwidth events */
+ HNS3_PMU_BW_EVT_PAIR(bw_ssu_egu, SSU_EGU),
+ HNS3_PMU_BW_EVT_PAIR(bw_ssu_rpu, SSU_RPU),
+ HNS3_PMU_BW_EVT_PAIR(bw_ssu_roce, SSU_ROCE),
+ HNS3_PMU_BW_EVT_PAIR(bw_roce_ssu, ROCE_SSU),
+ HNS3_PMU_BW_EVT_PAIR(bw_tpu_ssu, TPU_SSU),
+ HNS3_PMU_BW_EVT_PAIR(bw_rpu_rcbrx, RPU_RCBRX),
+ HNS3_PMU_BW_EVT_PAIR(bw_rcbtx_txsch, RCBTX_TXSCH),
+ HNS3_PMU_BW_EVT_PAIR(bw_wr_fbd, WR_FBD),
+ HNS3_PMU_BW_EVT_PAIR(bw_wr_ebd, WR_EBD),
+ HNS3_PMU_BW_EVT_PAIR(bw_rd_fbd, RD_FBD),
+ HNS3_PMU_BW_EVT_PAIR(bw_rd_ebd, RD_EBD),
+ HNS3_PMU_BW_EVT_PAIR(bw_rd_pay_m0, RD_PAY_M0),
+ HNS3_PMU_BW_EVT_PAIR(bw_rd_pay_m1, RD_PAY_M1),
+ HNS3_PMU_BW_EVT_PAIR(bw_wr_pay_m0, WR_PAY_M0),
+ HNS3_PMU_BW_EVT_PAIR(bw_wr_pay_m1, WR_PAY_M1),
+
+ /* packet rate events */
+ HNS3_PMU_PPS_EVT_PAIR(pps_igu_ssu, IGU_SSU),
+ HNS3_PMU_PPS_EVT_PAIR(pps_ssu_egu, SSU_EGU),
+ HNS3_PMU_PPS_EVT_PAIR(pps_ssu_rpu, SSU_RPU),
+ HNS3_PMU_PPS_EVT_PAIR(pps_ssu_roce, SSU_ROCE),
+ HNS3_PMU_PPS_EVT_PAIR(pps_roce_ssu, ROCE_SSU),
+ HNS3_PMU_PPS_EVT_PAIR(pps_tpu_ssu, TPU_SSU),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rpu_rcbrx, RPU_RCBRX),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rcbtx_tpu, RCBTX_TPU),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rcbtx_txsch, RCBTX_TXSCH),
+ HNS3_PMU_PPS_EVT_PAIR(pps_wr_fbd, WR_FBD),
+ HNS3_PMU_PPS_EVT_PAIR(pps_wr_ebd, WR_EBD),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rd_fbd, RD_FBD),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rd_ebd, RD_EBD),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rd_pay_m0, RD_PAY_M0),
+ HNS3_PMU_PPS_EVT_PAIR(pps_rd_pay_m1, RD_PAY_M1),
+ HNS3_PMU_PPS_EVT_PAIR(pps_wr_pay_m0, WR_PAY_M0),
+ HNS3_PMU_PPS_EVT_PAIR(pps_wr_pay_m1, WR_PAY_M1),
+ HNS3_PMU_PPS_EVT_PAIR(pps_intr_nicroh_tx_pre, NICROH_TX_PRE),
+ HNS3_PMU_PPS_EVT_PAIR(pps_intr_nicroh_rx_pre, NICROH_RX_PRE),
+
+ /* latency events */
+ HNS3_PMU_DLY_EVT_PAIR(dly_tx_push_to_mac, TX_PUSH),
+ HNS3_PMU_DLY_EVT_PAIR(dly_tx_normal_to_mac, TX),
+ HNS3_PMU_DLY_EVT_PAIR(dly_ssu_tx_th_nic, SSU_TX_NIC),
+ HNS3_PMU_DLY_EVT_PAIR(dly_ssu_tx_th_roce, SSU_TX_ROCE),
+ HNS3_PMU_DLY_EVT_PAIR(dly_ssu_rx_th_nic, SSU_RX_NIC),
+ HNS3_PMU_DLY_EVT_PAIR(dly_ssu_rx_th_roce, SSU_RX_ROCE),
+ HNS3_PMU_DLY_EVT_PAIR(dly_rpu, RPU),
+ HNS3_PMU_DLY_EVT_PAIR(dly_tpu, TPU),
+ HNS3_PMU_DLY_EVT_PAIR(dly_rpe, RPE),
+ HNS3_PMU_DLY_EVT_PAIR(dly_tpe_normal, TPE),
+ HNS3_PMU_DLY_EVT_PAIR(dly_tpe_push, TPE_PUSH),
+ HNS3_PMU_DLY_EVT_PAIR(dly_wr_fbd, WR_FBD),
+ HNS3_PMU_DLY_EVT_PAIR(dly_wr_ebd, WR_EBD),
+ HNS3_PMU_DLY_EVT_PAIR(dly_rd_fbd, RD_FBD),
+ HNS3_PMU_DLY_EVT_PAIR(dly_rd_ebd, RD_EBD),
+ HNS3_PMU_DLY_EVT_PAIR(dly_rd_pay_m0, RD_PAY_M0),
+ HNS3_PMU_DLY_EVT_PAIR(dly_rd_pay_m1, RD_PAY_M1),
+ HNS3_PMU_DLY_EVT_PAIR(dly_wr_pay_m0, WR_PAY_M0),
+ HNS3_PMU_DLY_EVT_PAIR(dly_wr_pay_m1, WR_PAY_M1),
+ HNS3_PMU_DLY_EVT_PAIR(dly_msix_write, MSIX_WRITE),
+
+ /* interrupt rate events */
+ HNS3_PMU_INTR_EVT_PAIR(pps_intr_msix_nic, MSIX_NIC),
+
+ NULL
+};
+
+static struct attribute *hns3_pmu_filter_mode_attr[] = {
+ /* bandwidth events */
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_ssu_egu, SSU_EGU),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_ssu_rpu, SSU_RPU),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_ssu_roce, SSU_ROCE),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_roce_ssu, ROCE_SSU),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_tpu_ssu, TPU_SSU),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_rpu_rcbrx, RPU_RCBRX),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_rcbtx_txsch, RCBTX_TXSCH),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_fbd, WR_FBD),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_ebd, WR_EBD),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_fbd, RD_FBD),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_ebd, RD_EBD),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_pay_m0, RD_PAY_M0),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_rd_pay_m1, RD_PAY_M1),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_pay_m0, WR_PAY_M0),
+ HNS3_PMU_BW_FLT_MODE_PAIR(bw_wr_pay_m1, WR_PAY_M1),
+
+ /* packet rate events */
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_igu_ssu, IGU_SSU),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_ssu_egu, SSU_EGU),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_ssu_rpu, SSU_RPU),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_ssu_roce, SSU_ROCE),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_roce_ssu, ROCE_SSU),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_tpu_ssu, TPU_SSU),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rpu_rcbrx, RPU_RCBRX),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rcbtx_tpu, RCBTX_TPU),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rcbtx_txsch, RCBTX_TXSCH),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_fbd, WR_FBD),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_ebd, WR_EBD),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_fbd, RD_FBD),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_ebd, RD_EBD),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_pay_m0, RD_PAY_M0),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_rd_pay_m1, RD_PAY_M1),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_pay_m0, WR_PAY_M0),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_wr_pay_m1, WR_PAY_M1),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_intr_nicroh_tx_pre, NICROH_TX_PRE),
+ HNS3_PMU_PPS_FLT_MODE_PAIR(pps_intr_nicroh_rx_pre, NICROH_RX_PRE),
+
+ /* latency events */
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tx_push_to_mac, TX_PUSH),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tx_normal_to_mac, TX),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_tx_th_nic, SSU_TX_NIC),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_tx_th_roce, SSU_TX_ROCE),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_rx_th_nic, SSU_RX_NIC),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_ssu_rx_th_roce, SSU_RX_ROCE),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rpu, RPU),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tpu, TPU),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rpe, RPE),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tpe_normal, TPE),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_tpe_push, TPE_PUSH),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_fbd, WR_FBD),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_ebd, WR_EBD),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_fbd, RD_FBD),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_ebd, RD_EBD),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_pay_m0, RD_PAY_M0),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_rd_pay_m1, RD_PAY_M1),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_pay_m0, WR_PAY_M0),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_wr_pay_m1, WR_PAY_M1),
+ HNS3_PMU_DLY_FLT_MODE_PAIR(dly_msix_write, MSIX_WRITE),
+
+ /* interrupt rate events */
+ HNS3_PMU_INTR_FLT_MODE_PAIR(pps_intr_msix_nic, MSIX_NIC),
+
+ NULL
+};
+
+static struct attribute_group hns3_pmu_events_group = {
+ .name = "events",
+ .attrs = hns3_pmu_events_attr,
+};
+
+static struct attribute_group hns3_pmu_filter_mode_group = {
+ .name = "filtermode",
+ .attrs = hns3_pmu_filter_mode_attr,
+};
+
+static struct attribute *hns3_pmu_format_attr[] = {
+ HNS3_PMU_FORMAT_ATTR(subevent, "config:0-7"),
+ HNS3_PMU_FORMAT_ATTR(event_type, "config:8-15"),
+ HNS3_PMU_FORMAT_ATTR(ext_counter_used, "config:16"),
+ HNS3_PMU_FORMAT_ATTR(port, "config1:0-3"),
+ HNS3_PMU_FORMAT_ATTR(tc, "config1:4-7"),
+ HNS3_PMU_FORMAT_ATTR(bdf, "config1:8-23"),
+ HNS3_PMU_FORMAT_ATTR(queue, "config1:24-39"),
+ HNS3_PMU_FORMAT_ATTR(intr, "config1:40-51"),
+ HNS3_PMU_FORMAT_ATTR(global, "config1:52"),
+ NULL
+};
+
+static struct attribute_group hns3_pmu_format_group = {
+ .name = "format",
+ .attrs = hns3_pmu_format_attr,
+};
+
+static struct attribute *hns3_pmu_cpumask_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL
+};
+
+static struct attribute_group hns3_pmu_cpumask_attr_group = {
+ .attrs = hns3_pmu_cpumask_attrs,
+};
+
+static struct attribute *hns3_pmu_identifier_attrs[] = {
+ &dev_attr_identifier.attr,
+ NULL
+};
+
+static struct attribute_group hns3_pmu_identifier_attr_group = {
+ .attrs = hns3_pmu_identifier_attrs,
+};
+
+static struct attribute *hns3_pmu_bdf_range_attrs[] = {
+ &dev_attr_bdf_min.attr,
+ &dev_attr_bdf_max.attr,
+ NULL
+};
+
+static struct attribute_group hns3_pmu_bdf_range_attr_group = {
+ .attrs = hns3_pmu_bdf_range_attrs,
+};
+
+static struct attribute *hns3_pmu_hw_clk_freq_attrs[] = {
+ &dev_attr_hw_clk_freq.attr,
+ NULL
+};
+
+static struct attribute_group hns3_pmu_hw_clk_freq_attr_group = {
+ .attrs = hns3_pmu_hw_clk_freq_attrs,
+};
+
+static const struct attribute_group *hns3_pmu_attr_groups[] = {
+ &hns3_pmu_events_group,
+ &hns3_pmu_filter_mode_group,
+ &hns3_pmu_format_group,
+ &hns3_pmu_cpumask_attr_group,
+ &hns3_pmu_identifier_attr_group,
+ &hns3_pmu_bdf_range_attr_group,
+ &hns3_pmu_hw_clk_freq_attr_group,
+ NULL
+};
+
+static u32 hns3_pmu_get_event(struct perf_event *event)
+{
+ return hns3_pmu_get_ext_counter_used(event) << 16 |
+ hns3_pmu_get_event_type(event) << 8 |
+ hns3_pmu_get_subevent(event);
+}
+
+static u32 hns3_pmu_get_real_event(struct perf_event *event)
+{
+ return hns3_pmu_get_event_type(event) << 8 |
+ hns3_pmu_get_subevent(event);
+}
+
+static u32 hns3_pmu_get_offset(u32 offset, u32 idx)
+{
+ return offset + HNS3_PMU_REG_EVENT_OFFSET +
+ HNS3_PMU_REG_EVENT_SIZE * idx;
+}
+
+static u32 hns3_pmu_readl(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx)
+{
+ u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+ return readl(hns3_pmu->base + offset);
+}
+
+static void hns3_pmu_writel(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx,
+ u32 val)
+{
+ u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+ writel(val, hns3_pmu->base + offset);
+}
+
+static u64 hns3_pmu_readq(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx)
+{
+ u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+ return readq(hns3_pmu->base + offset);
+}
+
+static void hns3_pmu_writeq(struct hns3_pmu *hns3_pmu, u32 reg_offset, u32 idx,
+ u64 val)
+{
+ u32 offset = hns3_pmu_get_offset(reg_offset, idx);
+
+ writeq(val, hns3_pmu->base + offset);
+}
+
+static bool hns3_pmu_cmp_event(struct perf_event *target,
+ struct perf_event *event)
+{
+ return hns3_pmu_get_real_event(target) == hns3_pmu_get_real_event(event);
+}
+
+static int hns3_pmu_find_related_event_idx(struct hns3_pmu *hns3_pmu,
+ struct perf_event *event)
+{
+ struct perf_event *sibling;
+ int hw_event_used = 0;
+ int idx;
+
+ for (idx = 0; idx < HNS3_PMU_MAX_HW_EVENTS; idx++) {
+ sibling = hns3_pmu->hw_events[idx];
+ if (!sibling)
+ continue;
+
+ hw_event_used++;
+
+ if (!hns3_pmu_cmp_event(sibling, event))
+ continue;
+
+ /* Related events is used in group */
+ if (sibling->group_leader == event->group_leader)
+ return idx;
+ }
+
+ /* No related event and all hardware events are used up */
+ if (hw_event_used >= HNS3_PMU_MAX_HW_EVENTS)
+ return -EBUSY;
+
+ /* No related event and there is extra hardware events can be use */
+ return -ENOENT;
+}
+
+static int hns3_pmu_get_event_idx(struct hns3_pmu *hns3_pmu)
+{
+ int idx;
+
+ for (idx = 0; idx < HNS3_PMU_MAX_HW_EVENTS; idx++) {
+ if (!hns3_pmu->hw_events[idx])
+ return idx;
+ }
+
+ return -EBUSY;
+}
+
+static bool hns3_pmu_valid_bdf(struct hns3_pmu *hns3_pmu, u16 bdf)
+{
+ struct pci_dev *pdev;
+
+ if (bdf < hns3_pmu->bdf_min || bdf > hns3_pmu->bdf_max) {
+ pci_err(hns3_pmu->pdev, "Invalid EP device: %#x!\n", bdf);
+ return false;
+ }
+
+ pdev = pci_get_domain_bus_and_slot(pci_domain_nr(hns3_pmu->pdev->bus),
+ PCI_BUS_NUM(bdf),
+ GET_PCI_DEVFN(bdf));
+ if (!pdev) {
+ pci_err(hns3_pmu->pdev, "Nonexistent EP device: %#x!\n", bdf);
+ return false;
+ }
+
+ pci_dev_put(pdev);
+ return true;
+}
+
+static void hns3_pmu_set_qid_para(struct hns3_pmu *hns3_pmu, u32 idx, u16 bdf,
+ u16 queue)
+{
+ u32 val;
+
+ val = GET_PCI_DEVFN(bdf);
+ val |= (u32)queue << HNS3_PMU_QID_PARA_QUEUE_S;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_QID_PARA, idx, val);
+}
+
+static bool hns3_pmu_qid_req_start(struct hns3_pmu *hns3_pmu, u32 idx)
+{
+ bool queue_id_valid = false;
+ u32 reg_qid_ctrl, val;
+ int err;
+
+ /* enable queue id request */
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_QID_CTRL, idx,
+ HNS3_PMU_QID_CTRL_REQ_ENABLE);
+
+ reg_qid_ctrl = hns3_pmu_get_offset(HNS3_PMU_REG_EVENT_QID_CTRL, idx);
+ err = readl_poll_timeout(hns3_pmu->base + reg_qid_ctrl, val,
+ val & HNS3_PMU_QID_CTRL_DONE, 1, 1000);
+ if (err == -ETIMEDOUT) {
+ pci_err(hns3_pmu->pdev, "QID request timeout!\n");
+ goto out;
+ }
+
+ queue_id_valid = !(val & HNS3_PMU_QID_CTRL_MISS);
+
+out:
+ /* disable qid request and clear status */
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_QID_CTRL, idx, 0);
+
+ return queue_id_valid;
+}
+
+static bool hns3_pmu_valid_queue(struct hns3_pmu *hns3_pmu, u32 idx, u16 bdf,
+ u16 queue)
+{
+ hns3_pmu_set_qid_para(hns3_pmu, idx, bdf, queue);
+
+ return hns3_pmu_qid_req_start(hns3_pmu, idx);
+}
+
+static struct hns3_pmu_event_attr *hns3_pmu_get_pmu_event(u32 event)
+{
+ struct hns3_pmu_event_attr *pmu_event;
+ struct dev_ext_attribute *eattr;
+ struct device_attribute *dattr;
+ struct attribute *attr;
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(hns3_pmu_events_attr) - 1; i++) {
+ attr = hns3_pmu_events_attr[i];
+ dattr = container_of(attr, struct device_attribute, attr);
+ eattr = container_of(dattr, struct dev_ext_attribute, attr);
+ pmu_event = eattr->var;
+
+ if (event == pmu_event->event)
+ return pmu_event;
+ }
+
+ return NULL;
+}
+
+static int hns3_pmu_set_func_mode(struct perf_event *event,
+ struct hns3_pmu *hns3_pmu)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u16 bdf = hns3_pmu_get_bdf(event);
+
+ if (!hns3_pmu_valid_bdf(hns3_pmu, bdf))
+ return -ENOENT;
+
+ HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_FUNC);
+
+ return 0;
+}
+
+static int hns3_pmu_set_func_queue_mode(struct perf_event *event,
+ struct hns3_pmu *hns3_pmu)
+{
+ u16 queue_id = hns3_pmu_get_queue(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u16 bdf = hns3_pmu_get_bdf(event);
+
+ if (!hns3_pmu_valid_bdf(hns3_pmu, bdf))
+ return -ENOENT;
+
+ if (!hns3_pmu_valid_queue(hns3_pmu, hwc->idx, bdf, queue_id)) {
+ pci_err(hns3_pmu->pdev, "Invalid queue: %u\n", queue_id);
+ return -ENOENT;
+ }
+
+ HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_FUNC_QUEUE);
+
+ return 0;
+}
+
+static bool
+hns3_pmu_is_enabled_global_mode(struct perf_event *event,
+ struct hns3_pmu_event_attr *pmu_event)
+{
+ u8 global = hns3_pmu_get_global(event);
+
+ if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_GLOBAL))
+ return false;
+
+ return global;
+}
+
+static bool hns3_pmu_is_enabled_func_mode(struct perf_event *event,
+ struct hns3_pmu_event_attr *pmu_event)
+{
+ u16 queue_id = hns3_pmu_get_queue(event);
+ u16 bdf = hns3_pmu_get_bdf(event);
+
+ if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC))
+ return false;
+ else if (queue_id != HNS3_PMU_FILTER_ALL_QUEUE)
+ return false;
+
+ return bdf;
+}
+
+static bool
+hns3_pmu_is_enabled_func_queue_mode(struct perf_event *event,
+ struct hns3_pmu_event_attr *pmu_event)
+{
+ u16 queue_id = hns3_pmu_get_queue(event);
+ u16 bdf = hns3_pmu_get_bdf(event);
+
+ if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_QUEUE))
+ return false;
+ else if (queue_id == HNS3_PMU_FILTER_ALL_QUEUE)
+ return false;
+
+ return bdf;
+}
+
+static bool hns3_pmu_is_enabled_port_mode(struct perf_event *event,
+ struct hns3_pmu_event_attr *pmu_event)
+{
+ u8 tc_id = hns3_pmu_get_tc(event);
+
+ if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT))
+ return false;
+
+ return tc_id == HNS3_PMU_FILTER_ALL_TC;
+}
+
+static bool
+hns3_pmu_is_enabled_port_tc_mode(struct perf_event *event,
+ struct hns3_pmu_event_attr *pmu_event)
+{
+ u8 tc_id = hns3_pmu_get_tc(event);
+
+ if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_PORT_TC))
+ return false;
+
+ return tc_id != HNS3_PMU_FILTER_ALL_TC;
+}
+
+static bool
+hns3_pmu_is_enabled_func_intr_mode(struct perf_event *event,
+ struct hns3_pmu *hns3_pmu,
+ struct hns3_pmu_event_attr *pmu_event)
+{
+ u16 bdf = hns3_pmu_get_bdf(event);
+
+ if (!(pmu_event->filter_support & HNS3_PMU_FILTER_SUPPORT_FUNC_INTR))
+ return false;
+
+ return hns3_pmu_valid_bdf(hns3_pmu, bdf);
+}
+
+static int hns3_pmu_select_filter_mode(struct perf_event *event,
+ struct hns3_pmu *hns3_pmu)
+{
+ u32 event_id = hns3_pmu_get_event(event);
+ struct hw_perf_event *hwc = &event->hw;
+ struct hns3_pmu_event_attr *pmu_event;
+
+ pmu_event = hns3_pmu_get_pmu_event(event_id);
+ if (!pmu_event) {
+ pci_err(hns3_pmu->pdev, "Invalid pmu event\n");
+ return -ENOENT;
+ }
+
+ if (hns3_pmu_is_enabled_global_mode(event, pmu_event)) {
+ HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_GLOBAL);
+ return 0;
+ }
+
+ if (hns3_pmu_is_enabled_func_mode(event, pmu_event))
+ return hns3_pmu_set_func_mode(event, hns3_pmu);
+
+ if (hns3_pmu_is_enabled_func_queue_mode(event, pmu_event))
+ return hns3_pmu_set_func_queue_mode(event, hns3_pmu);
+
+ if (hns3_pmu_is_enabled_port_mode(event, pmu_event)) {
+ HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_PORT);
+ return 0;
+ }
+
+ if (hns3_pmu_is_enabled_port_tc_mode(event, pmu_event)) {
+ HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_PORT_TC);
+ return 0;
+ }
+
+ if (hns3_pmu_is_enabled_func_intr_mode(event, hns3_pmu, pmu_event)) {
+ HNS3_PMU_SET_HW_FILTER(hwc, HNS3_PMU_HW_FILTER_FUNC_INTR);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static bool hns3_pmu_validate_event_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+ struct perf_event *event_group[HNS3_PMU_MAX_HW_EVENTS];
+ int counters = 1;
+ int num;
+
+ event_group[0] = leader;
+ if (!is_software_event(leader)) {
+ if (leader->pmu != event->pmu)
+ return false;
+
+ if (leader != event && !hns3_pmu_cmp_event(leader, event))
+ event_group[counters++] = event;
+ }
+
+ for_each_sibling_event(sibling, event->group_leader) {
+ if (is_software_event(sibling))
+ continue;
+
+ if (sibling->pmu != event->pmu)
+ return false;
+
+ for (num = 0; num < counters; num++) {
+ if (hns3_pmu_cmp_event(event_group[num], sibling))
+ break;
+ }
+
+ if (num == counters)
+ event_group[counters++] = sibling;
+ }
+
+ return counters <= HNS3_PMU_MAX_HW_EVENTS;
+}
+
+static u32 hns3_pmu_get_filter_condition(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u16 intr_id = hns3_pmu_get_intr(event);
+ u8 port_id = hns3_pmu_get_port(event);
+ u16 bdf = hns3_pmu_get_bdf(event);
+ u8 tc_id = hns3_pmu_get_tc(event);
+ u8 filter_mode;
+
+ filter_mode = *(u8 *)hwc->addr_filters;
+ switch (filter_mode) {
+ case HNS3_PMU_HW_FILTER_PORT:
+ return FILTER_CONDITION_PORT(port_id);
+ case HNS3_PMU_HW_FILTER_PORT_TC:
+ return FILTER_CONDITION_PORT_TC(port_id, tc_id);
+ case HNS3_PMU_HW_FILTER_FUNC:
+ case HNS3_PMU_HW_FILTER_FUNC_QUEUE:
+ return GET_PCI_DEVFN(bdf);
+ case HNS3_PMU_HW_FILTER_FUNC_INTR:
+ return FILTER_CONDITION_FUNC_INTR(GET_PCI_DEVFN(bdf), intr_id);
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+static void hns3_pmu_config_filter(struct perf_event *event)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ u8 event_type = hns3_pmu_get_event_type(event);
+ u8 subevent_id = hns3_pmu_get_subevent(event);
+ u16 queue_id = hns3_pmu_get_queue(event);
+ struct hw_perf_event *hwc = &event->hw;
+ u8 filter_mode = *(u8 *)hwc->addr_filters;
+ u16 bdf = hns3_pmu_get_bdf(event);
+ u32 idx = hwc->idx;
+ u32 val;
+
+ val = event_type;
+ val |= subevent_id << HNS3_PMU_CTRL_SUBEVENT_S;
+ val |= filter_mode << HNS3_PMU_CTRL_FILTER_MODE_S;
+ val |= HNS3_PMU_EVENT_OVERFLOW_RESTART;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+
+ val = hns3_pmu_get_filter_condition(event);
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_HIGH, idx, val);
+
+ if (filter_mode == HNS3_PMU_HW_FILTER_FUNC_QUEUE)
+ hns3_pmu_set_qid_para(hns3_pmu, idx, bdf, queue_id);
+}
+
+static void hns3_pmu_enable_counter(struct hns3_pmu *hns3_pmu,
+ struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+ u32 val;
+
+ val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+ val |= HNS3_PMU_EVENT_EN;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+}
+
+static void hns3_pmu_disable_counter(struct hns3_pmu *hns3_pmu,
+ struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+ u32 val;
+
+ val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+ val &= ~HNS3_PMU_EVENT_EN;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+}
+
+static void hns3_pmu_enable_intr(struct hns3_pmu *hns3_pmu,
+ struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+ u32 val;
+
+ val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx);
+ val &= ~HNS3_PMU_INTR_MASK_OVERFLOW;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx, val);
+}
+
+static void hns3_pmu_disable_intr(struct hns3_pmu *hns3_pmu,
+ struct hw_perf_event *hwc)
+{
+ u32 idx = hwc->idx;
+ u32 val;
+
+ val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx);
+ val |= HNS3_PMU_INTR_MASK_OVERFLOW;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_INTR_MASK, idx, val);
+}
+
+static void hns3_pmu_clear_intr_status(struct hns3_pmu *hns3_pmu, u32 idx)
+{
+ u32 val;
+
+ val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+ val |= HNS3_PMU_EVENT_STATUS_RESET;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+
+ val = hns3_pmu_readl(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx);
+ val &= ~HNS3_PMU_EVENT_STATUS_RESET;
+ hns3_pmu_writel(hns3_pmu, HNS3_PMU_REG_EVENT_CTRL_LOW, idx, val);
+}
+
+static u64 hns3_pmu_read_counter(struct perf_event *event)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+
+ return hns3_pmu_readq(hns3_pmu, event->hw.event_base, event->hw.idx);
+}
+
+static void hns3_pmu_write_counter(struct perf_event *event, u64 value)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ u32 idx = event->hw.idx;
+
+ hns3_pmu_writeq(hns3_pmu, HNS3_PMU_REG_EVENT_COUNTER, idx, value);
+ hns3_pmu_writeq(hns3_pmu, HNS3_PMU_REG_EVENT_EXT_COUNTER, idx, value);
+}
+
+static void hns3_pmu_init_counter(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ local64_set(&hwc->prev_count, 0);
+ hns3_pmu_write_counter(event, 0);
+}
+
+static int hns3_pmu_event_init(struct perf_event *event)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+ int ret;
+
+ if (event->attr.type != event->pmu->type)
+ return -ENOENT;
+
+ /* Sampling is not supported */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+ return -EOPNOTSUPP;
+
+ event->cpu = hns3_pmu->on_cpu;
+
+ idx = hns3_pmu_get_event_idx(hns3_pmu);
+ if (idx < 0) {
+ pci_err(hns3_pmu->pdev, "Up to %u events are supported!\n",
+ HNS3_PMU_MAX_HW_EVENTS);
+ return -EBUSY;
+ }
+
+ hwc->idx = idx;
+
+ ret = hns3_pmu_select_filter_mode(event, hns3_pmu);
+ if (ret) {
+ pci_err(hns3_pmu->pdev, "Invalid filter, ret = %d.\n", ret);
+ return ret;
+ }
+
+ if (!hns3_pmu_validate_event_group(event)) {
+ pci_err(hns3_pmu->pdev, "Invalid event group.\n");
+ return -EINVAL;
+ }
+
+ if (hns3_pmu_get_ext_counter_used(event))
+ hwc->event_base = HNS3_PMU_REG_EVENT_EXT_COUNTER;
+ else
+ hwc->event_base = HNS3_PMU_REG_EVENT_COUNTER;
+
+ return 0;
+}
+
+static void hns3_pmu_read(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 new_cnt, prev_cnt, delta;
+
+ do {
+ prev_cnt = local64_read(&hwc->prev_count);
+ new_cnt = hns3_pmu_read_counter(event);
+ } while (local64_cmpxchg(&hwc->prev_count, prev_cnt, new_cnt) !=
+ prev_cnt);
+
+ delta = new_cnt - prev_cnt;
+ local64_add(delta, &event->count);
+}
+
+static void hns3_pmu_start(struct perf_event *event, int flags)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+ return;
+
+ WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+ hwc->state = 0;
+
+ hns3_pmu_config_filter(event);
+ hns3_pmu_init_counter(event);
+ hns3_pmu_enable_intr(hns3_pmu, hwc);
+ hns3_pmu_enable_counter(hns3_pmu, hwc);
+
+ perf_event_update_userpage(event);
+}
+
+static void hns3_pmu_stop(struct perf_event *event, int flags)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hns3_pmu_disable_counter(hns3_pmu, hwc);
+ hns3_pmu_disable_intr(hns3_pmu, hwc);
+
+ WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+ hwc->state |= PERF_HES_STOPPED;
+
+ if (hwc->state & PERF_HES_UPTODATE)
+ return;
+
+ /* Read hardware counter and update the perf counter statistics */
+ hns3_pmu_read(event);
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int hns3_pmu_add(struct perf_event *event, int flags)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ /* Check all working events to find a related event. */
+ idx = hns3_pmu_find_related_event_idx(hns3_pmu, event);
+ if (idx < 0 && idx != -ENOENT)
+ return idx;
+
+ /* Current event shares an enabled hardware event with related event */
+ if (idx >= 0 && idx < HNS3_PMU_MAX_HW_EVENTS) {
+ hwc->idx = idx;
+ goto start_count;
+ }
+
+ idx = hns3_pmu_get_event_idx(hns3_pmu);
+ if (idx < 0)
+ return idx;
+
+ hwc->idx = idx;
+ hns3_pmu->hw_events[idx] = event;
+
+start_count:
+ if (flags & PERF_EF_START)
+ hns3_pmu_start(event, PERF_EF_RELOAD);
+
+ return 0;
+}
+
+static void hns3_pmu_del(struct perf_event *event, int flags)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hns3_pmu_stop(event, PERF_EF_UPDATE);
+ hns3_pmu->hw_events[hwc->idx] = NULL;
+ perf_event_update_userpage(event);
+}
+
+static void hns3_pmu_enable(struct pmu *pmu)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(pmu);
+ u32 val;
+
+ val = readl(hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+ val |= HNS3_PMU_GLOBAL_START;
+ writel(val, hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+}
+
+static void hns3_pmu_disable(struct pmu *pmu)
+{
+ struct hns3_pmu *hns3_pmu = to_hns3_pmu(pmu);
+ u32 val;
+
+ val = readl(hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+ val &= ~HNS3_PMU_GLOBAL_START;
+ writel(val, hns3_pmu->base + HNS3_PMU_REG_GLOBAL_CTRL);
+}
+
+static int hns3_pmu_alloc_pmu(struct pci_dev *pdev, struct hns3_pmu *hns3_pmu)
+{
+ u16 device_id;
+ char *name;
+ u32 val;
+
+ hns3_pmu->base = pcim_iomap_table(pdev)[BAR_2];
+ if (!hns3_pmu->base) {
+ pci_err(pdev, "ioremap failed\n");
+ return -ENOMEM;
+ }
+
+ hns3_pmu->hw_clk_freq = readl(hns3_pmu->base + HNS3_PMU_REG_CLOCK_FREQ);
+
+ val = readl(hns3_pmu->base + HNS3_PMU_REG_BDF);
+ hns3_pmu->bdf_min = val & 0xffff;
+ hns3_pmu->bdf_max = val >> 16;
+
+ val = readl(hns3_pmu->base + HNS3_PMU_REG_DEVICE_ID);
+ device_id = val & 0xffff;
+ name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hns3_pmu_sicl_%u", device_id);
+ if (!name)
+ return -ENOMEM;
+
+ hns3_pmu->pdev = pdev;
+ hns3_pmu->on_cpu = -1;
+ hns3_pmu->identifier = readl(hns3_pmu->base + HNS3_PMU_REG_VERSION);
+ hns3_pmu->pmu = (struct pmu) {
+ .name = name,
+ .module = THIS_MODULE,
+ .event_init = hns3_pmu_event_init,
+ .pmu_enable = hns3_pmu_enable,
+ .pmu_disable = hns3_pmu_disable,
+ .add = hns3_pmu_add,
+ .del = hns3_pmu_del,
+ .start = hns3_pmu_start,
+ .stop = hns3_pmu_stop,
+ .read = hns3_pmu_read,
+ .task_ctx_nr = perf_invalid_context,
+ .attr_groups = hns3_pmu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ };
+
+ return 0;
+}
+
+static irqreturn_t hns3_pmu_irq(int irq, void *data)
+{
+ struct hns3_pmu *hns3_pmu = data;
+ u32 intr_status, idx;
+
+ for (idx = 0; idx < HNS3_PMU_MAX_HW_EVENTS; idx++) {
+ intr_status = hns3_pmu_readl(hns3_pmu,
+ HNS3_PMU_REG_EVENT_INTR_STATUS,
+ idx);
+
+ /*
+ * As each counter will restart from 0 when it is overflowed,
+ * extra processing is no need, just clear interrupt status.
+ */
+ if (intr_status)
+ hns3_pmu_clear_intr_status(hns3_pmu, idx);
+ }
+
+ return IRQ_HANDLED;
+}
+
+static int hns3_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct hns3_pmu *hns3_pmu;
+
+ hns3_pmu = hlist_entry_safe(node, struct hns3_pmu, node);
+ if (!hns3_pmu)
+ return -ENODEV;
+
+ if (hns3_pmu->on_cpu == -1) {
+ hns3_pmu->on_cpu = cpu;
+ irq_set_affinity(hns3_pmu->irq, cpumask_of(cpu));
+ }
+
+ return 0;
+}
+
+static int hns3_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct hns3_pmu *hns3_pmu;
+ unsigned int target;
+
+ hns3_pmu = hlist_entry_safe(node, struct hns3_pmu, node);
+ if (!hns3_pmu)
+ return -ENODEV;
+
+ /* Nothing to do if this CPU doesn't own the PMU */
+ if (hns3_pmu->on_cpu != cpu)
+ return 0;
+
+ /* Choose a new CPU from all online cpus */
+ target = cpumask_any_but(cpu_online_mask, cpu);
+ if (target >= nr_cpu_ids)
+ return 0;
+
+ perf_pmu_migrate_context(&hns3_pmu->pmu, cpu, target);
+ hns3_pmu->on_cpu = target;
+ irq_set_affinity(hns3_pmu->irq, cpumask_of(target));
+
+ return 0;
+}
+
+static void hns3_pmu_free_irq(void *data)
+{
+ struct pci_dev *pdev = data;
+
+ pci_free_irq_vectors(pdev);
+}
+
+static int hns3_pmu_irq_register(struct pci_dev *pdev,
+ struct hns3_pmu *hns3_pmu)
+{
+ int irq, ret;
+
+ ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+ if (ret < 0) {
+ pci_err(pdev, "failed to enable MSI vectors, ret = %d.\n", ret);
+ return ret;
+ }
+
+ ret = devm_add_action(&pdev->dev, hns3_pmu_free_irq, pdev);
+ if (ret) {
+ pci_err(pdev, "failed to add free irq action, ret = %d.\n", ret);
+ return ret;
+ }
+
+ irq = pci_irq_vector(pdev, 0);
+ ret = devm_request_irq(&pdev->dev, irq, hns3_pmu_irq, 0,
+ hns3_pmu->pmu.name, hns3_pmu);
+ if (ret) {
+ pci_err(pdev, "failed to register irq, ret = %d.\n", ret);
+ return ret;
+ }
+
+ hns3_pmu->irq = irq;
+
+ return 0;
+}
+
+static int hns3_pmu_init_pmu(struct pci_dev *pdev, struct hns3_pmu *hns3_pmu)
+{
+ int ret;
+
+ ret = hns3_pmu_alloc_pmu(pdev, hns3_pmu);
+ if (ret)
+ return ret;
+
+ ret = hns3_pmu_irq_register(pdev, hns3_pmu);
+ if (ret)
+ return ret;
+
+ ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+ &hns3_pmu->node);
+ if (ret) {
+ pci_err(pdev, "failed to register hotplug, ret = %d.\n", ret);
+ return ret;
+ }
+
+ ret = perf_pmu_register(&hns3_pmu->pmu, hns3_pmu->pmu.name, -1);
+ if (ret) {
+ pci_err(pdev, "failed to register perf PMU, ret = %d.\n", ret);
+ cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+ &hns3_pmu->node);
+ }
+
+ return ret;
+}
+
+static void hns3_pmu_uninit_pmu(struct pci_dev *pdev)
+{
+ struct hns3_pmu *hns3_pmu = pci_get_drvdata(pdev);
+
+ perf_pmu_unregister(&hns3_pmu->pmu);
+ cpuhp_state_remove_instance(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+ &hns3_pmu->node);
+}
+
+static int hns3_pmu_init_dev(struct pci_dev *pdev)
+{
+ int ret;
+
+ ret = pcim_enable_device(pdev);
+ if (ret) {
+ pci_err(pdev, "failed to enable pci device, ret = %d.\n", ret);
+ return ret;
+ }
+
+ ret = pcim_iomap_regions(pdev, BIT(BAR_2), "hns3_pmu");
+ if (ret < 0) {
+ pci_err(pdev, "failed to request pci region, ret = %d.\n", ret);
+ return ret;
+ }
+
+ pci_set_master(pdev);
+
+ return 0;
+}
+
+static int hns3_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct hns3_pmu *hns3_pmu;
+ int ret;
+
+ hns3_pmu = devm_kzalloc(&pdev->dev, sizeof(*hns3_pmu), GFP_KERNEL);
+ if (!hns3_pmu)
+ return -ENOMEM;
+
+ ret = hns3_pmu_init_dev(pdev);
+ if (ret)
+ return ret;
+
+ ret = hns3_pmu_init_pmu(pdev, hns3_pmu);
+ if (ret) {
+ pci_clear_master(pdev);
+ return ret;
+ }
+
+ pci_set_drvdata(pdev, hns3_pmu);
+
+ return ret;
+}
+
+static void hns3_pmu_remove(struct pci_dev *pdev)
+{
+ hns3_pmu_uninit_pmu(pdev);
+ pci_clear_master(pdev);
+ pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id hns3_pmu_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa22b) },
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, hns3_pmu_ids);
+
+static struct pci_driver hns3_pmu_driver = {
+ .name = "hns3_pmu",
+ .id_table = hns3_pmu_ids,
+ .probe = hns3_pmu_probe,
+ .remove = hns3_pmu_remove,
+};
+
+static int __init hns3_pmu_module_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
+ "AP_PERF_ARM_HNS3_PMU_ONLINE",
+ hns3_pmu_online_cpu,
+ hns3_pmu_offline_cpu);
+ if (ret) {
+ pr_err("failed to setup HNS3 PMU hotplug, ret = %d.\n", ret);
+ return ret;
+ }
+
+ ret = pci_register_driver(&hns3_pmu_driver);
+ if (ret) {
+ pr_err("failed to register pci driver, ret = %d.\n", ret);
+ cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE);
+ }
+
+ return ret;
+}
+module_init(hns3_pmu_module_init);
+
+static void __exit hns3_pmu_module_exit(void)
+{
+ pci_unregister_driver(&hns3_pmu_driver);
+ cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE);
+}
+module_exit(hns3_pmu_module_exit);
+
+MODULE_DESCRIPTION("HNS3 PMU driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
index 282d3a071a67..69c3050a4348 100644
--- a/drivers/perf/marvell_cn10k_tad_pmu.c
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -2,10 +2,6 @@
/* Marvell CN10K LLC-TAD perf driver
*
* Copyright (C) 2021 Marvell
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) "tad_pmu: " fmt
@@ -18,9 +14,9 @@
#include <linux/perf_event.h>
#include <linux/platform_device.h>
-#define TAD_PFC_OFFSET 0x0
+#define TAD_PFC_OFFSET 0x800
#define TAD_PFC(counter) (TAD_PFC_OFFSET | (counter << 3))
-#define TAD_PRF_OFFSET 0x100
+#define TAD_PRF_OFFSET 0x900
#define TAD_PRF(counter) (TAD_PRF_OFFSET | (counter << 3))
#define TAD_PRF_CNTSEL_MASK 0xFF
#define TAD_MAX_COUNTERS 8
@@ -100,9 +96,7 @@ static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
* which sets TAD()_PRF()[CNTSEL] != 0
*/
for (i = 0; i < tad_pmu->region_cnt; i++) {
- reg_val = readq_relaxed(tad_pmu->regions[i].base +
- TAD_PRF(counter_idx));
- reg_val |= (event_idx & 0xFF);
+ reg_val = event_idx & 0xFF;
writeq_relaxed(reg_val, tad_pmu->regions[i].base +
TAD_PRF(counter_idx));
}
diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
index b2b8d2074ed0..2c961839903d 100644
--- a/drivers/perf/riscv_pmu.c
+++ b/drivers/perf/riscv_pmu.c
@@ -121,7 +121,7 @@ u64 riscv_pmu_event_update(struct perf_event *event)
return delta;
}
-static void riscv_pmu_stop(struct perf_event *event, int flags)
+void riscv_pmu_stop(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
@@ -175,7 +175,7 @@ int riscv_pmu_event_set_period(struct perf_event *event)
return overflow;
}
-static void riscv_pmu_start(struct perf_event *event, int flags)
+void riscv_pmu_start(struct perf_event *event, int flags)
{
struct hw_perf_event *hwc = &event->hw;
struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index dca3537a8dcc..79a3de515ece 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -17,10 +17,30 @@
#include <linux/irqdomain.h>
#include <linux/of_irq.h>
#include <linux/of.h>
+#include <linux/cpu_pm.h>
#include <asm/sbi.h>
#include <asm/hwcap.h>
+PMU_FORMAT_ATTR(event, "config:0-47");
+PMU_FORMAT_ATTR(firmware, "config:63");
+
+static struct attribute *riscv_arch_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_firmware.attr,
+ NULL,
+};
+
+static struct attribute_group riscv_pmu_format_group = {
+ .name = "format",
+ .attrs = riscv_arch_formats_attr,
+};
+
+static const struct attribute_group *riscv_pmu_attr_groups[] = {
+ &riscv_pmu_format_group,
+ NULL,
+};
+
union sbi_pmu_ctr_info {
unsigned long value;
struct {
@@ -666,12 +686,15 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
child = of_get_compatible_child(cpu, "riscv,cpu-intc");
if (!child) {
pr_err("Failed to find INTC node\n");
+ of_node_put(cpu);
return -ENODEV;
}
domain = irq_find_host(child);
of_node_put(child);
- if (domain)
+ if (domain) {
+ of_node_put(cpu);
break;
+ }
}
if (!domain) {
pr_err("Failed to find INTC IRQ root domain\n");
@@ -693,6 +716,73 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
return 0;
}
+#ifdef CONFIG_CPU_PM
+static int riscv_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
+ void *v)
+{
+ struct riscv_pmu *rvpmu = container_of(b, struct riscv_pmu, riscv_pm_nb);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events);
+ int enabled = bitmap_weight(cpuc->used_hw_ctrs, RISCV_MAX_COUNTERS);
+ struct perf_event *event;
+ int idx;
+
+ if (!enabled)
+ return NOTIFY_OK;
+
+ for (idx = 0; idx < RISCV_MAX_COUNTERS; idx++) {
+ event = cpuc->events[idx];
+ if (!event)
+ continue;
+
+ switch (cmd) {
+ case CPU_PM_ENTER:
+ /*
+ * Stop and update the counter
+ */
+ riscv_pmu_stop(event, PERF_EF_UPDATE);
+ break;
+ case CPU_PM_EXIT:
+ case CPU_PM_ENTER_FAILED:
+ /*
+ * Restore and enable the counter.
+ *
+ * Requires RCU read locking to be functional,
+ * wrap the call within RCU_NONIDLE to make the
+ * RCU subsystem aware this cpu is not idle from
+ * an RCU perspective for the riscv_pmu_start() call
+ * duration.
+ */
+ RCU_NONIDLE(riscv_pmu_start(event, PERF_EF_RELOAD));
+ break;
+ default:
+ break;
+ }
+ }
+
+ return NOTIFY_OK;
+}
+
+static int riscv_pm_pmu_register(struct riscv_pmu *pmu)
+{
+ pmu->riscv_pm_nb.notifier_call = riscv_pm_pmu_notify;
+ return cpu_pm_register_notifier(&pmu->riscv_pm_nb);
+}
+
+static void riscv_pm_pmu_unregister(struct riscv_pmu *pmu)
+{
+ cpu_pm_unregister_notifier(&pmu->riscv_pm_nb);
+}
+#else
+static inline int riscv_pm_pmu_register(struct riscv_pmu *pmu) { return 0; }
+static inline void riscv_pm_pmu_unregister(struct riscv_pmu *pmu) { }
+#endif
+
+static void riscv_pmu_destroy(struct riscv_pmu *pmu)
+{
+ riscv_pm_pmu_unregister(pmu);
+ cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
+}
+
static int pmu_sbi_device_probe(struct platform_device *pdev)
{
struct riscv_pmu *pmu = NULL;
@@ -720,6 +810,7 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE;
}
+ pmu->pmu.attr_groups = riscv_pmu_attr_groups;
pmu->num_counters = num_counters;
pmu->ctr_start = pmu_sbi_ctr_start;
pmu->ctr_stop = pmu_sbi_ctr_stop;
@@ -733,14 +824,19 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
if (ret)
return ret;
+ ret = riscv_pm_pmu_register(pmu);
+ if (ret)
+ goto out_unregister;
+
ret = perf_pmu_register(&pmu->pmu, "cpu", PERF_TYPE_RAW);
- if (ret) {
- cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
- return ret;
- }
+ if (ret)
+ goto out_unregister;
return 0;
+out_unregister:
+ riscv_pmu_destroy(pmu);
+
out_free:
kfree(pmu);
return ret;
diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c
index f5eced0842b3..6a88eb7e9f75 100644
--- a/drivers/powercap/dtpm_cpu.c
+++ b/drivers/powercap/dtpm_cpu.c
@@ -71,34 +71,19 @@ static u64 set_pd_power_limit(struct dtpm *dtpm, u64 power_limit)
static u64 scale_pd_power_uw(struct cpumask *pd_mask, u64 power)
{
- unsigned long max = 0, sum_util = 0;
+ unsigned long max, sum_util = 0;
int cpu;
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-
- /*
- * The capacity is the same for all CPUs belonging to
- * the same perf domain, so a single call to
- * arch_scale_cpu_capacity() is enough. However, we
- * need the CPU parameter to be initialized by the
- * loop, so the call ends up in this block.
- *
- * We can initialize 'max' with a cpumask_first() call
- * before the loop but the bits computation is not
- * worth given the arch_scale_cpu_capacity() just
- * returns a value where the resulting assembly code
- * will be optimized by the compiler.
- */
- max = arch_scale_cpu_capacity(cpu);
- sum_util += sched_cpu_util(cpu, max);
- }
-
/*
- * In the improbable case where all the CPUs of the perf
- * domain are offline, 'max' will be zero and will lead to an
- * illegal operation with a zero division.
+ * The capacity is the same for all CPUs belonging to
+ * the same perf domain.
*/
- return max ? (power * ((sum_util << 10) / max)) >> 10 : 0;
+ max = arch_scale_cpu_capacity(cpumask_first(pd_mask));
+
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask)
+ sum_util += sched_cpu_util(cpu);
+
+ return (power * ((sum_util << 10) / max)) >> 10;
}
static u64 get_pd_power_uw(struct dtpm *dtpm)
diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig
index 458218f88c5e..fe4971b65c64 100644
--- a/drivers/ptp/Kconfig
+++ b/drivers/ptp/Kconfig
@@ -176,6 +176,7 @@ config PTP_1588_CLOCK_OCP
depends on !S390
depends on COMMON_CLK
select NET_DEVLINK
+ select CRC16
help
This driver adds support for an OpenCompute time card.
diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c
index 9e54fe76a9b2..35d4b398c197 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3565,7 +3565,7 @@ static void qeth_flush_buffers(struct qeth_qdio_out_q *queue, int index,
if (!atomic_read(&queue->set_pci_flags_count)) {
/*
* there's no outstanding PCI any more, so we
- * have to request a PCI to be sure the the PCI
+ * have to request a PCI to be sure the PCI
* will wake at some time in the future then we
* can flush packed buffers that might still be
* hanging around, which can happen if no
diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
index b519f4b59d30..5e8887fa02c8 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c
@@ -11386,6 +11386,7 @@ scsih_shutdown(struct pci_dev *pdev)
_scsih_ir_shutdown(ioc);
_scsih_nvme_shutdown(ioc);
mpt3sas_base_mask_interrupts(ioc);
+ mpt3sas_base_stop_watchdog(ioc);
ioc->shost_recovery = 1;
mpt3sas_base_make_ioc_ready(ioc, SOFT_RESET);
ioc->shost_recovery = 0;
diff --git a/drivers/scsi/scsi_ioctl.c b/drivers/scsi/scsi_ioctl.c
index a480c4d589f5..729e309e6034 100644
--- a/drivers/scsi/scsi_ioctl.c
+++ b/drivers/scsi/scsi_ioctl.c
@@ -450,7 +450,7 @@ static int sg_io(struct scsi_device *sdev, struct sg_io_hdr *hdr, fmode_t mode)
goto out_put_request;
ret = 0;
- if (hdr->iovec_count) {
+ if (hdr->iovec_count && hdr->dxfer_len) {
struct iov_iter i;
struct iovec *iov = NULL;
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index b8151d95a806..b263b0fde03c 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -137,11 +137,9 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
int cpu_idx)
{
- unsigned long max = arch_scale_cpu_capacity(cpu);
- unsigned long util;
+ unsigned long util = sched_cpu_util(cpu);
- util = sched_cpu_util(cpu, max);
- return (util * 100) / max;
+ return (util * 100) / arch_scale_cpu_capacity(cpu);
}
#else /* !CONFIG_SMP */
static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu,
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index c7b337480e3e..3d367be71728 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -2953,37 +2953,59 @@ ufshcd_dev_cmd_completion(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
static int ufshcd_wait_for_dev_cmd(struct ufs_hba *hba,
struct ufshcd_lrb *lrbp, int max_timeout)
{
- int err = 0;
- unsigned long time_left;
+ unsigned long time_left = msecs_to_jiffies(max_timeout);
unsigned long flags;
+ bool pending;
+ int err;
+retry:
time_left = wait_for_completion_timeout(hba->dev_cmd.complete,
- msecs_to_jiffies(max_timeout));
+ time_left);
- spin_lock_irqsave(hba->host->host_lock, flags);
- hba->dev_cmd.complete = NULL;
if (likely(time_left)) {
+ /*
+ * The completion handler called complete() and the caller of
+ * this function still owns the @lrbp tag so the code below does
+ * not trigger any race conditions.
+ */
+ hba->dev_cmd.complete = NULL;
err = ufshcd_get_tr_ocs(lrbp);
if (!err)
err = ufshcd_dev_cmd_completion(hba, lrbp);
- }
- spin_unlock_irqrestore(hba->host->host_lock, flags);
-
- if (!time_left) {
+ } else {
err = -ETIMEDOUT;
dev_dbg(hba->dev, "%s: dev_cmd request timedout, tag %d\n",
__func__, lrbp->task_tag);
- if (!ufshcd_clear_cmds(hba, 1U << lrbp->task_tag))
+ if (ufshcd_clear_cmds(hba, 1U << lrbp->task_tag) == 0) {
/* successfully cleared the command, retry if needed */
err = -EAGAIN;
- /*
- * in case of an error, after clearing the doorbell,
- * we also need to clear the outstanding_request
- * field in hba
- */
- spin_lock_irqsave(&hba->outstanding_lock, flags);
- __clear_bit(lrbp->task_tag, &hba->outstanding_reqs);
- spin_unlock_irqrestore(&hba->outstanding_lock, flags);
+ /*
+ * Since clearing the command succeeded we also need to
+ * clear the task tag bit from the outstanding_reqs
+ * variable.
+ */
+ spin_lock_irqsave(&hba->outstanding_lock, flags);
+ pending = test_bit(lrbp->task_tag,
+ &hba->outstanding_reqs);
+ if (pending) {
+ hba->dev_cmd.complete = NULL;
+ __clear_bit(lrbp->task_tag,
+ &hba->outstanding_reqs);
+ }
+ spin_unlock_irqrestore(&hba->outstanding_lock, flags);
+
+ if (!pending) {
+ /*
+ * The completion handler ran while we tried to
+ * clear the command.
+ */
+ time_left = 1;
+ goto retry;
+ }
+ } else {
+ dev_err(hba->dev, "%s: failed to clear tag %d\n",
+ __func__, lrbp->task_tag);
+ }
}
return err;
diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c
index e7332cc65b1f..173aea8e9997 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.c
+++ b/drivers/ufs/host/ufshcd-pltfrm.c
@@ -108,9 +108,20 @@ out:
return ret;
}
+static bool phandle_exists(const struct device_node *np,
+ const char *phandle_name, int index)
+{
+ struct device_node *parse_np = of_parse_phandle(np, phandle_name, index);
+
+ if (parse_np)
+ of_node_put(parse_np);
+
+ return parse_np != NULL;
+}
+
#define MAX_PROP_SIZE 32
static int ufshcd_populate_vreg(struct device *dev, const char *name,
- struct ufs_vreg **out_vreg)
+ struct ufs_vreg **out_vreg)
{
char prop_name[MAX_PROP_SIZE];
struct ufs_vreg *vreg = NULL;
@@ -122,7 +133,7 @@ static int ufshcd_populate_vreg(struct device *dev, const char *name,
}
snprintf(prop_name, MAX_PROP_SIZE, "%s-supply", name);
- if (!of_parse_phandle(np, prop_name, 0)) {
+ if (!phandle_exists(np, prop_name, 0)) {
dev_info(dev, "%s: Unable to find %s regulator, assuming enabled\n",
__func__, prop_name);
goto out;
diff --git a/fs/attr.c b/fs/attr.c
index dbe996b0dedf..b5b8835ddf15 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -22,7 +22,7 @@
* chown_ok - verify permissions to chown inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @uid: uid to chown @inode to
+ * @ia_vfsuid: uid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -31,15 +31,15 @@
* performed on the raw inode simply passs init_user_ns.
*/
static bool chown_ok(struct user_namespace *mnt_userns,
- const struct inode *inode,
- kuid_t uid)
+ const struct inode *inode, vfsuid_t ia_vfsuid)
{
- kuid_t kuid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, inode->i_uid))
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
+ vfsuid_eq(ia_vfsuid, vfsuid))
return true;
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (uid_eq(kuid, INVALID_UID) &&
+ if (!vfsuid_valid(vfsuid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -49,7 +49,7 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* chgrp_ok - verify permissions to chgrp inode
* @mnt_userns: user namespace of the mount @inode was found from
* @inode: inode to check permissions on
- * @gid: gid to chown @inode to
+ * @ia_vfsgid: gid to chown @inode to
*
* If the inode has been found through an idmapped mount the user namespace of
* the vfsmount must be passed through @mnt_userns. This function will then
@@ -58,21 +58,19 @@ static bool chown_ok(struct user_namespace *mnt_userns,
* performed on the raw inode simply passs init_user_ns.
*/
static bool chgrp_ok(struct user_namespace *mnt_userns,
- const struct inode *inode, kgid_t gid)
+ const struct inode *inode, vfsgid_t ia_vfsgid)
{
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) {
- kgid_t mapped_gid;
-
- if (gid_eq(gid, inode->i_gid))
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ vfsuid_t vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
+ if (vfsgid_eq(ia_vfsgid, vfsgid))
return true;
- mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid);
- if (in_group_p(mapped_gid))
+ if (vfsgid_in_group_p(ia_vfsgid))
return true;
}
if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN))
return true;
- if (gid_eq(kgid, INVALID_GID) &&
+ if (!vfsgid_valid(vfsgid) &&
ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
return true;
return false;
@@ -120,28 +118,29 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry,
goto kill_priv;
/* Make sure a caller can chown. */
- if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid))
+ if ((ia_valid & ATTR_UID) &&
+ !chown_ok(mnt_userns, inode, attr->ia_vfsuid))
return -EPERM;
/* Make sure caller can chgrp. */
- if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid))
+ if ((ia_valid & ATTR_GID) &&
+ !chgrp_ok(mnt_userns, inode, attr->ia_vfsgid))
return -EPERM;
/* Make sure a caller can chmod. */
if (ia_valid & ATTR_MODE) {
- kgid_t mapped_gid;
+ vfsgid_t vfsgid;
if (!inode_owner_or_capable(mnt_userns, inode))
return -EPERM;
if (ia_valid & ATTR_GID)
- mapped_gid = mapped_kgid_fs(mnt_userns,
- i_user_ns(inode), attr->ia_gid);
+ vfsgid = attr->ia_vfsgid;
else
- mapped_gid = i_gid_into_mnt(mnt_userns, inode);
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
/* Also check the setgid bit! */
- if (!in_group_p(mapped_gid) &&
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -219,9 +218,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
* setattr_copy must be called with i_mutex held.
*
* setattr_copy updates the inode's metadata with that specified
- * in attr on idmapped mounts. If file ownership is changed setattr_copy
- * doesn't map ia_uid and ia_gid. It will asssume the caller has already
- * provided the intended values. Necessary permission checks to determine
+ * in attr on idmapped mounts. Necessary permission checks to determine
* whether or not the S_ISGID property needs to be removed are performed with
* the correct idmapped mount permission helpers.
* Noticeably missing is inode size update, which is more complex
@@ -242,10 +239,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -254,8 +249,8 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode,
inode->i_ctime = attr->ia_ctime;
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
- kgid_t kgid = i_gid_into_mnt(mnt_userns, inode);
- if (!in_group_p(kgid) &&
+ vfsgid_t vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (!vfsgid_in_group_p(vfsgid) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
@@ -306,9 +301,6 @@ EXPORT_SYMBOL(may_setattr);
* retry. Because breaking a delegation may take a long time, the
* caller should drop the i_mutex before doing so.
*
- * If file ownership is changed notify_change() doesn't map ia_uid and
- * ia_gid. It will asssume the caller has already provided the intended values.
- *
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
* to be NFS exported. Also, passing NULL is fine for callers holding
@@ -397,23 +389,25 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry,
* namespace of the superblock.
*/
if (ia_valid & ATTR_UID &&
- !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ !vfsuid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsuid))
return -EOVERFLOW;
if (ia_valid & ATTR_GID &&
- !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ !vfsgid_has_fsmapping(mnt_userns, inode->i_sb->s_user_ns,
+ attr->ia_vfsgid))
return -EOVERFLOW;
/* Don't allow modifications of files with invalid uids or
* gids unless those uids & gids are being made valid.
*/
if (!(ia_valid & ATTR_UID) &&
- !uid_valid(i_uid_into_mnt(mnt_userns, inode)))
+ !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)))
return -EOVERFLOW;
if (!(ia_valid & ATTR_GID) &&
- !gid_valid(i_gid_into_mnt(mnt_userns, inode)))
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode)))
return -EOVERFLOW;
- error = security_inode_setattr(dentry, attr);
+ error = security_inode_setattr(mnt_userns, dentry, attr);
if (error)
return error;
error = try_break_deleg(inode, delegated_inode);
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index ee92634196a8..1105ce3c80cb 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -9,6 +9,15 @@ menuconfig DLM
A general purpose distributed lock manager for kernel or userspace
applications.
+config DLM_DEPRECATED_API
+ bool "DLM deprecated API"
+ depends on DLM
+ help
+ Enables deprecated DLM timeout features that will be removed in
+ later Linux kernel releases.
+
+ If you are unsure, say N.
+
config DLM_DEBUG
bool "DLM debugging"
depends on DLM
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index 3545fdafc6fb..71dab733cf9a 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -9,7 +9,6 @@ dlm-y := ast.o \
member.o \
memory.o \
midcomms.o \
- netlink.o \
lowcomms.o \
plock.o \
rcom.o \
@@ -18,5 +17,6 @@ dlm-y := ast.o \
requestqueue.o \
user.o \
util.o
+dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o
dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index bfac462dd3e8..19ef136f9e4f 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -255,13 +255,13 @@ void dlm_callback_work(struct work_struct *work)
if (callbacks[i].flags & DLM_CB_SKIP) {
continue;
} else if (callbacks[i].flags & DLM_CB_BAST) {
- bastfn(lkb->lkb_astparam, callbacks[i].mode);
trace_dlm_bast(ls, lkb, callbacks[i].mode);
+ bastfn(lkb->lkb_astparam, callbacks[i].mode);
} else if (callbacks[i].flags & DLM_CB_CAST) {
lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
+ trace_dlm_ast(ls, lkb);
castfn(lkb->lkb_astparam);
- trace_dlm_ast(ls, lkb, lkb->lkb_lksb);
}
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 42eee2783756..ac8b62106ce0 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -75,8 +75,9 @@ struct dlm_cluster {
unsigned int cl_log_info;
unsigned int cl_protocol;
unsigned int cl_mark;
+#ifdef CONFIG_DLM_DEPRECATED_API
unsigned int cl_timewarn_cs;
- unsigned int cl_waitwarn_us;
+#endif
unsigned int cl_new_rsb_count;
unsigned int cl_recover_callbacks;
char cl_cluster_name[DLM_LOCKSPACE_LEN];
@@ -102,8 +103,9 @@ enum {
CLUSTER_ATTR_LOG_INFO,
CLUSTER_ATTR_PROTOCOL,
CLUSTER_ATTR_MARK,
+#ifdef CONFIG_DLM_DEPRECATED_API
CLUSTER_ATTR_TIMEWARN_CS,
- CLUSTER_ATTR_WAITWARN_US,
+#endif
CLUSTER_ATTR_NEW_RSB_COUNT,
CLUSTER_ATTR_RECOVER_CALLBACKS,
CLUSTER_ATTR_CLUSTER_NAME,
@@ -224,8 +226,9 @@ CLUSTER_ATTR(log_debug, NULL);
CLUSTER_ATTR(log_info, NULL);
CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running);
CLUSTER_ATTR(mark, NULL);
+#ifdef CONFIG_DLM_DEPRECATED_API
CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
-CLUSTER_ATTR(waitwarn_us, NULL);
+#endif
CLUSTER_ATTR(new_rsb_count, NULL);
CLUSTER_ATTR(recover_callbacks, NULL);
@@ -240,8 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info,
[CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol,
[CLUSTER_ATTR_MARK] = &cluster_attr_mark,
+#ifdef CONFIG_DLM_DEPRECATED_API
[CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs,
- [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us,
+#endif
[CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count,
[CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks,
[CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name,
@@ -432,8 +436,9 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_log_debug = dlm_config.ci_log_debug;
cl->cl_log_info = dlm_config.ci_log_info;
cl->cl_protocol = dlm_config.ci_protocol;
+#ifdef CONFIG_DLM_DEPRECATED_API
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
- cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
+#endif
cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
@@ -954,8 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
#define DEFAULT_LOG_INFO 1
#define DEFAULT_PROTOCOL DLM_PROTO_TCP
#define DEFAULT_MARK 0
+#ifdef CONFIG_DLM_DEPRECATED_API
#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
-#define DEFAULT_WAITWARN_US 0
+#endif
#define DEFAULT_NEW_RSB_COUNT 128
#define DEFAULT_RECOVER_CALLBACKS 0
#define DEFAULT_CLUSTER_NAME ""
@@ -971,8 +977,9 @@ struct dlm_config_info dlm_config = {
.ci_log_info = DEFAULT_LOG_INFO,
.ci_protocol = DEFAULT_PROTOCOL,
.ci_mark = DEFAULT_MARK,
+#ifdef CONFIG_DLM_DEPRECATED_API
.ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
- .ci_waitwarn_us = DEFAULT_WAITWARN_US,
+#endif
.ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
.ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
.ci_cluster_name = DEFAULT_CLUSTER_NAME
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index df92b0a07fc6..55c5f2c13ebd 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -37,8 +37,9 @@ struct dlm_config_info {
int ci_log_info;
int ci_protocol;
int ci_mark;
+#ifdef CONFIG_DLM_DEPRECATED_API
int ci_timewarn_cs;
- int ci_waitwarn_us;
+#endif
int ci_new_rsb_count;
int ci_recover_callbacks;
char ci_cluster_name[DLM_LOCKSPACE_LEN];
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 776c3ed519f0..8aca8085d24e 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -145,7 +145,9 @@ struct dlm_args {
void (*bastfn) (void *astparam, int mode);
int mode;
struct dlm_lksb *lksb;
+#ifdef CONFIG_DLM_DEPRECATED_API
unsigned long timeout;
+#endif
};
@@ -203,10 +205,20 @@ struct dlm_args {
#define DLM_IFL_OVERLAP_UNLOCK 0x00080000
#define DLM_IFL_OVERLAP_CANCEL 0x00100000
#define DLM_IFL_ENDOFLIFE 0x00200000
+#ifdef CONFIG_DLM_DEPRECATED_API
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
+#endif
#define DLM_IFL_DEADLOCK_CANCEL 0x01000000
#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */
+/* least significant 2 bytes are message changed, they are full transmitted
+ * but at receive side only the 2 bytes LSB will be set.
+ *
+ * Even wireshark dlm dissector does only evaluate the lower bytes and note
+ * that they may not be used on transceiver side, we assume the higher bytes
+ * are for internal use or reserved so long they are not parsed on receiver
+ * side.
+ */
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
@@ -249,10 +261,12 @@ struct dlm_lkb {
struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */
struct list_head lkb_wait_reply; /* waiting for remote reply */
struct list_head lkb_ownqueue; /* list of locks for a process */
- struct list_head lkb_time_list;
ktime_t lkb_timestamp;
- ktime_t lkb_wait_time;
+
+#ifdef CONFIG_DLM_DEPRECATED_API
+ struct list_head lkb_time_list;
unsigned long lkb_timeout_cs;
+#endif
struct mutex lkb_cb_mutex;
struct work_struct lkb_cb_work;
@@ -568,8 +582,10 @@ struct dlm_ls {
struct mutex ls_orphans_mutex;
struct list_head ls_orphans;
+#ifdef CONFIG_DLM_DEPRECATED_API
struct mutex ls_timeout_mutex;
struct list_head ls_timeout;
+#endif
spinlock_t ls_new_rsb_spin;
int ls_new_rsb_count;
@@ -606,8 +622,8 @@ struct dlm_ls {
wait_queue_head_t ls_uevent_wait; /* user part of join/leave */
int ls_uevent_result;
- struct completion ls_members_done;
- int ls_members_result;
+ struct completion ls_recovery_done;
+ int ls_recovery_result;
struct miscdevice ls_device;
@@ -688,7 +704,9 @@ struct dlm_ls {
#define LSFL_RCOM_READY 5
#define LSFL_RCOM_WAIT 6
#define LSFL_UEVENT_WAIT 7
+#ifdef CONFIG_DLM_DEPRECATED_API
#define LSFL_TIMEWARN 8
+#endif
#define LSFL_CB_DELAY 9
#define LSFL_NODIR 10
@@ -741,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
return test_bit(LSFL_NODIR, &ls->ls_flags);
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_netlink_init(void);
void dlm_netlink_exit(void);
void dlm_timeout_warn(struct dlm_lkb *lkb);
+#else
+static inline int dlm_netlink_init(void) { return 0; }
+static inline void dlm_netlink_exit(void) { };
+static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { };
+#endif
int dlm_plock_init(void);
void dlm_plock_exit(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 226822f49d30..dac7eb75dba9 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
+#ifdef CONFIG_DLM_DEPRECATED_API
/* if the operation was a cancel, then return -DLM_ECANCEL, if a
timeout caused the cancel then return -ETIMEDOUT */
if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
rv = -ETIMEDOUT;
}
+#endif
if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
@@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
kref_init(&lkb->lkb_ref);
INIT_LIST_HEAD(&lkb->lkb_ownqueue);
INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
+#ifdef CONFIG_DLM_DEPRECATED_API
INIT_LIST_HEAD(&lkb->lkb_time_list);
+#endif
INIT_LIST_HEAD(&lkb->lkb_cb_list);
mutex_init(&lkb->lkb_cb_mutex);
INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
@@ -1306,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb)
kref_get(&lkb->lkb_ref);
}
+static void unhold_lkb_assert(struct kref *kref)
+{
+ struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
+
+ DLM_ASSERT(false, dlm_print_lkb(lkb););
+}
+
/* This is called when we need to remove a reference and are certain
it's not the last ref. e.g. del_lkb is always called between a
find_lkb/put_lkb and is always the inverse of a previous add_lkb.
@@ -1313,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb)
static inline void unhold_lkb(struct dlm_lkb *lkb)
{
- int rv;
- rv = kref_put(&lkb->lkb_ref, kill_lkb);
- DLM_ASSERT(!rv, dlm_print_lkb(lkb););
+ kref_put(&lkb->lkb_ref, unhold_lkb_assert);
}
static void lkb_add_ordered(struct list_head *new, struct list_head *head,
@@ -1402,75 +1411,6 @@ static int msg_reply_type(int mstype)
return -1;
}
-static int nodeid_warned(int nodeid, int num_nodes, int *warned)
-{
- int i;
-
- for (i = 0; i < num_nodes; i++) {
- if (!warned[i]) {
- warned[i] = nodeid;
- return 0;
- }
- if (warned[i] == nodeid)
- return 1;
- }
- return 0;
-}
-
-void dlm_scan_waiters(struct dlm_ls *ls)
-{
- struct dlm_lkb *lkb;
- s64 us;
- s64 debug_maxus = 0;
- u32 debug_scanned = 0;
- u32 debug_expired = 0;
- int num_nodes = 0;
- int *warned = NULL;
-
- if (!dlm_config.ci_waitwarn_us)
- return;
-
- mutex_lock(&ls->ls_waiters_mutex);
-
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (!lkb->lkb_wait_time)
- continue;
-
- debug_scanned++;
-
- us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time));
-
- if (us < dlm_config.ci_waitwarn_us)
- continue;
-
- lkb->lkb_wait_time = 0;
-
- debug_expired++;
- if (us > debug_maxus)
- debug_maxus = us;
-
- if (!num_nodes) {
- num_nodes = ls->ls_num_nodes;
- warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL);
- }
- if (!warned)
- continue;
- if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned))
- continue;
-
- log_error(ls, "waitwarn %x %lld %d us check connection to "
- "node %d", lkb->lkb_id, (long long)us,
- dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid);
- }
- mutex_unlock(&ls->ls_waiters_mutex);
- kfree(warned);
-
- if (debug_expired)
- log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us",
- debug_scanned, debug_expired,
- dlm_config.ci_waitwarn_us, (long long)debug_maxus);
-}
-
/* add/remove lkb from global waiters list of lkb's waiting for
a reply from a remote node */
@@ -1514,7 +1454,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
lkb->lkb_wait_count++;
lkb->lkb_wait_type = mstype;
- lkb->lkb_wait_time = ktime_get();
lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
hold_lkb(lkb);
list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
@@ -1842,6 +1781,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls)
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
static void add_timeout(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
@@ -1962,17 +1902,11 @@ void dlm_adjust_timeouts(struct dlm_ls *ls)
list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
mutex_unlock(&ls->ls_timeout_mutex);
-
- if (!dlm_config.ci_waitwarn_us)
- return;
-
- mutex_lock(&ls->ls_waiters_mutex);
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (ktime_to_us(lkb->lkb_wait_time))
- lkb->lkb_wait_time = ktime_get();
- }
- mutex_unlock(&ls->ls_waiters_mutex);
}
+#else
+static void add_timeout(struct dlm_lkb *lkb) { }
+static void del_timeout(struct dlm_lkb *lkb) { }
+#endif
/* lkb is master or local copy */
@@ -2837,12 +2771,20 @@ static void confirm_master(struct dlm_rsb *r, int error)
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
int namelen, unsigned long timeout_cs,
void (*ast) (void *astparam),
void *astparam,
void (*bast) (void *astparam, int mode),
struct dlm_args *args)
+#else
+static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
+ int namelen, void (*ast)(void *astparam),
+ void *astparam,
+ void (*bast)(void *astparam, int mode),
+ struct dlm_args *args)
+#endif
{
int rv = -EINVAL;
@@ -2895,7 +2837,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
args->astfn = ast;
args->astparam = astparam;
args->bastfn = bast;
+#ifdef CONFIG_DLM_DEPRECATED_API
args->timeout = timeout_cs;
+#endif
args->mode = mode;
args->lksb = lksb;
rv = 0;
@@ -2951,7 +2895,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
lkb->lkb_lksb = args->lksb;
lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
lkb->lkb_ownpid = (int) current->pid;
+#ifdef CONFIG_DLM_DEPRECATED_API
lkb->lkb_timeout_cs = args->timeout;
+#endif
rv = 0;
out:
if (rv)
@@ -3472,10 +3418,15 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error)
goto out;
- trace_dlm_lock_start(ls, lkb, mode, flags);
+ trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
astarg, bast, &args);
+#else
+ error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast,
+ &args);
+#endif
if (error)
goto out_put;
@@ -3487,7 +3438,7 @@ int dlm_lock(dlm_lockspace_t *lockspace,
if (error == -EINPROGRESS)
error = 0;
out_put:
- trace_dlm_lock_end(ls, lkb, mode, flags, error);
+ trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error);
if (convert || error)
__put_lkb(ls, lkb);
@@ -5839,9 +5790,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
return 0;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
int mode, uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs)
+#else
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
+ int mode, uint32_t flags, void *name, unsigned int namelen)
+#endif
{
struct dlm_lkb *lkb;
struct dlm_args args;
@@ -5864,8 +5820,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
goto out;
}
}
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
+#else
+ error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua,
+ fake_bastfn, &args);
+#endif
if (error) {
kfree(ua->lksb.sb_lvbptr);
ua->lksb.sb_lvbptr = NULL;
@@ -5904,9 +5865,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
return error;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs)
+#else
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
+#endif
{
struct dlm_lkb *lkb;
struct dlm_args args;
@@ -5941,8 +5907,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
ua->bastaddr = ua_tmp->bastaddr;
ua->user_lksb = ua_tmp->user_lksb;
+#ifdef CONFIG_DLM_DEPRECATED_API
error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
fake_astfn, ua, fake_bastfn, &args);
+#else
+ error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua,
+ fake_bastfn, &args);
+#endif
if (error)
goto out_put;
@@ -5966,7 +5937,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
- unsigned long timeout_cs, uint32_t *lkid)
+ uint32_t *lkid)
{
struct dlm_lkb *lkb = NULL, *iter;
struct dlm_user_args *ua;
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 252a5898f908..a7b6474f009d 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -24,9 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb);
void dlm_scan_rsbs(struct dlm_ls *ls);
int dlm_lock_recovery_try(struct dlm_ls *ls);
void dlm_unlock_recovery(struct dlm_ls *ls);
-void dlm_scan_waiters(struct dlm_ls *ls);
+
+#ifdef CONFIG_DLM_DEPRECATED_API
void dlm_scan_timeout(struct dlm_ls *ls);
void dlm_adjust_timeouts(struct dlm_ls *ls);
+#else
+static inline void dlm_scan_timeout(struct dlm_ls *ls) { }
+static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { }
+#endif
+
int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len,
unsigned int flags, int *r_nodeid, int *result);
@@ -41,15 +47,22 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls);
int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc);
+#ifdef CONFIG_DLM_DEPRECATED_API
int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs);
int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
unsigned long timeout_cs);
+#else
+int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode,
+ uint32_t flags, void *name, unsigned int namelen);
+int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
+ int mode, uint32_t flags, uint32_t lkid, char *lvb_in);
+#endif
int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
- unsigned long timeout_cs, uint32_t *lkid);
+ uint32_t *lkid);
int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
uint32_t flags, uint32_t lkid, char *lvb_in);
int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 19ed41a5da93..3972f4d86c75 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -275,7 +275,6 @@ static int dlm_scand(void *data)
ls->ls_scan_time = jiffies;
dlm_scan_rsbs(ls);
dlm_scan_timeout(ls);
- dlm_scan_waiters(ls);
dlm_unlock_recovery(ls);
} else {
ls->ls_scan_time += HZ;
@@ -490,13 +489,28 @@ static int new_lockspace(const char *name, const char *cluster,
ls->ls_ops_arg = ops_arg;
}
- if (flags & DLM_LSFL_TIMEWARN)
+#ifdef CONFIG_DLM_DEPRECATED_API
+ if (flags & DLM_LSFL_TIMEWARN) {
+ pr_warn_once("===============================================================\n"
+ "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n"
+ " will be removed in v6.2!\n"
+ " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n"
+ "===============================================================\n");
+
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
+ }
/* ls_exflags are forced to match among nodes, and we don't
- need to require all nodes to have some flags set */
+ * need to require all nodes to have some flags set
+ */
ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
DLM_LSFL_NEWEXCL));
+#else
+ /* ls_exflags are forced to match among nodes, and we don't
+ * need to require all nodes to have some flags set
+ */
+ ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL));
+#endif
size = READ_ONCE(dlm_config.ci_rsbtbl_size);
ls->ls_rsbtbl_size = size;
@@ -527,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster,
mutex_init(&ls->ls_waiters_mutex);
INIT_LIST_HEAD(&ls->ls_orphans);
mutex_init(&ls->ls_orphans_mutex);
+#ifdef CONFIG_DLM_DEPRECATED_API
INIT_LIST_HEAD(&ls->ls_timeout);
mutex_init(&ls->ls_timeout_mutex);
+#endif
INIT_LIST_HEAD(&ls->ls_new_rsb);
spin_lock_init(&ls->ls_new_rsb_spin);
@@ -548,8 +564,8 @@ static int new_lockspace(const char *name, const char *cluster,
init_waitqueue_head(&ls->ls_uevent_wait);
ls->ls_uevent_result = 0;
- init_completion(&ls->ls_members_done);
- ls->ls_members_result = -1;
+ init_completion(&ls->ls_recovery_done);
+ ls->ls_recovery_result = -1;
mutex_init(&ls->ls_cb_mutex);
INIT_LIST_HEAD(&ls->ls_cb_delay);
@@ -645,8 +661,9 @@ static int new_lockspace(const char *name, const char *cluster,
if (error)
goto out_recoverd;
- wait_for_completion(&ls->ls_members_done);
- error = ls->ls_members_result;
+ /* wait until recovery is successful or failed */
+ wait_for_completion(&ls->ls_recovery_done);
+ error = ls->ls_recovery_result;
if (error)
goto out_members;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 19e82f08c0e0..a4e84e8d94c8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -529,7 +529,7 @@ static void lowcomms_write_space(struct sock *sk)
return;
if (!test_and_set_bit(CF_CONNECTED, &con->flags)) {
- log_print("successful connected to node %d", con->nodeid);
+ log_print("connected to node %d", con->nodeid);
queue_work(send_workqueue, &con->swork);
return;
}
@@ -1931,7 +1931,7 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
return ret;
if (!test_and_set_bit(CF_CONNECTED, &con->flags))
- log_print("successful connected to node %d", con->nodeid);
+ log_print("connected to node %d", con->nodeid);
return 0;
}
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 98084e0cfccf..2af2ccfe43a9 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -534,7 +534,11 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
int i, error, neg = 0, low = -1;
/* previously removed members that we've not finished removing need to
- count as a negative change so the "neg" recovery steps will happen */
+ * count as a negative change so the "neg" recovery steps will happen
+ *
+ * This functionality must report all member changes to lsops or
+ * midcomms layer and must never return before.
+ */
list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
log_rinfo(ls, "prev removed member %d", memb->nodeid);
@@ -583,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
*neg_out = neg;
error = ping_members(ls);
- /* error -EINTR means that a new recovery action is triggered.
- * We ignore this recovery action and let run the new one which might
- * have new member configuration.
- */
- if (error == -EINTR)
- error = 0;
-
- /* new_lockspace() may be waiting to know if the config
- * is good or bad
- */
- ls->ls_members_result = error;
- complete(&ls->ls_members_done);
-
log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
@@ -675,7 +666,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
if (!ls->ls_recover_begin)
ls->ls_recover_begin = jiffies;
- dlm_lsop_recover_prep(ls);
+ /* call recover_prep ops only once and not multiple times
+ * for each possible dlm_ls_stop() when recovery is already
+ * stopped.
+ *
+ * If we successful was able to clear LSFL_RUNNING bit and
+ * it was set we know it is the first dlm_ls_stop() call.
+ */
+ if (new)
+ dlm_lsop_recover_prep(ls);
+
return 0;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 0993eebf2060..737f185aad8d 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -29,6 +29,8 @@ struct plock_async_data {
struct plock_op {
struct list_head list;
int done;
+ /* if lock op got interrupted while waiting dlm_controld reply */
+ bool sigint;
struct dlm_plock_info info;
/* if set indicates async handling */
struct plock_async_data *data;
@@ -79,8 +81,7 @@ static void send_op(struct plock_op *op)
abandoned waiter. So, we have to insert the unlock-close when the
lock call is interrupted. */
-static void do_unlock_close(struct dlm_ls *ls, u64 number,
- struct file *file, struct file_lock *fl)
+static void do_unlock_close(const struct dlm_plock_info *info)
{
struct plock_op *op;
@@ -89,15 +90,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
return;
op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = fl->fl_pid;
- op->info.fsid = ls->ls_global_id;
- op->info.number = number;
+ op->info.pid = info->pid;
+ op->info.fsid = info->fsid;
+ op->info.number = info->number;
op->info.start = 0;
op->info.end = OFFSET_MAX;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = info->owner;
op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
@@ -161,16 +159,24 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
spin_lock(&ops_lock);
- list_del(&op->list);
+ /* recheck under ops_lock if we got a done != 0,
+ * if so this interrupt case should be ignored
+ */
+ if (op->done != 0) {
+ spin_unlock(&ops_lock);
+ goto do_lock_wait;
+ }
+
+ op->sigint = true;
spin_unlock(&ops_lock);
- log_print("%s: wait interrupted %x %llx, op removed",
+ log_debug(ls, "%s: wait interrupted %x %llx pid %d",
__func__, ls->ls_global_id,
- (unsigned long long)number);
- dlm_release_plock_op(op);
- do_unlock_close(ls, number, file, fl);
+ (unsigned long long)number, op->info.pid);
goto out;
}
+do_lock_wait:
+
WARN_ON(!list_empty(&op->list));
rv = op->info.rv;
@@ -378,7 +384,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
spin_lock(&ops_lock);
if (!list_empty(&send_list)) {
- op = list_entry(send_list.next, struct plock_op, list);
+ op = list_first_entry(&send_list, struct plock_op, list);
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
list_del(&op->list);
else
@@ -425,6 +431,19 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
if (iter->info.fsid == info.fsid &&
iter->info.number == info.number &&
iter->info.owner == info.owner) {
+ if (iter->sigint) {
+ list_del(&iter->list);
+ spin_unlock(&ops_lock);
+
+ pr_debug("%s: sigint cleanup %x %llx pid %d",
+ __func__, iter->info.fsid,
+ (unsigned long long)iter->info.number,
+ iter->info.pid);
+ do_unlock_close(&iter->info);
+ memcpy(&iter->info, &info, sizeof(info));
+ dlm_release_plock_op(iter);
+ return count;
+ }
list_del_init(&iter->list);
memcpy(&iter->info, &info, sizeof(info));
if (iter->data)
@@ -443,7 +462,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
else
wake_up(&recv_wq);
} else
- log_print("%s: no op %x %llx - may got interrupted?", __func__,
+ log_print("%s: no op %x %llx", __func__,
info.fsid, (unsigned long long)info.number);
return count;
}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index a55dfce705dd..e15eb511b04b 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -70,6 +70,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
/*
* Add or remove nodes from the lockspace's ls_nodes list.
+ *
+ * Due to the fact that we must report all membership changes to lsops
+ * or midcomms layer, it is not permitted to abort ls_recover() until
+ * this is done.
*/
error = dlm_recover_members(ls, rv, &neg);
@@ -239,14 +243,12 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
- dlm_lsop_recover_done(ls);
return 0;
fail:
dlm_release_root_list(ls);
- log_rinfo(ls, "dlm_recover %llu error %d",
- (unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
+
return error;
}
@@ -257,6 +259,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
static void do_ls_recovery(struct dlm_ls *ls)
{
struct dlm_recover *rv = NULL;
+ int error;
spin_lock(&ls->ls_recover_lock);
rv = ls->ls_recover_args;
@@ -266,7 +269,31 @@ static void do_ls_recovery(struct dlm_ls *ls)
spin_unlock(&ls->ls_recover_lock);
if (rv) {
- ls_recover(ls, rv);
+ error = ls_recover(ls, rv);
+ switch (error) {
+ case 0:
+ ls->ls_recovery_result = 0;
+ complete(&ls->ls_recovery_done);
+
+ dlm_lsop_recover_done(ls);
+ break;
+ case -EINTR:
+ /* if recovery was interrupted -EINTR we wait for the next
+ * ls_recover() iteration until it hopefully succeeds.
+ */
+ log_rinfo(ls, "%s %llu interrupted and should be queued to run again",
+ __func__, (unsigned long long)rv->seq);
+ break;
+ default:
+ log_rinfo(ls, "%s %llu error %d", __func__,
+ (unsigned long long)rv->seq, error);
+
+ /* let new_lockspace() get aware of critical error */
+ ls->ls_recovery_result = error;
+ complete(&ls->ls_recovery_done);
+ break;
+ }
+
kfree(rv->nodes);
kfree(rv);
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 1060b24f18d4..99e8f0744513 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc,
goto out;
}
+#ifdef CONFIG_DLM_DEPRECATED_API
+ if (params->timeout)
+ pr_warn_once("========================================================\n"
+ "WARNING: the lkb timeout feature is being deprecated and\n"
+ " will be removed in v6.2!\n"
+ "========================================================\n");
+#endif
+
ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
if (!ua)
goto out;
@@ -262,23 +270,34 @@ static int device_user_lock(struct dlm_user_proc *proc,
ua->xid = params->xid;
if (params->flags & DLM_LKF_CONVERT) {
+#ifdef CONFIG_DLM_DEPRECATED_API
error = dlm_user_convert(ls, ua,
params->mode, params->flags,
params->lkid, params->lvb,
(unsigned long) params->timeout);
+#else
+ error = dlm_user_convert(ls, ua,
+ params->mode, params->flags,
+ params->lkid, params->lvb);
+#endif
} else if (params->flags & DLM_LKF_ORPHAN) {
error = dlm_user_adopt_orphan(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
- (unsigned long) params->timeout,
&lkid);
if (!error)
error = lkid;
} else {
+#ifdef CONFIG_DLM_DEPRECATED_API
error = dlm_user_request(ls, ua,
params->mode, params->flags,
params->name, params->namelen,
(unsigned long) params->timeout);
+#else
+ error = dlm_user_request(ls, ua,
+ params->mode, params->flags,
+ params->name, params->namelen);
+#endif
if (!error)
error = ua->lksb.sb_lkid;
}
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 19e6c56a9f47..26fa170090b8 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -17,7 +17,7 @@ struct z_erofs_decompress_req {
/* indicate the algorithm will be used for decompression */
unsigned int alg;
- bool inplace_io, partial_decoding;
+ bool inplace_io, partial_decoding, fillgaps;
};
struct z_erofs_decompressor {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fbb037ba326e..fe8ac0e163f7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -366,42 +366,33 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
return iomap_bmap(mapping, block, &erofs_iomap_ops);
}
-static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to)
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
- loff_t align = iocb->ki_pos | iov_iter_count(to) |
- iov_iter_alignment(to);
- struct block_device *bdev = inode->i_sb->s_bdev;
- unsigned int blksize_mask;
-
- if (bdev)
- blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1;
- else
- blksize_mask = (1 << inode->i_blkbits) - 1;
- if (align & blksize_mask)
- return -EINVAL;
- return 0;
-}
-
-static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
/* no need taking (shared) inode lock since it's a ro filesystem */
if (!iov_iter_count(to))
return 0;
#ifdef CONFIG_FS_DAX
- if (IS_DAX(iocb->ki_filp->f_mapping->host))
+ if (IS_DAX(inode))
return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
#endif
if (iocb->ki_flags & IOCB_DIRECT) {
- int err = erofs_prepare_dio(iocb, to);
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int blksize_mask;
+
+ if (bdev)
+ blksize_mask = bdev_logical_block_size(bdev) - 1;
+ else
+ blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ if ((iocb->ki_pos | iov_iter_count(to) |
+ iov_iter_alignment(to)) & blksize_mask)
+ return -EINVAL;
- if (!err)
- return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
- NULL, 0, NULL, 0);
- if (err < 0)
- return err;
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+ NULL, 0, NULL, 0);
}
return filemap_read(iocb, to, 0);
}
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 6dca1900c733..2d55569f96ac 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -83,7 +83,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
j = 0;
/* 'valid' bounced can only be tested after a complete round */
- if (test_bit(j, bounced)) {
+ if (!rq->fillgaps && test_bit(j, bounced)) {
DBG_BUGON(i < lz4_max_distance_pages);
DBG_BUGON(top >= lz4_max_distance_pages);
availables[top++] = rq->out[i - lz4_max_distance_pages];
@@ -91,14 +91,18 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
if (page) {
__clear_bit(j, bounced);
- if (kaddr) {
- if (kaddr + PAGE_SIZE == page_address(page))
+ if (!PageHighMem(page)) {
+ if (!i) {
+ kaddr = page_address(page);
+ continue;
+ }
+ if (kaddr &&
+ kaddr + PAGE_SIZE == page_address(page)) {
kaddr += PAGE_SIZE;
- else
- kaddr = NULL;
- } else if (!i) {
- kaddr = page_address(page);
+ continue;
+ }
}
+ kaddr = NULL;
continue;
}
kaddr = NULL;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 05a3063cf2bc..5e59b3f523eb 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -143,6 +143,7 @@ again:
DBG_BUGON(z_erofs_lzma_head);
z_erofs_lzma_head = head;
spin_unlock(&z_erofs_lzma_lock);
+ wake_up_all(&z_erofs_lzma_wq);
z_erofs_lzma_max_dictsize = dict_size;
mutex_unlock(&lzma_resize_mutex);
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index 18e59821c597..ecf28f66b97d 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name,
}
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
- void *dentry_blk, unsigned int *ofs,
+ void *dentry_blk, struct erofs_dirent *de,
unsigned int nameoff, unsigned int maxsize)
{
- struct erofs_dirent *de = dentry_blk + *ofs;
const struct erofs_dirent *end = dentry_blk + nameoff;
while (de < end) {
@@ -59,9 +58,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
/* stopped by some reason */
return 1;
++de;
- *ofs += sizeof(struct erofs_dirent);
+ ctx->pos += sizeof(struct erofs_dirent);
}
- *ofs = maxsize;
return 0;
}
@@ -90,33 +88,33 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
nameoff = le16_to_cpu(de->nameoff);
if (nameoff < sizeof(struct erofs_dirent) ||
- nameoff >= PAGE_SIZE) {
+ nameoff >= EROFS_BLKSIZ) {
erofs_err(dir->i_sb,
"invalid de[0].nameoff %u @ nid %llu",
nameoff, EROFS_I(dir)->nid);
err = -EFSCORRUPTED;
- goto skip_this;
+ break;
}
maxsize = min_t(unsigned int,
- dirsize - ctx->pos + ofs, PAGE_SIZE);
+ dirsize - ctx->pos + ofs, EROFS_BLKSIZ);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
ofs = roundup(ofs, sizeof(struct erofs_dirent));
+ ctx->pos = blknr_to_addr(i) + ofs;
if (ofs >= nameoff)
goto skip_this;
}
- err = erofs_fill_dentries(dir, ctx, de, &ofs,
+ err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
-skip_this:
- ctx->pos = blknr_to_addr(i) + ofs;
-
if (err)
break;
+skip_this:
+ ctx->pos = blknr_to_addr(i) + maxsize;
++i;
ofs = 0;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 724bb57075f6..5792ca9e0d5e 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2018 HUAWEI, Inc.
* https://www.huawei.com/
+ * Copyright (C) 2022 Alibaba Cloud
*/
#include "zdata.h"
#include "compress.h"
@@ -26,6 +27,82 @@ static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
_PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
};
+struct z_erofs_bvec_iter {
+ struct page *bvpage;
+ struct z_erofs_bvset *bvset;
+ unsigned int nr, cur;
+};
+
+static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
+{
+ if (iter->bvpage)
+ kunmap_local(iter->bvset);
+ return iter->bvpage;
+}
+
+static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
+{
+ unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
+ /* have to access nextpage in advance, otherwise it will be unmapped */
+ struct page *nextpage = iter->bvset->nextpage;
+ struct page *oldpage;
+
+ DBG_BUGON(!nextpage);
+ oldpage = z_erofs_bvec_iter_end(iter);
+ iter->bvpage = nextpage;
+ iter->bvset = kmap_local_page(nextpage);
+ iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
+ iter->cur = 0;
+ return oldpage;
+}
+
+static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvset_inline *bvset,
+ unsigned int bootstrap_nr,
+ unsigned int cur)
+{
+ *iter = (struct z_erofs_bvec_iter) {
+ .nr = bootstrap_nr,
+ .bvset = (struct z_erofs_bvset *)bvset,
+ };
+
+ while (cur > iter->nr) {
+ cur -= iter->nr;
+ z_erofs_bvset_flip(iter);
+ }
+ iter->cur = cur;
+}
+
+static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **candidate_bvpage)
+{
+ if (iter->cur == iter->nr) {
+ if (!*candidate_bvpage)
+ return -EAGAIN;
+
+ DBG_BUGON(iter->bvset->nextpage);
+ iter->bvset->nextpage = *candidate_bvpage;
+ z_erofs_bvset_flip(iter);
+
+ iter->bvset->nextpage = NULL;
+ *candidate_bvpage = NULL;
+ }
+ iter->bvset->bvec[iter->cur++] = *bvec;
+ return 0;
+}
+
+static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **old_bvpage)
+{
+ if (iter->cur == iter->nr)
+ *old_bvpage = z_erofs_bvset_flip(iter);
+ else
+ *old_bvpage = NULL;
+ *bvec = iter->bvset->bvec[iter->cur++];
+}
+
static void z_erofs_destroy_pcluster_pool(void)
{
int i;
@@ -46,7 +123,7 @@ static int z_erofs_create_pcluster_pool(void)
for (pcs = pcluster_pool;
pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
- size = struct_size(a, compressed_pages, pcs->maxpages);
+ size = struct_size(a, compressed_bvecs, pcs->maxpages);
sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
pcs->slab = kmem_cache_create(pcs->name, size, 0,
@@ -150,30 +227,29 @@ int __init z_erofs_init_zip_subsystem(void)
return err;
}
-enum z_erofs_collectmode {
- COLLECT_SECONDARY,
- COLLECT_PRIMARY,
+enum z_erofs_pclustermode {
+ Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * The current collection was the tail of an exist chain, in addition
- * that the previous processed chained collections are all decided to
+ * The current pclusters was the tail of an exist chain, in addition
+ * that the previous processed chained pclusters are all decided to
* be hooked up to it.
- * A new chain will be created for the remaining collections which are
- * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
- * the next collection cannot reuse the whole page safely in
- * the following scenario:
+ * A new chain will be created for the remaining pclusters which are
+ * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
+ * the next pcluster cannot reuse the whole page safely for inplace I/O
+ * in the following scenario:
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
- * | (belongs to the next cl) | (belongs to the current cl) |
- * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+ * | (belongs to the next pcl) | (belongs to the current pcl) |
+ * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
*/
- COLLECT_PRIMARY_HOOKED,
+ Z_EROFS_PCLUSTER_HOOKED,
/*
- * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
+ * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
* could be dispatched into bypass queue later due to uptodated managed
* pages. All related online pages cannot be reused for inplace I/O (or
- * pagevec) since it can be directly decoded without I/O submission.
+ * bvpage) since it can be directly decoded without I/O submission.
*/
- COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
* The current collection has been linked with the owned chain, and
* could also be linked with the remaining collections, which means
@@ -184,39 +260,36 @@ enum z_erofs_collectmode {
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
- * | PRIMARY_FOLLOWED or | |
- * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+ * | PCLUSTER_FOLLOWED or | |
+ * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
*
* [ (*) the above page can be used as inplace I/O. ]
*/
- COLLECT_PRIMARY_FOLLOWED,
+ Z_EROFS_PCLUSTER_FOLLOWED,
};
struct z_erofs_decompress_frontend {
struct inode *const inode;
struct erofs_map_blocks map;
+ struct z_erofs_bvec_iter biter;
- struct z_erofs_pagevec_ctor vector;
-
+ struct page *candidate_bvpage;
struct z_erofs_pcluster *pcl, *tailpcl;
- /* a pointer used to pick up inplace I/O pages */
- struct page **icpage_ptr;
z_erofs_next_pcluster_t owned_head;
-
- enum z_erofs_collectmode mode;
+ enum z_erofs_pclustermode mode;
bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
+
+ /* a pointer used to pick up inplace I/O pages */
+ unsigned int icur;
};
#define DECOMPRESS_FRONTEND_INIT(__i) { \
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true }
-
-static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
-static DEFINE_MUTEX(z_pagemap_global_lock);
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
enum z_erofs_cache_alloctype type,
@@ -231,24 +304,21 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
*/
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
- struct page **pages;
- pgoff_t index;
+ unsigned int i;
- if (fe->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
- pages = pcl->compressed_pages;
- index = pcl->obj.index;
- for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
+ for (i = 0; i < pcl->pclusterpages; ++i) {
struct page *page;
compressed_page_t t;
struct page *newpage = NULL;
/* the compressed page was loaded before */
- if (READ_ONCE(*pages))
+ if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, index);
+ page = find_get_page(mc, pcl->obj.index + i);
if (page) {
t = tag_compressed_page_justfound(page);
@@ -269,7 +339,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
}
}
- if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+ if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
+ tagptr_cast_ptr(t)))
continue;
if (page)
@@ -283,7 +354,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
* managed cache since it can be moved to the bypass queue instead.
*/
if (standalone)
- fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -300,7 +371,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
* therefore no need to worry about available decompression users.
*/
for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page = pcl->compressed_pages[i];
+ struct page *page = pcl->compressed_bvecs[i].page;
if (!page)
continue;
@@ -313,7 +384,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
continue;
/* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_pages[i], NULL);
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
detach_page_private(page);
unlock_page(page);
}
@@ -323,56 +394,59 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
int erofs_try_to_free_cached_page(struct page *page)
{
struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- int ret = 0; /* 0 - busy */
+ int ret, i;
- if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
- unsigned int i;
+ if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
+ return 0;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pcl->pclusterpages; ++i) {
- if (pcl->compressed_pages[i] == page) {
- WRITE_ONCE(pcl->compressed_pages[i], NULL);
- ret = 1;
- break;
- }
+ ret = 0;
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].page == page) {
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ ret = 1;
+ break;
}
- erofs_workgroup_unfreeze(&pcl->obj, 1);
-
- if (ret)
- detach_page_private(page);
}
+ erofs_workgroup_unfreeze(&pcl->obj, 1);
+ if (ret)
+ detach_page_private(page);
return ret;
}
-/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct z_erofs_bvec *bvec)
{
struct z_erofs_pcluster *const pcl = fe->pcl;
- while (fe->icpage_ptr > pcl->compressed_pages)
- if (!cmpxchg(--fe->icpage_ptr, NULL, page))
+ while (fe->icur > 0) {
+ if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
+ NULL, bvec->page)) {
+ pcl->compressed_bvecs[fe->icur] = *bvec;
return true;
+ }
+ }
return false;
}
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
- struct page *page, enum z_erofs_page_type type,
- bool pvec_safereuse)
+ struct z_erofs_bvec *bvec, bool exclusive)
{
int ret;
- /* give priority for inplaceio */
- if (fe->mode >= COLLECT_PRIMARY &&
- type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(fe, page))
- return 0;
-
- ret = z_erofs_pagevec_enqueue(&fe->vector, page, type,
- pvec_safereuse);
- fe->pcl->vcnt += (unsigned int)ret;
- return ret ? 0 : -EAGAIN;
+ if (exclusive) {
+ /* give priority for inplaceio to use file pages first */
+ if (z_erofs_try_inplace_io(fe, bvec))
+ return 0;
+ /* otherwise, check if it can be used as a bvpage */
+ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
+ !fe->candidate_bvpage)
+ fe->candidate_bvpage = bvec->page;
+ }
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ fe->pcl->vcnt += (ret >= 0);
+ return ret;
}
static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
@@ -385,7 +459,7 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
*owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
/* so we can attach this pcluster to our submission chain. */
- f->mode = COLLECT_PRIMARY_FOLLOWED;
+ f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
return;
}
@@ -393,66 +467,21 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
* type 2, link to the end of an existing open chain, be careful
* that its submission is controlled by the original attached chain.
*/
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+ if (*owned_head != &pcl->next && pcl != f->tailpcl &&
+ cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
*owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- f->mode = COLLECT_PRIMARY_HOOKED;
+ f->mode = Z_EROFS_PCLUSTER_HOOKED;
f->tailpcl = NULL;
return;
}
/* type 3, it belongs to a chain, but it isn't the end of the chain */
- f->mode = COLLECT_PRIMARY;
+ f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}
-static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
-{
- struct z_erofs_pcluster *pcl = fe->pcl;
- unsigned int length;
-
- /* to avoid unexpected loop formed by corrupted images */
- if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- length = READ_ONCE(pcl->length);
- if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
- if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- } else {
- unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
-
- if (map->m_flags & EROFS_MAP_FULL_MAPPED)
- llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
-
- while (llen > length &&
- length != cmpxchg_relaxed(&pcl->length, length, llen)) {
- cpu_relax();
- length = READ_ONCE(pcl->length);
- }
- }
- mutex_lock(&pcl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
- fe->tailpcl = pcl;
-
- z_erofs_try_to_claim_pcluster(fe);
- return 0;
-}
-
-static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
+ struct erofs_map_blocks *map = &fe->map;
bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
struct erofs_workgroup *grp;
@@ -471,14 +500,13 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe,
atomic_set(&pcl->obj.refcount, 1);
pcl->algorithmformat = map->m_algorithmformat;
- pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
- (map->m_flags & EROFS_MAP_FULL_MAPPED ?
- Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+ pcl->length = 0;
+ pcl->partial = true;
/* new pclusters should be claimed as type 1, primary and followed */
pcl->next = fe->owned_head;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
- fe->mode = COLLECT_PRIMARY_FOLLOWED;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
* lock all primary followed works before visible to others
@@ -494,7 +522,7 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe,
} else {
pcl->obj.index = map->m_pa >> PAGE_SHIFT;
- grp = erofs_insert_workgroup(inode->i_sb, &pcl->obj);
+ grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
if (IS_ERR(grp)) {
err = PTR_ERR(grp);
goto err_out;
@@ -520,11 +548,10 @@ err_out:
return err;
}
-static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
{
- struct erofs_workgroup *grp;
+ struct erofs_map_blocks *map = &fe->map;
+ struct erofs_workgroup *grp = NULL;
int ret;
DBG_BUGON(fe->pcl);
@@ -533,38 +560,35 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe,
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (map->m_flags & EROFS_MAP_META) {
- if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- goto tailpacking;
+ if (!(map->m_flags & EROFS_MAP_META)) {
+ grp = erofs_find_workgroup(fe->inode->i_sb,
+ map->m_pa >> PAGE_SHIFT);
+ } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
- grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
if (grp) {
fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ ret = -EEXIST;
} else {
-tailpacking:
- ret = z_erofs_register_pcluster(fe, inode, map);
- if (!ret)
- goto out;
- if (ret != -EEXIST)
- return ret;
+ ret = z_erofs_register_pcluster(fe);
}
- ret = z_erofs_lookup_pcluster(fe, inode, map);
- if (ret) {
- erofs_workgroup_put(&fe->pcl->obj);
+ if (ret == -EEXIST) {
+ mutex_lock(&fe->pcl->lock);
+ /* used to check tail merging loop due to corrupted images */
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = fe->pcl;
+
+ z_erofs_try_to_claim_pcluster(fe);
+ } else if (ret) {
return ret;
}
-
-out:
- z_erofs_pagevec_ctor_init(&fe->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- fe->pcl->pagevec, fe->pcl->vcnt);
+ z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
+ Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
/* since file-backed online pages are traversed in reverse order */
- fe->icpage_ptr = fe->pcl->compressed_pages +
- z_erofs_pclusterpages(fe->pcl);
+ fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -593,14 +617,19 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
if (!pcl)
return false;
- z_erofs_pagevec_ctor_exit(&fe->vector, false);
+ z_erofs_bvec_iter_end(&fe->biter);
mutex_unlock(&pcl->lock);
+ if (fe->candidate_bvpage) {
+ DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+ fe->candidate_bvpage = NULL;
+ }
+
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
erofs_workgroup_put(&pcl->obj);
fe->pcl = NULL;
@@ -628,11 +657,10 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
- bool tight = true;
+ bool tight = true, exclusive;
enum z_erofs_cache_alloctype cache_strategy;
- enum z_erofs_page_type page_type;
- unsigned int cur, end, spiltted, index;
+ unsigned int cur, end, spiltted;
int err = 0;
/* register locked file pages as online pages in pack */
@@ -653,7 +681,7 @@ repeat:
map->m_llen = 0;
err = z_erofs_map_blocks_iter(inode, map, 0);
if (err)
- goto err_out;
+ goto out;
} else {
if (fe->pcl)
goto hitted;
@@ -663,9 +691,9 @@ repeat:
if (!(map->m_flags & EROFS_MAP_MAPPED))
goto hitted;
- err = z_erofs_collector_begin(fe, inode, map);
+ err = z_erofs_collector_begin(fe);
if (err)
- goto err_out;
+ goto out;
if (z_erofs_is_inline_pcluster(fe->pcl)) {
void *mp;
@@ -676,11 +704,12 @@ repeat:
err = PTR_ERR(mp);
erofs_err(inode->i_sb,
"failed to get inline page, err %d", err);
- goto err_out;
+ goto out;
}
get_page(fe->map.buf.page);
- WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page);
- fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
+ fe->map.buf.page);
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
@@ -696,10 +725,10 @@ hitted:
* Ensure the current partial page belongs to this submit chain rather
* than other concurrent submit chains or the noio(bypass) chain since
* those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or pagevec (should be processed in strict order.)
+ * for inplace I/O or bvpage (should be processed in a strict order.)
*/
- tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED &&
- fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
+ fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
@@ -707,60 +736,59 @@ hitted:
goto next_part;
}
- /* let's derive page type */
- page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
- (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
- (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
- Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
-
+ exclusive = (!cur && (!spiltted || tight));
if (cur)
- tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
retry:
- err = z_erofs_attach_page(fe, page, page_type,
- fe->mode >= COLLECT_PRIMARY_FOLLOWED);
- /* should allocate an additional short-lived page for pagevec */
- if (err == -EAGAIN) {
- struct page *const newpage =
- alloc_page(GFP_NOFS | __GFP_NOFAIL);
-
- set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
- err = z_erofs_attach_page(fe, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
- if (!err)
- goto retry;
+ err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
+ .page = page,
+ .offset = offset - map->m_la,
+ .end = end,
+ }), exclusive);
+ /* should allocate an additional short-lived page for bvset */
+ if (err == -EAGAIN && !fe->candidate_bvpage) {
+ fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
+ set_page_private(fe->candidate_bvpage,
+ Z_EROFS_SHORTLIVED_PAGE);
+ goto retry;
}
- if (err)
- goto err_out;
-
- index = page->index - (map->m_la >> PAGE_SHIFT);
-
- z_erofs_onlinepage_fixup(page, index, true);
+ if (err) {
+ DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
+ goto out;
+ }
+ z_erofs_onlinepage_split(page);
/* bump up the number of spiltted parts of a page */
++spiltted;
- /* also update nr_pages */
- fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1);
+ if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
+ fe->pcl->multibases = true;
+
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ fe->pcl->length == map->m_llen)
+ fe->pcl->partial = false;
+ if (fe->pcl->length < offset + end - map->m_la) {
+ fe->pcl->length = offset + end - map->m_la;
+ fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
next_part:
- /* can be used for verification */
+ /* shorten the remaining extent to update progress */
map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
end = cur;
if (end > 0)
goto repeat;
out:
+ if (err)
+ z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
__func__, page, spiltted, map->m_llen);
return err;
-
- /* if some error occurred while processing this page */
-err_out:
- SetPageError(page);
- goto out;
}
static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
@@ -783,97 +811,137 @@ static bool z_erofs_page_is_invalidated(struct page *page)
return !page->mapping && !z_erofs_is_shortlived_page(page);
}
-static int z_erofs_decompress_pcluster(struct super_block *sb,
- struct z_erofs_pcluster *pcl,
- struct page **pagepool)
-{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
- struct z_erofs_pagevec_ctor ctor;
- unsigned int i, inputsize, outputsize, llen, nr_pages;
- struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
- struct page **pages, **compressed_pages, *page;
+struct z_erofs_decompress_backend {
+ struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
+ struct super_block *sb;
+ struct z_erofs_pcluster *pcl;
- enum z_erofs_page_type page_type;
- bool overlapped, partial;
- int err;
+ /* pages with the longest decompressed length for deduplication */
+ struct page **decompressed_pages;
+ /* pages to keep the compressed data */
+ struct page **compressed_pages;
- might_sleep();
- DBG_BUGON(!READ_ONCE(pcl->nr_pages));
+ struct list_head decompressed_secondary_bvecs;
+ struct page **pagepool;
+ unsigned int onstack_used, nr_pages;
+};
- mutex_lock(&pcl->lock);
- nr_pages = pcl->nr_pages;
+struct z_erofs_bvec_item {
+ struct z_erofs_bvec bvec;
+ struct list_head list;
+};
- if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
- pages = pages_onstack;
- } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
- mutex_trylock(&z_pagemap_global_lock)) {
- pages = z_pagemap_global;
- } else {
- gfp_t gfp_flags = GFP_KERNEL;
+static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+ struct z_erofs_bvec *bvec)
+{
+ struct z_erofs_bvec_item *item;
- if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
- gfp_flags |= __GFP_NOFAIL;
+ if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
+ unsigned int pgnr;
+ struct page *oldpage;
- pages = kvmalloc_array(nr_pages, sizeof(struct page *),
- gfp_flags);
+ pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+ oldpage = be->decompressed_pages[pgnr];
+ be->decompressed_pages[pgnr] = bvec->page;
- /* fallback to global pagemap for the lowmem scenario */
- if (!pages) {
- mutex_lock(&z_pagemap_global_lock);
- pages = z_pagemap_global;
- }
+ if (!oldpage)
+ return;
}
- for (i = 0; i < nr_pages; ++i)
- pages[i] = NULL;
-
- err = 0;
- z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
- pcl->pagevec, 0);
-
- for (i = 0; i < pcl->vcnt; ++i) {
- unsigned int pagenr;
+ /* (cold path) one pcluster is requested multiple times */
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
+ item->bvec = *bvec;
+ list_add(&item->list, &be->decompressed_secondary_bvecs);
+}
- page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ unsigned int off0 = be->pcl->pageofs_out;
+ struct list_head *p, *n;
+
+ list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
+ struct z_erofs_bvec_item *bvi;
+ unsigned int end, cur;
+ void *dst, *src;
+
+ bvi = container_of(p, struct z_erofs_bvec_item, list);
+ cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
+ end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
+ bvi->bvec.end);
+ dst = kmap_local_page(bvi->bvec.page);
+ while (cur < end) {
+ unsigned int pgnr, scur, len;
+
+ pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+
+ scur = bvi->bvec.offset + cur -
+ ((pgnr << PAGE_SHIFT) - off0);
+ len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
+ if (!be->decompressed_pages[pgnr]) {
+ err = -EFSCORRUPTED;
+ cur += len;
+ continue;
+ }
+ src = kmap_local_page(be->decompressed_pages[pgnr]);
+ memcpy(dst + cur, src + scur, len);
+ kunmap_local(src);
+ cur += len;
+ }
+ kunmap_local(dst);
+ if (err)
+ z_erofs_page_mark_eio(bvi->bvec.page);
+ z_erofs_onlinepage_endio(bvi->bvec.page);
+ list_del(p);
+ kfree(bvi);
+ }
+}
- /* all pages in pagevec ought to be valid */
- DBG_BUGON(!page);
- DBG_BUGON(z_erofs_page_is_invalidated(page));
+static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ struct z_erofs_bvec_iter biter;
+ struct page *old_bvpage;
+ int i;
- if (z_erofs_put_shortlivedpage(pagepool, page))
- continue;
+ z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
+ for (i = 0; i < pcl->vcnt; ++i) {
+ struct z_erofs_bvec bvec;
- if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
- pagenr = 0;
- else
- pagenr = z_erofs_onlinepage_index(page);
+ z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
- DBG_BUGON(pagenr >= nr_pages);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
- /*
- * currently EROFS doesn't support multiref(dedup),
- * so here erroring out one multiref page.
- */
- if (pages[pagenr]) {
- DBG_BUGON(1);
- SetPageError(pages[pagenr]);
- z_erofs_onlinepage_endio(pages[pagenr]);
- err = -EFSCORRUPTED;
- }
- pages[pagenr] = page;
+ DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
+ z_erofs_do_decompressed_bvec(be, &bvec);
}
- z_erofs_pagevec_ctor_exit(&ctor, true);
- overlapped = false;
- compressed_pages = pcl->compressed_pages;
+ old_bvpage = z_erofs_bvec_iter_end(&biter);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
+}
+static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
+ bool *overlapped)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ int i, err = 0;
+
+ *overlapped = false;
for (i = 0; i < pclusterpages; ++i) {
- unsigned int pagenr;
+ struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
+ struct page *page = bvec->page;
- page = compressed_pages[i];
- /* all compressed pages ought to be valid */
- DBG_BUGON(!page);
+ /* compressed pages ought to be present before decompressing */
+ if (!page) {
+ DBG_BUGON(1);
+ continue;
+ }
+ be->compressed_pages[i] = page;
if (z_erofs_is_inline_pcluster(pcl)) {
if (!PageUptodate(page))
@@ -883,109 +951,129 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- if (erofs_page_is_managed(sbi, page)) {
+ if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
if (!PageUptodate(page))
err = -EIO;
continue;
}
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
+ }
+ }
- /*
- * only if non-head page can be selected
- * for inplace decompression
- */
- pagenr = z_erofs_onlinepage_index(page);
-
- DBG_BUGON(pagenr >= nr_pages);
- if (pages[pagenr]) {
- DBG_BUGON(1);
- SetPageError(pages[pagenr]);
- z_erofs_onlinepage_endio(pages[pagenr]);
- err = -EFSCORRUPTED;
- }
- pages[pagenr] = page;
+ if (err)
+ return err;
+ return 0;
+}
- overlapped = true;
- }
+static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ unsigned int i, inputsize;
+ int err2;
+ struct page *page;
+ bool overlapped;
- /* PG_error needs checking for all non-managed pages */
- if (PageError(page)) {
- DBG_BUGON(PageUptodate(page));
- err = -EIO;
- }
+ mutex_lock(&pcl->lock);
+ be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
+
+ /* allocate (de)compressed page arrays if cannot be kept on stack */
+ be->decompressed_pages = NULL;
+ be->compressed_pages = NULL;
+ be->onstack_used = 0;
+ if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
+ be->decompressed_pages = be->onstack_pages;
+ be->onstack_used = be->nr_pages;
+ memset(be->decompressed_pages, 0,
+ sizeof(struct page *) * be->nr_pages);
}
+ if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
+ be->compressed_pages = be->onstack_pages + be->onstack_used;
+
+ if (!be->decompressed_pages)
+ be->decompressed_pages =
+ kvcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+ if (!be->compressed_pages)
+ be->compressed_pages =
+ kvcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ z_erofs_parse_out_bvecs(be);
+ err2 = z_erofs_parse_in_bvecs(be, &overlapped);
+ if (err2)
+ err = err2;
if (err)
goto out;
- llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
- if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) {
- outputsize = llen;
- partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
- } else {
- outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out;
- partial = true;
- }
-
if (z_erofs_is_inline_pcluster(pcl))
inputsize = pcl->tailpacking_size;
else
inputsize = pclusterpages * PAGE_SIZE;
err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
- .sb = sb,
- .in = compressed_pages,
- .out = pages,
+ .sb = be->sb,
+ .in = be->compressed_pages,
+ .out = be->decompressed_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
.inputsize = inputsize,
- .outputsize = outputsize,
+ .outputsize = pcl->length,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
- .partial_decoding = partial
- }, pagepool);
+ .partial_decoding = pcl->partial,
+ .fillgaps = pcl->multibases,
+ }, be->pagepool);
out:
/* must handle all compressed pages before actual file pages */
if (z_erofs_is_inline_pcluster(pcl)) {
- page = compressed_pages[0];
- WRITE_ONCE(compressed_pages[0], NULL);
+ page = pcl->compressed_bvecs[0].page;
+ WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
put_page(page);
} else {
for (i = 0; i < pclusterpages; ++i) {
- page = compressed_pages[i];
+ page = pcl->compressed_bvecs[i].page;
if (erofs_page_is_managed(sbi, page))
continue;
/* recycle all individual short-lived pages */
- (void)z_erofs_put_shortlivedpage(pagepool, page);
- WRITE_ONCE(compressed_pages[i], NULL);
+ (void)z_erofs_put_shortlivedpage(be->pagepool, page);
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
}
+ if (be->compressed_pages < be->onstack_pages ||
+ be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
+ kvfree(be->compressed_pages);
+ z_erofs_fill_other_copies(be, err);
- for (i = 0; i < nr_pages; ++i) {
- page = pages[i];
+ for (i = 0; i < be->nr_pages; ++i) {
+ page = be->decompressed_pages[i];
if (!page)
continue;
DBG_BUGON(z_erofs_page_is_invalidated(page));
/* recycle all individual short-lived pages */
- if (z_erofs_put_shortlivedpage(pagepool, page))
+ if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
-
- if (err < 0)
- SetPageError(page);
-
+ if (err)
+ z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
}
- if (pages == z_pagemap_global)
- mutex_unlock(&z_pagemap_global_lock);
- else if (pages != pages_onstack)
- kvfree(pages);
+ if (be->decompressed_pages != be->onstack_pages)
+ kvfree(be->decompressed_pages);
- pcl->nr_pages = 0;
+ pcl->length = 0;
+ pcl->partial = true;
+ pcl->multibases = false;
+ pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
/* pcluster lock MUST be taken before the following line */
@@ -997,22 +1085,25 @@ out:
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
struct page **pagepool)
{
+ struct z_erofs_decompress_backend be = {
+ .sb = io->sb,
+ .pagepool = pagepool,
+ .decompressed_secondary_bvecs =
+ LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ };
z_erofs_next_pcluster_t owned = io->head;
while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- struct z_erofs_pcluster *pcl;
-
- /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+ /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
-
- /* no possible that 'owned' equals NULL */
+ /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
- pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(pcl->next);
+ be.pcl = container_of(owned, struct z_erofs_pcluster, next);
+ owned = READ_ONCE(be.pcl->next);
- z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
- erofs_workgroup_put(&pcl->obj);
+ z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
+ erofs_workgroup_put(&be.pcl->obj);
}
}
@@ -1038,7 +1129,6 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (sync) {
if (!atomic_add_return(bios, &io->pending_bios))
complete(&io->u.done);
-
return;
}
@@ -1071,7 +1161,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
int justfound;
repeat:
- page = READ_ONCE(pcl->compressed_pages[nr]);
+ page = READ_ONCE(pcl->compressed_bvecs[nr].page);
oldpage = page;
if (!page)
@@ -1087,7 +1177,7 @@ repeat:
* otherwise, it will go inplace I/O path instead.
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- WRITE_ONCE(pcl->compressed_pages[nr], page);
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
set_page_private(page, 0);
tocache = true;
goto out_tocache;
@@ -1113,14 +1203,13 @@ repeat:
/* the page is still in manage cache */
if (page->mapping == mc) {
- WRITE_ONCE(pcl->compressed_pages[nr], page);
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
- ClearPageError(page);
if (!PagePrivate(page)) {
/*
* impossible to be !PagePrivate(page) for
* the current restriction as well if
- * the page is already in compressed_pages[].
+ * the page is already in compressed_bvecs[].
*/
DBG_BUGON(!justfound);
@@ -1149,7 +1238,8 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+ if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
+ oldpage, page)) {
erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
@@ -1186,6 +1276,7 @@ fg_out:
q = fgq;
init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
+ q->eio = false;
}
q->sb = sb;
q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
@@ -1246,26 +1337,25 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
DBG_BUGON(PageUptodate(page));
DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (err)
- SetPageError(page);
-
if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
if (!err)
SetPageUptodate(page);
unlock_page(page);
}
}
+ if (err)
+ q->eio = true;
z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
bio_put(bio);
}
-static void z_erofs_submit_queue(struct super_block *sb,
- struct z_erofs_decompress_frontend *f,
+static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct super_block *sb = f->inode->i_sb;
+ struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
@@ -1317,7 +1407,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
struct page *page;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi));
+ mc);
if (!page)
continue;
@@ -1369,15 +1459,14 @@ submit_bio_retry:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
}
-static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_decompress_frontend *f,
+static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
+ z_erofs_submit_queue(f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1475,7 +1564,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
(void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ z_erofs_runqueue(&f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, 0));
if (err)
@@ -1524,7 +1613,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
(void)z_erofs_collector_end(&f);
- z_erofs_runqueue(inode->i_sb, &f, &pagepool,
+ z_erofs_runqueue(&f, &pagepool,
z_erofs_get_sync_decompress_policy(sbi, nr_pages));
erofs_put_metabuf(&f.map.buf);
erofs_release_pages(&pagepool);
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 58053bb5066f..e7f04c4fbb81 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -7,13 +7,10 @@
#define __EROFS_FS_ZDATA_H
#include "internal.h"
-#include "zpvec.h"
+#include "tagptr.h"
#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
-#define Z_EROFS_NR_INLINE_PAGEVECS 3
-
-#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
-#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
+#define Z_EROFS_INLINE_BVECS 2
/*
* let's leave a type here in case of introducing
@@ -21,6 +18,21 @@
*/
typedef void *z_erofs_next_pcluster_t;
+struct z_erofs_bvec {
+ struct page *page;
+ int offset;
+ unsigned int end;
+};
+
+#define __Z_EROFS_BVSET(name, total) \
+struct name { \
+ /* point to the next page which contains the following bvecs */ \
+ struct page *nextpage; \
+ struct z_erofs_bvec bvec[total]; \
+}
+__Z_EROFS_BVSET(z_erofs_bvset,);
+__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
+
/*
* Structure fields follow one of the following exclusion rules.
*
@@ -38,24 +50,21 @@ struct z_erofs_pcluster {
/* A: point to next chained pcluster or TAILs */
z_erofs_next_pcluster_t next;
- /* A: lower limit of decompressed length and if full length or not */
+ /* L: the maximum decompression size of this round */
unsigned int length;
+ /* L: total number of bvecs */
+ unsigned int vcnt;
+
/* I: page offset of start position of decompression */
unsigned short pageofs_out;
/* I: page offset of inline compressed data */
unsigned short pageofs_in;
- /* L: maximum relative page index in pagevec[] */
- unsigned short nr_pages;
-
- /* L: total number of pages in pagevec[] */
- unsigned int vcnt;
-
union {
- /* L: inline a certain number of pagevecs for bootstrap */
- erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+ /* L: inline a certain number of bvec for bootstrap */
+ struct z_erofs_bvset_inline bvset;
/* I: can be used to free the pcluster by RCU. */
struct rcu_head rcu;
@@ -72,8 +81,14 @@ struct z_erofs_pcluster {
/* I: compression algorithm format */
unsigned char algorithmformat;
- /* A: compressed pages (can be cached or inplaced pages) */
- struct page *compressed_pages[];
+ /* L: whether partial decompression or not */
+ bool partial;
+
+ /* L: indicate several pageofs_outs or not */
+ bool multibases;
+
+ /* A: compressed bvecs (can be cached or inplaced pages) */
+ struct z_erofs_bvec compressed_bvecs[];
};
/* let's avoid the valid 32-bit kernel addresses */
@@ -94,6 +109,8 @@ struct z_erofs_decompressqueue {
struct completion done;
struct work_struct work;
} u;
+
+ bool eio;
};
static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
@@ -108,38 +125,17 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
return pcl->pclusterpages;
}
-#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
-#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
-#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
-
/*
- * waiters (aka. ongoing_packs): # to unlock the page
- * sub-index: 0 - for partial page, >= 1 full page sub-index
+ * bit 31: I/O error occurred on this page
+ * bit 0 - 30: remaining parts to complete this page
*/
-typedef atomic_t z_erofs_onlinepage_t;
-
-/* type punning */
-union z_erofs_onlinepage_converter {
- z_erofs_onlinepage_t *o;
- unsigned long *v;
-};
-
-static inline unsigned int z_erofs_onlinepage_index(struct page *page)
-{
- union z_erofs_onlinepage_converter u;
-
- DBG_BUGON(!PagePrivate(page));
- u.v = &page_private(page);
-
- return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
-}
+#define Z_EROFS_PAGE_EIO (1 << 31)
static inline void z_erofs_onlinepage_init(struct page *page)
{
union {
- z_erofs_onlinepage_t o;
+ atomic_t o;
unsigned long v;
- /* keep from being unlocked in advance */
} u = { .o = ATOMIC_INIT(1) };
set_page_private(page, u.v);
@@ -147,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page)
SetPagePrivate(page);
}
-static inline void z_erofs_onlinepage_fixup(struct page *page,
- uintptr_t index, bool down)
+static inline void z_erofs_onlinepage_split(struct page *page)
{
- union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
- int orig, orig_index, val;
-
-repeat:
- orig = atomic_read(u.o);
- orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
- if (orig_index) {
- if (!index)
- return;
+ atomic_inc((atomic_t *)&page->private);
+}
- DBG_BUGON(orig_index != index);
- }
+static inline void z_erofs_page_mark_eio(struct page *page)
+{
+ int orig;
- val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
- ((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
- if (atomic_cmpxchg(u.o, orig, val) != orig)
- goto repeat;
+ do {
+ orig = atomic_read((atomic_t *)&page->private);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
+ orig | Z_EROFS_PAGE_EIO) != orig);
}
static inline void z_erofs_onlinepage_endio(struct page *page)
{
- union z_erofs_onlinepage_converter u;
unsigned int v;
DBG_BUGON(!PagePrivate(page));
- u.v = &page_private(page);
-
- v = atomic_dec_return(u.o);
- if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+ v = atomic_dec_return((atomic_t *)&page->private);
+ if (!(v & ~Z_EROFS_PAGE_EIO)) {
set_page_private(page, 0);
ClearPagePrivate(page);
- if (!PageError(page))
+ if (!(v & Z_EROFS_PAGE_EIO))
SetPageUptodate(page);
unlock_page(page);
}
- erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
}
-#define Z_EROFS_VMAP_ONSTACK_PAGES \
- min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
-#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
+#define Z_EROFS_ONSTACK_PAGES 32
#endif
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
deleted file mode 100644
index b05464f4a808..000000000000
--- a/fs/erofs/zpvec.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * https://www.huawei.com/
- */
-#ifndef __EROFS_FS_ZPVEC_H
-#define __EROFS_FS_ZPVEC_H
-
-#include "tagptr.h"
-
-/* page type in pagevec for decompress subsystem */
-enum z_erofs_page_type {
- /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
- Z_EROFS_PAGE_TYPE_EXCLUSIVE,
-
- Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
-
- Z_EROFS_VLE_PAGE_TYPE_HEAD,
- Z_EROFS_VLE_PAGE_TYPE_MAX
-};
-
-extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
- __bad_page_type_exclusive(void);
-
-/* pagevec tagged pointer */
-typedef tagptr2_t erofs_vtptr_t;
-
-/* pagevec collector */
-struct z_erofs_pagevec_ctor {
- struct page *curr, *next;
- erofs_vtptr_t *pages;
-
- unsigned int nr, index;
-};
-
-static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
- bool atomic)
-{
- if (!ctor->curr)
- return;
-
- if (atomic)
- kunmap_atomic(ctor->pages);
- else
- kunmap(ctor->curr);
-}
-
-static inline struct page *
-z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
- unsigned int nr)
-{
- unsigned int index;
-
- /* keep away from occupied pages */
- if (ctor->next)
- return ctor->next;
-
- for (index = 0; index < nr; ++index) {
- const erofs_vtptr_t t = ctor->pages[index];
- const unsigned int tags = tagptr_unfold_tags(t);
-
- if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
- return tagptr_unfold_ptr(t);
- }
- DBG_BUGON(nr >= ctor->nr);
- return NULL;
-}
-
-static inline void
-z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
- bool atomic)
-{
- struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
-
- z_erofs_pagevec_ctor_exit(ctor, atomic);
-
- ctor->curr = next;
- ctor->next = NULL;
- ctor->pages = atomic ?
- kmap_atomic(ctor->curr) : kmap(ctor->curr);
-
- ctor->nr = PAGE_SIZE / sizeof(struct page *);
- ctor->index = 0;
-}
-
-static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
- unsigned int nr,
- erofs_vtptr_t *pages,
- unsigned int i)
-{
- ctor->nr = nr;
- ctor->curr = ctor->next = NULL;
- ctor->pages = pages;
-
- if (i >= nr) {
- i -= nr;
- z_erofs_pagevec_ctor_pagedown(ctor, false);
- while (i > ctor->nr) {
- i -= ctor->nr;
- z_erofs_pagevec_ctor_pagedown(ctor, false);
- }
- }
- ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
- ctor->index = i;
-}
-
-static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
- struct page *page,
- enum z_erofs_page_type type,
- bool pvec_safereuse)
-{
- if (!ctor->next) {
- /* some pages cannot be reused as pvec safely without I/O */
- if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
- type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
-
- if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- ctor->index + 1 == ctor->nr)
- return false;
- }
-
- if (ctor->index >= ctor->nr)
- z_erofs_pagevec_ctor_pagedown(ctor, false);
-
- /* exclusive page type must be 0 */
- if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
- __bad_page_type_exclusive();
-
- /* should remind that collector->next never equal to 1, 2 */
- if (type == (uintptr_t)ctor->next) {
- ctor->next = page;
- }
- ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
- return true;
-}
-
-static inline struct page *
-z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
- enum z_erofs_page_type *type)
-{
- erofs_vtptr_t t;
-
- if (ctor->index >= ctor->nr) {
- DBG_BUGON(!ctor->next);
- z_erofs_pagevec_ctor_pagedown(ctor, true);
- }
-
- t = ctor->pages[ctor->index];
-
- *type = tagptr_unfold_tags(t);
-
- /* should remind that collector->next never equal to 1, 2 */
- if (*type == (uintptr_t)ctor->next)
- ctor->next = tagptr_unfold_ptr(t);
-
- ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
- return tagptr_unfold_ptr(t);
-}
-#endif
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e6b932219803..7a192e4e7fa9 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1679,14 +1679,14 @@ int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
- if (is_quota_modification(inode, iattr)) {
+ if (is_quota_modification(mnt_userns, inode, iattr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
- (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
- error = dquot_transfer(inode, iattr);
+ if (i_uid_needs_update(mnt_userns, iattr, inode) ||
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
+ error = dquot_transfer(mnt_userns, inode, iattr);
if (error)
return error;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f6a19f6d9f6d..6f475d2e3b18 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1059,9 +1059,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_frags_per_group);
goto failed_mount;
}
- if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
- "error: #inodes per group too big: %lu",
+ "error: invalid #inodes per group: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
@@ -1071,6 +1072,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+ if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu",
+ le32_to_cpu(es->s_inodes_count),
+ (u64)sbi->s_groups_count * sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc_array(db_count,
@@ -1490,8 +1498,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
len = i_size-off;
toread = len;
while (toread > 0) {
- tocopy = sb->s_blocksize - offset < toread ?
- sb->s_blocksize - offset : toread;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, toread);
tmp_bh.b_state = 0;
tmp_bh.b_size = sb->s_blocksize;
@@ -1529,8 +1536,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
struct buffer_head *bh;
while (towrite > 0) {
- tocopy = sb->s_blocksize - offset < towrite ?
- sb->s_blocksize - offset : towrite;
+ tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
tmp_bh.b_state = 0;
tmp_bh.b_size = sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 84c0eb55071d..3dcc1dd1f179 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5350,14 +5350,14 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (error)
return error;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
}
- if ((ia_valid & ATTR_UID && !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (ia_valid & ATTR_GID && !gid_eq(attr->ia_gid, inode->i_gid))) {
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
handle_t *handle;
/* (user+group)*(old+new) structure, inode write (sb,
@@ -5374,7 +5374,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* counts xattr inode references.
*/
down_read(&EXT4_I(inode)->xattr_sem);
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
up_read(&EXT4_I(inode)->xattr_sem);
if (error) {
@@ -5383,10 +5383,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
/* Update corresponding info in inode so that everything is in
* one transaction */
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
if (unlikely(error)) {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index bd14cef1b08f..d66e37d80a2d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -861,10 +861,8 @@ static void __setattr_copy(struct user_namespace *mnt_userns,
{
unsigned int ia_valid = attr->ia_valid;
- if (ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
if (ia_valid & ATTR_ATIME)
inode->i_atime = attr->ia_atime;
if (ia_valid & ATTR_MTIME)
@@ -917,17 +915,15 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (err)
return err;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
err = f2fs_dquot_initialize(inode);
if (err)
return err;
}
- if ((attr->ia_valid & ATTR_UID &&
- !uid_eq(attr->ia_uid, inode->i_uid)) ||
- (attr->ia_valid & ATTR_GID &&
- !gid_eq(attr->ia_gid, inode->i_gid))) {
+ if (i_uid_needs_update(mnt_userns, attr, inode) ||
+ i_gid_needs_update(mnt_userns, attr, inode)) {
f2fs_lock_op(F2FS_I_SB(inode));
- err = dquot_transfer(inode, attr);
+ err = dquot_transfer(mnt_userns, inode, attr);
if (err) {
set_sbi_flag(F2FS_I_SB(inode),
SBI_QUOTA_NEED_REPAIR);
@@ -938,10 +934,8 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* update uid/gid under lock_op(), so that dquot and inode can
* be updated atomically.
*/
- if (attr->ia_valid & ATTR_UID)
- inode->i_uid = attr->ia_uid;
- if (attr->ia_valid & ATTR_GID)
- inode->i_gid = attr->ia_gid;
+ i_uid_update(mnt_userns, attr, inode);
+ i_gid_update(mnt_userns, attr, inode);
f2fs_mark_inode_dirty_sync(inode, true);
f2fs_unlock_op(F2FS_I_SB(inode));
}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 3cb7f8a43b4d..dcd0a1e35095 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -255,18 +255,18 @@ static int recover_quota_data(struct inode *inode, struct page *page)
memset(&attr, 0, sizeof(attr));
- attr.ia_uid = make_kuid(inode->i_sb->s_user_ns, i_uid);
- attr.ia_gid = make_kgid(inode->i_sb->s_user_ns, i_gid);
+ attr.ia_vfsuid = VFSUIDT_INIT(make_kuid(inode->i_sb->s_user_ns, i_uid));
+ attr.ia_vfsgid = VFSGIDT_INIT(make_kgid(inode->i_sb->s_user_ns, i_gid));
- if (!uid_eq(attr.ia_uid, inode->i_uid))
+ if (!vfsuid_eq(attr.ia_vfsuid, i_uid_into_vfsuid(&init_user_ns, inode)))
attr.ia_valid |= ATTR_UID;
- if (!gid_eq(attr.ia_gid, inode->i_gid))
+ if (!vfsgid_eq(attr.ia_vfsgid, i_gid_into_vfsgid(&init_user_ns, inode)))
attr.ia_valid |= ATTR_GID;
if (!attr.ia_valid)
return 0;
- err = dquot_transfer(inode, &attr);
+ err = dquot_transfer(&init_user_ns, inode, &attr);
if (err)
set_sbi_flag(F2FS_I_SB(inode), SBI_QUOTA_NEED_REPAIR);
return err;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3dae3ed60f3a..3e4eb3467cb4 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -90,7 +90,8 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
* out the RO attribute for checking by the security
* module, just because it maps to a file mode.
*/
- err = security_inode_setattr(file->f_path.dentry, &ia);
+ err = security_inode_setattr(file_mnt_user_ns(file),
+ file->f_path.dentry, &ia);
if (err)
goto out_unlock_inode;
@@ -516,9 +517,11 @@ int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
if (((attr->ia_valid & ATTR_UID) &&
- (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+ (!uid_eq(from_vfsuid(mnt_userns, i_user_ns(inode), attr->ia_vfsuid),
+ sbi->options.fs_uid))) ||
((attr->ia_valid & ATTR_GID) &&
- (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+ (!gid_eq(from_vfsgid(mnt_userns, i_user_ns(inode), attr->ia_vfsgid),
+ sbi->options.fs_gid))) ||
((attr->ia_valid & ATTR_MODE) &&
(attr->ia_mode & ~FAT_VALID_MODE)))
error = -EPERM;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 1d732fd223d4..332dc9ac47a9 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -95,14 +95,14 @@ int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (rc)
return rc;
- if (is_quota_modification(inode, iattr)) {
+ if (is_quota_modification(mnt_userns, inode, iattr)) {
rc = dquot_initialize(inode);
if (rc)
return rc;
}
if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) ||
(iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) {
- rc = dquot_transfer(inode, iattr);
+ rc = dquot_transfer(mnt_userns, inode, iattr);
if (rc)
return rc;
}
diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c
index 05efcdf7a4a7..7c849024999f 100644
--- a/fs/ksmbd/vfs.c
+++ b/fs/ksmbd/vfs.c
@@ -963,7 +963,7 @@ ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns,
*/
int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
struct dentry *dentry, const char *attr_name,
- const void *attr_value, size_t attr_size, int flags)
+ void *attr_value, size_t attr_size, int flags)
{
int err;
diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h
index 8c37aaf936ab..70da4c0ba7ad 100644
--- a/fs/ksmbd/vfs.h
+++ b/fs/ksmbd/vfs.h
@@ -109,7 +109,7 @@ ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns,
int attr_name_len);
int ksmbd_vfs_setxattr(struct user_namespace *user_ns,
struct dentry *dentry, const char *attr_name,
- const void *attr_value, size_t attr_size, int flags);
+ void *attr_value, size_t attr_size, int flags);
int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
size_t *xattr_stream_name_size, int s_type);
int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns,
diff --git a/fs/locks.c b/fs/locks.c
index ca28e0e50e56..c266cfdc3291 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) {
}
/* Fill in a file_lock structure with an appropriate FLOCK lock. */
-static struct file_lock *
-flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
+static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
- int type = flock_translate_cmd(cmd);
-
- if (type < 0)
- return ERR_PTR(type);
-
- if (fl == NULL) {
- fl = locks_alloc_lock();
- if (fl == NULL)
- return ERR_PTR(-ENOMEM);
- } else {
- locks_init_lock(fl);
- }
+ locks_init_lock(fl);
fl->fl_file = filp;
fl->fl_owner = filp;
@@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl)
fl->fl_flags = FL_FLOCK;
fl->fl_type = type;
fl->fl_end = OFFSET_MAX;
-
- return fl;
}
static int assign_type(struct file_lock *fl, long type)
@@ -2097,21 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait);
*/
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
- struct fd f = fdget(fd);
- struct file_lock *lock;
- int can_sleep, unlock;
- int error;
-
- error = -EBADF;
- if (!f.file)
- goto out;
-
- can_sleep = !(cmd & LOCK_NB);
- cmd &= ~LOCK_NB;
- unlock = (cmd == LOCK_UN);
-
- if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
- goto out_putf;
+ int can_sleep, error, type;
+ struct file_lock fl;
+ struct fd f;
/*
* LOCK_MAND locks were broken for a long time in that they never
@@ -2123,36 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
if (cmd & LOCK_MAND) {
pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n");
- error = 0;
- goto out_putf;
+ return 0;
}
- lock = flock_make_lock(f.file, cmd, NULL);
- if (IS_ERR(lock)) {
- error = PTR_ERR(lock);
+ type = flock_translate_cmd(cmd & ~LOCK_NB);
+ if (type < 0)
+ return type;
+
+ error = -EBADF;
+ f = fdget(fd);
+ if (!f.file)
+ return error;
+
+ if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE)))
goto out_putf;
- }
- if (can_sleep)
- lock->fl_flags |= FL_SLEEP;
+ flock_make_lock(f.file, &fl, type);
- error = security_file_lock(f.file, lock->fl_type);
+ error = security_file_lock(f.file, fl.fl_type);
if (error)
- goto out_free;
+ goto out_putf;
+
+ can_sleep = !(cmd & LOCK_NB);
+ if (can_sleep)
+ fl.fl_flags |= FL_SLEEP;
if (f.file->f_op->flock)
error = f.file->f_op->flock(f.file,
- (can_sleep) ? F_SETLKW : F_SETLK,
- lock);
+ (can_sleep) ? F_SETLKW : F_SETLK,
+ &fl);
else
- error = locks_lock_file_wait(f.file, lock);
-
- out_free:
- locks_free_lock(lock);
+ error = locks_lock_file_wait(f.file, &fl);
out_putf:
fdput(f);
- out:
+
return error;
}
@@ -2614,7 +2593,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
if (list_empty(&flctx->flc_flock))
return;
- flock_make_lock(filp, LOCK_UN, &fl);
+ flock_make_lock(filp, &fl, F_UNLCK);
fl.fl_flags |= FL_CLOSE;
if (filp->f_op->flock)
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 4f897e109547..cd7d09a569ff 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -295,12 +295,13 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
const void *data, int data_type,
struct inode *dir)
{
- __u32 marks_mask = 0, marks_ignored_mask = 0;
+ __u32 marks_mask = 0, marks_ignore_mask = 0;
__u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS |
FANOTIFY_EVENT_FLAGS;
const struct path *path = fsnotify_data_path(data, data_type);
unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
struct fsnotify_mark *mark;
+ bool ondir = event_mask & FAN_ONDIR;
int type;
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
@@ -315,19 +316,21 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
return 0;
} else if (!(fid_mode & FAN_REPORT_FID)) {
/* Do we have a directory inode to report? */
- if (!dir && !(event_mask & FS_ISDIR))
+ if (!dir && !ondir)
return 0;
}
fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
- /* Apply ignore mask regardless of mark's ISDIR flag */
- marks_ignored_mask |= mark->ignored_mask;
+ /*
+ * Apply ignore mask depending on event flags in ignore mask.
+ */
+ marks_ignore_mask |=
+ fsnotify_effective_ignore_mask(mark, ondir, type);
/*
- * If the event is on dir and this mark doesn't care about
- * events on dir, don't send it!
+ * Send the event depending on event flags in mark mask.
*/
- if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
+ if (!fsnotify_mask_applicable(mark->mask, ondir, type))
continue;
marks_mask |= mark->mask;
@@ -336,7 +339,7 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
*match_mask |= 1U << type;
}
- test_mask = event_mask & marks_mask & ~marks_ignored_mask;
+ test_mask = event_mask & marks_mask & ~marks_ignore_mask;
/*
* For dirent modification events (create/delete/move) that do not carry
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 80e0ec95b113..1d9f11255c64 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -499,6 +499,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
mflags |= FAN_MARK_EVICTABLE;
+ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+ mflags |= FAN_MARK_IGNORE;
return mflags;
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b08ce0d821a7..f0e49a406ffa 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1009,10 +1009,10 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
mask &= ~umask;
spin_lock(&fsn_mark->lock);
oldmask = fsnotify_calc_mask(fsn_mark);
- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
fsn_mark->mask &= ~mask;
} else {
- fsn_mark->ignored_mask &= ~mask;
+ fsn_mark->ignore_mask &= ~mask;
}
newmask = fsnotify_calc_mask(fsn_mark);
/*
@@ -1021,7 +1021,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
* changes to the mask.
* Destroy mark when only umask bits remain.
*/
- *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
+ *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
spin_unlock(&fsn_mark->lock);
return oldmask & ~newmask;
@@ -1085,15 +1085,24 @@ static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
unsigned int fan_flags)
{
bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
+ unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
bool recalc = false;
/*
+ * When using FAN_MARK_IGNORE for the first time, mark starts using
+ * independent event flags in ignore mask. After that, trying to
+ * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
+ * will result in EEXIST error.
+ */
+ if (ignore == FAN_MARK_IGNORE)
+ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
+
+ /*
* Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
* the removal of the FS_MODIFY bit in calculated mask if it was set
- * because of an ignored mask that is now going to survive FS_MODIFY.
+ * because of an ignore mask that is now going to survive FS_MODIFY.
*/
- if ((fan_flags & FAN_MARK_IGNORED_MASK) &&
- (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
!(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
if (!(fsn_mark->mask & FS_MODIFY))
@@ -1120,10 +1129,10 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
bool recalc;
spin_lock(&fsn_mark->lock);
- if (!(fan_flags & FAN_MARK_IGNORED_MASK))
+ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
fsn_mark->mask |= mask;
else
- fsn_mark->ignored_mask |= mask;
+ fsn_mark->ignore_mask |= mask;
recalc = fsnotify_calc_mask(fsn_mark) &
~fsnotify_conn_mask(fsn_mark->connector);
@@ -1187,6 +1196,37 @@ static int fanotify_group_init_error_pool(struct fsnotify_group *group)
sizeof(struct fanotify_error_event));
}
+static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
+ unsigned int fan_flags)
+{
+ /*
+ * Non evictable mark cannot be downgraded to evictable mark.
+ */
+ if (fan_flags & FAN_MARK_EVICTABLE &&
+ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+ return -EEXIST;
+
+ /*
+ * New ignore mask semantics cannot be downgraded to old semantics.
+ */
+ if (fan_flags & FAN_MARK_IGNORED_MASK &&
+ fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+ return -EEXIST;
+
+ /*
+ * An ignore mask that survives modify could never be downgraded to not
+ * survive modify. With new FAN_MARK_IGNORE semantics we make that rule
+ * explicit and return an error when trying to update the ignore mask
+ * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
+ */
+ if (fan_flags & FAN_MARK_IGNORE &&
+ !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+ return -EEXIST;
+
+ return 0;
+}
+
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int obj_type,
__u32 mask, unsigned int fan_flags,
@@ -1208,19 +1248,18 @@ static int fanotify_add_mark(struct fsnotify_group *group,
}
/*
- * Non evictable mark cannot be downgraded to evictable mark.
+ * Check if requested mark flags conflict with an existing mark flags.
*/
- if (fan_flags & FAN_MARK_EVICTABLE &&
- !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) {
- ret = -EEXIST;
+ ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
+ if (ret)
goto out;
- }
/*
* Error events are pre-allocated per group, only if strictly
* needed (i.e. FAN_FS_ERROR was requested).
*/
- if (!(fan_flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) {
+ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
+ (mask & FAN_FS_ERROR)) {
ret = fanotify_group_init_error_pool(group);
if (ret)
goto out;
@@ -1261,10 +1300,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
/*
* If some other task has this inode open for write we should not add
- * an ignored mark, unless that ignored mark is supposed to survive
+ * an ignore mask, unless that ignore mask is supposed to survive
* modification changes anyway.
*/
- if ((flags & FAN_MARK_IGNORED_MASK) &&
+ if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
inode_is_open_for_write(inode))
return 0;
@@ -1520,7 +1559,8 @@ static int fanotify_events_supported(struct fsnotify_group *group,
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
- (mask & FAN_RENAME);
+ (mask & FAN_RENAME) ||
+ (flags & FAN_MARK_IGNORE);
/*
* Some filesystems such as 'proc' acquire unusual locks when opening
@@ -1557,7 +1597,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
__kernel_fsid_t __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
- bool ignored = flags & FAN_MARK_IGNORED_MASK;
+ unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
+ unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
u32 umask = 0;
int ret;
@@ -1586,7 +1627,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
}
- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+ switch (mark_cmd) {
case FAN_MARK_ADD:
case FAN_MARK_REMOVE:
if (!mask)
@@ -1606,9 +1647,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & ~valid_mask)
return -EINVAL;
- /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
- if (ignored)
+
+ /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
+ if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
+ return -EINVAL;
+
+ /*
+ * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
+ * FAN_MARK_IGNORED_MASK.
+ */
+ if (ignore == FAN_MARK_IGNORED_MASK) {
mask &= ~FANOTIFY_EVENT_FLAGS;
+ umask = FANOTIFY_EVENT_FLAGS;
+ }
f = fdget(fanotify_fd);
if (unlikely(!f.file))
@@ -1672,7 +1723,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
goto fput_and_out;
- if (flags & FAN_MARK_FLUSH) {
+ if (mark_cmd == FAN_MARK_FLUSH) {
ret = 0;
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
@@ -1688,7 +1739,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
if (ret)
goto fput_and_out;
- if (flags & FAN_MARK_ADD) {
+ if (mark_cmd == FAN_MARK_ADD) {
ret = fanotify_events_supported(group, &path, mask, flags);
if (ret)
goto path_put_and_out;
@@ -1712,6 +1763,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
else
mnt = path.mnt;
+ ret = mnt ? -EINVAL : -EISDIR;
+ /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
+ if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
+ (mnt || S_ISDIR(inode->i_mode)) &&
+ !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
+ goto path_put_and_out;
+
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
if (mnt || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
@@ -1721,12 +1779,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* events with parent/name info for non-directory.
*/
if ((fid_mode & FAN_REPORT_DIR_FID) &&
- (flags & FAN_MARK_ADD) && !ignored)
+ (flags & FAN_MARK_ADD) && !ignore)
mask |= FAN_EVENT_ON_CHILD;
}
/* create/update an inode mark */
- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
+ switch (mark_cmd) {
case FAN_MARK_ADD:
if (mark_type == FAN_MARK_MOUNT)
ret = fanotify_add_vfsmount_mark(group, mnt, mask,
@@ -1804,7 +1862,7 @@ static int __init fanotify_user_setup(void)
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 10);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 59fb40abe33d..55081ae3a6ec 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -113,7 +113,7 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
return;
seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
inode->i_ino, inode->i_sb->s_dev,
- mflags, mark->mask, mark->ignored_mask);
+ mflags, mark->mask, mark->ignore_mask);
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
@@ -121,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
struct mount *mnt = fsnotify_conn_mount(mark->connector);
seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
- mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
+ mnt->mnt_id, mflags, mark->mask, mark->ignore_mask);
} else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
struct super_block *sb = fsnotify_conn_sb(mark->connector);
seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
- sb->s_dev, mflags, mark->mask, mark->ignored_mask);
+ sb->s_dev, mflags, mark->mask, mark->ignore_mask);
}
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 0b3e74935cb4..7974e91ffe13 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -100,7 +100,7 @@ void fsnotify_sb_delete(struct super_block *sb)
* Given an inode, first check if we care what happens to our children. Inotify
* and dnotify both tell their parents about events. If we care about any event
* on a child we run all of our children and set a dentry flag saying that the
- * parent cares. Thus when an event happens on a child it can quickly tell if
+ * parent cares. Thus when an event happens on a child it can quickly tell
* if there is a need to find a parent and send the event to the parent.
*/
void __fsnotify_update_child_dentry_flags(struct inode *inode)
@@ -324,7 +324,8 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
struct fsnotify_group *group = NULL;
__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
__u32 marks_mask = 0;
- __u32 marks_ignored_mask = 0;
+ __u32 marks_ignore_mask = 0;
+ bool is_dir = mask & FS_ISDIR;
struct fsnotify_mark *mark;
int type;
@@ -336,7 +337,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
if (!(mark->flags &
FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
- mark->ignored_mask = 0;
+ mark->ignore_mask = 0;
}
}
@@ -344,14 +345,15 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
group = mark->group;
marks_mask |= mark->mask;
- marks_ignored_mask |= mark->ignored_mask;
+ marks_ignore_mask |=
+ fsnotify_effective_ignore_mask(mark, is_dir, type);
}
- pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
- __func__, group, mask, marks_mask, marks_ignored_mask,
+ pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
+ __func__, group, mask, marks_mask, marks_ignore_mask,
data, data_type, dir, cookie);
- if (!(test_mask & marks_mask & ~marks_ignored_mask))
+ if (!(test_mask & marks_mask & ~marks_ignore_mask))
return 0;
if (group->ops->handle_event) {
@@ -423,7 +425,8 @@ static bool fsnotify_iter_select_report_types(
* But is *this mark* watching children?
*/
if (type == FSNOTIFY_ITER_TYPE_PARENT &&
- !(mark->mask & FS_EVENT_ON_CHILD))
+ !(mark->mask & FS_EVENT_ON_CHILD) &&
+ !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
continue;
fsnotify_iter_set_report_type(iter_info, type);
@@ -532,8 +535,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
/*
- * If this is a modify event we may need to clear some ignored masks.
- * In that case, the object with ignored masks will have the FS_MODIFY
+ * If this is a modify event we may need to clear some ignore masks.
+ * In that case, the object with ignore masks will have the FS_MODIFY
* event in its mask.
* Otherwise, return if none of the marks care about this type of event.
*/
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ed42a189faa2..1c4bfdab008d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -136,7 +136,7 @@ static inline u32 inotify_mask_to_arg(__u32 mask)
IN_Q_OVERFLOW);
}
-/* intofiy userspace file descriptor functions */
+/* inotify userspace file descriptor functions */
static __poll_t inotify_poll(struct file *file, poll_table *wait)
{
struct fsnotify_group *group = file->private_data;
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 4de597a83b88..52615e6090e1 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -592,8 +592,12 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
a = (ATTR_RECORD*)((u8*)ctx->attr +
le32_to_cpu(ctx->attr->length));
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated))
+ u8 *mrec_end = (u8 *)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_allocated);
+ u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
+ name_end > mrec_end)
break;
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7497cd592258..9c67edd215d5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1146,7 +1146,7 @@ int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
if (status)
return status;
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
status = dquot_initialize(inode);
if (status)
return status;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 337527571461..740b64238312 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -277,7 +277,6 @@ enum ocfs2_mount_options
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
- OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -673,8 +672,7 @@ static inline int ocfs2_cluster_o2cb_global_heartbeat(struct ocfs2_super *osb)
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
- return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
- || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER));
+ return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 0b0ae3ebb0cf..da7718cef735 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -252,16 +252,14 @@ static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
- if (!si->si_slots[preferred].sl_valid ||
- !si->si_slots[preferred].sl_node_num) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (!si->si_slots[i].sl_valid ||
- !si->si_slots[i].sl_node_num) {
+ if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
@@ -456,30 +454,24 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- if (ocfs2_mount_local(osb))
- /* use slot 0 directly in local mode */
- slot = 0;
- else {
- /* search for ourselves first and take the slot if it already
- * exists. Perhaps we need to mark this in a variable for our
- * own journal recovery? Possibly not, though we certainly
- * need to warn to the user */
- slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ if (slot < 0) {
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
- /* if no slot yet, then just take 1st available
- * one. */
- slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot < 0) {
- spin_unlock(&osb->osb_lock);
- mlog(ML_ERROR, "no free slots available!\n");
- status = -EINVAL;
- goto bail;
- }
- } else
- printk(KERN_INFO "ocfs2: Slot %d on device (%s) was "
- "already allocated to this node!\n",
- slot, osb->dev_str);
- }
+ spin_unlock(&osb->osb_lock);
+ mlog(ML_ERROR, "no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f7298816d8d9..438be028935d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -172,7 +172,6 @@ enum {
Opt_dir_resv_level,
Opt_journal_async_commit,
Opt_err_cont,
- Opt_nocluster,
Opt_err,
};
@@ -206,7 +205,6 @@ static const match_table_t tokens = {
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_err_cont, "errors=continue"},
- {Opt_nocluster, "nocluster"},
{Opt_err, NULL}
};
@@ -618,13 +616,6 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
- tmp = OCFS2_MOUNT_NOCLUSTER;
- if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
- ret = -EINVAL;
- mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
- goto out;
- }
-
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
@@ -865,7 +856,6 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
}
if (ocfs2_userspace_stack(osb) &&
- !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
OCFS2_STACK_LABEL_LEN)) {
mlog(ML_ERROR,
@@ -1137,11 +1127,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
- if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
- !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT))
- printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted "
- "without cluster aware mode.\n", osb->dev_str);
-
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
@@ -1452,9 +1437,6 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_journal_async_commit:
mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
break;
- case Opt_nocluster:
- mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER;
- break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1566,9 +1548,6 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
seq_printf(s, ",journal_async_commit");
- if (opts & OCFS2_MOUNT_NOCLUSTER)
- seq_printf(s, ",nocluster");
-
return 0;
}
diff --git a/fs/open.c b/fs/open.c
index 1d57fbde2feb..2790aac66e58 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -663,6 +663,42 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
return do_fchmodat(AT_FDCWD, filename, mode);
}
+/**
+ * setattr_vfsuid - check and set ia_fsuid attribute
+ * @kuid: new inode owner
+ *
+ * Check whether @kuid is valid and if so generate and set vfsuid_t in
+ * ia_vfsuid.
+ *
+ * Return: true if @kuid is valid, false if not.
+ */
+static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
+{
+ if (!uid_valid(kuid))
+ return false;
+ attr->ia_valid |= ATTR_UID;
+ attr->ia_vfsuid = VFSUIDT_INIT(kuid);
+ return true;
+}
+
+/**
+ * setattr_vfsgid - check and set ia_fsgid attribute
+ * @kgid: new inode owner
+ *
+ * Check whether @kgid is valid and if so generate and set vfsgid_t in
+ * ia_vfsgid.
+ *
+ * Return: true if @kgid is valid, false if not.
+ */
+static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
+{
+ if (!gid_valid(kgid))
+ return false;
+ attr->ia_valid |= ATTR_GID;
+ attr->ia_vfsgid = VFSGIDT_INIT(kgid);
+ return true;
+}
+
int chown_common(const struct path *path, uid_t user, gid_t group)
{
struct user_namespace *mnt_userns, *fs_userns;
@@ -678,28 +714,22 @@ int chown_common(const struct path *path, uid_t user, gid_t group)
mnt_userns = mnt_user_ns(path->mnt);
fs_userns = i_user_ns(inode);
- uid = mapped_kuid_user(mnt_userns, fs_userns, uid);
- gid = mapped_kgid_user(mnt_userns, fs_userns, gid);
retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
- if (user != (uid_t) -1) {
- if (!uid_valid(uid))
- return -EINVAL;
- newattrs.ia_valid |= ATTR_UID;
- newattrs.ia_uid = uid;
- }
- if (group != (gid_t) -1) {
- if (!gid_valid(gid))
- return -EINVAL;
- newattrs.ia_valid |= ATTR_GID;
- newattrs.ia_gid = gid;
- }
+ if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
+ return -EINVAL;
+ if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
+ return -EINVAL;
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
inode_lock(inode);
- error = security_path_chown(path, uid, gid);
+ /* Continue to send actual fs values, not the mount values. */
+ error = security_path_chown(
+ path,
+ from_vfsuid(mnt_userns, fs_userns, newattrs.ia_vfsuid),
+ from_vfsgid(mnt_userns, fs_userns, newattrs.ia_vfsgid));
if (!error)
error = notify_change(mnt_userns, path->dentry, &newattrs,
&delegated_inode);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 714ec569d25b..245e2cb62708 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -331,8 +331,8 @@ int ovl_set_attr(struct ovl_fs *ofs, struct dentry *upperdentry,
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
- .ia_uid = stat->uid,
- .ia_gid = stat->gid,
+ .ia_vfsuid = VFSUIDT_INIT(stat->uid),
+ .ia_vfsgid = VFSGIDT_INIT(stat->gid),
};
err = ovl_do_notify_change(ofs, upperdentry, &attr);
}
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 492eddeb481f..7922b619f6c8 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -454,23 +454,94 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return res;
}
+/*
+ * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone
+ * of the POSIX ACLs retrieved from the lower layer to this function to not
+ * alter the POSIX ACLs for the underlying filesystem.
+ */
+static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns,
+ struct posix_acl *acl)
+{
+ for (unsigned int i = 0; i < acl->a_count; i++) {
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
+
+ struct posix_acl_entry *e = &acl->a_entries[i];
+ switch (e->e_tag) {
+ case ACL_USER:
+ vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid);
+ e->e_uid = vfsuid_into_kuid(vfsuid);
+ break;
+ case ACL_GROUP:
+ vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid);
+ e->e_gid = vfsgid_into_kgid(vfsgid);
+ break;
+ }
+ }
+}
+
+/*
+ * When the relevant layer is an idmapped mount we need to take the idmapping
+ * of the layer into account and translate any ACL_{GROUP,USER} values
+ * according to the idmapped mount.
+ *
+ * We cannot alter the ACLs returned from the relevant layer as that would
+ * alter the cached values filesystem wide for the lower filesystem. Instead we
+ * can clone the ACLs and then apply the relevant idmapping of the layer.
+ *
+ * This is obviously only relevant when idmapped layers are used.
+ */
struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu)
{
struct inode *realinode = ovl_inode_real(inode);
- const struct cred *old_cred;
- struct posix_acl *acl;
+ struct posix_acl *acl, *clone;
+ struct path realpath;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
return NULL;
- if (rcu)
- return get_cached_acl_rcu(realinode, type);
+ /* Careful in RCU walk mode */
+ ovl_i_path_real(inode, &realpath);
+ if (!realpath.dentry) {
+ WARN_ON(!rcu);
+ return ERR_PTR(-ECHILD);
+ }
- old_cred = ovl_override_creds(inode->i_sb);
- acl = get_acl(realinode, type);
- revert_creds(old_cred);
+ if (rcu) {
+ acl = get_cached_acl_rcu(realinode, type);
+ } else {
+ const struct cred *old_cred;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ acl = get_acl(realinode, type);
+ revert_creds(old_cred);
+ }
+ /*
+ * If there are no POSIX ACLs, or we encountered an error,
+ * or the layer isn't idmapped we don't need to do anything.
+ */
+ if (!is_idmapped_mnt(realpath.mnt) || IS_ERR_OR_NULL(acl))
+ return acl;
- return acl;
+ /*
+ * We only get here if the layer is idmapped. So drop out of RCU path
+ * walk so we can clone the ACLs. There's no need to release the ACLs
+ * since get_cached_acl_rcu() doesn't take a reference on the ACLs.
+ */
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ if (!clone)
+ clone = ERR_PTR(-ENOMEM);
+ else
+ ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone);
+ /*
+ * Since we're not in RCU path walk we always need to release the
+ * original ACLs.
+ */
+ posix_acl_release(acl);
+ return clone;
}
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 4f34b7e02eee..6ec815b84d48 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -139,17 +139,7 @@ static inline int ovl_do_notify_change(struct ovl_fs *ofs,
struct dentry *upperdentry,
struct iattr *attr)
{
- struct user_namespace *upper_mnt_userns = ovl_upper_mnt_userns(ofs);
- struct user_namespace *fs_userns = i_user_ns(d_inode(upperdentry));
-
- if (attr->ia_valid & ATTR_UID)
- attr->ia_uid = mapped_kuid_user(upper_mnt_userns,
- fs_userns, attr->ia_uid);
- if (attr->ia_valid & ATTR_GID)
- attr->ia_gid = mapped_kgid_user(upper_mnt_userns,
- fs_userns, attr->ia_gid);
-
- return notify_change(upper_mnt_userns, upperdentry, attr, NULL);
+ return notify_change(ovl_upper_mnt_userns(ofs), upperdentry, attr, NULL);
}
static inline int ovl_do_rmdir(struct ovl_fs *ofs,
@@ -259,7 +249,8 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry,
const char *name, const void *value,
size_t size, int flags)
{
- int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name, value, size, flags);
+ int err = vfs_setxattr(ovl_upper_mnt_userns(ofs), dentry, name,
+ (void *)value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, %d) = %i\n",
dentry, name, min((int)size, 48), value, size, flags, err);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 1ce5c9698393..e0a2e0468ee7 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1003,9 +1003,6 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
struct dentry *dentry, struct inode *inode,
const char *name, void *buffer, size_t size)
{
- if (!IS_POSIXACL(inode))
- return -EOPNOTSUPP;
-
return ovl_xattr_get(dentry, inode, handler->name, buffer, size);
}
@@ -1021,9 +1018,6 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
struct posix_acl *acl = NULL;
int err;
- if (!IS_POSIXACL(inode))
- return -EOPNOTSUPP;
-
/* Check that everything is OK before copy-up */
if (value) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
@@ -1966,20 +1960,6 @@ static struct dentry *ovl_get_root(struct super_block *sb,
return root;
}
-static bool ovl_has_idmapped_layers(struct ovl_fs *ofs)
-{
-
- unsigned int i;
- const struct vfsmount *mnt;
-
- for (i = 0; i < ofs->numlayer; i++) {
- mnt = ofs->layers[i].mnt;
- if (mnt && is_idmapped_mnt(mnt))
- return true;
- }
- return false;
-}
-
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { };
@@ -2149,10 +2129,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
ovl_trusted_xattr_handlers;
sb->s_fs_info = ofs;
- if (ovl_has_idmapped_layers(ofs))
- pr_warn("POSIX ACLs are not yet supported with idmapped layers, mounting without ACL support.\n");
- else
- sb->s_flags |= SB_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
sb->s_iflags |= SB_I_SKIP_SYNC;
err = -ENOMEM;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 962d32468eb4..1d17d7b13dcd 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(posix_acl_alloc);
/*
* Clone an ACL.
*/
-static struct posix_acl *
+struct posix_acl *
posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
{
struct posix_acl *clone = NULL;
@@ -213,6 +213,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
}
return clone;
}
+EXPORT_SYMBOL_GPL(posix_acl_clone);
/*
* Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
@@ -361,8 +362,8 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
{
const struct posix_acl_entry *pa, *pe, *mask_obj;
int found = 0;
- kuid_t uid;
- kgid_t gid;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
want &= MAY_READ | MAY_WRITE | MAY_EXEC;
@@ -370,30 +371,28 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode,
switch(pa->e_tag) {
case ACL_USER_OBJ:
/* (May have been checked already) */
- uid = i_uid_into_mnt(mnt_userns, inode);
- if (uid_eq(uid, current_fsuid()))
+ vfsuid = i_uid_into_vfsuid(mnt_userns, inode);
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
goto check_perm;
break;
case ACL_USER:
- uid = mapped_kuid_fs(mnt_userns,
- i_user_ns(inode),
+ vfsuid = make_vfsuid(mnt_userns, &init_user_ns,
pa->e_uid);
- if (uid_eq(uid, current_fsuid()))
+ if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
goto mask;
break;
case ACL_GROUP_OBJ:
- gid = i_gid_into_mnt(mnt_userns, inode);
- if (in_group_p(gid)) {
+ vfsgid = i_gid_into_vfsgid(mnt_userns, inode);
+ if (vfsgid_in_group_p(vfsgid)) {
found = 1;
if ((pa->e_perm & want) == want)
goto mask;
}
break;
case ACL_GROUP:
- gid = mapped_kgid_fs(mnt_userns,
- i_user_ns(inode),
+ vfsgid = make_vfsgid(mnt_userns, &init_user_ns,
pa->e_gid);
- if (in_group_p(gid)) {
+ if (vfsgid_in_group_p(vfsgid)) {
found = 1;
if ((pa->e_perm & want) == want)
goto mask;
@@ -699,7 +698,7 @@ int posix_acl_update_mode(struct user_namespace *mnt_userns,
return error;
if (error == 0)
*acl = NULL;
- if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) &&
+ if (!vfsgid_in_group_p(i_gid_into_vfsgid(mnt_userns, inode)) &&
!capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID))
mode &= ~S_ISGID;
*mode_p = mode;
@@ -710,46 +709,127 @@ EXPORT_SYMBOL(posix_acl_update_mode);
/*
* Fix up the uids and gids in posix acl extended attributes in place.
*/
-static void posix_acl_fix_xattr_userns(
- struct user_namespace *to, struct user_namespace *from,
- struct user_namespace *mnt_userns,
- void *value, size_t size, bool from_user)
+static int posix_acl_fix_xattr_common(void *value, size_t size)
+{
+ struct posix_acl_xattr_header *header = value;
+ int count;
+
+ if (!header)
+ return -EINVAL;
+ if (size < sizeof(struct posix_acl_xattr_header))
+ return -EINVAL;
+ if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+ return -EINVAL;
+
+ count = posix_acl_xattr_count(size);
+ if (count < 0)
+ return -EINVAL;
+ if (count == 0)
+ return -EINVAL;
+
+ return count;
+}
+
+void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode,
+ void *value, size_t size)
{
struct posix_acl_xattr_header *header = value;
struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
int count;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
kuid_t uid;
kgid_t gid;
- if (!value)
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
return;
- if (size < sizeof(struct posix_acl_xattr_header))
+
+ count = posix_acl_fix_xattr_common(value, size);
+ if (count < 0)
return;
- if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+
+ for (end = entry + count; entry != end; entry++) {
+ switch (le16_to_cpu(entry->e_tag)) {
+ case ACL_USER:
+ uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+ vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid);
+ entry->e_id = cpu_to_le32(from_kuid(&init_user_ns,
+ vfsuid_into_kuid(vfsuid)));
+ break;
+ case ACL_GROUP:
+ gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+ vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid);
+ entry->e_id = cpu_to_le32(from_kgid(&init_user_ns,
+ vfsgid_into_kgid(vfsgid)));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode,
+ void *value, size_t size)
+{
+ struct posix_acl_xattr_header *header = value;
+ struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+ int count;
+ vfsuid_t vfsuid;
+ vfsgid_t vfsgid;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (no_idmapping(mnt_userns, i_user_ns(inode)))
return;
- count = posix_acl_xattr_count(size);
+ count = posix_acl_fix_xattr_common(value, size);
if (count < 0)
return;
- if (count == 0)
+
+ for (end = entry + count; entry != end; entry++) {
+ switch (le16_to_cpu(entry->e_tag)) {
+ case ACL_USER:
+ uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id));
+ vfsuid = VFSUIDT_INIT(uid);
+ uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid);
+ entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid));
+ break;
+ case ACL_GROUP:
+ gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id));
+ vfsgid = VFSGIDT_INIT(gid);
+ gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid);
+ entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void posix_acl_fix_xattr_userns(
+ struct user_namespace *to, struct user_namespace *from,
+ void *value, size_t size)
+{
+ struct posix_acl_xattr_header *header = value;
+ struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
+ int count;
+ kuid_t uid;
+ kgid_t gid;
+
+ count = posix_acl_fix_xattr_common(value, size);
+ if (count < 0)
return;
for (end = entry + count; entry != end; entry++) {
switch(le16_to_cpu(entry->e_tag)) {
case ACL_USER:
uid = make_kuid(from, le32_to_cpu(entry->e_id));
- if (from_user)
- uid = mapped_kuid_user(mnt_userns, &init_user_ns, uid);
- else
- uid = mapped_kuid_fs(mnt_userns, &init_user_ns, uid);
entry->e_id = cpu_to_le32(from_kuid(to, uid));
break;
case ACL_GROUP:
gid = make_kgid(from, le32_to_cpu(entry->e_id));
- if (from_user)
- gid = mapped_kgid_user(mnt_userns, &init_user_ns, gid);
- else
- gid = mapped_kgid_fs(mnt_userns, &init_user_ns, gid);
entry->e_id = cpu_to_le32(from_kgid(to, gid));
break;
default:
@@ -758,34 +838,20 @@ static void posix_acl_fix_xattr_userns(
}
}
-void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
- struct inode *inode,
- void *value, size_t size)
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
-
- /* Leave ids untouched on non-idmapped mounts. */
- if (no_idmapping(mnt_userns, i_user_ns(inode)))
- mnt_userns = &init_user_ns;
- if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
+ if (user_ns == &init_user_ns)
return;
- posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value,
- size, true);
+ posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
}
-void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
- struct inode *inode,
- void *value, size_t size)
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
{
struct user_namespace *user_ns = current_user_ns();
-
- /* Leave ids untouched on non-idmapped mounts. */
- if (no_idmapping(mnt_userns, i_user_ns(inode)))
- mnt_userns = &init_user_ns;
- if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns))
+ if (user_ns == &init_user_ns)
return;
- posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value,
- size, false);
+ posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
}
/*
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 09d1307959d0..28966da7834e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2085,7 +2085,8 @@ EXPORT_SYMBOL(__dquot_transfer);
/* Wrapper for transferring ownership of an inode for uid/gid only
* Called from FSXXX_setattr()
*/
-int dquot_transfer(struct inode *inode, struct iattr *iattr)
+int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
+ struct iattr *iattr)
{
struct dquot *transfer_to[MAXQUOTAS] = {};
struct dquot *dquot;
@@ -2095,8 +2096,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (!dquot_active(inode))
return 0;
- if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
- dquot = dqget(sb, make_kqid_uid(iattr->ia_uid));
+ if (i_uid_needs_update(mnt_userns, iattr, inode)) {
+ kuid_t kuid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
+
+ dquot = dqget(sb, make_kqid_uid(kuid));
if (IS_ERR(dquot)) {
if (PTR_ERR(dquot) != -ESRCH) {
ret = PTR_ERR(dquot);
@@ -2106,8 +2110,11 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
}
transfer_to[USRQUOTA] = dquot;
}
- if (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid)){
- dquot = dqget(sb, make_kqid_gid(iattr->ia_gid));
+ if (i_gid_needs_update(mnt_userns, iattr, inode)) {
+ kgid_t kgid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
+
+ dquot = dqget(sb, make_kqid_gid(kgid));
if (IS_ERR(dquot)) {
if (PTR_ERR(dquot) != -ESRCH) {
ret = PTR_ERR(dquot);
diff --git a/fs/read_write.c b/fs/read_write.c
index e0777eefd846..397da0236607 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1263,6 +1263,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
count, fl);
file_end_write(out.file);
} else {
+ if (out.file->f_flags & O_NONBLOCK)
+ fl |= SPLICE_F_NONBLOCK;
+
retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0cffe054b78e..0df48d176732 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -290,7 +290,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
struct buffer_head *bh;
struct item_head *ih, tmp_ih;
b_blocknr_t blocknr;
- char *p = NULL;
+ char *p;
int chars;
int ret;
int result;
@@ -305,8 +305,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
result = search_for_position_by_key(inode->i_sb, &key, &path);
if (result != POSITION_FOUND) {
pathrelse(&path);
- if (p)
- kunmap(bh_result->b_page);
if (result == IO_ERROR)
return -EIO;
/*
@@ -352,8 +350,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
}
pathrelse(&path);
- if (p)
- kunmap(bh_result->b_page);
return ret;
}
/* requested data are in direct item(s) */
@@ -363,8 +359,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
* when it is stored in direct item(s)
*/
pathrelse(&path);
- if (p)
- kunmap(bh_result->b_page);
return -ENOENT;
}
@@ -396,9 +390,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
* sure we need to. But, this means the item might move if
* kmap schedules
*/
- if (!p)
- p = (char *)kmap(bh_result->b_page);
-
+ p = (char *)kmap(bh_result->b_page);
p += offset;
memset(p, 0, inode->i_sb->s_blocksize);
do {
@@ -3284,7 +3276,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
/* must be turned off for recursive notify_change calls */
ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
- if (is_quota_modification(inode, attr)) {
+ if (is_quota_modification(mnt_userns, inode, attr)) {
error = dquot_initialize(inode);
if (error)
return error;
@@ -3367,7 +3359,7 @@ int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
- error = dquot_transfer(inode, attr);
+ error = dquot_transfer(mnt_userns, inode, attr);
reiserfs_write_lock(inode->i_sb);
if (error) {
journal_end(&th);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e943370107d0..de86f5b2859f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg)
}
static inline struct uffd_msg userfault_msg(unsigned long address,
+ unsigned long real_address,
unsigned int flags,
unsigned long reason,
unsigned int features)
{
struct uffd_msg msg;
+
msg_init(&msg);
msg.event = UFFD_EVENT_PAGEFAULT;
- if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
- address &= PAGE_MASK;
- msg.arg.pagefault.address = address;
+ msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
+ real_address : address;
+
/*
* These flags indicate why the userfault occurred:
* - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
@@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
uwq.wq.private = current;
- uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
- ctx->features);
+ uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
+ reason, ctx->features);
uwq.ctx = ctx;
uwq.waken = false;
diff --git a/fs/xattr.c b/fs/xattr.c
index e8dd03e4561e..a1f4998bc6be 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -282,9 +282,15 @@ out:
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
+static inline bool is_posix_acl_xattr(const char *name)
+{
+ return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
+ (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
+}
+
int
vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
- const char *name, const void *value, size_t size, int flags)
+ const char *name, void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
struct inode *delegated_inode = NULL;
@@ -292,12 +298,16 @@ vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
int error;
if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
- error = cap_convert_nscap(mnt_userns, dentry, &value, size);
+ error = cap_convert_nscap(mnt_userns, dentry,
+ (const void **)&value, size);
if (error < 0)
return error;
size = error;
}
+ if (size && is_posix_acl_xattr(name))
+ posix_acl_setxattr_idmapped_mnt(mnt_userns, inode, value, size);
+
retry_deleg:
inode_lock(inode);
error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size,
@@ -431,7 +441,10 @@ vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
return ret;
}
nolsm:
- return __vfs_getxattr(dentry, inode, name, value, size);
+ error = __vfs_getxattr(dentry, inode, name, value, size);
+ if (error > 0 && is_posix_acl_xattr(name))
+ posix_acl_getxattr_idmapped_mnt(mnt_userns, inode, value, size);
+ return error;
}
EXPORT_SYMBOL_GPL(vfs_getxattr);
@@ -577,8 +590,7 @@ static void setxattr_convert(struct user_namespace *mnt_userns,
if (ctx->size &&
((strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(ctx->kname->name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)))
- posix_acl_fix_xattr_from_user(mnt_userns, d_inode(d),
- ctx->kvalue, ctx->size);
+ posix_acl_fix_xattr_from_user(ctx->kvalue, ctx->size);
}
int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
@@ -695,8 +707,7 @@ do_getxattr(struct user_namespace *mnt_userns, struct dentry *d,
if (error > 0) {
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
- posix_acl_fix_xattr_to_user(mnt_userns, d_inode(d),
- ctx->kvalue, error);
+ posix_acl_fix_xattr_to_user(ctx->kvalue, error);
if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 29f5b8b8aca6..a7402f6ea510 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -667,13 +667,15 @@ xfs_setattr_nonsize(
uint qflags = 0;
if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
- uid = iattr->ia_uid;
+ uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsuid);
qflags |= XFS_QMOPT_UQUOTA;
} else {
uid = inode->i_uid;
}
if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
- gid = iattr->ia_gid;
+ gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ iattr->ia_vfsgid);
qflags |= XFS_QMOPT_GQUOTA;
} else {
gid = inode->i_gid;
@@ -704,13 +706,13 @@ xfs_setattr_nonsize(
* didn't have the inode locked, inode's dquot(s) would have changed
* also.
*/
- if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp) &&
- !uid_eq(inode->i_uid, iattr->ia_uid)) {
+ if (XFS_IS_UQUOTA_ON(mp) &&
+ i_uid_needs_update(mnt_userns, iattr, inode)) {
ASSERT(udqp);
old_udqp = xfs_qm_vop_chown(tp, ip, &ip->i_udquot, udqp);
}
- if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp) &&
- !gid_eq(inode->i_gid, iattr->ia_gid)) {
+ if (XFS_IS_GQUOTA_ON(mp) &&
+ i_gid_needs_update(mnt_userns, iattr, inode)) {
ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp));
ASSERT(gdqp);
old_gdqp = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp);
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 053299758deb..f5d8338967cb 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -616,7 +616,7 @@ static int zonefs_inode_setattr(struct user_namespace *mnt_userns,
!uid_eq(iattr->ia_uid, inode->i_uid)) ||
((iattr->ia_valid & ATTR_GID) &&
!gid_eq(iattr->ia_gid, inode->i_gid))) {
- ret = dquot_transfer(inode, iattr);
+ ret = dquot_transfer(mnt_userns, inode, iattr);
if (ret)
return ret;
}
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index fd7e8fbaeef1..961f4d88f9ef 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -38,6 +38,10 @@
#define wmb() do { kcsan_wmb(); __wmb(); } while (0)
#endif
+#ifdef __dma_mb
+#define dma_mb() do { kcsan_mb(); __dma_mb(); } while (0)
+#endif
+
#ifdef __dma_rmb
#define dma_rmb() do { kcsan_rmb(); __dma_rmb(); } while (0)
#endif
@@ -65,6 +69,10 @@
#define wmb() mb()
#endif
+#ifndef dma_mb
+#define dma_mb() mb()
+#endif
+
#ifndef dma_rmb
#define dma_rmb() rmb()
#endif
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 7ce93aaf69f8..72974cb81343 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -964,7 +964,34 @@ static inline void iounmap(volatile void __iomem *addr)
#elif defined(CONFIG_GENERIC_IOREMAP)
#include <linux/pgtable.h>
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot);
+/*
+ * Arch code can implement the following two hooks when using GENERIC_IOREMAP
+ * ioremap_allowed() return a bool,
+ * - true means continue to remap
+ * - false means skip remap and return directly
+ * iounmap_allowed() return a bool,
+ * - true means continue to vunmap
+ * - false means skip vunmap and return directly
+ */
+#ifndef ioremap_allowed
+#define ioremap_allowed ioremap_allowed
+static inline bool ioremap_allowed(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
+{
+ return true;
+}
+#endif
+
+#ifndef iounmap_allowed
+#define iounmap_allowed iounmap_allowed
+static inline bool iounmap_allowed(void *addr)
+{
+ return true;
+}
+#endif
+
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot);
void iounmap(volatile void __iomem *addr);
static inline void __iomem *ioremap(phys_addr_t addr, size_t size)
@@ -1125,9 +1152,7 @@ static inline void memcpy_toio(volatile void __iomem *addr, const void *buffer,
}
#endif
-#ifndef CONFIG_GENERIC_DEVMEM_IS_ALLOWED
extern int devmem_is_allowed(unsigned long pfn);
-#endif
#endif /* __KERNEL__ */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index d4427d0a0e18..187b54a9567e 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -288,6 +288,10 @@ struct css_set {
struct cgroup_base_stat {
struct task_cputime cputime;
+
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_sum;
+#endif
};
/*
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 19f0dbfdd7fe..154daff0b46b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -130,7 +130,6 @@ enum cpuhp_state {
CPUHP_ZCOMP_PREPARE,
CPUHP_TIMERS_PREPARE,
CPUHP_MIPS_SOC_PREPARE,
- CPUHP_LOONGARCH_SOC_PREPARE,
CPUHP_BP_PREPARE_DYN,
CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,
CPUHP_BRINGUP_CPU,
@@ -230,6 +229,7 @@ enum cpuhp_state {
CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+ CPUHP_AP_PERF_ARM_HNS3_PMU_ONLINE,
CPUHP_AP_PERF_ARM_L2X0_ONLINE,
CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
diff --git a/include/linux/evm.h b/include/linux/evm.h
index 4c374be70247..aa63e0b3c0a2 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -21,7 +21,8 @@ extern enum integrity_status evm_verifyxattr(struct dentry *dentry,
void *xattr_value,
size_t xattr_value_len,
struct integrity_iint_cache *iint);
-extern int evm_inode_setattr(struct dentry *dentry, struct iattr *attr);
+extern int evm_inode_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr);
extern void evm_inode_post_setattr(struct dentry *dentry, int ia_valid);
extern int evm_inode_setxattr(struct user_namespace *mnt_userns,
struct dentry *dentry, const char *name,
@@ -68,7 +69,8 @@ static inline enum integrity_status evm_verifyxattr(struct dentry *dentry,
}
#endif
-static inline int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
+static inline int evm_inode_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
return 0;
}
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index e517dbcf74ed..8ad743def6f3 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -59,15 +59,19 @@
#define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \
FAN_MARK_FILESYSTEM)
+#define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \
+ FAN_MARK_FLUSH)
+
+#define FANOTIFY_MARK_IGNORE_BITS (FAN_MARK_IGNORED_MASK | \
+ FAN_MARK_IGNORE)
+
#define FANOTIFY_MARK_FLAGS (FANOTIFY_MARK_TYPE_BITS | \
- FAN_MARK_ADD | \
- FAN_MARK_REMOVE | \
+ FANOTIFY_MARK_CMD_BITS | \
+ FANOTIFY_MARK_IGNORE_BITS | \
FAN_MARK_DONT_FOLLOW | \
FAN_MARK_ONLYDIR | \
- FAN_MARK_IGNORED_MASK | \
FAN_MARK_IGNORED_SURV_MODIFY | \
- FAN_MARK_EVICTABLE | \
- FAN_MARK_FLUSH)
+ FAN_MARK_EVICTABLE)
/*
* Events that can be reported with data type FSNOTIFY_EVENT_PATH.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ad5e3520fae..ec2e35886779 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -221,8 +221,26 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
struct iattr {
unsigned int ia_valid;
umode_t ia_mode;
- kuid_t ia_uid;
- kgid_t ia_gid;
+ /*
+ * The two anonymous unions wrap structures with the same member.
+ *
+ * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
+ * are a dedicated type requiring the filesystem to use the dedicated
+ * helpers. Other filesystem can continue to use ia_{g,u}id until they
+ * have been ported.
+ *
+ * They always contain the same value. In other words FS_ALLOW_IDMAP
+ * pass down the same value on idmapped mounts as they would on regular
+ * mounts.
+ */
+ union {
+ kuid_t ia_uid;
+ vfsuid_t ia_vfsuid;
+ };
+ union {
+ kgid_t ia_gid;
+ vfsgid_t ia_vfsgid;
+ };
loff_t ia_size;
struct timespec64 ia_atime;
struct timespec64 ia_mtime;
@@ -1600,13 +1618,68 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to map
*
+ * Note, this will eventually be removed completely in favor of the type-safe
+ * i_uid_into_vfsuid().
+ *
* Return: the inode's i_uid mapped down according to @mnt_userns.
* If the inode's i_uid has no mapping INVALID_UID is returned.
*/
static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- return mapped_kuid_fs(mnt_userns, i_user_ns(inode), inode->i_uid);
+ return AS_KUIDT(make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid));
+}
+
+/**
+ * i_uid_into_vfsuid - map an inode's i_uid down into a mnt_userns
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to map
+ *
+ * Return: whe inode's i_uid mapped down according to @mnt_userns.
+ * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
+ */
+static inline vfsuid_t i_uid_into_vfsuid(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+{
+ return make_vfsuid(mnt_userns, i_user_ns(inode), inode->i_uid);
+}
+
+/**
+ * i_uid_needs_update - check whether inode's i_uid needs to be updated
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Check whether the $inode's i_uid field needs to be updated taking idmapped
+ * mounts into account if the filesystem supports it.
+ *
+ * Return: true if @inode's i_uid field needs to be updated, false if not.
+ */
+static inline bool i_uid_needs_update(struct user_namespace *mnt_userns,
+ const struct iattr *attr,
+ const struct inode *inode)
+{
+ return ((attr->ia_valid & ATTR_UID) &&
+ !vfsuid_eq(attr->ia_vfsuid,
+ i_uid_into_vfsuid(mnt_userns, inode)));
+}
+
+/**
+ * i_uid_update - update @inode's i_uid field
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Safely update @inode's i_uid field translating the vfsuid of any idmapped
+ * mount into the filesystem kuid.
+ */
+static inline void i_uid_update(struct user_namespace *mnt_userns,
+ const struct iattr *attr,
+ struct inode *inode)
+{
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = from_vfsuid(mnt_userns, i_user_ns(inode),
+ attr->ia_vfsuid);
}
/**
@@ -1614,13 +1687,68 @@ static inline kuid_t i_uid_into_mnt(struct user_namespace *mnt_userns,
* @mnt_userns: user namespace of the mount the inode was found from
* @inode: inode to map
*
+ * Note, this will eventually be removed completely in favor of the type-safe
+ * i_gid_into_vfsgid().
+ *
* Return: the inode's i_gid mapped down according to @mnt_userns.
* If the inode's i_gid has no mapping INVALID_GID is returned.
*/
static inline kgid_t i_gid_into_mnt(struct user_namespace *mnt_userns,
const struct inode *inode)
{
- return mapped_kgid_fs(mnt_userns, i_user_ns(inode), inode->i_gid);
+ return AS_KGIDT(make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid));
+}
+
+/**
+ * i_gid_into_vfsgid - map an inode's i_gid down into a mnt_userns
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @inode: inode to map
+ *
+ * Return: the inode's i_gid mapped down according to @mnt_userns.
+ * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
+ */
+static inline vfsgid_t i_gid_into_vfsgid(struct user_namespace *mnt_userns,
+ const struct inode *inode)
+{
+ return make_vfsgid(mnt_userns, i_user_ns(inode), inode->i_gid);
+}
+
+/**
+ * i_gid_needs_update - check whether inode's i_gid needs to be updated
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Check whether the $inode's i_gid field needs to be updated taking idmapped
+ * mounts into account if the filesystem supports it.
+ *
+ * Return: true if @inode's i_gid field needs to be updated, false if not.
+ */
+static inline bool i_gid_needs_update(struct user_namespace *mnt_userns,
+ const struct iattr *attr,
+ const struct inode *inode)
+{
+ return ((attr->ia_valid & ATTR_GID) &&
+ !vfsgid_eq(attr->ia_vfsgid,
+ i_gid_into_vfsgid(mnt_userns, inode)));
+}
+
+/**
+ * i_gid_update - update @inode's i_gid field
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @attr: the new attributes of @inode
+ * @inode: the inode to update
+ *
+ * Safely update @inode's i_gid field translating the vfsgid of any idmapped
+ * mount into the filesystem kgid.
+ */
+static inline void i_gid_update(struct user_namespace *mnt_userns,
+ const struct iattr *attr,
+ struct inode *inode)
+{
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = from_vfsgid(mnt_userns, i_user_ns(inode),
+ attr->ia_vfsgid);
}
/**
@@ -2195,8 +2323,8 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
struct inode *inode)
{
- return !uid_valid(i_uid_into_mnt(mnt_userns, inode)) ||
- !gid_valid(i_gid_into_mnt(mnt_userns, inode));
+ return !vfsuid_valid(i_uid_into_vfsuid(mnt_userns, inode)) ||
+ !vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode));
}
static inline int iocb_flags(struct file *file);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9560734759fa..d7d96c806bff 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -518,8 +518,8 @@ struct fsnotify_mark {
struct hlist_node obj_list;
/* Head of list of marks for an object [mark ref] */
struct fsnotify_mark_connector *connector;
- /* Events types to ignore [mark->lock, group->mark_mutex] */
- __u32 ignored_mask;
+ /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
+ __u32 ignore_mask;
/* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE 0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002
@@ -529,6 +529,7 @@ struct fsnotify_mark {
/* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200
+#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400
unsigned int flags; /* flags [mark->lock] */
};
@@ -655,15 +656,91 @@ extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
/* functions used to manipulate the marks attached to inodes */
-/* Get mask for calculating object interest taking ignored mask into account */
+/*
+ * Canonical "ignore mask" including event flags.
+ *
+ * Note the subtle semantic difference from the legacy ->ignored_mask.
+ * ->ignored_mask traditionally only meant which events should be ignored,
+ * while ->ignore_mask also includes flags regarding the type of objects on
+ * which events should be ignored.
+ */
+static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
+{
+ __u32 ignore_mask = mark->ignore_mask;
+
+ /* The event flags in ignore mask take effect */
+ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
+ return ignore_mask;
+
+ /*
+ * Legacy behavior:
+ * - Always ignore events on dir
+ * - Ignore events on child if parent is watching children
+ */
+ ignore_mask |= FS_ISDIR;
+ ignore_mask &= ~FS_EVENT_ON_CHILD;
+ ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;
+
+ return ignore_mask;
+}
+
+/* Legacy ignored_mask - only event types to ignore */
+static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
+{
+ return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
+}
+
+/*
+ * Check if mask (or ignore mask) should be applied depending if victim is a
+ * directory and whether it is reported to a watching parent.
+ */
+static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
+ int iter_type)
+{
+ /* Should mask be applied to a directory? */
+ if (is_dir && !(mask & FS_ISDIR))
+ return false;
+
+ /* Should mask be applied to a child? */
+ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
+ !(mask & FS_EVENT_ON_CHILD))
+ return false;
+
+ return true;
+}
+
+/*
+ * Effective ignore mask taking into account if event victim is a
+ * directory and whether it is reported to a watching parent.
+ */
+static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
+ bool is_dir, int iter_type)
+{
+ __u32 ignore_mask = fsnotify_ignored_events(mark);
+
+ if (!ignore_mask)
+ return 0;
+
+ /* For non-dir and non-child, no need to consult the event flags */
+ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
+ return ignore_mask;
+
+ ignore_mask = fsnotify_ignore_mask(mark);
+ if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
+ return 0;
+
+ return ignore_mask & ALL_FSNOTIFY_EVENTS;
+}
+
+/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
__u32 mask = mark->mask;
- if (!mark->ignored_mask)
+ if (!fsnotify_ignored_events(mark))
return mask;
- /* Interest in FS_MODIFY may be needed for clearing ignored mask */
+ /* Interest in FS_MODIFY may be needed for clearing ignore mask */
if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
mask |= FS_MODIFY;
@@ -671,7 +748,7 @@ static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
* If mark is interested in ignoring events on children, the object must
* show interest in those events for fsnotify_parent() to notice it.
*/
- return mask | (mark->ignored_mask & ALL_FSNOTIFY_EVENTS);
+ return mask | mark->ignore_mask;
}
/* Get mask of events for a list of marks */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 2d2ccae933c2..0ace7759acd2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -348,7 +348,7 @@ struct vm_area_struct;
#define GFP_DMA32 __GFP_DMA32
#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM)
#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | \
- __GFP_SKIP_KASAN_POISON)
+ __GFP_SKIP_KASAN_POISON | __GFP_SKIP_KASAN_UNPOISON)
#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
__GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de29821231c9..4ddaf6ad73ef 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -461,4 +461,16 @@ static inline int split_folio_to_list(struct folio *folio,
return split_huge_page_to_list(&folio->page, list);
}
+/*
+ * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
+ * limitations in the implementation like arm64 MTE can override this to
+ * false
+ */
+#ifndef arch_thp_swp_supported
+static inline bool arch_thp_swp_supported(void)
+{
+ return true;
+}
+#endif
+
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 426b1744215e..81708ca0ebc7 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -140,6 +140,11 @@ static inline int ima_measure_critical_data(const char *event_label,
#endif /* CONFIG_IMA */
+#ifdef CONFIG_HAVE_IMA_KEXEC
+int __init ima_free_kexec_buffer(void);
+int __init ima_get_kexec_buffer(void **addr, size_t *size);
+#endif
+
#ifdef CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT
extern bool arch_ima_get_secureboot(void);
extern const char * const *arch_get_ima_policy(void);
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index bf1eef337a07..570831ca9951 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -220,8 +220,6 @@ extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
enum jump_label_type type);
-extern void arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
@@ -230,12 +228,12 @@ extern void static_key_slow_inc(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern void static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
-extern void jump_label_apply_nops(struct module *mod);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);
+extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);
/*
* We should be using ATOMIC_INIT() for initializing .enabled, but
@@ -303,11 +301,6 @@ static inline int jump_label_text_reserved(void *start, void *end)
static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}
-static inline int jump_label_apply_nops(struct module *mod)
-{
- return 0;
-}
-
static inline void static_key_enable(struct static_key *key)
{
STATIC_KEY_CHECK_USE(key);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 69ae6b278464..ddb5a358fd82 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,6 +28,9 @@ enum cpu_usage_stat {
CPUTIME_STEAL,
CPUTIME_GUEST,
CPUTIME_GUEST_NICE,
+#ifdef CONFIG_SCHED_CORE
+ CPUTIME_FORCEIDLE,
+#endif
NR_STATS,
};
@@ -115,4 +118,8 @@ extern void account_process_tick(struct task_struct *, int user);
extern void account_idle_ticks(unsigned long ticks);
+#ifdef CONFIG_SCHED_CORE
+extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
+#endif
+
#endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index b6829b970093..1f1099dac3f0 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -188,7 +188,7 @@ static inline void
lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass, u8 inner, u8 outer)
{
- lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL);
+ lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL);
}
static inline void
@@ -211,24 +211,28 @@ static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
* or they are too narrow (they suffer from a false class-split):
*/
#define lockdep_set_class(lock, key) \
- lockdep_init_map_waits(&(lock)->dep_map, #key, key, 0, \
- (lock)->dep_map.wait_type_inner, \
- (lock)->dep_map.wait_type_outer)
+ lockdep_init_map_type(&(lock)->dep_map, #key, key, 0, \
+ (lock)->dep_map.wait_type_inner, \
+ (lock)->dep_map.wait_type_outer, \
+ (lock)->dep_map.lock_type)
#define lockdep_set_class_and_name(lock, key, name) \
- lockdep_init_map_waits(&(lock)->dep_map, name, key, 0, \
- (lock)->dep_map.wait_type_inner, \
- (lock)->dep_map.wait_type_outer)
+ lockdep_init_map_type(&(lock)->dep_map, name, key, 0, \
+ (lock)->dep_map.wait_type_inner, \
+ (lock)->dep_map.wait_type_outer, \
+ (lock)->dep_map.lock_type)
#define lockdep_set_class_and_subclass(lock, key, sub) \
- lockdep_init_map_waits(&(lock)->dep_map, #key, key, sub,\
- (lock)->dep_map.wait_type_inner, \
- (lock)->dep_map.wait_type_outer)
+ lockdep_init_map_type(&(lock)->dep_map, #key, key, sub, \
+ (lock)->dep_map.wait_type_inner, \
+ (lock)->dep_map.wait_type_outer, \
+ (lock)->dep_map.lock_type)
#define lockdep_set_subclass(lock, sub) \
- lockdep_init_map_waits(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\
- (lock)->dep_map.wait_type_inner, \
- (lock)->dep_map.wait_type_outer)
+ lockdep_init_map_type(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\
+ (lock)->dep_map.wait_type_inner, \
+ (lock)->dep_map.wait_type_outer, \
+ (lock)->dep_map.lock_type)
#define lockdep_set_novalidate_class(lock) \
lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cf3d0d673f6b..7898e29bcfb5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1130,23 +1130,27 @@ static inline bool is_zone_movable_page(const struct page *page)
#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-bool __put_devmap_managed_page(struct page *page);
-static inline bool put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs);
+static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
{
if (!static_branch_unlikely(&devmap_managed_key))
return false;
if (!is_zone_device_page(page))
return false;
- return __put_devmap_managed_page(page);
+ return __put_devmap_managed_page_refs(page, refs);
}
-
#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page(struct page *page)
+static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
{
return false;
}
#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+static inline bool put_devmap_managed_page(struct page *page)
+{
+ return put_devmap_managed_page_refs(page, 1);
+}
+
/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
diff --git a/include/linux/mnt_idmapping.h b/include/linux/mnt_idmapping.h
index ee5a217de2a8..f6e5369d2928 100644
--- a/include/linux/mnt_idmapping.h
+++ b/include/linux/mnt_idmapping.h
@@ -13,6 +13,129 @@ struct user_namespace;
*/
extern struct user_namespace init_user_ns;
+typedef struct {
+ uid_t val;
+} vfsuid_t;
+
+typedef struct {
+ gid_t val;
+} vfsgid_t;
+
+static_assert(sizeof(vfsuid_t) == sizeof(kuid_t));
+static_assert(sizeof(vfsgid_t) == sizeof(kgid_t));
+static_assert(offsetof(vfsuid_t, val) == offsetof(kuid_t, val));
+static_assert(offsetof(vfsgid_t, val) == offsetof(kgid_t, val));
+
+#ifdef CONFIG_MULTIUSER
+static inline uid_t __vfsuid_val(vfsuid_t uid)
+{
+ return uid.val;
+}
+
+static inline gid_t __vfsgid_val(vfsgid_t gid)
+{
+ return gid.val;
+}
+#else
+static inline uid_t __vfsuid_val(vfsuid_t uid)
+{
+ return 0;
+}
+
+static inline gid_t __vfsgid_val(vfsgid_t gid)
+{
+ return 0;
+}
+#endif
+
+static inline bool vfsuid_valid(vfsuid_t uid)
+{
+ return __vfsuid_val(uid) != (uid_t)-1;
+}
+
+static inline bool vfsgid_valid(vfsgid_t gid)
+{
+ return __vfsgid_val(gid) != (gid_t)-1;
+}
+
+static inline bool vfsuid_eq(vfsuid_t left, vfsuid_t right)
+{
+ return vfsuid_valid(left) && __vfsuid_val(left) == __vfsuid_val(right);
+}
+
+static inline bool vfsgid_eq(vfsgid_t left, vfsgid_t right)
+{
+ return vfsgid_valid(left) && __vfsgid_val(left) == __vfsgid_val(right);
+}
+
+/**
+ * vfsuid_eq_kuid - check whether kuid and vfsuid have the same value
+ * @vfsuid: the vfsuid to compare
+ * @kuid: the kuid to compare
+ *
+ * Check whether @vfsuid and @kuid have the same values.
+ *
+ * Return: true if @vfsuid and @kuid have the same value, false if not.
+ * Comparison between two invalid uids returns false.
+ */
+static inline bool vfsuid_eq_kuid(vfsuid_t vfsuid, kuid_t kuid)
+{
+ return vfsuid_valid(vfsuid) && __vfsuid_val(vfsuid) == __kuid_val(kuid);
+}
+
+/**
+ * vfsgid_eq_kgid - check whether kgid and vfsgid have the same value
+ * @vfsgid: the vfsgid to compare
+ * @kgid: the kgid to compare
+ *
+ * Check whether @vfsgid and @kgid have the same values.
+ *
+ * Return: true if @vfsgid and @kgid have the same value, false if not.
+ * Comparison between two invalid gids returns false.
+ */
+static inline bool vfsgid_eq_kgid(vfsgid_t vfsgid, kgid_t kgid)
+{
+ return vfsgid_valid(vfsgid) && __vfsgid_val(vfsgid) == __kgid_val(kgid);
+}
+
+/*
+ * vfs{g,u}ids are created from k{g,u}ids.
+ * We don't allow them to be created from regular {u,g}id.
+ */
+#define VFSUIDT_INIT(val) (vfsuid_t){ __kuid_val(val) }
+#define VFSGIDT_INIT(val) (vfsgid_t){ __kgid_val(val) }
+
+#define INVALID_VFSUID VFSUIDT_INIT(INVALID_UID)
+#define INVALID_VFSGID VFSGIDT_INIT(INVALID_GID)
+
+/*
+ * Allow a vfs{g,u}id to be used as a k{g,u}id where we want to compare
+ * whether the mapped value is identical to value of a k{g,u}id.
+ */
+#define AS_KUIDT(val) (kuid_t){ __vfsuid_val(val) }
+#define AS_KGIDT(val) (kgid_t){ __vfsgid_val(val) }
+
+#ifdef CONFIG_MULTIUSER
+/**
+ * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups
+ * @vfsgid: the mnt gid to match
+ *
+ * This function can be used to determine whether @vfsuid matches any of the
+ * caller's groups.
+ *
+ * Return: 1 if vfsuid matches caller's groups, 0 if not.
+ */
+static inline int vfsgid_in_group_p(vfsgid_t vfsgid)
+{
+ return in_group_p(AS_KGIDT(vfsgid));
+}
+#else
+static inline int vfsgid_in_group_p(vfsgid_t vfsgid)
+{
+ return 1;
+}
+#endif
+
/**
* initial_idmapping - check whether this is the initial mapping
* @ns: idmapping to check
@@ -48,7 +171,7 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
}
/**
- * mapped_kuid_fs - map a filesystem kuid into a mnt_userns
+ * make_vfsuid - map a filesystem kuid into a mnt_userns
* @mnt_userns: the mount's idmapping
* @fs_userns: the filesystem's idmapping
* @kuid : kuid to be mapped
@@ -67,25 +190,33 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
* If @kuid has no mapping in either @mnt_userns or @fs_userns INVALID_UID is
* returned.
*/
-static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- kuid_t kuid)
+
+static inline vfsuid_t make_vfsuid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ kuid_t kuid)
{
uid_t uid;
if (no_idmapping(mnt_userns, fs_userns))
- return kuid;
+ return VFSUIDT_INIT(kuid);
if (initial_idmapping(fs_userns))
uid = __kuid_val(kuid);
else
uid = from_kuid(fs_userns, kuid);
if (uid == (uid_t)-1)
- return INVALID_UID;
- return make_kuid(mnt_userns, uid);
+ return INVALID_VFSUID;
+ return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+}
+
+static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ kuid_t kuid)
+{
+ return AS_KUIDT(make_vfsuid(mnt_userns, fs_userns, kuid));
}
/**
- * mapped_kgid_fs - map a filesystem kgid into a mnt_userns
+ * make_vfsgid - map a filesystem kgid into a mnt_userns
* @mnt_userns: the mount's idmapping
* @fs_userns: the filesystem's idmapping
* @kgid : kgid to be mapped
@@ -104,21 +235,56 @@ static inline kuid_t mapped_kuid_fs(struct user_namespace *mnt_userns,
* If @kgid has no mapping in either @mnt_userns or @fs_userns INVALID_GID is
* returned.
*/
-static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns,
- struct user_namespace *fs_userns,
- kgid_t kgid)
+
+static inline vfsgid_t make_vfsgid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ kgid_t kgid)
{
gid_t gid;
if (no_idmapping(mnt_userns, fs_userns))
- return kgid;
+ return VFSGIDT_INIT(kgid);
if (initial_idmapping(fs_userns))
gid = __kgid_val(kgid);
else
gid = from_kgid(fs_userns, kgid);
if (gid == (gid_t)-1)
- return INVALID_GID;
- return make_kgid(mnt_userns, gid);
+ return INVALID_VFSGID;
+ return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+}
+
+static inline kgid_t mapped_kgid_fs(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ kgid_t kgid)
+{
+ return AS_KGIDT(make_vfsgid(mnt_userns, fs_userns, kgid));
+}
+
+/**
+ * from_vfsuid - map a vfsuid into the filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsuid : vfsuid to be mapped
+ *
+ * Map @vfsuid into the filesystem idmapping. This function has to be used in
+ * order to e.g. write @vfsuid to inode->i_uid.
+ *
+ * Return: @vfsuid mapped into the filesystem idmapping
+ */
+static inline kuid_t from_vfsuid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ vfsuid_t vfsuid)
+{
+ uid_t uid;
+
+ if (no_idmapping(mnt_userns, fs_userns))
+ return AS_KUIDT(vfsuid);
+ uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+ if (uid == (uid_t)-1)
+ return INVALID_UID;
+ if (initial_idmapping(fs_userns))
+ return KUIDT_INIT(uid);
+ return make_kuid(fs_userns, uid);
}
/**
@@ -145,16 +311,66 @@ static inline kuid_t mapped_kuid_user(struct user_namespace *mnt_userns,
struct user_namespace *fs_userns,
kuid_t kuid)
{
- uid_t uid;
+ return from_vfsuid(mnt_userns, fs_userns, VFSUIDT_INIT(kuid));
+}
+
+/**
+ * vfsuid_has_fsmapping - check whether a vfsuid maps into the filesystem
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsuid: vfsuid to be mapped
+ *
+ * Check whether @vfsuid has a mapping in the filesystem idmapping. Use this
+ * function to check whether the filesystem idmapping has a mapping for
+ * @vfsuid.
+ *
+ * Return: true if @vfsuid has a mapping in the filesystem, false if not.
+ */
+static inline bool vfsuid_has_fsmapping(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ vfsuid_t vfsuid)
+{
+ return uid_valid(from_vfsuid(mnt_userns, fs_userns, vfsuid));
+}
+
+/**
+ * vfsuid_into_kuid - convert vfsuid into kuid
+ * @vfsuid: the vfsuid to convert
+ *
+ * This can be used when a vfsuid is committed as a kuid.
+ *
+ * Return: a kuid with the value of @vfsuid
+ */
+static inline kuid_t vfsuid_into_kuid(vfsuid_t vfsuid)
+{
+ return AS_KUIDT(vfsuid);
+}
+
+/**
+ * from_vfsgid - map a vfsgid into the filesystem idmapping
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsgid : vfsgid to be mapped
+ *
+ * Map @vfsgid into the filesystem idmapping. This function has to be used in
+ * order to e.g. write @vfsgid to inode->i_gid.
+ *
+ * Return: @vfsgid mapped into the filesystem idmapping
+ */
+static inline kgid_t from_vfsgid(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ vfsgid_t vfsgid)
+{
+ gid_t gid;
if (no_idmapping(mnt_userns, fs_userns))
- return kuid;
- uid = from_kuid(mnt_userns, kuid);
- if (uid == (uid_t)-1)
- return INVALID_UID;
+ return AS_KGIDT(vfsgid);
+ gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+ if (gid == (gid_t)-1)
+ return INVALID_GID;
if (initial_idmapping(fs_userns))
- return KUIDT_INIT(uid);
- return make_kuid(fs_userns, uid);
+ return KGIDT_INIT(gid);
+ return make_kgid(fs_userns, gid);
}
/**
@@ -181,16 +397,39 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns,
struct user_namespace *fs_userns,
kgid_t kgid)
{
- gid_t gid;
+ return from_vfsgid(mnt_userns, fs_userns, VFSGIDT_INIT(kgid));
+}
- if (no_idmapping(mnt_userns, fs_userns))
- return kgid;
- gid = from_kgid(mnt_userns, kgid);
- if (gid == (gid_t)-1)
- return INVALID_GID;
- if (initial_idmapping(fs_userns))
- return KGIDT_INIT(gid);
- return make_kgid(fs_userns, gid);
+/**
+ * vfsgid_has_fsmapping - check whether a vfsgid maps into the filesystem
+ * @mnt_userns: the mount's idmapping
+ * @fs_userns: the filesystem's idmapping
+ * @vfsgid: vfsgid to be mapped
+ *
+ * Check whether @vfsgid has a mapping in the filesystem idmapping. Use this
+ * function to check whether the filesystem idmapping has a mapping for
+ * @vfsgid.
+ *
+ * Return: true if @vfsgid has a mapping in the filesystem, false if not.
+ */
+static inline bool vfsgid_has_fsmapping(struct user_namespace *mnt_userns,
+ struct user_namespace *fs_userns,
+ vfsgid_t vfsgid)
+{
+ return gid_valid(from_vfsgid(mnt_userns, fs_userns, vfsgid));
+}
+
+/**
+ * vfsgid_into_kgid - convert vfsgid into kgid
+ * @vfsgid: the vfsgid to convert
+ *
+ * This can be used when a vfsgid is committed as a kgid.
+ *
+ * Return: a kgid with the value of @vfsgid
+ */
+static inline kgid_t vfsgid_into_kgid(vfsgid_t vfsgid)
+{
+ return AS_KGIDT(vfsgid);
}
/**
@@ -209,7 +448,8 @@ static inline kgid_t mapped_kgid_user(struct user_namespace *mnt_userns,
static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns,
struct user_namespace *fs_userns)
{
- return mapped_kuid_user(mnt_userns, fs_userns, current_fsuid());
+ return from_vfsuid(mnt_userns, fs_userns,
+ VFSUIDT_INIT(current_fsuid()));
}
/**
@@ -228,7 +468,8 @@ static inline kuid_t mapped_fsuid(struct user_namespace *mnt_userns,
static inline kgid_t mapped_fsgid(struct user_namespace *mnt_userns,
struct user_namespace *fs_userns)
{
- return mapped_kgid_user(mnt_userns, fs_userns, current_fsgid());
+ return from_vfsgid(mnt_userns, fs_userns,
+ VFSGIDT_INIT(current_fsgid()));
}
#endif /* _LINUX_MNT_IDMAPPING_H */
diff --git a/include/linux/of.h b/include/linux/of.h
index f0a5d6b10c5a..20a4e7cb7afe 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -441,8 +441,6 @@ void *of_kexec_alloc_and_setup_fdt(const struct kimage *image,
unsigned long initrd_load_addr,
unsigned long initrd_len,
const char *cmdline, size_t extra_fdt_size);
-int ima_get_kexec_buffer(void **addr, size_t *size);
-int ima_free_kexec_buffer(void);
#else /* CONFIG_OF */
static inline void of_core_init(void)
diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h
index 861e606b820f..b7bce4983638 100644
--- a/include/linux/once_lite.h
+++ b/include/linux/once_lite.h
@@ -9,15 +9,27 @@
*/
#define DO_ONCE_LITE(func, ...) \
DO_ONCE_LITE_IF(true, func, ##__VA_ARGS__)
-#define DO_ONCE_LITE_IF(condition, func, ...) \
+
+#define __ONCE_LITE_IF(condition) \
({ \
static bool __section(".data.once") __already_done; \
- bool __ret_do_once = !!(condition); \
+ bool __ret_cond = !!(condition); \
+ bool __ret_once = false; \
\
- if (unlikely(__ret_do_once && !__already_done)) { \
+ if (unlikely(__ret_cond && !__already_done)) { \
__already_done = true; \
- func(__VA_ARGS__); \
+ __ret_once = true; \
} \
+ unlikely(__ret_once); \
+ })
+
+#define DO_ONCE_LITE_IF(condition, func, ...) \
+ ({ \
+ bool __ret_do_once = !!(condition); \
+ \
+ if (__ONCE_LITE_IF(__ret_do_once)) \
+ func(__VA_ARGS__); \
+ \
unlikely(__ret_do_once); \
})
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0178823ce8c2..7fa460ccf7fa 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -556,10 +556,13 @@
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F3 0x1493
#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F3 0x144b
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F3 0x1443
+#define PCI_DEVICE_ID_AMD_17H_MA0H_DF_F3 0x1727
#define PCI_DEVICE_ID_AMD_19H_DF_F3 0x1653
#define PCI_DEVICE_ID_AMD_19H_M10H_DF_F3 0x14b0
#define PCI_DEVICE_ID_AMD_19H_M40H_DF_F3 0x167c
#define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d
+#define PCI_DEVICE_ID_AMD_19H_M60H_DF_F3 0x14e3
+#define PCI_DEVICE_ID_AMD_19H_M70H_DF_F3 0x14f3
#define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703
#define PCI_DEVICE_ID_AMD_LANCE 0x2000
#define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001
diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index 46f9b6fe306e..bf66fe011fa8 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -56,9 +56,13 @@ struct riscv_pmu {
struct cpu_hw_events __percpu *hw_events;
struct hlist_node node;
+ struct notifier_block riscv_pm_nb;
};
#define to_riscv_pmu(p) (container_of(p, struct riscv_pmu, pmu))
+
+void riscv_pmu_start(struct perf_event *event, int flags);
+void riscv_pmu_stop(struct perf_event *event, int flags);
unsigned long riscv_pmu_ctr_read_csr(unsigned long csr);
int riscv_pmu_event_set_period(struct perf_event *event);
uint64_t riscv_pmu_ctr_get_width_mask(struct perf_event *event);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index da759560eec5..ee8b9ecdc03b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -759,6 +759,8 @@ struct perf_event {
struct pid_namespace *ns;
u64 id;
+ atomic64_t lost_samples;
+
u64 (*clock)(void);
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index b65c877d92b8..7d1e604c1325 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -73,6 +73,7 @@ extern int set_posix_acl(struct user_namespace *, struct inode *, int,
struct posix_acl *);
struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
+struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags);
#ifdef CONFIG_FS_POSIX_ACL
int posix_acl_chmod(struct user_namespace *, struct inode *, umode_t);
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 1766e1de6956..b6bd3eac2bcc 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -33,21 +33,31 @@ posix_acl_xattr_count(size_t size)
}
#ifdef CONFIG_FS_POSIX_ACL
-void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
- struct inode *inode,
- void *value, size_t size);
-void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
- struct inode *inode,
- void *value, size_t size);
+void posix_acl_fix_xattr_from_user(void *value, size_t size);
+void posix_acl_fix_xattr_to_user(void *value, size_t size);
+void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode,
+ void *value, size_t size);
+void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode,
+ void *value, size_t size);
#else
-static inline void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns,
- struct inode *inode,
- void *value, size_t size)
+static inline void posix_acl_fix_xattr_from_user(void *value, size_t size)
{
}
-static inline void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns,
- struct inode *inode,
- void *value, size_t size)
+static inline void posix_acl_fix_xattr_to_user(void *value, size_t size)
+{
+}
+static inline void
+posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode, void *value,
+ size_t size)
+{
+}
+static inline void
+posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns,
+ const struct inode *inode, void *value,
+ size_t size)
{
}
#endif
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index a0f6668924d3..0d8625d71733 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -20,11 +20,12 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
}
/* i_mutex must being held */
-static inline bool is_quota_modification(struct inode *inode, struct iattr *ia)
+static inline bool is_quota_modification(struct user_namespace *mnt_userns,
+ struct inode *inode, struct iattr *ia)
{
- return (ia->ia_valid & ATTR_SIZE) ||
- (ia->ia_valid & ATTR_UID && !uid_eq(ia->ia_uid, inode->i_uid)) ||
- (ia->ia_valid & ATTR_GID && !gid_eq(ia->ia_gid, inode->i_gid));
+ return ((ia->ia_valid & ATTR_SIZE) ||
+ i_uid_needs_update(mnt_userns, ia, inode) ||
+ i_gid_needs_update(mnt_userns, ia, inode));
}
#if defined(CONFIG_QUOTA)
@@ -115,7 +116,8 @@ int dquot_set_dqblk(struct super_block *sb, struct kqid id,
struct qc_dqblk *di);
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
-int dquot_transfer(struct inode *inode, struct iattr *iattr);
+int dquot_transfer(struct user_namespace *mnt_userns, struct inode *inode,
+ struct iattr *iattr);
static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
{
@@ -234,7 +236,8 @@ static inline void dquot_free_inode(struct inode *inode)
{
}
-static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
+static inline int dquot_transfer(struct user_namespace *mnt_userns,
+ struct inode *inode, struct iattr *iattr)
{
return 0;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c46f3a63b758..88b8817b827d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2257,7 +2257,7 @@ static inline bool owner_on_cpu(struct task_struct *owner)
}
/* Returns effective CPU energy utilization, as seen by the scheduler */
-unsigned long sched_cpu_util(int cpu, unsigned long max);
+unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */
#ifdef CONFIG_RSEQ
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index e5af028c08b4..994c25640e15 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p)
}
extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task);
extern void rt_mutex_adjust_pi(struct task_struct *p);
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
- return tsk->pi_blocked_on != NULL;
-}
#else
static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
{
return NULL;
}
# define rt_mutex_adjust_pi(p) do { } while (0)
-static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
-{
- return false;
-}
#endif
extern void normalize_rt_tasks(void);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 56cffe42abbc..816df6cc444e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -81,6 +81,7 @@ struct sched_domain_shared {
atomic_t ref;
atomic_t nr_busy_cpus;
int has_idle_cores;
+ int nr_idle_scan;
};
struct sched_domain {
diff --git a/include/linux/security.h b/include/linux/security.h
index 7fc4e9f49f54..4d0baf30266e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -353,7 +353,8 @@ int security_inode_readlink(struct dentry *dentry);
int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
bool rcu);
int security_inode_permission(struct inode *inode, int mask);
-int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
+int security_inode_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr);
int security_inode_getattr(const struct path *path);
int security_inode_setxattr(struct user_namespace *mnt_userns,
struct dentry *dentry, const char *name,
@@ -848,8 +849,9 @@ static inline int security_inode_permission(struct inode *inode, int mask)
return 0;
}
-static inline int security_inode_setattr(struct dentry *dentry,
- struct iattr *attr)
+static inline int security_inode_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry,
+ struct iattr *attr)
{
return 0;
}
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 4c379d23ec6e..979a9d3e5bfb 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -61,7 +61,7 @@ int __vfs_setxattr_locked(struct user_namespace *, struct dentry *,
const char *, const void *, size_t, int,
struct inode **);
int vfs_setxattr(struct user_namespace *, struct dentry *, const char *,
- const void *, size_t, int);
+ void *, size_t, int);
int __vfs_removexattr(struct user_namespace *, struct dentry *, const char *);
int __vfs_removexattr_locked(struct user_namespace *, struct dentry *,
const char *, struct inode **);
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index f7506f08e505..c04f359655b8 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -405,6 +405,9 @@ static inline bool ip6_ignore_linkdown(const struct net_device *dev)
{
const struct inet6_dev *idev = __in6_dev_get(dev);
+ if (unlikely(!idev))
+ return true;
+
return !!idev->cnf.ignore_routes_with_linkdown;
}
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 3c4f550e5a8b..2f766e3437ce 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -847,6 +847,7 @@ enum {
};
void l2cap_chan_hold(struct l2cap_chan *c);
+struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c);
void l2cap_chan_put(struct l2cap_chan *c);
static inline void l2cap_chan_lock(struct l2cap_chan *chan)
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 85cd695e7fd1..ee88f0f1350f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -321,7 +321,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
-#define TCP_PINGPONG_THRESH 3
+#define TCP_PINGPONG_THRESH 1
static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
{
@@ -338,14 +338,6 @@ static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
}
-static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- if (icsk->icsk_ack.pingpong < U8_MAX)
- icsk->icsk_ack.pingpong++;
-}
-
static inline bool inet_csk_has_ulp(struct sock *sk)
{
return inet_sk(sk)->is_icsk && !!inet_csk(sk)->icsk_ulp_ops;
diff --git a/include/net/sock.h b/include/net/sock.h
index 9fa54762e077..7a48991cdb19 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2843,18 +2843,18 @@ static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_wmem ? */
if (proto->sysctl_wmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));
- return *proto->sysctl_wmem;
+ return READ_ONCE(*proto->sysctl_wmem);
}
static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_rmem ? */
if (proto->sysctl_rmem_offset)
- return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
+ return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));
- return *proto->sysctl_rmem;
+ return READ_ONCE(*proto->sysctl_rmem);
}
/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 071735e10872..78a64e1b33a7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1419,7 +1419,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space,
static inline int tcp_win_from_space(const struct sock *sk, int space)
{
- int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale;
+ int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale);
return tcp_adv_win_scale <= 0 ?
(space>>(-tcp_adv_win_scale)) :
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index 32088c603244..bad21222130e 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -49,38 +49,52 @@
/* note: we begin tracing dlm_lock_start() only if ls and lkb are found */
TRACE_EVENT(dlm_lock_start,
- TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode,
- __u32 flags),
+ TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
+ unsigned int namelen, int mode, __u32 flags),
- TP_ARGS(ls, lkb, mode, flags),
+ TP_ARGS(ls, lkb, name, namelen, mode, flags),
TP_STRUCT__entry(
__field(__u32, ls_id)
__field(__u32, lkb_id)
__field(int, mode)
__field(__u32, flags)
+ __dynamic_array(unsigned char, res_name,
+ lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen)
),
TP_fast_assign(
+ struct dlm_rsb *r;
+
__entry->ls_id = ls->ls_global_id;
__entry->lkb_id = lkb->lkb_id;
__entry->mode = mode;
__entry->flags = flags;
+
+ r = lkb->lkb_resource;
+ if (r)
+ memcpy(__get_dynamic_array(res_name), r->res_name,
+ __get_dynamic_array_len(res_name));
+ else if (name)
+ memcpy(__get_dynamic_array(res_name), name,
+ __get_dynamic_array_len(res_name));
),
- TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s",
+ TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s res_name=%s",
__entry->ls_id, __entry->lkb_id,
show_lock_mode(__entry->mode),
- show_lock_flags(__entry->flags))
+ show_lock_flags(__entry->flags),
+ __print_hex_str(__get_dynamic_array(res_name),
+ __get_dynamic_array_len(res_name)))
);
TRACE_EVENT(dlm_lock_end,
- TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode, __u32 flags,
- int error),
+ TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, void *name,
+ unsigned int namelen, int mode, __u32 flags, int error),
- TP_ARGS(ls, lkb, mode, flags, error),
+ TP_ARGS(ls, lkb, name, namelen, mode, flags, error),
TP_STRUCT__entry(
__field(__u32, ls_id)
@@ -88,14 +102,26 @@ TRACE_EVENT(dlm_lock_end,
__field(int, mode)
__field(__u32, flags)
__field(int, error)
+ __dynamic_array(unsigned char, res_name,
+ lkb->lkb_resource ? lkb->lkb_resource->res_length : namelen)
),
TP_fast_assign(
+ struct dlm_rsb *r;
+
__entry->ls_id = ls->ls_global_id;
__entry->lkb_id = lkb->lkb_id;
__entry->mode = mode;
__entry->flags = flags;
+ r = lkb->lkb_resource;
+ if (r)
+ memcpy(__get_dynamic_array(res_name), r->res_name,
+ __get_dynamic_array_len(res_name));
+ else if (name)
+ memcpy(__get_dynamic_array(res_name), name,
+ __get_dynamic_array_len(res_name));
+
/* return value will be zeroed in those cases by dlm_lock()
* we do it here again to not introduce more overhead if
* trace isn't running and error reflects the return value.
@@ -104,12 +130,15 @@ TRACE_EVENT(dlm_lock_end,
__entry->error = 0;
else
__entry->error = error;
+
),
- TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d",
+ TP_printk("ls_id=%u lkb_id=%x mode=%s flags=%s error=%d res_name=%s",
__entry->ls_id, __entry->lkb_id,
show_lock_mode(__entry->mode),
- show_lock_flags(__entry->flags), __entry->error)
+ show_lock_flags(__entry->flags), __entry->error,
+ __print_hex_str(__get_dynamic_array(res_name),
+ __get_dynamic_array_len(res_name)))
);
@@ -123,42 +152,65 @@ TRACE_EVENT(dlm_bast,
__field(__u32, ls_id)
__field(__u32, lkb_id)
__field(int, mode)
+ __dynamic_array(unsigned char, res_name,
+ lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
),
TP_fast_assign(
+ struct dlm_rsb *r;
+
__entry->ls_id = ls->ls_global_id;
__entry->lkb_id = lkb->lkb_id;
__entry->mode = mode;
+
+ r = lkb->lkb_resource;
+ if (r)
+ memcpy(__get_dynamic_array(res_name), r->res_name,
+ __get_dynamic_array_len(res_name));
),
- TP_printk("ls_id=%u lkb_id=%x mode=%s", __entry->ls_id,
- __entry->lkb_id, show_lock_mode(__entry->mode))
+ TP_printk("ls_id=%u lkb_id=%x mode=%s res_name=%s",
+ __entry->ls_id, __entry->lkb_id,
+ show_lock_mode(__entry->mode),
+ __print_hex_str(__get_dynamic_array(res_name),
+ __get_dynamic_array_len(res_name)))
);
TRACE_EVENT(dlm_ast,
- TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, struct dlm_lksb *lksb),
+ TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb),
- TP_ARGS(ls, lkb, lksb),
+ TP_ARGS(ls, lkb),
TP_STRUCT__entry(
__field(__u32, ls_id)
__field(__u32, lkb_id)
__field(u8, sb_flags)
__field(int, sb_status)
+ __dynamic_array(unsigned char, res_name,
+ lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
),
TP_fast_assign(
+ struct dlm_rsb *r;
+
__entry->ls_id = ls->ls_global_id;
__entry->lkb_id = lkb->lkb_id;
- __entry->sb_flags = lksb->sb_flags;
- __entry->sb_status = lksb->sb_status;
+ __entry->sb_flags = lkb->lkb_lksb->sb_flags;
+ __entry->sb_status = lkb->lkb_lksb->sb_status;
+
+ r = lkb->lkb_resource;
+ if (r)
+ memcpy(__get_dynamic_array(res_name), r->res_name,
+ __get_dynamic_array_len(res_name));
),
- TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d",
+ TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d res_name=%s",
__entry->ls_id, __entry->lkb_id,
- show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status)
+ show_dlm_sb_flags(__entry->sb_flags), __entry->sb_status,
+ __print_hex_str(__get_dynamic_array(res_name),
+ __get_dynamic_array_len(res_name)))
);
@@ -173,17 +225,28 @@ TRACE_EVENT(dlm_unlock_start,
__field(__u32, ls_id)
__field(__u32, lkb_id)
__field(__u32, flags)
+ __dynamic_array(unsigned char, res_name,
+ lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
),
TP_fast_assign(
+ struct dlm_rsb *r;
+
__entry->ls_id = ls->ls_global_id;
__entry->lkb_id = lkb->lkb_id;
__entry->flags = flags;
+
+ r = lkb->lkb_resource;
+ if (r)
+ memcpy(__get_dynamic_array(res_name), r->res_name,
+ __get_dynamic_array_len(res_name));
),
- TP_printk("ls_id=%u lkb_id=%x flags=%s",
+ TP_printk("ls_id=%u lkb_id=%x flags=%s res_name=%s",
__entry->ls_id, __entry->lkb_id,
- show_lock_flags(__entry->flags))
+ show_lock_flags(__entry->flags),
+ __print_hex_str(__get_dynamic_array(res_name),
+ __get_dynamic_array_len(res_name)))
);
@@ -199,18 +262,29 @@ TRACE_EVENT(dlm_unlock_end,
__field(__u32, lkb_id)
__field(__u32, flags)
__field(int, error)
+ __dynamic_array(unsigned char, res_name,
+ lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
),
TP_fast_assign(
+ struct dlm_rsb *r;
+
__entry->ls_id = ls->ls_global_id;
__entry->lkb_id = lkb->lkb_id;
__entry->flags = flags;
__entry->error = error;
+
+ r = lkb->lkb_resource;
+ if (r)
+ memcpy(__get_dynamic_array(res_name), r->res_name,
+ __get_dynamic_array_len(res_name));
),
- TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d",
+ TP_printk("ls_id=%u lkb_id=%x flags=%s error=%d res_name=%s",
__entry->ls_id, __entry->lkb_id,
- show_lock_flags(__entry->flags), __entry->error)
+ show_lock_flags(__entry->flags), __entry->error,
+ __print_hex_str(__get_dynamic_array(res_name),
+ __get_dynamic_array_len(res_name)))
);
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f76668305ac5..4cb51ace600d 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -13,11 +13,12 @@ DECLARE_EVENT_CLASS(kmem_alloc,
TP_PROTO(unsigned long call_site,
const void *ptr,
+ struct kmem_cache *s,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags),
- TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+ TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags),
TP_STRUCT__entry(
__field( unsigned long, call_site )
@@ -25,6 +26,7 @@ DECLARE_EVENT_CLASS(kmem_alloc,
__field( size_t, bytes_req )
__field( size_t, bytes_alloc )
__field( unsigned long, gfp_flags )
+ __field( bool, accounted )
),
TP_fast_assign(
@@ -33,42 +35,47 @@ DECLARE_EVENT_CLASS(kmem_alloc,
__entry->bytes_req = bytes_req;
__entry->bytes_alloc = bytes_alloc;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
+ __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ?
+ ((gfp_flags & __GFP_ACCOUNT) ||
+ (s && s->flags & SLAB_ACCOUNT)) : false;
),
- TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+ TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s accounted=%s",
(void *)__entry->call_site,
__entry->ptr,
__entry->bytes_req,
__entry->bytes_alloc,
- show_gfp_flags(__entry->gfp_flags))
+ show_gfp_flags(__entry->gfp_flags),
+ __entry->accounted ? "true" : "false")
);
DEFINE_EVENT(kmem_alloc, kmalloc,
- TP_PROTO(unsigned long call_site, const void *ptr,
+ TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s,
size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),
- TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
+ TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags)
);
DEFINE_EVENT(kmem_alloc, kmem_cache_alloc,
- TP_PROTO(unsigned long call_site, const void *ptr,
+ TP_PROTO(unsigned long call_site, const void *ptr, struct kmem_cache *s,
size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags),
- TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)
+ TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags)
);
DECLARE_EVENT_CLASS(kmem_alloc_node,
TP_PROTO(unsigned long call_site,
const void *ptr,
+ struct kmem_cache *s,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags,
int node),
- TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+ TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node),
TP_STRUCT__entry(
__field( unsigned long, call_site )
@@ -77,6 +84,7 @@ DECLARE_EVENT_CLASS(kmem_alloc_node,
__field( size_t, bytes_alloc )
__field( unsigned long, gfp_flags )
__field( int, node )
+ __field( bool, accounted )
),
TP_fast_assign(
@@ -86,33 +94,37 @@ DECLARE_EVENT_CLASS(kmem_alloc_node,
__entry->bytes_alloc = bytes_alloc;
__entry->gfp_flags = (__force unsigned long)gfp_flags;
__entry->node = node;
+ __entry->accounted = IS_ENABLED(CONFIG_MEMCG_KMEM) ?
+ ((gfp_flags & __GFP_ACCOUNT) ||
+ (s && s->flags & SLAB_ACCOUNT)) : false;
),
- TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+ TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
(void *)__entry->call_site,
__entry->ptr,
__entry->bytes_req,
__entry->bytes_alloc,
show_gfp_flags(__entry->gfp_flags),
- __entry->node)
+ __entry->node,
+ __entry->accounted ? "true" : "false")
);
DEFINE_EVENT(kmem_alloc_node, kmalloc_node,
TP_PROTO(unsigned long call_site, const void *ptr,
- size_t bytes_req, size_t bytes_alloc,
+ struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc,
gfp_t gfp_flags, int node),
- TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
+ TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node)
);
DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node,
TP_PROTO(unsigned long call_site, const void *ptr,
- size_t bytes_req, size_t bytes_alloc,
+ struct kmem_cache *s, size_t bytes_req, size_t bytes_alloc,
gfp_t gfp_flags, int node),
- TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)
+ TP_ARGS(call_site, ptr, s, bytes_req, bytes_alloc, gfp_flags, node)
);
TRACE_EVENT(kfree,
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index f13d37b60775..1ecdb911add8 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -192,6 +192,7 @@ struct f_owner_ex {
#define F_LINUX_SPECIFIC_BASE 1024
+#ifndef HAVE_ARCH_STRUCT_FLOCK
struct flock {
short l_type;
short l_whence;
@@ -216,5 +217,6 @@ struct flock64 {
__ARCH_FLOCK64_PAD
#endif
};
+#endif /* HAVE_ARCH_STRUCT_FLOCK */
#endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index f1f89132d60e..d8536d77fea1 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -83,12 +83,20 @@
#define FAN_MARK_FLUSH 0x00000080
/* FAN_MARK_FILESYSTEM is 0x00000100 */
#define FAN_MARK_EVICTABLE 0x00000200
+/* This bit is mutually exclusive with FAN_MARK_IGNORED_MASK bit */
+#define FAN_MARK_IGNORE 0x00000400
/* These are NOT bitwise flags. Both bits can be used togther. */
#define FAN_MARK_INODE 0x00000000
#define FAN_MARK_MOUNT 0x00000010
#define FAN_MARK_FILESYSTEM 0x00000100
+/*
+ * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY
+ * for non-inode mark types.
+ */
+#define FAN_MARK_IGNORE_SURV (FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY)
+
/* Deprecated - do not use this in programs and do not add new flags here! */
#define FAN_ALL_MARK_FLAGS (FAN_MARK_ADD |\
FAN_MARK_REMOVE |\
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d37629dbad72..0474ee362151 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -301,6 +301,7 @@ enum {
* { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 id; } && PERF_FORMAT_ID
+ * { u64 lost; } && PERF_FORMAT_LOST
* } && !PERF_FORMAT_GROUP
*
* { u64 nr;
@@ -308,6 +309,7 @@ enum {
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 value;
* { u64 id; } && PERF_FORMAT_ID
+ * { u64 lost; } && PERF_FORMAT_LOST
* } cntr[nr];
* } && PERF_FORMAT_GROUP
* };
@@ -317,8 +319,9 @@ enum perf_event_read_format {
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
PERF_FORMAT_ID = 1U << 2,
PERF_FORMAT_GROUP = 1U << 3,
+ PERF_FORMAT_LOST = 1U << 4,
- PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
+ PERF_FORMAT_MAX = 1U << 5, /* non-ABI */
};
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 24b5c2ab5598..feb59380c896 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -310,6 +310,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -318,6 +321,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -398,6 +404,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
@@ -411,8 +422,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
cputime->stime = 0;
@@ -438,6 +450,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
}
}
@@ -445,27 +461,43 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
cgroup_rstat_flush_release();
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
}
diff --git a/kernel/configs/x86_debug.config b/kernel/configs/x86_debug.config
index dcd86f32f4ed..6fac5b405334 100644
--- a/kernel/configs/x86_debug.config
+++ b/kernel/configs/x86_debug.config
@@ -7,12 +7,11 @@ CONFIG_DEBUG_SLAB=y
CONFIG_DEBUG_KMEMLEAK=y
CONFIG_DEBUG_PAGEALLOC=y
CONFIG_SLUB_DEBUG_ON=y
-CONFIG_KMEMCHECK=y
CONFIG_DEBUG_OBJECTS=y
CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
CONFIG_GCOV_KERNEL=y
CONFIG_LOCKDEP=y
CONFIG_PROVE_LOCKING=y
CONFIG_SCHEDSTATS=y
-CONFIG_VMLINUX_VALIDATION=y
+CONFIG_NOINSTR_VALIDATION=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d2b354991bf5..c9d32d4d2e20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1819,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
+ if (event->attr.read_format & PERF_FORMAT_LOST)
+ entry += sizeof(u64);
+
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
@@ -5260,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -5321,7 +5328,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
@@ -5331,6 +5338,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
@@ -6858,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
- u64 values[4];
+ u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
@@ -6872,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
@@ -6882,7 +6893,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
- u64 values[5];
+ u64 values[6];
int n = 0;
values[n++] = 1 + leader->nr_siblings;
@@ -6900,6 +6911,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
@@ -6913,6 +6926,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
+ if (read_format & PERF_FORMAT_LOST)
+ values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index fb35b926024c..726132039c38 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
- if (rb->nr_pages)
+ if (rb->nr_pages) {
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
+ }
goto out;
}
@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
+ atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index b156e152d6b4..714ac4c3b556 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -332,17 +332,13 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
- * Update code which is definitely not currently executing.
- * Architectures which need heavyweight synchronization to modify
- * running code can override this to make the non-live update case
- * cheaper.
- */
-void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
- enum jump_label_type type)
+#ifndef arch_jump_label_transform_static
+static void arch_jump_label_transform_static(struct jump_entry *entry,
+ enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ /* nothing to do on most architectures */
}
+#endif
static inline struct jump_entry *static_key_entries(struct static_key *key)
{
@@ -508,7 +504,7 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
-static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
struct static_key *key = jump_entry_key(entry);
bool type = static_key_type(key);
@@ -596,31 +592,6 @@ static void __jump_label_mod_update(struct static_key *key)
}
}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
- struct jump_entry *iter_start = mod->jump_entries;
- struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
- struct jump_entry *iter;
-
- /* if the module doesn't have jump label entries, just return */
- if (iter_start == iter_stop)
- return;
-
- for (iter = iter_start; iter < iter_stop; iter++) {
- /* Only write NOPs for arch_branch_static(). */
- if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
- arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
- }
-}
-
static int jump_label_add_module(struct module *mod)
{
struct jump_entry *iter_start = mod->jump_entries;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f06b91ca6482..e2f179491b08 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5238,9 +5238,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
}
- lockdep_init_map_waits(lock, name, key, 0,
- lock->wait_type_inner,
- lock->wait_type_outer);
+ lockdep_init_map_type(lock, name, key, 0,
+ lock->wait_type_inner,
+ lock->wait_type_outer,
+ lock->lock_type);
class = register_lock_class(lock, subclass, 0);
hlock->class_idx = class - lock_classes;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9d1db4a54d34..65f0262f635e 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -335,8 +335,6 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
-
- /* Writer only, not initialized in reader */
bool handoff_set;
};
#define rwsem_first_waiter(sem) \
@@ -459,10 +457,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* to give up the lock), request a HANDOFF to
* force the issue.
*/
- if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
- time_after(jiffies, waiter->timeout)) {
- adjustment -= RWSEM_FLAG_HANDOFF;
- lockevent_inc(rwsem_rlock_handoff);
+ if (time_after(jiffies, waiter->timeout)) {
+ if (!(oldcount & RWSEM_FLAG_HANDOFF)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+ waiter->handoff_set = true;
}
atomic_long_add(-adjustment, &sem->count);
@@ -599,7 +599,7 @@ rwsem_del_wake_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter,
static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
struct rwsem_waiter *waiter)
{
- bool first = rwsem_first_waiter(sem) == waiter;
+ struct rwsem_waiter *first = rwsem_first_waiter(sem);
long count, new;
lockdep_assert_held(&sem->wait_lock);
@@ -609,11 +609,20 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
if (has_handoff) {
- if (!first)
+ /*
+ * Honor handoff bit and yield only when the first
+ * waiter is the one that set it. Otherwisee, we
+ * still try to acquire the rwsem.
+ */
+ if (first->handoff_set && (waiter != first))
return false;
- /* First waiter inherits a previously set handoff bit */
- waiter->handoff_set = true;
+ /*
+ * First waiter can inherit a previously set handoff
+ * bit and spin on rwsem if lock acquisition fails.
+ */
+ if (waiter == first)
+ waiter->handoff_set = true;
}
new = count;
@@ -1027,6 +1036,7 @@ queue:
waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_READ;
waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+ waiter.handoff_set = false;
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list)) {
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 97ac20b4f738..bda8175f8f99 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -18,8 +18,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/rseq.h>
-#define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
- RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
+#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
+ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
/*
*
@@ -175,23 +176,15 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
u32 flags, event_mask;
int ret;
+ if (WARN_ON_ONCE(cs_flags & RSEQ_CS_NO_RESTART_FLAGS) || cs_flags)
+ return -EINVAL;
+
/* Get thread flags. */
ret = get_user(flags, &t->rseq->flags);
if (ret)
return ret;
- /* Take critical section flags into account. */
- flags |= cs_flags;
-
- /*
- * Restart on signal can only be inhibited when restart on
- * preempt and restart on migrate are inhibited too. Otherwise,
- * a preempted signal handler could fail to restart the prior
- * execution context on sigreturn.
- */
- if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
- (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
- RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
+ if (WARN_ON_ONCE(flags & RSEQ_CS_NO_RESTART_FLAGS) || flags)
return -EINVAL;
/*
@@ -203,7 +196,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
t->rseq_event_mask = 0;
preempt_enable();
- return !!(event_mask & ~flags);
+ return !!event_mask;
}
static int clear_rseq_cs(struct task_struct *t)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..5555e49c4e12 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -873,15 +873,11 @@ static inline void hrtick_rq_init(struct rq *rq)
({ \
typeof(ptr) _ptr = (ptr); \
typeof(mask) _mask = (mask); \
- typeof(*_ptr) _old, _val = *_ptr; \
+ typeof(*_ptr) _val = *_ptr; \
\
- for (;;) { \
- _old = cmpxchg(_ptr, _val, _val | _mask); \
- if (_old == _val) \
- break; \
- _val = _old; \
- } \
- _old; \
+ do { \
+ } while (!try_cmpxchg(_ptr, &_val, _val | _mask)); \
+ _val; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -890,7 +886,7 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -905,30 +901,28 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+ typeof(ti->flags) val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
return false;
if (val & _TIF_NEED_RESCHED)
return true;
- old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
- if (old == val)
+ if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
break;
- val = old;
}
return true;
}
#else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
#ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
{
return false;
}
@@ -3808,7 +3802,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(int cpu)
{
/*
* Do not complicate things with the async wake_list while the CPU is
@@ -3824,13 +3818,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -3838,10 +3840,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -4163,7 +4162,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock;
/*
@@ -4753,7 +4752,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
- * See the ttwu() WF_ON_CPU case and its ordering comment.
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
#endif
@@ -6500,8 +6500,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
io_wq_worker_sleeping(tsk);
}
- if (tsk_is_pi_blocked(tsk))
- return;
+ /*
+ * spinlock and rwlock must not flush block requests. This will
+ * deadlock if the callback attempts to acquire a lock which is
+ * already acquired.
+ */
+ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -6998,17 +7002,29 @@ out_unlock:
EXPORT_SYMBOL(set_user_nice);
/*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
* @p: task
* @nice: nice value
*/
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
{
/* Convert nice value [19,-20] to rlimit style value [1,40]: */
int nice_rlim = nice_to_rlimit(nice);
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
- capable(CAP_SYS_NICE));
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
}
#ifdef __ARCH_WANT_SYS_NICE
@@ -7137,12 +7153,14 @@ struct task_struct *idle_task(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p)
{
- unsigned long dl_util, util, irq;
+ unsigned long dl_util, util, irq, max;
struct rq *rq = cpu_rq(cpu);
+ max = arch_scale_cpu_capacity(cpu);
+
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
@@ -7222,10 +7240,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
return min(max, util);
}
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
- ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
}
#endif /* CONFIG_SMP */
@@ -7287,6 +7304,69 @@ static bool check_same_owner(struct task_struct *p)
return match;
}
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user, bool pi)
@@ -7328,58 +7408,11 @@ recheck:
(rt_policy(policy) != (attr->sched_priority != 0)))
return -EINVAL;
- /*
- * Allow unprivileged RT tasks to decrease priority:
- */
- if (user && !capable(CAP_SYS_NICE)) {
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !can_nice(p, attr->sched_nice))
- return -EPERM;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio =
- task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- return -EPERM;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- return -EPERM;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- return -EPERM;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!can_nice(p, task_nice(p)))
- return -EPERM;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- return -EPERM;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- return -EPERM;
- }
-
if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
if (attr->sched_flags & SCHED_FLAG_SUGOV)
return -EINVAL;
@@ -9531,7 +9564,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
void __init sched_init(void)
{
@@ -9580,7 +9613,7 @@ void __init sched_init(void)
for_each_possible_cpu(i) {
per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
- per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+ per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
cpumask_size(), GFP_KERNEL, cpu_to_node(i));
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 38a2cec21014..93878cb2a46d 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -56,7 +56,6 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
unsigned long old_cookie;
struct rq_flags rf;
struct rq *rq;
- bool enqueued;
rq = task_rq_lock(p, &rf);
@@ -68,14 +67,16 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
*/
SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
- enqueued = sched_core_enqueued(p);
- if (enqueued)
+ if (sched_core_enqueued(p))
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
old_cookie = p->core_cookie;
p->core_cookie = cookie;
- if (enqueued)
+ /*
+ * Consider the cases: !prev_cookie and !cookie.
+ */
+ if (cookie && task_on_rq_queued(p))
sched_core_enqueue(rq, p);
/*
@@ -277,7 +278,11 @@ void __sched_core_account_forceidle(struct rq *rq)
if (p == rq_i->idle)
continue;
- __schedstat_add(p->stats.core_forceidle_sum, delta);
+ /*
+ * Note: this will account forceidle to the current cpu, even
+ * if it comes from our SMT sibling.
+ */
+ __account_forceidle_time(p, delta);
}
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3dbf351d12d5..1207c78f85c1 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -157,11 +157,10 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
- sg_cpu->max = max;
+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max,
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 78a233d43757..95fc77853743 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -226,6 +226,21 @@ void account_idle_time(u64 cputime)
cpustat[CPUTIME_IDLE] += cputime;
}
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+ __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+ task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
/*
* When a guest is interrupted for a longer amount of time, missed clock
* ticks are not redelivered later. Due to that, this function may on
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7bf561262cb8..0ab79d819a0d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -30,14 +30,16 @@ static struct ctl_table sched_dl_sysctls[] = {
.data = &sysctl_sched_dl_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
},
{
.procname = "sched_deadline_period_min_us",
.data = &sysctl_sched_dl_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
},
{}
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77b2048a9326..914096c5b1ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -612,11 +612,8 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
/* ensure we never gain time by being placed backwards. */
- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime,
+ max_vruntime(cfs_rq->min_vruntime, vruntime));
}
static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
@@ -1055,6 +1052,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1548,8 +1572,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1790,6 +1812,15 @@ static bool task_numa_compare(struct task_numa_env *env,
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
+ goto unlock;
+
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
@@ -2885,6 +2916,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
@@ -3144,6 +3176,8 @@ void reweight_task(struct task_struct *p, int prio)
load->inv_weight = sched_prio_to_wmult[prio];
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
@@ -3254,8 +3288,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -3313,6 +3345,34 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
}
#ifdef CONFIG_SMP
+static inline bool load_avg_is_decayed(struct sched_avg *sa)
+{
+ if (sa->load_sum)
+ return false;
+
+ if (sa->util_sum)
+ return false;
+
+ if (sa->runnable_sum)
+ return false;
+
+ /*
+ * _avg must be null when _sum are null because _avg = _sum / divider
+ * Make sure that rounding and/or propagation of PELT values never
+ * break this.
+ */
+ SCHED_WARN_ON(sa->load_avg ||
+ sa->util_avg ||
+ sa->runnable_avg);
+
+ return true;
+}
+
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return u64_u32_load_copy(cfs_rq->avg.last_update_time,
+ cfs_rq->last_update_time_copy);
+}
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
@@ -3345,27 +3405,12 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->load.weight)
return false;
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_sum)
+ if (!load_avg_is_decayed(&cfs_rq->avg))
return false;
if (child_cfs_rq_on_list(cfs_rq))
return false;
- /*
- * _avg must be null when _sum are null because _avg = _sum / divider
- * Make sure that rounding and/or propagation of PELT values never
- * break this.
- */
- SCHED_WARN_ON(cfs_rq->avg.load_avg ||
- cfs_rq->avg.util_avg ||
- cfs_rq->avg.runnable_avg);
-
return true;
}
@@ -3423,27 +3468,9 @@ void set_task_rq_fair(struct sched_entity *se,
if (!(se->avg.last_update_time && prev))
return;
-#ifndef CONFIG_64BIT
- {
- u64 p_last_update_time_copy;
- u64 n_last_update_time_copy;
-
- do {
- p_last_update_time_copy = prev->load_last_update_time_copy;
- n_last_update_time_copy = next->load_last_update_time_copy;
-
- smp_rmb();
-
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
+ p_last_update_time = cfs_rq_last_update_time(prev);
+ n_last_update_time = cfs_rq_last_update_time(next);
- } while (p_last_update_time != p_last_update_time_copy ||
- n_last_update_time != n_last_update_time_copy);
- }
-#else
- p_last_update_time = prev->avg.last_update_time;
- n_last_update_time = next->avg.last_update_time;
-#endif
__update_load_avg_blocked_se(p_last_update_time, se);
se->avg.last_update_time = n_last_update_time;
}
@@ -3722,6 +3749,89 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 throttled = 0, now, lut;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ if (load_avg_is_decayed(&se->avg))
+ return;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
+
+ /*
+ * Estimated "now" is: last_update_time + cfs_idle_lag + rq_idle_lag, where:
+ *
+ * last_update_time (the cfs_rq's last_update_time)
+ * = cfs_rq_clock_pelt()@cfs_rq_idle
+ * = rq_clock_pelt()@cfs_rq_idle
+ * - cfs->throttled_clock_pelt_time@cfs_rq_idle
+ *
+ * cfs_idle_lag (delta between rq's update and cfs_rq's update)
+ * = rq_clock_pelt()@rq_idle - rq_clock_pelt()@cfs_rq_idle
+ *
+ * rq_idle_lag (delta between now and rq's update)
+ * = sched_clock_cpu() - rq_clock()@rq_idle
+ *
+ * We can then write:
+ *
+ * now = rq_clock_pelt()@rq_idle - cfs->throttled_clock_pelt_time +
+ * sched_clock_cpu() - rq_clock()@rq_idle
+ * Where:
+ * rq_clock_pelt()@rq_idle is rq->clock_pelt_idle
+ * rq_clock()@rq_idle is rq->clock_idle
+ * cfs->throttled_clock_pelt_time@cfs_rq_idle
+ * is cfs_rq->throttled_pelt_idle
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+ throttled = u64_u32_load(cfs_rq->throttled_pelt_idle);
+ /* The clock has been stopped for throttling */
+ if (throttled == U64_MAX)
+ return;
+#endif
+ now = u64_u32_load(rq->clock_pelt_idle);
+ /*
+ * Paired with _update_idle_rq_clock_pelt(). It ensures at the worst case
+ * is observed the old clock_pelt_idle value and the new clock_idle,
+ * which lead to an underestimation. The opposite would lead to an
+ * overestimation.
+ */
+ smp_rmb();
+ lut = cfs_rq_last_update_time(cfs_rq);
+
+ now -= throttled;
+ if (now < lut)
+ /*
+ * cfs_rq->avg.last_update_time is more recent than our
+ * estimation, let's use it.
+ */
+ now = lut;
+ else
+ now += sched_clock_cpu(cpu_of(rq)) - u64_u32_load(rq->clock_idle);
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
@@ -3796,12 +3906,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
}
decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
-
-#ifndef CONFIG_64BIT
- smp_wmb();
- cfs_rq->load_last_update_time_copy = sa->last_update_time;
-#endif
-
+ u64_u32_store_copy(sa->last_update_time,
+ cfs_rq->last_update_time_copy,
+ sa->last_update_time);
return decayed;
}
@@ -3933,27 +4040,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
}
}
-#ifndef CONFIG_64BIT
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- u64 last_update_time_copy;
- u64 last_update_time;
-
- do {
- last_update_time_copy = cfs_rq->load_last_update_time_copy;
- smp_rmb();
- last_update_time = cfs_rq->avg.last_update_time;
- } while (last_update_time != last_update_time_copy);
-
- return last_update_time;
-}
-#else
-static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
-{
- return cfs_rq->avg.last_update_time;
-}
-#endif
-
/*
* Synchronize entity load avg of dequeued entity without locking
* the previous rq.
@@ -4368,16 +4454,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4477,6 +4558,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
+
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
}
/*
@@ -4992,11 +5076,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- /* Nothing to run but something to decay (on_list)? Complete the branch */
if (!cfs_rq->load.weight) {
- if (cfs_rq->on_list)
- goto unthrottle_throttle;
- return;
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
}
task_delta = cfs_rq->h_nr_running;
@@ -5034,31 +5125,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(qcfs_rq))
- list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
- /*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
- */
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(qcfs_rq))
- break;
- }
-
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -5713,13 +5785,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -5743,21 +5808,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);
enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -5844,7 +5894,7 @@ dequeue_throttle:
/* Working cpumask for: load_balance, load_balance_newidle. */
DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
-DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
#ifdef CONFIG_NO_HZ_COMMON
@@ -6334,8 +6384,9 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd
*/
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
{
- struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
+ struct sched_domain_shared *sd_share;
struct rq *this_rq = this_rq();
int this = smp_processor_id();
struct sched_domain *this_sd;
@@ -6375,6 +6426,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this);
}
+ if (sched_feat(SIS_UTIL)) {
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, target));
+ if (sd_share) {
+ /* because !--nr is the condition to stop scan */
+ nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
+ /* overloaded LLC is unlikely to have idle cpu/core */
+ if (nr == 1)
+ return -1;
+ }
+ }
+
for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
@@ -6420,7 +6482,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
int cpu, best_cpu = -1;
struct cpumask *cpus;
- cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+ cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
task_util = uclamp_task_util(p);
@@ -6470,7 +6532,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
/*
- * per-cpu select_idle_mask usage
+ * per-cpu select_rq_mask usage
*/
lockdep_assert_irqs_disabled();
@@ -6640,62 +6702,96 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
/*
- * compute_energy(): Estimates the energy that @pd would consume if @p was
- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
- * landscape of @pd's CPUs after the task migration, and uses the Energy Model
- * to compute what would be the energy if we decided to actually migrate that
- * task.
+ * energy_env - Utilization landscape for energy estimation.
+ * @task_busy_time: Utilization contribution by the task for which we test the
+ * placement. Given by eenv_task_busy_time().
+ * @pd_busy_time: Utilization of the whole perf domain without the task
+ * contribution. Given by eenv_pd_busy_time().
+ * @cpu_cap: Maximum CPU capacity for the perf domain.
+ * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap).
+ */
+struct energy_env {
+ unsigned long task_busy_time;
+ unsigned long pd_busy_time;
+ unsigned long cpu_cap;
+ unsigned long pd_cap;
+};
+
+/*
+ * Compute the task busy time for compute_energy(). This time cannot be
+ * injected directly into effective_cpu_util() because of the IRQ scaling.
+ * The latter only makes sense with the most recent CPUs where the task has
+ * run.
+ */
+static inline void eenv_task_busy_time(struct energy_env *eenv,
+ struct task_struct *p, int prev_cpu)
+{
+ unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu);
+ unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu));
+
+ if (unlikely(irq >= max_cap))
+ busy_time = max_cap;
+ else
+ busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap);
+
+ eenv->task_busy_time = busy_time;
+}
+
+/*
+ * Compute the perf_domain (PD) busy time for compute_energy(). Based on the
+ * utilization for each @pd_cpus, it however doesn't take into account
+ * clamping since the ratio (utilization / cpu_capacity) is already enough to
+ * scale the EM reported power consumption at the (eventually clamped)
+ * cpu_capacity.
+ *
+ * The contribution of the task @p for which we want to estimate the
+ * energy cost is removed (by cpu_util_next()) and must be calculated
+ * separately (see eenv_task_busy_time). This ensures:
+ *
+ * - A stable PD utilization, no matter which CPU of that PD we want to place
+ * the task on.
+ *
+ * - A fair comparison between CPUs as the task contribution (task_util())
+ * will always be the same no matter which CPU utilization we rely on
+ * (util_avg or util_est).
+ *
+ * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't
+ * exceed @eenv->pd_cap.
*/
-static long
-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+static inline void eenv_pd_busy_time(struct energy_env *eenv,
+ struct cpumask *pd_cpus,
+ struct task_struct *p)
{
- struct cpumask *pd_mask = perf_domain_span(pd);
- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
- unsigned long max_util = 0, sum_util = 0;
- unsigned long _cpu_cap = cpu_cap;
+ unsigned long busy_time = 0;
int cpu;
- _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
+ for_each_cpu(cpu, pd_cpus) {
+ unsigned long util = cpu_util_next(cpu, p, -1);
- /*
- * The capacity state of CPUs of the current rd can be driven by CPUs
- * of another rd if they belong to the same pd. So, account for the
- * utilization of these CPUs too by masking pd with cpu_online_mask
- * instead of the rd span.
- *
- * If an entire pd is outside of the current rd, it will not appear in
- * its pd list and will not be accounted by compute_energy().
- */
- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
- unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util, util_running = util_freq;
- struct task_struct *tsk = NULL;
+ busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
+ }
- /*
- * When @p is placed on @cpu:
- *
- * util_running = max(cpu_util, cpu_util_est) +
- * max(task_util, _task_util_est)
- *
- * while cpu_util_next is: max(cpu_util + task_util,
- * cpu_util_est + _task_util_est)
- */
- if (cpu == dst_cpu) {
- tsk = p;
- util_running =
- cpu_util_next(cpu, p, -1) + task_util_est(p);
- }
+ eenv->pd_busy_time = min(eenv->pd_cap, busy_time);
+}
- /*
- * Busy time computation: utilization clamping is not
- * required since the ratio (sum_util / cpu_capacity)
- * is already enough to scale the EM reported power
- * consumption at the (eventually clamped) cpu_capacity.
- */
- cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
- ENERGY_UTIL, NULL);
+/*
+ * Compute the maximum utilization for compute_energy() when the task @p
+ * is placed on the cpu @dst_cpu.
+ *
+ * Returns the maximum utilization among @eenv->cpus. This utilization can't
+ * exceed @eenv->cpu_cap.
+ */
+static inline unsigned long
+eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
+ struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = 0;
+ int cpu;
- sum_util += min(cpu_util, _cpu_cap);
+ for_each_cpu(cpu, pd_cpus) {
+ struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
+ unsigned long util = cpu_util_next(cpu, p, dst_cpu);
+ unsigned long cpu_util;
/*
* Performance domain frequency: utilization clamping
@@ -6704,12 +6800,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
- FREQUENCY_UTIL, tsk);
- max_util = max(max_util, min(cpu_util, _cpu_cap));
+ cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
- return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
+ return min(max_util, eenv->cpu_cap);
+}
+
+/*
+ * compute_energy(): Use the Energy Model to estimate the energy that @pd would
+ * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task
+ * contribution is ignored.
+ */
+static inline unsigned long
+compute_energy(struct energy_env *eenv, struct perf_domain *pd,
+ struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu)
+{
+ unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu);
+ unsigned long busy_time = eenv->pd_busy_time;
+
+ if (dst_cpu >= 0)
+ busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time);
+
+ return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap);
}
/*
@@ -6753,12 +6866,13 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
*/
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
- struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
- int cpu, best_energy_cpu = prev_cpu, target = -1;
- unsigned long cpu_cap, util, base_energy = 0;
+ struct root_domain *rd = this_rq()->rd;
+ int cpu, best_energy_cpu, target = -1;
struct sched_domain *sd;
struct perf_domain *pd;
+ struct energy_env eenv;
rcu_read_lock();
pd = rcu_dereference(rd->pd);
@@ -6781,20 +6895,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!task_util_est(p))
goto unlock;
+ eenv_task_busy_time(&eenv, p, prev_cpu);
+
for (; pd; pd = pd->next) {
- unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cur_delta, max_spare_cap = 0;
bool compute_prev_delta = false;
- unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
+ unsigned long base_energy;
+
+ cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
+
+ if (cpumask_empty(cpus))
+ continue;
+
+ /* Account thermal pressure for the energy estimation */
+ cpu = cpumask_first(cpus);
+ cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
+ cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+
+ eenv.cpu_cap = cpu_thermal_cap;
+ eenv.pd_cap = 0;
+
+ for_each_cpu(cpu, cpus) {
+ eenv.pd_cap += cpu_thermal_cap;
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ continue;
- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap;
- lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
@@ -6807,15 +6940,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!fits_capacity(util, cpu_cap))
continue;
+ lsub_positive(&cpu_cap, util);
+
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
compute_prev_delta = true;
- } else if (spare_cap > max_spare_cap) {
+ } else if (cpu_cap > max_spare_cap) {
/*
* Find the CPU with the maximum spare capacity
* in the performance domain.
*/
- max_spare_cap = spare_cap;
+ max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
}
}
@@ -6823,25 +6958,29 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (max_spare_cap_cpu < 0 && !compute_prev_delta)
continue;
+ eenv_pd_busy_time(&eenv, cpus, p);
/* Compute the 'base' energy of the pd, without @p */
- base_energy_pd = compute_energy(p, -1, pd);
- base_energy += base_energy_pd;
+ base_energy = compute_energy(&eenv, pd, cpus, p, -1);
/* Evaluate the energy impact of using prev_cpu. */
if (compute_prev_delta) {
- prev_delta = compute_energy(p, prev_cpu, pd);
- if (prev_delta < base_energy_pd)
+ prev_delta = compute_energy(&eenv, pd, cpus, p,
+ prev_cpu);
+ /* CPU utilization has changed */
+ if (prev_delta < base_energy)
goto unlock;
- prev_delta -= base_energy_pd;
+ prev_delta -= base_energy;
best_delta = min(best_delta, prev_delta);
}
/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0) {
- cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
- if (cur_delta < base_energy_pd)
+ cur_delta = compute_energy(&eenv, pd, cpus, p,
+ max_spare_cap_cpu);
+ /* CPU utilization has changed */
+ if (cur_delta < base_energy)
goto unlock;
- cur_delta -= base_energy_pd;
+ cur_delta -= base_energy;
if (cur_delta < best_delta) {
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
@@ -6850,12 +6989,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
}
rcu_read_unlock();
- /*
- * Pick the best CPU if prev_cpu cannot be used, or if it saves at
- * least 6% of the energy used by prev_cpu.
- */
- if ((prev_delta == ULONG_MAX) ||
- (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
+ if (best_delta < prev_delta)
target = best_energy_cpu;
return target;
@@ -6951,6 +7085,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
+ struct sched_entity *se = &p->se;
+
/*
* As blocked tasks retain absolute vruntime the migration needs to
* deal with this by subtracting the old and adding the new
@@ -6958,23 +7094,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* the task on the new runqueue.
*/
if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-
- do {
- min_vruntime_copy = cfs_rq->min_vruntime_copy;
- smp_rmb();
- min_vruntime = cfs_rq->min_vruntime;
- } while (min_vruntime != min_vruntime_copy);
-#else
- min_vruntime = cfs_rq->min_vruntime;
-#endif
- se->vruntime -= min_vruntime;
+ se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
}
if (p->on_rq == TASK_ON_RQ_MIGRATING) {
@@ -6983,25 +7105,29 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* rq->lock and can modify state directly.
*/
lockdep_assert_rq_held(task_rq(p));
- detach_entity_cfs_rq(&p->se);
+ detach_entity_cfs_rq(se);
} else {
+ remove_entity_load_avg(se);
+
/*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the cfs_rq last_update_time
+ * and update sched_avg to improve the PELT continuity after
+ * migration.
*/
- remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
+ se->avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
+ se->exec_start = 0;
update_scan_period(p, new_cpu);
}
@@ -7585,8 +7711,8 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
@@ -8167,6 +8293,9 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
+ if (cfs_rq->nr_running == 0)
+ update_idle_cfs_rq_clock_pelt(cfs_rq);
+
if (cfs_rq == &rq->cfs)
decayed = true;
}
@@ -8500,7 +8629,7 @@ static inline int sg_imbalanced(struct sched_group *group)
/*
* group_has_capacity returns true if the group has spare capacity that could
* be used by some tasks.
- * We consider that a group has spare capacity if the * number of task is
+ * We consider that a group has spare capacity if the number of task is
* smaller than the number of CPUs or if the utilization is lower than the
* available capacity for CFS tasks.
* For the latter, we use a threshold to stabilize the state, to take into
@@ -8669,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8691,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
- sgs->group_load += cpu_load(rq);
+ sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8722,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;
- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}
@@ -8779,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -9058,16 +9208,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
- return running <= imb_numa_nr;
-}
-
-/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
@@ -9183,7 +9323,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
@@ -9196,17 +9338,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks
* would remain below threshold where an imbalance is
- * allowed. If there is a real need of migration,
- * periodic load balance will take care of it.
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
+ * take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -9222,6 +9378,77 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void update_idle_cpu_scan(struct lb_env *env,
+ unsigned long sum_util)
+{
+ struct sched_domain_shared *sd_share;
+ int llc_weight, pct;
+ u64 x, y, tmp;
+ /*
+ * Update the number of CPUs to scan in LLC domain, which could
+ * be used as a hint in select_idle_cpu(). The update of sd_share
+ * could be expensive because it is within a shared cache line.
+ * So the write of this hint only occurs during periodic load
+ * balancing, rather than CPU_NEWLY_IDLE, because the latter
+ * can fire way more frequently than the former.
+ */
+ if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
+ return;
+
+ llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
+ if (env->sd->span_weight != llc_weight)
+ return;
+
+ sd_share = rcu_dereference(per_cpu(sd_llc_shared, env->dst_cpu));
+ if (!sd_share)
+ return;
+
+ /*
+ * The number of CPUs to search drops as sum_util increases, when
+ * sum_util hits 85% or above, the scan stops.
+ * The reason to choose 85% as the threshold is because this is the
+ * imbalance_pct(117) when a LLC sched group is overloaded.
+ *
+ * let y = SCHED_CAPACITY_SCALE - p * x^2 [1]
+ * and y'= y / SCHED_CAPACITY_SCALE
+ *
+ * x is the ratio of sum_util compared to the CPU capacity:
+ * x = sum_util / (llc_weight * SCHED_CAPACITY_SCALE)
+ * y' is the ratio of CPUs to be scanned in the LLC domain,
+ * and the number of CPUs to scan is calculated by:
+ *
+ * nr_scan = llc_weight * y' [2]
+ *
+ * When x hits the threshold of overloaded, AKA, when
+ * x = 100 / pct, y drops to 0. According to [1],
+ * p should be SCHED_CAPACITY_SCALE * pct^2 / 10000
+ *
+ * Scale x by SCHED_CAPACITY_SCALE:
+ * x' = sum_util / llc_weight; [3]
+ *
+ * and finally [1] becomes:
+ * y = SCHED_CAPACITY_SCALE -
+ * x'^2 * pct^2 / (10000 * SCHED_CAPACITY_SCALE) [4]
+ *
+ */
+ /* equation [3] */
+ x = sum_util;
+ do_div(x, llc_weight);
+
+ /* equation [4] */
+ pct = env->sd->imbalance_pct;
+ tmp = x * x * pct * pct;
+ do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
+ tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);
+ y = SCHED_CAPACITY_SCALE - tmp;
+
+ /* equation [2] */
+ y *= llc_weight;
+ do_div(y, SCHED_CAPACITY_SCALE);
+ if ((int)y != sd_share->nr_idle_scan)
+ WRITE_ONCE(sd_share->nr_idle_scan, (int)y);
+}
+
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
@@ -9234,6 +9461,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
+ unsigned long sum_util = 0;
int sg_status = 0;
do {
@@ -9266,6 +9494,7 @@ next_group:
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
+ sum_util += sgs->group_util;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -9291,24 +9520,8 @@ next_group:
WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
-}
-
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr)
-{
- if (!allow_numa_imbalance(dst_running, imb_numa_nr))
- return imbalance;
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the destination is lightly loaded.
- */
- if (imbalance <= NUMA_IMBALANCE_MIN)
- return 0;
-
- return imbalance;
+ update_idle_cpu_scan(env, sum_util);
}
/**
@@ -9325,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}
@@ -9395,7 +9617,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = nr_diff;
} else {
/*
@@ -9403,15 +9625,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- local->sum_nr_running + 1, env->sd->imb_numa_nr);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
}
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
@@ -9834,9 +10062,15 @@ static int should_we_balance(struct lb_env *env)
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
@@ -11287,9 +11521,13 @@ static inline bool vruntime_normalized(struct task_struct *p)
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
- list_add_leaf_cfs_rq(cfs_rq_of(se));
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
@@ -11297,14 +11535,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (!cfs_rq_throttled(cfs_rq)){
- update_load_avg(cfs_rq, se, UPDATE_TG);
- list_add_leaf_cfs_rq(cfs_rq);
- continue;
- }
+ update_load_avg(cfs_rq, se, UPDATE_TG);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (cfs_rq_throttled(cfs_rq))
break;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
}
#else
@@ -11422,10 +11659,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
void init_cfs_rq(struct cfs_rq *cfs_rq)
{
cfs_rq->tasks_timeline = RB_ROOT_CACHED;
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
+ u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20)));
#ifdef CONFIG_SMP
raw_spin_lock_init(&cfs_rq->removed.lock);
#endif
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..ee7f23c76bd3 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -60,7 +60,8 @@ SCHED_FEAT(TTWU_QUEUE, true)
/*
* When doing wakeups, attempt to limit superfluous scans of the LLC domain.
*/
-SCHED_FEAT(SIS_PROP, true)
+SCHED_FEAT(SIS_PROP, false)
+SCHED_FEAT(SIS_UTIL, true)
/*
* Issue a WARN when we do multiple update_rq_clock() calls
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 4ff2ed4f8fa1..3a0e0dc28721 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,6 +61,25 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
+/* The rq is idle, we can sync to clock_task */
+static inline void _update_idle_rq_clock_pelt(struct rq *rq)
+{
+ rq->clock_pelt = rq_clock_task(rq);
+
+ u64_u32_store(rq->clock_idle, rq_clock(rq));
+ /* Paired with smp_rmb in migrate_se_pelt_lag() */
+ smp_wmb();
+ u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
+}
+
/*
* The clock_pelt scales the time to reflect the effective amount of
* computation done during the running delta time but then sync back to
@@ -76,8 +95,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
- /* The rq is idle, we can sync to clock_task */
- rq->clock_pelt = rq_clock_task(rq);
+ _update_idle_rq_clock_pelt(rq);
return;
}
@@ -130,17 +148,23 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+
+ _update_idle_rq_clock_pelt(rq);
}
-static inline u64 rq_clock_pelt(struct rq *rq)
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
- lockdep_assert_rq_held(rq);
- assert_clock_updated(rq);
+ u64 throttled;
- return rq->clock_pelt - rq->lost_idle_time;
+ if (unlikely(cfs_rq->throttle_count))
+ throttled = U64_MAX;
+ else
+ throttled = cfs_rq->throttled_clock_pelt_time;
+
+ u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
-#ifdef CONFIG_CFS_BANDWIDTH
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
@@ -150,6 +174,7 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
@@ -204,6 +229,7 @@ update_rq_clock_pelt(struct rq *rq, s64 delta) { }
static inline void
update_idle_rq_clock_pelt(struct rq *rq) { }
+static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 8c9ed9664840..55f39c8f4203 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -480,7 +480,7 @@ static inline void rt_queue_push_tasks(struct rq *rq)
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
+static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
@@ -601,7 +601,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se) {
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
@@ -687,7 +687,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
- dequeue_top_rt_rq(rt_rq);
+ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1089,7 +1089,7 @@ static void update_curr_rt(struct rq *rq)
}
static void
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
+dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -1100,7 +1100,7 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
BUG_ON(!rq->nr_running);
- sub_nr_running(rq, rt_rq->rt_nr_running);
+ sub_nr_running(rq, count);
rt_rq->rt_queued = 0;
}
@@ -1486,18 +1486,21 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
+ unsigned int rt_nr_running;
for_each_sched_rt_entity(rt_se) {
rt_se->back = back;
back = rt_se;
}
- dequeue_top_rt_rq(rt_rq_of_se(back));
+ rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se, flags);
}
+
+ dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47b89a0fc6e5..aad7f5ee9666 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -520,6 +520,45 @@ struct cfs_bandwidth { };
#endif /* CONFIG_CGROUP_SCHED */
+/*
+ * u64_u32_load/u64_u32_store
+ *
+ * Use a copy of a u64 value to protect against data race. This is only
+ * applicable for 32-bits architectures.
+ */
+#ifdef CONFIG_64BIT
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
+#else
+# define u64_u32_load_copy(var, copy) \
+({ \
+ u64 __val, __val_copy; \
+ do { \
+ __val_copy = copy; \
+ /* \
+ * paired with u64_u32_store_copy(), ordering access \
+ * to var and copy. \
+ */ \
+ smp_rmb(); \
+ __val = var; \
+ } while (__val != __val_copy); \
+ __val; \
+})
+# define u64_u32_store_copy(var, copy, val) \
+do { \
+ typeof(val) __val = (val); \
+ var = __val; \
+ /* \
+ * paired with u64_u32_load_copy(), ordering access to var and \
+ * copy. \
+ */ \
+ smp_wmb(); \
+ copy = __val; \
+} while (0)
+#endif
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -560,7 +599,7 @@ struct cfs_rq {
*/
struct sched_avg avg;
#ifndef CONFIG_64BIT
- u64 load_last_update_time_copy;
+ u64 last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
@@ -609,6 +648,10 @@ struct cfs_rq {
int runtime_enabled;
s64 runtime_remaining;
+ u64 throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+ u64 throttled_pelt_idle_copy;
+#endif
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
@@ -981,6 +1024,12 @@ struct rq {
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
+ u64 clock_pelt_idle;
+ u64 clock_idle;
+#ifndef CONFIG_64BIT
+ u64 clock_pelt_idle_copy;
+ u64 clock_idle_copy;
+#endif
atomic_t nr_iowait;
@@ -1815,15 +1864,6 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
return to_cpumask(sg->sgc->cpumask);
}
-/**
- * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
- * @group: The group whose first CPU is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_span(group));
-}
-
extern int group_balance_cpu(struct sched_group *sg);
#ifdef CONFIG_SCHED_DEBUG
@@ -2044,7 +2084,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
-#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2852,7 +2891,7 @@ enum cpu_util_type {
};
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum cpu_util_type type,
+ enum cpu_util_type type,
struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05b6c2ad90b9..8739c2a5a54e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2316,23 +2316,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/*
* For a single LLC per node, allow an
- * imbalance up to 25% of the node. This is an
- * arbitrary cutoff based on SMT-2 to balance
- * between memory bandwidth and avoiding
- * premature sharing of HT resources and SMT-4
- * or SMT-8 *may* benefit from a different
- * cutoff.
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
*
* For multiple LLCs, allow an imbalance
* until multiple tasks would share an LLC
* on one node while LLCs on another node
- * remain idle.
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
*/
nr_llcs = sd->span_weight / child->span_weight;
if (nr_llcs == 1)
- imb = sd->span_weight >> 2;
+ imb = sd->span_weight >> 3;
else
imb = nr_llcs;
+ imb = max(1U, imb);
sd->imb_numa_nr = imb;
/* Set span based on the first NUMA domain. */
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index bb9962b33f95..59ddb00d6944 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -454,6 +454,33 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue)
rcu_assign_pointer(watch->queue, wqueue);
}
+static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue)
+{
+ const struct cred *cred;
+ struct watch *w;
+
+ hlist_for_each_entry(w, &wlist->watchers, list_node) {
+ struct watch_queue *wq = rcu_access_pointer(w->queue);
+ if (wqueue == wq && watch->id == w->id)
+ return -EBUSY;
+ }
+
+ cred = current_cred();
+ if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) {
+ atomic_dec(&cred->user->nr_watches);
+ return -EAGAIN;
+ }
+
+ watch->cred = get_cred(cred);
+ rcu_assign_pointer(watch->watch_list, wlist);
+
+ kref_get(&wqueue->usage);
+ kref_get(&watch->usage);
+ hlist_add_head(&watch->queue_node, &wqueue->watches);
+ hlist_add_head_rcu(&watch->list_node, &wlist->watchers);
+ return 0;
+}
+
/**
* add_watch_to_object - Add a watch on an object to a watch list
* @watch: The watch to add
@@ -468,34 +495,21 @@ void init_watch(struct watch *watch, struct watch_queue *wqueue)
*/
int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
{
- struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
- struct watch *w;
-
- hlist_for_each_entry(w, &wlist->watchers, list_node) {
- struct watch_queue *wq = rcu_access_pointer(w->queue);
- if (wqueue == wq && watch->id == w->id)
- return -EBUSY;
- }
-
- watch->cred = get_current_cred();
- rcu_assign_pointer(watch->watch_list, wlist);
+ struct watch_queue *wqueue;
+ int ret = -ENOENT;
- if (atomic_inc_return(&watch->cred->user->nr_watches) >
- task_rlimit(current, RLIMIT_NOFILE)) {
- atomic_dec(&watch->cred->user->nr_watches);
- put_cred(watch->cred);
- return -EAGAIN;
- }
+ rcu_read_lock();
+ wqueue = rcu_access_pointer(watch->queue);
if (lock_wqueue(wqueue)) {
- kref_get(&wqueue->usage);
- kref_get(&watch->usage);
- hlist_add_head(&watch->queue_node, &wqueue->watches);
+ spin_lock(&wlist->lock);
+ ret = add_one_watch(watch, wlist, wqueue);
+ spin_unlock(&wlist->lock);
unlock_wqueue(wqueue);
}
- hlist_add_head(&watch->list_node, &wlist->watchers);
- return 0;
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(add_watch_to_object);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ea50f6be843..aa8a82bc6738 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5001,7 +5001,10 @@ static void unbind_workers(int cpu)
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..e2a39e30756d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,7 +87,8 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- folio_put_refs(folio, refs);
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -176,7 +177,8 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- folio_put_refs(folio, refs);
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
}
/**
diff --git a/mm/hmm.c b/mm/hmm.c
index 3fd3242c5e50..f2aa63b94d9b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -212,14 +212,6 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool hmm_is_device_private_entry(struct hmm_range *range,
- swp_entry_t entry)
-{
- return is_device_private_entry(entry) &&
- pfn_swap_entry_to_page(entry)->pgmap->owner ==
- range->dev_private_owner;
-}
-
static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
pte_t pte)
{
@@ -252,10 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
/*
- * Never fault in device private pages, but just report
- * the PFN even if not present.
+ * Don't fault in device private pages owned by the caller,
+ * just report the PFN.
*/
- if (hmm_is_device_private_entry(range, entry)) {
+ if (is_device_private_entry(entry) &&
+ pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
@@ -273,6 +267,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_private_entry(entry))
+ goto fault;
+
if (is_device_exclusive_entry(entry))
goto fault;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..a18c071c294e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4788,8 +4788,13 @@ again:
* sharing with another vma.
*/
;
- } else if (unlikely(is_hugetlb_entry_migration(entry) ||
- is_hugetlb_entry_hwpoisoned(entry))) {
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
+ bool uffd_wp = huge_pte_uffd_wp(entry);
+
+ if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ entry = huge_pte_clear_uffd_wp(entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (unlikely(is_hugetlb_entry_migration(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
bool uffd_wp = huge_pte_uffd_wp(entry);
@@ -5947,6 +5952,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
+ put_page(*pagep);
ret = -ENOMEM;
*pagep = NULL;
goto out;
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fe598ecd9b7..8652426282cc 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -11,29 +11,35 @@
#include <linux/io.h>
#include <linux/export.h>
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
unsigned long offset, vaddr;
phys_addr_t last_addr;
struct vm_struct *area;
/* Disallow wrap-around or zero size */
- last_addr = addr + size - 1;
- if (!size || last_addr < addr)
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
return NULL;
/* Page-align mappings */
- offset = addr & (~PAGE_MASK);
- addr -= offset;
+ offset = phys_addr & (~PAGE_MASK);
+ phys_addr -= offset;
size = PAGE_ALIGN(size + offset);
+ if (!ioremap_allowed(phys_addr, size, prot))
+ return NULL;
+
area = get_vm_area_caller(size, VM_IOREMAP,
__builtin_return_address(0));
if (!area)
return NULL;
vaddr = (unsigned long)area->addr;
+ area->phys_addr = phys_addr;
- if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr,
+ __pgprot(prot))) {
free_vm_area(area);
return NULL;
}
@@ -44,6 +50,12 @@ EXPORT_SYMBOL(ioremap_prot);
void iounmap(volatile void __iomem *addr)
{
- vunmap((void *)((unsigned long)addr & PAGE_MASK));
+ void *vaddr = (void *)((unsigned long)addr & PAGE_MASK);
+
+ if (!iounmap_allowed(vaddr))
+ return;
+
+ if (is_vmalloc_addr(vaddr))
+ vunmap(vaddr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index c40c0e7b3b5f..78be2beb7453 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -108,9 +108,10 @@ void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
return;
tag = kasan_random_tag();
+ kasan_unpoison(set_tag(page_address(page), tag),
+ PAGE_SIZE << order, init);
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
}
void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 4b5e5a3d3a63..6aff49f6b79e 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -603,14 +603,6 @@ static unsigned long kfence_init_pool(void)
addr += 2 * PAGE_SIZE;
}
- /*
- * The pool is live and will never be deallocated from this point on.
- * Remove the pool object from the kmemleak object tree, as it would
- * otherwise overlap with allocations returned by kfence_alloc(), which
- * are registered with kmemleak through the slab post-alloc hook.
- */
- kmemleak_free(__kfence_pool);
-
return 0;
}
@@ -623,8 +615,16 @@ static bool __init kfence_init_pool_early(void)
addr = kfence_init_pool();
- if (!addr)
+ if (!addr) {
+ /*
+ * The pool is live and will never be deallocated from this point on.
+ * Ignore the pool object from the kmemleak phys object tree, as it would
+ * otherwise overlap with allocations returned by kfence_alloc(), which
+ * are registered with kmemleak through the slab post-alloc hook.
+ */
+ kmemleak_ignore_phys(__pa(__kfence_pool));
return true;
+ }
/*
* Only release unprotected pages, and do not try to go back and change
diff --git a/mm/memory.c b/mm/memory.c
index 4cf7d4b6c950..1c6027adc542 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3043,7 +3043,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_t entry;
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page));
+ VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
/*
* Clear the pages cpupid information as the existing
@@ -4369,9 +4369,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- /* See comment in handle_pte_fault() */
+ /*
+ * See comment in handle_pte_fault() for how this scenario happens, we
+ * need to return NOPAGE so that we drop this page.
+ */
if (pmd_devmap_trans_unstable(vmf->pmd))
- return 0;
+ return VM_FAULT_NOPAGE;
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
diff --git a/mm/memremap.c b/mm/memremap.c
index b870a659eee6..745eea0f99c3 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -499,7 +499,7 @@ void free_zone_device_page(struct page *page)
}
#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs)
{
if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
return false;
@@ -509,9 +509,9 @@ bool __put_devmap_managed_page(struct page *page)
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
- if (page_ref_dec_return(page) == 1)
+ if (page_ref_sub_return(page, refs) == 1)
wake_up_var(&page->_refcount);
return true;
}
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(__put_devmap_managed_page_refs);
#endif /* CONFIG_FS_DAX */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e008a3df0485..b0bcab50f0a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2361,7 +2361,7 @@ static inline bool check_new_pcp(struct page *page, unsigned int order)
}
#endif /* CONFIG_DEBUG_VM */
-static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
+static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
/* Don't skip if a software KASAN mode is enabled. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
@@ -2373,12 +2373,10 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
return true;
/*
- * With hardware tag-based KASAN enabled, skip if either:
- *
- * 1. Memory tags have already been cleared via tag_clear_highpage().
- * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+ * With hardware tag-based KASAN enabled, skip if this has been
+ * requested via __GFP_SKIP_KASAN_UNPOISON.
*/
- return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+ return flags & __GFP_SKIP_KASAN_UNPOISON;
}
static inline bool should_skip_init(gfp_t flags)
@@ -2397,6 +2395,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ int i;
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2422,8 +2421,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* should be initialized as well).
*/
if (init_tags) {
- int i;
-
/* Initialize both memory and tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
@@ -2431,13 +2428,17 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/* Note that memory is already initialized by the loop above. */
init = false;
}
- if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+ if (!should_skip_kasan_unpoison(gfp_flags)) {
/* Unpoison shadow memory or set memory tags. */
kasan_unpoison_pages(page, order, init);
/* Note that memory is already initialized by KASAN. */
if (kasan_has_integrated_init())
init = false;
+ } else {
+ /* Ensure page_address() dereferencing does not fault. */
+ for (i = 0; i != 1 << order; ++i)
+ page_kasan_tag_reset(page + i);
}
/* If memory is still not initialized, do it now. */
if (init)
@@ -3968,11 +3969,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* need to be calculated.
*/
if (!order) {
- long fast_free;
+ long usable_free;
+ long reserved;
+
+ usable_free = free_pages;
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
- fast_free = free_pages;
- fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
- if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ /* reserved may over estimate high-atomic reserves. */
+ usable_free -= min(usable_free, reserved);
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 206ed6b40c1d..f06279d6190a 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -55,22 +55,28 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
struct page *page;
+ vm_fault_t ret;
int err;
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return vmf_error(-EINVAL);
+ filemap_invalidate_lock_shared(mapping);
+
retry:
page = find_lock_page(mapping, offset);
if (!page) {
page = alloc_page(gfp | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
err = set_direct_map_invalid_noflush(page);
if (err) {
put_page(page);
- return vmf_error(err);
+ ret = vmf_error(err);
+ goto out;
}
__SetPageUptodate(page);
@@ -86,7 +92,8 @@ retry:
if (err == -EEXIST)
goto retry;
- return vmf_error(err);
+ ret = vmf_error(err);
+ goto out;
}
addr = (unsigned long)page_address(page);
@@ -94,7 +101,11 @@ retry:
}
vmf->page = page;
- return VM_FAULT_LOCKED;
+ ret = VM_FAULT_LOCKED;
+
+out:
+ filemap_invalidate_unlock_shared(mapping);
+ return ret;
}
static const struct vm_operations_struct secretmem_vm_ops = {
@@ -162,12 +173,20 @@ static int secretmem_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
+ struct address_space *mapping = inode->i_mapping;
unsigned int ia_valid = iattr->ia_valid;
+ int ret;
+
+ filemap_invalidate_lock(mapping);
if ((ia_valid & ATTR_SIZE) && inode->i_size)
- return -EINVAL;
+ ret = -EINVAL;
+ else
+ ret = simple_setattr(mnt_userns, dentry, iattr);
- return simple_setattr(mnt_userns, dentry, iattr);
+ filemap_invalidate_unlock(mapping);
+
+ return ret;
}
static const struct inode_operations secretmem_iops = {
diff --git a/mm/shmem.c b/mm/shmem.c
index a6f565308133..b7f2d4a56867 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3392,7 +3392,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->blocks > S64_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
@@ -3514,10 +3514,7 @@ static int shmem_reconfigure(struct fs_context *fc)
raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (ctx->blocks > S64_MAX) {
- err = "Number of blocks too large";
- goto out;
- }
+
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
err = "Cannot retroactively limit size";
diff --git a/mm/slab.c b/mm/slab.c
index f8cd00f4ba13..5e73e2d80222 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3230,7 +3230,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
}
/* ___cache_alloc_node can fall back to other nodes */
ptr = ____cache_alloc_node(cachep, flags, nodeid);
- out:
+out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
init = slab_want_init_on_alloc(flags, cachep);
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
if (!objp)
objp = ____cache_alloc_node(cache, flags, numa_mem_id());
- out:
+out:
return objp;
}
#else
@@ -3406,9 +3406,10 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
{
bool init;
+ memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1);
+
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
- memcg_slab_free_hook(cachep, &objp, 1);
__kfence_free(objp);
return;
}
@@ -3441,7 +3442,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- memcg_slab_free_hook(cachep, &objp, 1);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3478,7 +3478,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
{
void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret,
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep,
cachep->object_size, cachep->size, flags);
return ret;
@@ -3553,7 +3553,7 @@ error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
slab_post_alloc_hook(s, objcg, flags, i, p, false);
- __kmem_cache_free_bulk(s, i, p);
+ kmem_cache_free_bulk(s, i, p);
return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3567,7 +3567,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret,
+ trace_kmalloc(_RET_IP_, ret, cachep,
size, cachep->size, flags);
return ret;
}
@@ -3592,7 +3592,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3611,7 +3611,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(_RET_IP_, ret, cachep,
size, cachep->size,
flags, nodeid);
return ret;
@@ -3694,7 +3694,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
ret = slab_alloc(cachep, NULL, flags, size, caller);
ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret,
+ trace_kmalloc(caller, ret, cachep,
size, cachep->size, flags);
return ret;
diff --git a/mm/slab.h b/mm/slab.h
index db9fb5c8dae7..4ec82bec15ec 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -380,15 +380,6 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
-/*
- * Generic implementation of bulk operations
- * These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the object listed
- * may be allocated or freed using these operations.
- */
-void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-
static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -547,36 +538,22 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
obj_cgroup_put(objcg);
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
- struct kmem_cache *s;
struct obj_cgroup **objcgs;
- struct obj_cgroup *objcg;
- struct slab *slab;
- unsigned int off;
int i;
if (!memcg_kmem_enabled())
return;
- for (i = 0; i < objects; i++) {
- if (unlikely(!p[i]))
- continue;
-
- slab = virt_to_slab(p[i]);
- /* we could be given a kmalloc_large() object, skip those */
- if (!slab)
- continue;
-
- objcgs = slab_objcgs(slab);
- if (!objcgs)
- continue;
+ objcgs = slab_objcgs(slab);
+ if (!objcgs)
+ return;
- if (!s_orig)
- s = slab->slab_cache;
- else
- s = s_orig;
+ for (i = 0; i < objects; i++) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
off = obj_to_index(s, slab, p[i]);
objcg = objcgs[off];
@@ -628,7 +605,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
{
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s,
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 77c3adf40e50..17996649cfe3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,13 +26,12 @@
#include <linux/memcontrol.h>
#include <linux/stackdepot.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
-
#include "internal.h"
-
#include "slab.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
enum slab_state slab_state;
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
@@ -105,33 +104,6 @@ static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
}
#endif
-void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
-{
- size_t i;
-
- for (i = 0; i < nr; i++) {
- if (s)
- kmem_cache_free(s, p[i]);
- else
- kfree(p[i]);
- }
-}
-
-int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
- void **p)
-{
- size_t i;
-
- for (i = 0; i < nr; i++) {
- void *x = p[i] = kmem_cache_alloc(s, flags);
- if (!x) {
- __kmem_cache_free_bulk(s, i, p);
- return 0;
- }
- }
- return i;
-}
-
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
@@ -959,7 +931,7 @@ EXPORT_SYMBOL(kmalloc_order);
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order_trace);
diff --git a/mm/slob.c b/mm/slob.c
index f47811f09aca..2bd4f476c340 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -507,7 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
*m = size;
ret = (void *)m + minalign;
- trace_kmalloc_node(caller, ret,
+ trace_kmalloc_node(caller, ret, NULL,
size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -516,7 +516,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- trace_kmalloc_node(caller, ret,
+ trace_kmalloc_node(caller, ret, NULL,
size, PAGE_SIZE << order, gfp, node);
}
@@ -616,12 +616,12 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
PAGE_SIZE << get_order(c->size),
flags, node);
}
@@ -692,16 +692,33 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ void *x = p[i] = kmem_cache_alloc(s, flags);
+
+ if (!x) {
+ kmem_cache_free_bulk(s, i, p);
+ return 0;
+ }
+ }
+ return i;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
diff --git a/mm/slub.c b/mm/slub.c
index b1281b8654bd..862dbd9af4f5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3257,7 +3257,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
{
void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
s->size, gfpflags);
return ret;
@@ -3280,7 +3280,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru);
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
@@ -3292,7 +3292,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
s->object_size, s->size, gfpflags, node);
return ret;
@@ -3306,7 +3306,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(_RET_IP_, ret, s,
size, s->size, gfpflags, node);
ret = kasan_kmalloc(s, ret, size, gfpflags);
@@ -3464,9 +3464,6 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long tid;
- /* memcg_slab_free_hook() is already called for bulk free. */
- if (!tail)
- memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3526,9 +3523,10 @@ redo:
}
static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int cnt,
+ void *head, void *tail, void **p, int cnt,
unsigned long addr)
{
+ memcg_slab_free_hook(s, slab, p, cnt);
/*
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
@@ -3550,7 +3548,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
if (!s)
return;
trace_kmem_cache_free(_RET_IP_, x, s->name);
- slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_);
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3591,88 +3589,67 @@ static inline
int build_detached_freelist(struct kmem_cache *s, size_t size,
void **p, struct detached_freelist *df)
{
- size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
struct folio *folio;
- struct slab *slab;
-
- /* Always re-init detached_freelist */
- df->slab = NULL;
-
- do {
- object = p[--size];
- /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
- } while (!object && size);
-
- if (!object)
- return 0;
+ size_t same;
+ object = p[--size];
folio = virt_to_folio(object);
if (!s) {
/* Handle kalloc'ed objects */
if (unlikely(!folio_test_slab(folio))) {
free_large_kmalloc(folio, object);
- p[size] = NULL; /* mark object processed */
+ df->slab = NULL;
return size;
}
/* Derive kmem_cache from object */
- slab = folio_slab(folio);
- df->s = slab->slab_cache;
+ df->slab = folio_slab(folio);
+ df->s = df->slab->slab_cache;
} else {
- slab = folio_slab(folio);
+ df->slab = folio_slab(folio);
df->s = cache_from_obj(s, object); /* Support for memcg */
}
- if (is_kfence_address(object)) {
- slab_free_hook(df->s, object, false);
- __kfence_free(object);
- p[size] = NULL; /* mark object processed */
- return size;
- }
-
/* Start new detached freelist */
- df->slab = slab;
- set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
- p[size] = NULL; /* mark object processed */
df->cnt = 1;
+ if (is_kfence_address(object))
+ return size;
+
+ set_freepointer(df->s, object, NULL);
+
+ same = size;
while (size) {
object = p[--size];
- if (!object)
- continue; /* Skip processed objects */
-
/* df->slab is always set at this point */
if (df->slab == virt_to_slab(object)) {
/* Opportunity build freelist */
set_freepointer(df->s, object, df->freelist);
df->freelist = object;
df->cnt++;
- p[size] = NULL; /* mark object processed */
-
+ same--;
+ if (size != same)
+ swap(p[size], p[same]);
continue;
}
/* Limit look ahead search */
if (!--lookahead)
break;
-
- if (!first_skipped_index)
- first_skipped_index = size + 1;
}
- return first_skipped_index;
+ return same;
}
/* Note that interrupts must be enabled when calling this function. */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
- if (WARN_ON(!size))
+ if (!size)
return;
- memcg_slab_free_hook(s, p, size);
do {
struct detached_freelist df;
@@ -3680,7 +3657,8 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (!df.slab)
continue;
- slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_);
+ slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt,
+ _RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3760,7 +3738,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
error:
slub_put_cpu_ptr(s->cpu_slab);
slab_post_alloc_hook(s, objcg, flags, i, p, false);
- __kmem_cache_free_bulk(s, i, p);
+ kmem_cache_free_bulk(s, i, p);
return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -4441,7 +4419,7 @@ void *__kmalloc(size_t size, gfp_t flags)
ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+ trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
ret = kasan_kmalloc(s, ret, size, flags);
@@ -4475,7 +4453,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
ret = kmalloc_large_node(size, flags, node);
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(_RET_IP_, ret, NULL,
size, PAGE_SIZE << get_order(size),
flags, node);
@@ -4489,7 +4467,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
- trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+ trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
ret = kasan_kmalloc(s, ret, size, flags);
@@ -4581,7 +4559,7 @@ void kfree(const void *x)
return;
}
slab = folio_slab(folio);
- slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_);
+ slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -4890,6 +4868,9 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
+ if (sysfs_slab_alias(s, name))
+ return NULL;
+
s->refcount++;
/*
@@ -4898,11 +4879,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
*/
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
-
- if (sysfs_slab_alias(s, name)) {
- s->refcount--;
- s = NULL;
- }
}
return s;
@@ -4948,7 +4924,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
ret = slab_alloc(s, NULL, gfpflags, caller, size);
/* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, size, s->size, gfpflags);
+ trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
return ret;
}
@@ -4964,7 +4940,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
ret = kmalloc_large_node(size, gfpflags, node);
- trace_kmalloc_node(caller, ret,
+ trace_kmalloc_node(caller, ret, NULL,
size, PAGE_SIZE << get_order(size),
gfpflags, node);
@@ -4979,7 +4955,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
/* Honor the call site pointer we received. */
- trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
+ trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
return ret;
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 2a65a89b5b4d..10b94d64cc25 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -307,7 +307,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
entry.val = 0;
if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
+ if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
get_swap_pages(1, &entry, folio_nr_pages(folio));
goto out;
}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 1739e8cb3291..c17021642234 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4973,6 +4973,9 @@ int hci_suspend_sync(struct hci_dev *hdev)
return err;
}
+ /* Update event mask so only the allowed event can wakeup the host */
+ hci_set_event_mask_sync(hdev);
+
/* Only configure accept list if disconnect succeeded and wake
* isn't being prevented.
*/
@@ -4984,9 +4987,6 @@ int hci_suspend_sync(struct hci_dev *hdev)
/* Unpause to take care of updating scanning params */
hdev->scanning_paused = false;
- /* Update event mask so only the allowed event can wakeup the host */
- hci_set_event_mask_sync(hdev);
-
/* Enable event filter for paired devices */
hci_update_event_filter_sync(hdev);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index ae78490ecd3d..52668662ae8d 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -111,7 +111,8 @@ static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn,
}
/* Find channel with given SCID.
- * Returns locked channel. */
+ * Returns a reference locked channel.
+ */
static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
u16 cid)
{
@@ -119,15 +120,19 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_scid(conn, cid);
- if (c)
- l2cap_chan_lock(c);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
mutex_unlock(&conn->chan_lock);
return c;
}
/* Find channel with given DCID.
- * Returns locked channel.
+ * Returns a reference locked channel.
*/
static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
u16 cid)
@@ -136,8 +141,12 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_dcid(conn, cid);
- if (c)
- l2cap_chan_lock(c);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
mutex_unlock(&conn->chan_lock);
return c;
@@ -162,8 +171,12 @@ static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_ident(conn, ident);
- if (c)
- l2cap_chan_lock(c);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
mutex_unlock(&conn->chan_lock);
return c;
@@ -497,6 +510,16 @@ void l2cap_chan_hold(struct l2cap_chan *c)
kref_get(&c->kref);
}
+struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
+
+ if (!kref_get_unless_zero(&c->kref))
+ return NULL;
+
+ return c;
+}
+
void l2cap_chan_put(struct l2cap_chan *c)
{
BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
@@ -1968,7 +1991,10 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
src_match = !bacmp(&c->src, src);
dst_match = !bacmp(&c->dst, dst);
if (src_match && dst_match) {
- l2cap_chan_hold(c);
+ c = l2cap_chan_hold_unless_zero(c);
+ if (!c)
+ continue;
+
read_unlock(&chan_list_lock);
return c;
}
@@ -1983,7 +2009,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
}
if (c1)
- l2cap_chan_hold(c1);
+ c1 = l2cap_chan_hold_unless_zero(c1);
read_unlock(&chan_list_lock);
@@ -4463,6 +4489,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
unlock:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return err;
}
@@ -4577,6 +4604,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
done:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return err;
}
@@ -5304,6 +5332,7 @@ send_move_response:
l2cap_send_move_chan_rsp(chan, result);
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return 0;
}
@@ -5396,6 +5425,7 @@ static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result)
}
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
}
static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
@@ -5425,6 +5455,7 @@ static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
}
static int l2cap_move_channel_rsp(struct l2cap_conn *conn,
@@ -5488,6 +5519,7 @@ static int l2cap_move_channel_confirm(struct l2cap_conn *conn,
l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return 0;
}
@@ -5523,6 +5555,7 @@ static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn,
}
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return 0;
}
@@ -5895,12 +5928,11 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
if (credits > max_credits) {
BT_ERR("LE credits overflow");
l2cap_send_disconn_req(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
/* Return 0 so that we don't trigger an unnecessary
* command reject packet.
*/
- return 0;
+ goto unlock;
}
chan->tx_credits += credits;
@@ -5911,7 +5943,9 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
if (chan->tx_credits)
chan->ops->resume(chan);
+unlock:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return 0;
}
@@ -7597,6 +7631,7 @@ drop:
done:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
}
static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
@@ -8085,7 +8120,7 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c,
if (src_type != c->src_type)
continue;
- l2cap_chan_hold(c);
+ c = l2cap_chan_hold_unless_zero(c);
read_unlock(&chan_list_lock);
return c;
}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index ae758ab1b558..2f91a8c2b678 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4723,7 +4723,6 @@ static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
else
status = MGMT_STATUS_FAILED;
- mgmt_pending_remove(cmd);
goto unlock;
}
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index bb01776d2d88..c96509c442a5 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -589,9 +589,13 @@ static int br_fill_ifinfo(struct sk_buff *skb,
}
done:
+ if (af) {
+ if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0))
+ nla_nest_end(skb, af);
+ else
+ nla_nest_cancel(skb, af);
+ }
- if (af)
- nla_nest_end(skb, af);
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 251e666ba9a2..748be7253248 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -47,7 +47,7 @@ enum caif_states {
struct caifsock {
struct sock sk; /* must be first member */
struct cflayer layer;
- u32 flow_state;
+ unsigned long flow_state;
struct caif_connect_request conn_req;
struct mutex readlock;
struct dentry *debugfs_socket_dir;
@@ -56,38 +56,32 @@ struct caifsock {
static int rx_flow_is_on(struct caifsock *cf_sk)
{
- return test_bit(RX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ return test_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static int tx_flow_is_on(struct caifsock *cf_sk)
{
- return test_bit(TX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ return test_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_rx_flow_off(struct caifsock *cf_sk)
{
- clear_bit(RX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ clear_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_rx_flow_on(struct caifsock *cf_sk)
{
- set_bit(RX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ set_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_tx_flow_off(struct caifsock *cf_sk)
{
- clear_bit(TX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ clear_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_tx_flow_on(struct caifsock *cf_sk)
{
- set_bit(TX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ set_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void caif_read_lock(struct sock *sk)
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index dc92a67baea3..7d542eb46172 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -480,8 +480,8 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf
sk->sk_family = PF_DECnet;
sk->sk_protocol = 0;
sk->sk_allocation = gfp;
- sk->sk_sndbuf = sysctl_decnet_wmem[1];
- sk->sk_rcvbuf = sysctl_decnet_rmem[1];
+ sk->sk_sndbuf = READ_ONCE(sysctl_decnet_wmem[1]);
+ sk->sk_rcvbuf = READ_ONCE(sysctl_decnet_rmem[1]);
/* Initialization of DECnet Session Control Port */
scp = DN_SK(sk);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 2b56218fc57c..4dfd68cf61c5 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -344,6 +344,7 @@ static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag,
ether_addr_copy(a->addr, addr);
a->vid = vid;
+ a->db = db;
refcount_set(&a->refcount, 1);
list_add_tail(&a->list, &lag->fdbs);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 46e8a5125853..452ff177e4da 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1042,6 +1042,7 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
{
+ u8 fib_notify_on_flag_change;
struct fib_alias *fa_match;
struct sk_buff *skb;
int err;
@@ -1063,14 +1064,16 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
WRITE_ONCE(fa_match->offload, fri->offload);
WRITE_ONCE(fa_match->trap, fri->trap);
+ fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
+
/* 2 means send notifications only if offload_failed was changed. */
- if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 &&
+ if (fib_notify_on_flag_change == 2 &&
READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
goto out;
WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
- if (!net->ipv4.sysctl_fib_notify_on_flag_change)
+ if (!fib_notify_on_flag_change)
goto out;
skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2faaaaf540ac..766881775abb 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -452,8 +452,8 @@ void tcp_init_sock(struct sock *sk)
icsk->icsk_sync_mss = tcp_sync_mss;
- WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
- WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
sk_sockets_allocated_inc(sk);
}
@@ -686,7 +686,7 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
- sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
!tcp_rtx_queue_empty(sk) &&
refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
tcp_skb_can_collapse_to(skb);
@@ -1724,7 +1724,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
else
- cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
val = min(val, cap);
WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
@@ -4459,9 +4459,18 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
}
- /* check the signature */
- genhash = tp->af_specific->calc_md5_hash(newhash, hash_expected,
- NULL, skb);
+ /* Check the signature.
+ * To support dual stack listeners, we need to handle
+ * IPv4-mapped case.
+ */
+ if (family == AF_INET)
+ genhash = tcp_v4_md5_hash_skb(newhash,
+ hash_expected,
+ NULL, skb);
+ else
+ genhash = tp->af_specific->calc_md5_hash(newhash,
+ hash_expected,
+ NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 07dbcbae7782..b1637990d570 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -426,7 +426,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
if (sk->sk_sndbuf < sndmem)
WRITE_ONCE(sk->sk_sndbuf,
- min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -461,7 +461,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
- int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -534,7 +534,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
*/
static void tcp_init_buffer_space(struct sock *sk)
{
- int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
@@ -574,16 +574,17 @@ static void tcp_clamp_window(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);
+ int rmem2;
icsk->icsk_ack.quick = 0;
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
- if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
+ if (sk->sk_rcvbuf < rmem2 &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
WRITE_ONCE(sk->sk_rcvbuf,
- min(atomic_read(&sk->sk_rmem_alloc),
- net->ipv4.sysctl_tcp_rmem[2]));
+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -724,7 +725,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
* <prev RTT . ><current RTT .. ><next RTT .... >
*/
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
@@ -745,7 +746,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
do_div(rcvwin, tp->advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
@@ -910,9 +911,9 @@ static void tcp_update_pacing_rate(struct sock *sk)
* end of slow start and should slow down.
*/
if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
- rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
+ rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
else
- rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
+ rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
@@ -2175,7 +2176,7 @@ void tcp_enter_loss(struct sock *sk)
* loss recovery is underway except recurring timeout(s) on
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
*/
- tp->frto = net->ipv4.sysctl_tcp_frto &&
+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
(new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
}
@@ -3058,7 +3059,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{
- u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
struct tcp_sock *tp = tcp_sk(sk);
if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
@@ -3581,7 +3582,8 @@ static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
if (*last_oow_ack_time) {
s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
- if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
+ if (0 <= elapsed &&
+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
@@ -3629,7 +3631,7 @@ static void tcp_send_challenge_ack(struct sock *sk)
/* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
if (now != challenge_timestamp) {
- u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
+ u32 ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
u32 half = (ack_limit + 1) >> 1;
challenge_timestamp = now;
@@ -4426,7 +4428,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
int mib_idx;
if (before(seq, tp->rcv_nxt))
@@ -4473,7 +4475,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
tcp_rcv_spurious_retrans(sk, skb);
@@ -5519,7 +5521,7 @@ send_now:
}
if (!tcp_is_sack(tp) ||
- tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
+ tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
goto send_now;
if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
@@ -5540,11 +5542,12 @@ send_now:
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;
- delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
+ delay = min_t(unsigned long,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
rtt * (NSEC_PER_USEC >> 3)/20);
sock_hold(sk);
hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
- sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
HRTIMER_MODE_REL_PINNED_SOFT);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d16e6e40f47b..586c102ce152 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1006,7 +1006,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
- tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
(inet_sk(sk)->tos & INET_ECN_MASK) :
inet_sk(sk)->tos;
@@ -1526,7 +1526,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
/* Set ToS of the new socket based upon the value of incoming SYN.
* ECT bits are set later in tcp_init_transfer().
*/
- if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
if (!dst) {
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index a501150deaa3..d58e672be31c 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -329,7 +329,7 @@ void tcp_update_metrics(struct sock *sk)
int m;
sk_dst_confirm(sk);
- if (net->ipv4.sysctl_tcp_nometrics_save || !dst)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
return;
rcu_read_lock();
@@ -385,7 +385,7 @@ void tcp_update_metrics(struct sock *sk)
if (tcp_in_initial_slowstart(tp)) {
/* Slow start still did not finish. */
- if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && (tcp_snd_cwnd(tp) >> 1) > val)
@@ -401,7 +401,7 @@ void tcp_update_metrics(struct sock *sk)
} else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
- if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
@@ -418,7 +418,7 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
(val + tp->snd_ssthresh) >> 1);
}
- if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && tp->snd_ssthresh > val)
@@ -463,7 +463,7 @@ void tcp_init_metrics(struct sock *sk)
if (tcp_metric_locked(tm, TCP_METRIC_CWND))
tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
- val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ?
+ val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val) {
tp->snd_ssthresh = val;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c38e07b50639..4c376b6d8764 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -167,16 +167,13 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
- /* If this is the first data packet sent in response to the
- * previous received data,
- * and it is a reply for ato after last received packet,
- * increase pingpong count.
- */
- if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
- (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- inet_csk_inc_pingpong_cnt(sk);
-
tp->lsndtime = now;
+
+ /* If it is a reply for ato after last received
+ * packet, enter pingpong mode.
+ */
+ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ inet_csk_enter_pingpong_mode(sk);
}
/* Account for an ACK we sent. */
@@ -230,7 +227,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
- if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = min_t(u32, space, U16_MAX);
@@ -241,7 +238,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
*rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
- space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
@@ -285,7 +282,7 @@ static u16 tcp_select_window(struct sock *sk)
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale &&
- sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -1976,7 +1973,7 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
- r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log;
+ r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
bytes += sk->sk_gso_max_size >> r;
@@ -1995,7 +1992,7 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
min_tso = ca_ops->min_tso_segs ?
ca_ops->min_tso_segs(sk) :
- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
@@ -2507,7 +2504,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
if (sk->sk_pacing_status == SK_PACING_NONE)
limit = min_t(unsigned long, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor;
if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 7f695c39d9a8..87c699d57b36 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1522,7 +1522,6 @@ static void mld_query_work(struct work_struct *work)
if (++cnt >= MLD_MAX_QUEUE) {
rework = true;
- schedule_delayed_work(&idev->mc_query_work, 0);
break;
}
}
@@ -1533,8 +1532,10 @@ static void mld_query_work(struct work_struct *work)
__mld_query_work(skb);
mutex_unlock(&idev->mc_lock);
- if (!rework)
- in6_dev_put(idev);
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
/* called with rcu_read_lock() */
@@ -1624,7 +1625,6 @@ static void mld_report_work(struct work_struct *work)
if (++cnt >= MLD_MAX_QUEUE) {
rework = true;
- schedule_delayed_work(&idev->mc_report_work, 0);
break;
}
}
@@ -1635,8 +1635,10 @@ static void mld_report_work(struct work_struct *work)
__mld_report_work(skb);
mutex_unlock(&idev->mc_lock);
- if (!rework)
- in6_dev_put(idev);
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index ecf3a553a0dc..8c6c2d82c1cd 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -22,6 +22,11 @@
#include <linux/proc_fs.h>
#include <net/ping.h>
+static void ping_v6_destroy(struct sock *sk)
+{
+ inet6_destroy_sock(sk);
+}
+
/* Compatibility glue so we can support IPv6 when it's compiled as a module */
static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
int *addr_len)
@@ -181,6 +186,7 @@ struct proto pingv6_prot = {
.owner = THIS_MODULE,
.init = ping_init_sock,
.close = ping_close,
+ .destroy = ping_v6_destroy,
.connect = ip6_datagram_connect_v6_only,
.disconnect = __udp_disconnect,
.setsockopt = ipv6_setsockopt,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9d3ede293258..be09941fe6d9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -546,7 +546,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
if (np->repflow && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
- tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+ tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
(tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
(np->tclass & INET_ECN_MASK) :
np->tclass;
@@ -1314,7 +1314,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
/* Set ToS of the new socket based upon the value of incoming SYN.
* ECT bits are set later in tcp_init_transfer().
*/
- if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
/* Clone native IPv6 options from listening socket (if any)
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 15a73b7fdd75..1a9ada411879 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -377,9 +377,8 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
bool cancel_scan;
struct cfg80211_nan_func *func;
- spin_lock_bh(&local->fq.lock);
clear_bit(SDATA_STATE_RUNNING, &sdata->state);
- spin_unlock_bh(&local->fq.lock);
+ synchronize_rcu(); /* flush _ieee80211_wake_txqs() */
cancel_scan = rcu_access_pointer(local->scan_sdata) == sdata;
if (cancel_scan)
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index bd8f0f425be4..30d289044e71 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1271,7 +1271,7 @@ raise_win:
if (unlikely(th->syn))
new_win = min(new_win, 65535U) << tp->rx_opt.rcv_wscale;
if (!tp->rx_opt.rcv_wscale &&
- sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows)
+ READ_ONCE(sock_net(ssk)->ipv4.sysctl_tcp_workaround_signed_windows))
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 21a3ed64226e..7e1518bb6115 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1908,7 +1908,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvmem, rcvbuf;
u64 rcvwin, grow;
@@ -1926,7 +1926,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
do_div(rcvwin, advmss);
rcvbuf = min_t(u64, rcvwin * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
if (rcvbuf > sk->sk_rcvbuf) {
u32 window_clamp;
@@ -2669,8 +2669,8 @@ static int mptcp_init_sock(struct sock *sk)
mptcp_ca_reset(sk);
sk_sockets_allocated_inc(sk);
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
+ sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
+ sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]);
return 0;
}
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 63e8892ec807..af28f3b60389 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1533,7 +1533,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
mptcp_sock_graft(ssk, sk->sk_socket);
iput(SOCK_INODE(sf));
WRITE_ONCE(msk->allow_infinite_fallback, false);
- return err;
+ return 0;
failed_unlink:
list_del(&subflow->node);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 646d5fd53604..9f976b11d896 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3340,6 +3340,8 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
if (err < 0)
return err;
}
+
+ cond_resched();
}
return 0;
@@ -9367,9 +9369,13 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
break;
}
}
+
+ cond_resched();
}
list_for_each_entry(set, &ctx->table->sets, list) {
+ cond_resched();
+
if (!nft_is_active_next(ctx->net, set))
continue;
if (!(set->flags & NFT_SET_MAP) ||
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index a364f8e5e698..87a9009d5234 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -843,11 +843,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
}
static int
-nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
+nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
struct sk_buff *nskb;
if (diff < 0) {
+ unsigned int min_len = skb_transport_offset(e->skb);
+
+ if (data_len < min_len)
+ return -EINVAL;
+
if (pskb_trim(e->skb, data_len))
return -ENOMEM;
} else if (diff > 0) {
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index 15e4b7640dc0..da29e92c03e2 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -68,6 +68,31 @@ static void nft_queue_sreg_eval(const struct nft_expr *expr,
regs->verdict.code = ret;
}
+static int nft_queue_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ static const unsigned int supported_hooks = ((1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD) |
+ (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ case NFPROTO_INET:
+ case NFPROTO_BRIDGE:
+ break;
+ case NFPROTO_NETDEV: /* lacks okfn */
+ fallthrough;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, supported_hooks);
+}
+
static const struct nla_policy nft_queue_policy[NFTA_QUEUE_MAX + 1] = {
[NFTA_QUEUE_NUM] = { .type = NLA_U16 },
[NFTA_QUEUE_TOTAL] = { .type = NLA_U16 },
@@ -164,6 +189,7 @@ static const struct nft_expr_ops nft_queue_ops = {
.eval = nft_queue_eval,
.init = nft_queue_init,
.dump = nft_queue_dump,
+ .validate = nft_queue_validate,
.reduce = NFT_REDUCE_READONLY,
};
@@ -173,6 +199,7 @@ static const struct nft_expr_ops nft_queue_sreg_ops = {
.eval = nft_queue_sreg_eval,
.init = nft_queue_sreg_init,
.dump = nft_queue_sreg_dump,
+ .validate = nft_queue_validate,
.reduce = NFT_REDUCE_READONLY,
};
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index be29da09cc7a..3460abceba44 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -229,9 +229,8 @@ static struct sctp_association *sctp_association_init(
if (!sctp_ulpq_init(&asoc->ulpq, asoc))
goto fail_init;
- if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams,
- 0, gfp))
- goto fail_init;
+ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp))
+ goto stream_free;
/* Initialize default path MTU. */
asoc->pathmtu = sp->pathmtu;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index 6dc95dcc0ff4..ef9fceadef8d 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -137,7 +137,7 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
ret = sctp_stream_alloc_out(stream, outcnt, gfp);
if (ret)
- goto out_err;
+ return ret;
for (i = 0; i < stream->outcnt; i++)
SCTP_SO(stream, i)->state = SCTP_STREAM_OPEN;
@@ -145,22 +145,9 @@ int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt,
handle_in:
sctp_stream_interleave_init(stream);
if (!incnt)
- goto out;
-
- ret = sctp_stream_alloc_in(stream, incnt, gfp);
- if (ret)
- goto in_err;
-
- goto out;
+ return 0;
-in_err:
- sched->free(stream);
- genradix_free(&stream->in);
-out_err:
- genradix_free(&stream->out);
- stream->outcnt = 0;
-out:
- return ret;
+ return sctp_stream_alloc_in(stream, incnt, gfp);
}
int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid)
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 518b1b9bf89d..1ad565ed5627 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -160,7 +160,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
if (!SCTP_SO(&asoc->stream, i)->ext)
continue;
- ret = n->init_sid(&asoc->stream, i, GFP_KERNEL);
+ ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC);
if (ret)
goto err;
}
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 43509c7e90fc..f1c3b8eb4b3d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -517,7 +517,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
timer_setup(&sk->sk_timer, tipc_sk_timeout, 0);
sk->sk_shutdown = 0;
sk->sk_backlog_rcv = tipc_sk_backlog_rcv;
- sk->sk_rcvbuf = sysctl_tipc_rmem[1];
+ sk->sk_rcvbuf = READ_ONCE(sysctl_tipc_rmem[1]);
sk->sk_data_ready = tipc_data_ready;
sk->sk_write_space = tipc_write_space;
sk->sk_destruct = tipc_sock_destruct;
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 879b9024678e..9975df34d9c2 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1376,8 +1376,13 @@ static int tls_device_down(struct net_device *netdev)
* by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW.
* Now release the ref taken above.
*/
- if (refcount_dec_and_test(&ctx->refcount))
+ if (refcount_dec_and_test(&ctx->refcount)) {
+ /* sk_destruct ran after tls_device_down took a ref, and
+ * it returned early. Complete the destruction here.
+ */
+ list_del(&ctx->list);
tls_device_free_ctx(ctx);
+ }
}
up_write(&device_offload_lock);
diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files
index 7adab4618035..379e86c71bed 100755
--- a/scripts/remove-stale-files
+++ b/scripts/remove-stale-files
@@ -41,3 +41,5 @@ if [ -n "${building_out_of_srctree}" ]; then
fi
rm -f scripts/extract-cert
+
+rm -f arch/x86/purgatory/kexec-purgatory.c
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index cc88f02c7562..93e8bc047a73 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -755,13 +755,14 @@ void evm_inode_post_removexattr(struct dentry *dentry, const char *xattr_name)
evm_update_evmxattr(dentry, xattr_name, NULL, 0);
}
-static int evm_attr_change(struct dentry *dentry, struct iattr *attr)
+static int evm_attr_change(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_backing_inode(dentry);
unsigned int ia_valid = attr->ia_valid;
- if ((!(ia_valid & ATTR_UID) || uid_eq(attr->ia_uid, inode->i_uid)) &&
- (!(ia_valid & ATTR_GID) || gid_eq(attr->ia_gid, inode->i_gid)) &&
+ if (!i_uid_needs_update(mnt_userns, attr, inode) &&
+ !i_gid_needs_update(mnt_userns, attr, inode) &&
(!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
return 0;
@@ -775,7 +776,8 @@ static int evm_attr_change(struct dentry *dentry, struct iattr *attr)
* Permit update of file attributes when files have a valid EVM signature,
* except in the case of them having an immutable portable signature.
*/
-int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
+int evm_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
{
unsigned int ia_valid = attr->ia_valid;
enum integrity_status evm_status;
@@ -801,7 +803,7 @@ int evm_inode_setattr(struct dentry *dentry, struct iattr *attr)
return 0;
if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
- !evm_attr_change(dentry, attr))
+ !evm_attr_change(mnt_userns, dentry, attr))
return 0;
integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
index 13753136f03f..419dc405c831 100644
--- a/security/integrity/ima/ima_kexec.c
+++ b/security/integrity/ima/ima_kexec.c
@@ -137,7 +137,7 @@ void ima_add_kexec_buffer(struct kimage *image)
/*
* Restore the measurement list from the previous kernel.
*/
-void ima_load_kexec_buffer(void)
+void __init ima_load_kexec_buffer(void)
{
void *kexec_buffer = NULL;
size_t kexec_buffer_size = 0;
diff --git a/security/security.c b/security/security.c
index 188b8f782220..f85afb02ea1c 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1324,7 +1324,8 @@ int security_inode_permission(struct inode *inode, int mask)
return call_int_hook(inode_permission, 0, inode, mask);
}
-int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
+int security_inode_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
int ret;
@@ -1333,7 +1334,7 @@ int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
ret = call_int_hook(inode_setattr, 0, dentry, attr);
if (ret)
return ret;
- return evm_inode_setattr(dentry, attr);
+ return evm_inode_setattr(mnt_userns, dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);
diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h
index 00f5227c8459..a77b915d36a8 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -302,6 +302,7 @@
#define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */
#define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */
#define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */
+#define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h
index 0197042b7dfb..1ecdb911add8 100644
--- a/tools/include/uapi/asm-generic/fcntl.h
+++ b/tools/include/uapi/asm-generic/fcntl.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _ASM_GENERIC_FCNTL_H
#define _ASM_GENERIC_FCNTL_H
@@ -90,7 +91,7 @@
/* a horrid kludge trying to make sure that this will fail on old kernels */
#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
-#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)
+#define O_TMPFILE_MASK (__O_TMPFILE | O_DIRECTORY | O_CREAT)
#ifndef O_NDELAY
#define O_NDELAY O_NONBLOCK
@@ -115,11 +116,13 @@
#define F_GETSIG 11 /* for sockets. */
#endif
+#if __BITS_PER_LONG == 32 || defined(__KERNEL__)
#ifndef F_GETLK64
#define F_GETLK64 12 /* using 'struct flock64' */
#define F_SETLK64 13
#define F_SETLKW64 14
#endif
+#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */
#ifndef F_SETOWN_EX
#define F_SETOWN_EX 15
@@ -178,6 +181,10 @@ struct f_owner_ex {
blocking */
#define LOCK_UN 8 /* remove lock */
+/*
+ * LOCK_MAND support has been removed from the kernel. We leave the symbols
+ * here to not break legacy builds, but these should not be used in new code.
+ */
#define LOCK_MAND 32 /* This is a mandatory flock ... */
#define LOCK_READ 64 /* which allows concurrent read operations */
#define LOCK_WRITE 128 /* which allows concurrent write operations */
@@ -185,6 +192,7 @@ struct f_owner_ex {
#define F_LINUX_SPECIFIC_BASE 1024
+#ifndef HAVE_ARCH_STRUCT_FLOCK
struct flock {
short l_type;
short l_whence;
@@ -209,5 +217,6 @@ struct flock64 {
__ARCH_FLOCK64_PAD
#endif
};
+#endif /* HAVE_ARCH_STRUCT_FLOCK */
#endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/tools/perf/scripts/python/arm-cs-trace-disasm.py b/tools/perf/scripts/python/arm-cs-trace-disasm.py
index 5f57d9829956..4339692a8d0b 100755
--- a/tools/perf/scripts/python/arm-cs-trace-disasm.py
+++ b/tools/perf/scripts/python/arm-cs-trace-disasm.py
@@ -61,7 +61,7 @@ def get_optional(perf_dict, field):
def get_offset(perf_dict, field):
if field in perf_dict:
- return f"+0x{perf_dict[field]:x}"
+ return "+%#x" % perf_dict[field]
return ""
def get_dso_file_path(dso_name, dso_build_id):
@@ -76,7 +76,7 @@ def get_dso_file_path(dso_name, dso_build_id):
else:
append = "/elf"
- dso_path = f"{os.environ['PERF_BUILDID_DIR']}/{dso_name}/{dso_build_id}{append}"
+ dso_path = os.environ['PERF_BUILDID_DIR'] + "/" + dso_name + "/" + dso_build_id + append;
# Replace duplicate slash chars to single slash char
dso_path = dso_path.replace('//', '/', 1)
return dso_path
@@ -94,8 +94,8 @@ def read_disam(dso_fname, dso_start, start_addr, stop_addr):
start_addr = start_addr - dso_start;
stop_addr = stop_addr - dso_start;
disasm = [ options.objdump_name, "-d", "-z",
- f"--start-address=0x{start_addr:x}",
- f"--stop-address=0x{stop_addr:x}" ]
+ "--start-address="+format(start_addr,"#x"),
+ "--stop-address="+format(stop_addr,"#x") ]
disasm += [ dso_fname ]
disasm_output = check_output(disasm).decode('utf-8').split('\n')
disasm_cache[addr_range] = disasm_output
@@ -109,12 +109,14 @@ def print_disam(dso_fname, dso_start, start_addr, stop_addr):
m = disasm_re.search(line)
if m is None:
continue
- print(f"\t{line}")
+ print("\t" + line)
def print_sample(sample):
- print(f"Sample = {{ cpu: {sample['cpu']:04} addr: 0x{sample['addr']:016x} " \
- f"phys_addr: 0x{sample['phys_addr']:016x} ip: 0x{sample['ip']:016x} " \
- f"pid: {sample['pid']} tid: {sample['tid']} period: {sample['period']} time: {sample['time']} }}")
+ print("Sample = { cpu: %04d addr: 0x%016x phys_addr: 0x%016x ip: 0x%016x " \
+ "pid: %d tid: %d period: %d time: %d }" % \
+ (sample['cpu'], sample['addr'], sample['phys_addr'], \
+ sample['ip'], sample['pid'], sample['tid'], \
+ sample['period'], sample['time']))
def trace_begin():
print('ARM CoreSight Trace Data Assembler Dump')
@@ -131,7 +133,7 @@ def common_start_str(comm, sample):
cpu = sample["cpu"]
pid = sample["pid"]
tid = sample["tid"]
- return f"{comm:>16} {pid:>5}/{tid:<5} [{cpu:04}] {sec:9}.{ns:09} "
+ return "%16s %5u/%-5u [%04u] %9u.%09u " % (comm, pid, tid, cpu, sec, ns)
# This code is copied from intel-pt-events.py for printing source code
# line and symbols.
@@ -171,7 +173,7 @@ def print_srccode(comm, param_dict, sample, symbol, dso):
glb_line_number = line_number
glb_source_file_name = source_file_name
- print(f"{start_str}{src_str}")
+ print(start_str, src_str)
def process_event(param_dict):
global cache_size
@@ -188,7 +190,7 @@ def process_event(param_dict):
symbol = get_optional(param_dict, "symbol")
if (options.verbose == True):
- print(f"Event type: {name}")
+ print("Event type: %s" % name)
print_sample(sample)
# If cannot find dso so cannot dump assembler, bail out
@@ -197,7 +199,7 @@ def process_event(param_dict):
# Validate dso start and end addresses
if ((dso_start == '[unknown]') or (dso_end == '[unknown]')):
- print(f"Failed to find valid dso map for dso {dso}")
+ print("Failed to find valid dso map for dso %s" % dso)
return
if (name[0:12] == "instructions"):
@@ -244,15 +246,15 @@ def process_event(param_dict):
# Handle CS_ETM_TRACE_ON packet if start_addr=0 and stop_addr=4
if (start_addr == 0 and stop_addr == 4):
- print(f"CPU{cpu}: CS_ETM_TRACE_ON packet is inserted")
+ print("CPU%d: CS_ETM_TRACE_ON packet is inserted" % cpu)
return
if (start_addr < int(dso_start) or start_addr > int(dso_end)):
- print(f"Start address 0x{start_addr:x} is out of range [ 0x{dso_start:x} .. 0x{dso_end:x} ] for dso {dso}")
+ print("Start address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (start_addr, int(dso_start), int(dso_end), dso))
return
if (stop_addr < int(dso_start) or stop_addr > int(dso_end)):
- print(f"Stop address 0x{stop_addr:x} is out of range [ 0x{dso_start:x} .. 0x{dso_end:x} ] for dso {dso}")
+ print("Stop address 0x%x is out of range [ 0x%x .. 0x%x ] for dso %s" % (stop_addr, int(dso_start), int(dso_end), dso))
return
if (options.objdump_name != None):
@@ -267,6 +269,6 @@ def process_event(param_dict):
if path.exists(dso_fname):
print_disam(dso_fname, dso_vm_start, start_addr, stop_addr)
else:
- print(f"Failed to find dso {dso} for address range [ 0x{start_addr:x} .. 0x{stop_addr:x} ]")
+ print("Failed to find dso %s for address range [ 0x%x .. 0x%x ]" % (dso, start_addr, stop_addr))
print_srccode(comm, param_dict, sample, symbol, dso)
diff --git a/tools/perf/util/bpf-loader.c b/tools/perf/util/bpf-loader.c
index f8ad581ea247..cdd6463a5b68 100644
--- a/tools/perf/util/bpf-loader.c
+++ b/tools/perf/util/bpf-loader.c
@@ -63,20 +63,16 @@ static struct hashmap *bpf_map_hash;
static struct bpf_perf_object *
bpf_perf_object__next(struct bpf_perf_object *prev)
{
- struct bpf_perf_object *next;
-
- if (!prev)
- next = list_first_entry(&bpf_objects_list,
- struct bpf_perf_object,
- list);
- else
- next = list_next_entry(prev, list);
+ if (!prev) {
+ if (list_empty(&bpf_objects_list))
+ return NULL;
- /* Empty list is noticed here so don't need checking on entry. */
- if (&next->list == &bpf_objects_list)
+ return list_first_entry(&bpf_objects_list, struct bpf_perf_object, list);
+ }
+ if (list_is_last(&prev->list, &bpf_objects_list))
return NULL;
- return next;
+ return list_next_entry(prev, list);
}
#define bpf_perf_object__for_each(perf_obj, tmp) \
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index ecd377938eea..b3be5b1d9dbb 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -233,6 +233,33 @@ Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
return NULL;
}
+static int elf_read_program_header(Elf *elf, u64 vaddr, GElf_Phdr *phdr)
+{
+ size_t i, phdrnum;
+ u64 sz;
+
+ if (elf_getphdrnum(elf, &phdrnum))
+ return -1;
+
+ for (i = 0; i < phdrnum; i++) {
+ if (gelf_getphdr(elf, i, phdr) == NULL)
+ return -1;
+
+ if (phdr->p_type != PT_LOAD)
+ continue;
+
+ sz = max(phdr->p_memsz, phdr->p_filesz);
+ if (!sz)
+ continue;
+
+ if (vaddr >= phdr->p_vaddr && (vaddr < phdr->p_vaddr + sz))
+ return 0;
+ }
+
+ /* Not found any valid program header */
+ return -1;
+}
+
static bool want_demangle(bool is_kernel_sym)
{
return is_kernel_sym ? symbol_conf.demangle_kernel : symbol_conf.demangle;
@@ -1209,6 +1236,7 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
sym.st_value);
used_opd = true;
}
+
/*
* When loading symbols in a data mapping, ABS symbols (which
* has a value of SHN_ABS in its st_shndx) failed at
@@ -1227,6 +1255,17 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
gelf_getshdr(sec, &shdr);
+ /*
+ * If the attribute bit SHF_ALLOC is not set, the section
+ * doesn't occupy memory during process execution.
+ * E.g. ".gnu.warning.*" section is used by linker to generate
+ * warnings when calling deprecated functions, the symbols in
+ * the section aren't loaded to memory during process execution,
+ * so skip them.
+ */
+ if (!(shdr.sh_flags & SHF_ALLOC))
+ continue;
+
secstrs = secstrs_sym;
/*
@@ -1262,11 +1301,20 @@ dso__load_sym_internal(struct dso *dso, struct map *map, struct symsrc *syms_ss,
goto out_elf_end;
} else if ((used_opd && runtime_ss->adjust_symbols) ||
(!used_opd && syms_ss->adjust_symbols)) {
+ GElf_Phdr phdr;
+
+ if (elf_read_program_header(syms_ss->elf,
+ (u64)sym.st_value, &phdr)) {
+ pr_warning("%s: failed to find program header for "
+ "symbol: %s st_value: %#" PRIx64 "\n",
+ __func__, elf_name, (u64)sym.st_value);
+ continue;
+ }
pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " "
- "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__,
- (u64)sym.st_value, (u64)shdr.sh_addr,
- (u64)shdr.sh_offset);
- sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+ "p_vaddr: %#" PRIx64 " p_offset: %#" PRIx64 "\n",
+ __func__, (u64)sym.st_value, (u64)phdr.p_vaddr,
+ (u64)phdr.p_offset);
+ sym.st_value -= phdr.p_vaddr - phdr.p_offset;
}
demangled = demangle_sym(dso, kmodule, elf_name);
diff --git a/tools/testing/selftests/rseq/rseq-riscv.h b/tools/testing/selftests/rseq/rseq-riscv.h
index b86642f90d7f..3a391c9bf468 100644
--- a/tools/testing/selftests/rseq/rseq-riscv.h
+++ b/tools/testing/selftests/rseq/rseq-riscv.h
@@ -86,7 +86,7 @@ do { \
#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
RSEQ_INJECT_ASM(1) \
- "la "RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \
+ "la " RSEQ_ASM_TMP_REG_1 ", " __rseq_str(cs_label) "\n" \
REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(rseq_cs) "]\n" \
__rseq_str(label) ":\n"
@@ -103,17 +103,17 @@ do { \
#define RSEQ_ASM_OP_CMPEQ(var, expect, label) \
REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
- "bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
+ "bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
__rseq_str(label) "\n"
#define RSEQ_ASM_OP_CMPEQ32(var, expect, label) \
- "lw "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
- "bne "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
+ "lw " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
+ "bne " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
__rseq_str(label) "\n"
#define RSEQ_ASM_OP_CMPNE(var, expect, label) \
REG_L RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n" \
- "beq "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
+ "beq " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(expect) "] ," \
__rseq_str(label) "\n"
#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
@@ -127,12 +127,12 @@ do { \
REG_S RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(var) "]\n"
#define RSEQ_ASM_OP_R_LOAD_OFF(offset) \
- "add "RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \
+ "add " RSEQ_ASM_TMP_REG_1 ", %[" __rseq_str(offset) "], " \
RSEQ_ASM_TMP_REG_1 "\n" \
REG_L RSEQ_ASM_TMP_REG_1 ", (" RSEQ_ASM_TMP_REG_1 ")\n"
#define RSEQ_ASM_OP_R_ADD(count) \
- "add "RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \
+ "add " RSEQ_ASM_TMP_REG_1 ", " RSEQ_ASM_TMP_REG_1 \
", %[" __rseq_str(count) "]\n"
#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label) \
@@ -194,8 +194,8 @@ int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[expect] "r" (expect),
[newv] "r" (newv)
@@ -251,8 +251,8 @@ int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[expectnot] "r" (expectnot),
[load] "m" (*load),
@@ -301,8 +301,8 @@ int rseq_addv(intptr_t *v, intptr_t count, int cpu)
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[count] "r" (count)
RSEQ_INJECT_INPUT
@@ -352,8 +352,8 @@ int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
@@ -411,8 +411,8 @@ int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
@@ -472,8 +472,8 @@ int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[v] "m" (*v),
[expect] "r" (expect),
[v2] "m" (*v2),
@@ -532,8 +532,8 @@ int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
@@ -593,8 +593,8 @@ int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[expect] "r" (expect),
[v] "m" (*v),
[newv] "r" (newv),
@@ -651,8 +651,8 @@ int rseq_offset_deref_addv(intptr_t *ptr, off_t off, intptr_t inc, int cpu)
RSEQ_ASM_DEFINE_ABORT(4, abort)
: /* gcc asm goto does not allow outputs */
: [cpu_id] "r" (cpu),
- [current_cpu_id] "m" (__rseq_abi.cpu_id),
- [rseq_cs] "m" (__rseq_abi.rseq_cs),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
[ptr] "r" (ptr),
[off] "er" (off),
[inc] "er" (inc)
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
index 986b9458efb2..4177f9507bbe 100644
--- a/tools/testing/selftests/rseq/rseq.c
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -111,7 +111,8 @@ void rseq_init(void)
libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
- if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p) {
+ if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
+ *libc_rseq_size_p != 0) {
/* rseq registration owned by glibc */
rseq_offset = *libc_rseq_offset_p;
rseq_size = *libc_rseq_size_p;
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c
index 9b68658b6bb8..5b98f3ee58a5 100644
--- a/tools/vm/slabinfo.c
+++ b/tools/vm/slabinfo.c
@@ -233,6 +233,24 @@ static unsigned long read_slab_obj(struct slabinfo *s, const char *name)
return l;
}
+static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name)
+{
+ char x[128];
+ FILE *f;
+ size_t l;
+
+ snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name);
+ f = fopen(x, "r");
+ if (!f) {
+ buffer[0] = 0;
+ l = 0;
+ } else {
+ l = fread(buffer, 1, sizeof(buffer), f);
+ buffer[l] = 0;
+ fclose(f);
+ }
+ return l;
+}
/*
* Put a size string together
@@ -409,14 +427,18 @@ static void show_tracking(struct slabinfo *s)
{
printf("\n%s: Kernel object allocation\n", s->name);
printf("-----------------------------------------------------------------------\n");
- if (read_slab_obj(s, "alloc_calls"))
+ if (read_debug_slab_obj(s, "alloc_traces"))
+ printf("%s", buffer);
+ else if (read_slab_obj(s, "alloc_calls"))
printf("%s", buffer);
else
printf("No Data\n");
printf("\n%s: Kernel object freeing\n", s->name);
printf("------------------------------------------------------------------------\n");
- if (read_slab_obj(s, "free_calls"))
+ if (read_debug_slab_obj(s, "free_traces"))
+ printf("%s", buffer);
+ else if (read_slab_obj(s, "free_calls"))
printf("%s", buffer);
else
printf("No Data\n");