summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CREDITS8
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs21
-rw-r--r--Documentation/DMA-API.txt55
-rw-r--r--Documentation/admin-guide/LSM/tomoyo.rst24
-rw-r--r--Documentation/conf.py62
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt4
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-mediatek.txt6
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-rockchip.txt11
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-tiecap.txt1
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-zx.txt22
-rw-r--r--Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt1
-rw-r--r--Documentation/devicetree/bindings/thermal/mediatek-thermal.txt1
-rw-r--r--Documentation/devicetree/bindings/thermal/rockchip-thermal.txt1
-rw-r--r--Documentation/devicetree/bindings/thermal/uniphier-thermal.txt64
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/f2fs.txt19
-rw-r--r--Documentation/filesystems/vfs.txt2
-rw-r--r--Documentation/translations/ko_KR/memory-barriers.txt132
-rw-r--r--MAINTAINERS31
-rw-r--r--arch/alpha/include/uapi/asm/siginfo.h14
-rw-r--r--arch/alpha/kernel/traps.c6
-rw-r--r--arch/arm/Kconfig4
-rw-r--r--arch/arm/include/asm/smp_scu.h1
-rw-r--r--arch/arm/include/asm/suspend.h2
-rw-r--r--arch/arm/include/debug/omap2plus.S1
-rw-r--r--arch/arm/kernel/entry-armv.S2
-rw-r--r--arch/arm/kernel/entry-common.S44
-rw-r--r--arch/arm/kernel/head.S2
-rw-r--r--arch/arm/kernel/hyp-stub.S1
-rw-r--r--arch/arm/kernel/iwmmxt.S1
-rw-r--r--arch/arm/kernel/sleep.S1
-rw-r--r--arch/arm/kernel/stacktrace.c1
-rw-r--r--arch/arm/mach-exynos/sleep.S1
-rw-r--r--arch/arm/mach-imx/mach-imx27_visstrim_m10.c44
-rw-r--r--arch/arm/mach-imx/mach-mx31moboard.c12
-rw-r--r--arch/arm/mach-omap2/sleep34xx.S2
-rw-r--r--arch/arm/mach-omap2/sleep44xx.S1
-rw-r--r--arch/arm/mach-pxa/mioa701_bootresume.S2
-rw-r--r--arch/arm/mach-rockchip/sleep.S2
-rw-r--r--arch/arm/mm/cache-v4wb.S1
-rw-r--r--arch/arm/mm/fault.c5
-rw-r--r--arch/arm/mm/proc-v7-3level.S3
-rw-r--r--arch/arm/mm/proc-xscale.S1
-rw-r--r--arch/arm64/kernel/signal32.c23
-rw-r--r--arch/blackfin/include/uapi/asm/siginfo.h30
-rw-r--r--arch/frv/include/uapi/asm/siginfo.h2
-rw-r--r--arch/ia64/include/uapi/asm/siginfo.h21
-rw-r--r--arch/ia64/kernel/signal.c17
-rw-r--r--arch/ia64/kernel/traps.c4
-rw-r--r--arch/metag/include/asm/dma-mapping.h2
-rw-r--r--arch/mips/include/uapi/asm/siginfo.h4
-rw-r--r--arch/mips/kernel/signal32.c19
-rw-r--r--arch/mips/kernel/traps.c2
-rw-r--r--arch/nios2/include/asm/dma-mapping.h2
-rw-r--r--arch/parisc/kernel/signal32.c31
-rw-r--r--arch/powerpc/kernel/signal_32.c20
-rw-r--r--arch/s390/include/asm/ap.h126
-rw-r--r--arch/s390/include/asm/mmu.h7
-rw-r--r--arch/s390/include/asm/mmu_context.h10
-rw-r--r--arch/s390/include/asm/tlbflush.h30
-rw-r--r--arch/s390/kernel/compat_signal.c32
-rw-r--r--arch/s390/mm/gmap.c8
-rw-r--r--arch/s390/mm/pgalloc.c20
-rw-r--r--arch/sh/drivers/pci/fixups-dreamcast.c3
-rw-r--r--arch/sparc/include/uapi/asm/siginfo.h9
-rw-r--r--arch/sparc/kernel/signal32.c16
-rw-r--r--arch/sparc/kernel/traps_32.c2
-rw-r--r--arch/sparc/kernel/traps_64.c2
-rw-r--r--arch/tile/include/asm/dma-mapping.h4
-rw-r--r--arch/tile/include/uapi/asm/siginfo.h4
-rw-r--r--arch/tile/kernel/compat_signal.c18
-rw-r--r--arch/tile/kernel/traps.c2
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/mem_encrypt.h4
-rw-r--r--arch/x86/include/asm/mpspec.h1
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/signal_compat.c21
-rw-r--r--arch/x86/mm/mem_encrypt.c2
-rw-r--r--drivers/acpi/nfit/Kconfig2
-rw-r--r--drivers/acpi/nfit/core.c50
-rw-r--r--drivers/base/dma-coherent.c85
-rw-r--r--drivers/base/dma-mapping.c7
-rw-r--r--drivers/block/rbd.c2
-rw-r--r--drivers/char/virtio_console.c3
-rw-r--r--drivers/dax/super.c12
-rw-r--r--drivers/firmware/efi/cper.c10
-rw-r--r--drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c5
-rw-r--r--drivers/net/ethernet/amd/au1000_eth.c18
-rw-r--r--drivers/net/ethernet/i825xx/lasi_82596.c6
-rw-r--r--drivers/net/ethernet/i825xx/lib82596.c9
-rw-r--r--drivers/net/ethernet/i825xx/sni_82596.c6
-rw-r--r--drivers/net/ethernet/seeq/sgiseeq.c8
-rw-r--r--drivers/nvdimm/btt.c197
-rw-r--r--drivers/nvdimm/btt.h11
-rw-r--r--drivers/nvdimm/btt_devs.c4
-rw-r--r--drivers/nvdimm/bus.c27
-rw-r--r--drivers/nvdimm/claim.c9
-rw-r--r--drivers/nvdimm/core.c10
-rw-r--r--drivers/nvdimm/label.c30
-rw-r--r--drivers/nvdimm/namespace_devs.c6
-rw-r--r--drivers/nvdimm/nd.h16
-rw-r--r--drivers/nvdimm/pfn_devs.c53
-rw-r--r--drivers/nvdimm/pmem.h14
-rw-r--r--drivers/nvdimm/region_devs.c6
-rw-r--r--drivers/of/device.c48
-rw-r--r--drivers/pwm/Kconfig13
-rw-r--r--drivers/pwm/Makefile1
-rw-r--r--drivers/pwm/pwm-bcm2835.c2
-rw-r--r--drivers/pwm/pwm-hibvt.c2
-rw-r--r--drivers/pwm/pwm-mediatek.c78
-rw-r--r--drivers/pwm/pwm-meson.c2
-rw-r--r--drivers/pwm/pwm-pca9685.c14
-rw-r--r--drivers/pwm/pwm-renesas-tpu.c1
-rw-r--r--drivers/pwm/pwm-rockchip.c281
-rw-r--r--drivers/pwm/pwm-samsung.c70
-rw-r--r--drivers/pwm/pwm-tegra.c2
-rw-r--r--drivers/pwm/pwm-tiecap.c90
-rw-r--r--drivers/pwm/pwm-tiehrpwm.c122
-rw-r--r--drivers/pwm/pwm-vt8500.c1
-rw-r--r--drivers/pwm/pwm-zx.c282
-rw-r--r--drivers/s390/block/dasd.c331
-rw-r--r--drivers/s390/block/dasd_devmap.c8
-rw-r--r--drivers/s390/block/dasd_int.h19
-rw-r--r--drivers/s390/crypto/ap_asm.h9
-rw-r--r--drivers/s390/crypto/ap_bus.c49
-rw-r--r--drivers/s390/crypto/ap_bus.h47
-rw-r--r--drivers/s390/crypto/ap_queue.c26
-rw-r--r--drivers/scsi/NCR_Q720.c3
-rw-r--r--drivers/thermal/Kconfig12
-rw-r--r--drivers/thermal/Makefile1
-rw-r--r--drivers/thermal/broadcom/bcm2835_thermal.c2
-rw-r--r--drivers/thermal/hisi_thermal.c2
-rw-r--r--drivers/thermal/int340x_thermal/acpi_thermal_rel.c2
-rw-r--r--drivers/thermal/int340x_thermal/acpi_thermal_rel.h8
-rw-r--r--drivers/thermal/int340x_thermal/int3400_thermal.c43
-rw-r--r--drivers/thermal/int340x_thermal/int3406_thermal.c96
-rw-r--r--drivers/thermal/int340x_thermal/processor_thermal_device.c2
-rw-r--r--drivers/thermal/intel_pch_thermal.c12
-rw-r--r--drivers/thermal/mtk_thermal.c88
-rw-r--r--drivers/thermal/qoriq_thermal.c2
-rw-r--r--drivers/thermal/rcar_gen3_thermal.c2
-rw-r--r--drivers/thermal/rockchip_thermal.c65
-rw-r--r--drivers/thermal/samsung/exynos_tmu.c2
-rw-r--r--drivers/thermal/thermal_core.c31
-rw-r--r--drivers/thermal/thermal_core.h1
-rw-r--r--drivers/thermal/thermal_sysfs.c29
-rw-r--r--drivers/thermal/uniphier_thermal.c384
-rw-r--r--drivers/thermal/zx2967_thermal.c2
-rw-r--r--drivers/usb/host/ohci-sm501.c7
-rw-r--r--drivers/usb/host/ohci-tmio.c9
-rw-r--r--fs/ceph/addr.c403
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c40
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c6
-rw-r--r--fs/ceph/file.c50
-rw-r--r--fs/ceph/inode.c53
-rw-r--r--fs/ceph/mds_client.c37
-rw-r--r--fs/ceph/mdsmap.c6
-rw-r--r--fs/ceph/snap.c37
-rw-r--r--fs/ceph/super.c78
-rw-r--r--fs/ceph/super.h16
-rw-r--r--fs/ceph/xattr.c8
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c11
-rw-r--r--fs/ext2/super.c4
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/inode.c11
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/f2fs/acl.c5
-rw-r--r--fs/f2fs/checkpoint.c60
-rw-r--r--fs/f2fs/data.c177
-rw-r--r--fs/f2fs/dir.c7
-rw-r--r--fs/f2fs/f2fs.h285
-rw-r--r--fs/f2fs/file.c360
-rw-r--r--fs/f2fs/gc.c115
-rw-r--r--fs/f2fs/gc.h27
-rw-r--r--fs/f2fs/inline.c142
-rw-r--r--fs/f2fs/inode.c132
-rw-r--r--fs/f2fs/namei.c43
-rw-r--r--fs/f2fs/node.c79
-rw-r--r--fs/f2fs/recovery.c83
-rw-r--r--fs/f2fs/segment.c292
-rw-r--r--fs/f2fs/segment.h47
-rw-r--r--fs/f2fs/super.c433
-rw-r--r--fs/f2fs/sysfs.c251
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/fcntl.c13
-rw-r--r--fs/fuse/cuse.c4
-rw-r--r--fs/fuse/dev.c13
-rw-r--r--fs/fuse/dir.c30
-rw-r--r--fs/fuse/file.c52
-rw-r--r--fs/fuse/fuse_i.h8
-rw-r--r--fs/inode.c21
-rw-r--r--fs/internal.h2
-rw-r--r--fs/lockd/clntlock.c6
-rw-r--r--fs/namespace.c64
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/delegation.c2
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c17
-rw-r--r--fs/nfs/inode.c10
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4_fs.h11
-rw-r--r--fs/nfs/nfs4proc.c274
-rw-r--r--fs/nfs/pagelist.c170
-rw-r--r--fs/nfs/pnfs.c43
-rw-r--r--fs/nfs/pnfs.h2
-rw-r--r--fs/nfs/pnfs_nfs.c44
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c12
-rw-r--r--fs/nfs/write.c457
-rw-r--r--fs/open.c8
-rw-r--r--fs/overlayfs/dir.c11
-rw-r--r--fs/overlayfs/inode.c14
-rw-r--r--fs/overlayfs/overlayfs.h8
-rw-r--r--fs/overlayfs/readdir.c383
-rw-r--r--fs/overlayfs/super.c11
-rw-r--r--fs/overlayfs/util.c24
-rw-r--r--fs/signalfd.c22
-rw-r--r--fs/xattr.c15
-rw-r--r--fs/xfs/xfs_aops.c13
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_buf.c4
-rw-r--r--fs/xfs/xfs_buf.h3
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_linux.h9
-rw-r--r--fs/xfs/xfs_super.c25
-rw-r--r--include/linux/capability.h2
-rw-r--r--include/linux/ceph/ceph_fs.h6
-rw-r--r--include/linux/ceph/libceph.h11
-rw-r--r--include/linux/ceph/mon_client.h4
-rw-r--r--include/linux/ceph/rados.h1
-rw-r--r--include/linux/cper.h94
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--include/linux/dax.h6
-rw-r--r--include/linux/dcache.h14
-rw-r--r--include/linux/dma-mapping.h28
-rw-r--r--include/linux/efi.h4
-rw-r--r--include/linux/f2fs_fs.h40
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/libnvdimm.h16
-rw-r--r--include/linux/lsm_audit.h2
-rw-r--r--include/linux/lsm_hooks.h7
-rw-r--r--include/linux/mem_encrypt.h13
-rw-r--r--include/linux/nfs_fs.h6
-rw-r--r--include/linux/nfs_page.h6
-rw-r--r--include/linux/nfs_xdr.h2
-rw-r--r--include/linux/security.h8
-rw-r--r--include/linux/signal.h22
-rw-r--r--include/linux/sunrpc/sched.h2
-rw-r--r--include/linux/sunrpc/xdr.h13
-rw-r--r--include/linux/sunrpc/xprt.h5
-rw-r--r--include/linux/thermal.h1
-rw-r--r--include/linux/user_namespace.h9
-rw-r--r--include/trace/events/f2fs.h113
-rw-r--r--include/uapi/asm-generic/siginfo.h115
-rw-r--r--include/uapi/linux/capability.h22
-rw-r--r--kernel/cgroup/cpuset.c16
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/irq/irqdesc.c24
-rw-r--r--kernel/irq/msi.c5
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/signal.c72
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/user_namespace.c20
-rw-r--r--lib/Kconfig3
-rw-r--r--net/bluetooth/l2cap_core.c80
-rw-r--r--net/ceph/mon_client.c6
-rw-r--r--net/ceph/osd_client.c5
-rw-r--r--net/sunrpc/backchannel_rqst.c4
-rw-r--r--net/sunrpc/clnt.c12
-rw-r--r--net/sunrpc/svcsock.c6
-rw-r--r--net/sunrpc/xprt.c57
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c71
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c10
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c12
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c902
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c9
-rw-r--r--net/sunrpc/xprtrdma/transport.c8
-rw-r--r--net/sunrpc/xprtrdma/verbs.c22
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h63
-rw-r--r--net/sunrpc/xprtsock.c90
-rw-r--r--scripts/selinux/genheaders/genheaders.c7
-rwxr-xr-xscripts/sphinx-pre-install1
-rw-r--r--security/commoncap.c277
-rw-r--r--security/lsm_audit.c2
-rw-r--r--security/security.c5
-rw-r--r--security/selinux/avc.c16
-rw-r--r--security/selinux/hooks.c54
-rw-r--r--security/selinux/include/avc.h2
-rw-r--r--security/selinux/include/avc_ss.h2
-rw-r--r--security/selinux/include/classmap.h2
-rw-r--r--security/selinux/include/objsec.h2
-rw-r--r--security/selinux/include/security.h4
-rw-r--r--security/selinux/ss/avtab.c2
-rw-r--r--security/selinux/ss/avtab.h2
-rw-r--r--security/selinux/ss/constraint.h2
-rw-r--r--security/selinux/ss/context.h2
-rw-r--r--security/selinux/ss/ebitmap.c2
-rw-r--r--security/selinux/ss/ebitmap.h2
-rw-r--r--security/selinux/ss/hashtab.c2
-rw-r--r--security/selinux/ss/hashtab.h2
-rw-r--r--security/selinux/ss/mls.c2
-rw-r--r--security/selinux/ss/mls.h2
-rw-r--r--security/selinux/ss/mls_types.h2
-rw-r--r--security/selinux/ss/policydb.c2
-rw-r--r--security/selinux/ss/policydb.h2
-rw-r--r--security/selinux/ss/services.c9
-rw-r--r--security/selinux/ss/services.h2
-rw-r--r--security/selinux/ss/sidtab.c2
-rw-r--r--security/selinux/ss/sidtab.h2
-rw-r--r--security/selinux/ss/symtab.c2
-rw-r--r--security/selinux/ss/symtab.h2
-rw-r--r--tools/include/uapi/linux/perf_event.h4
-rw-r--r--tools/perf/Documentation/intel-pt.txt2
-rw-r--r--tools/perf/Documentation/perf-mem.txt4
-rw-r--r--tools/perf/Documentation/perf-record.txt5
-rw-r--r--tools/perf/Documentation/perf-report.txt1
-rw-r--r--tools/perf/Documentation/perf-script.txt2
-rw-r--r--tools/perf/Documentation/perf-trace.txt2
-rw-r--r--tools/perf/builtin-mem.c97
-rw-r--r--tools/perf/builtin-record.c2
-rw-r--r--tools/perf/builtin-script.c15
-rw-r--r--tools/perf/builtin-stat.c2
-rw-r--r--tools/perf/builtin-trace.c39
-rw-r--r--tools/perf/perf.h1
-rw-r--r--tools/perf/pmu-events/arch/powerpc/power9/frontend.json7
-rw-r--r--tools/perf/pmu-events/arch/powerpc/power9/other.json120
-rw-r--r--tools/perf/pmu-events/arch/powerpc/power9/pipeline.json7
-rw-r--r--tools/perf/pmu-events/arch/powerpc/power9/pmc.json7
-rw-r--r--tools/perf/tests/code-reading.c5
-rw-r--r--tools/perf/tests/sample-parsing.c6
-rw-r--r--tools/perf/ui/browsers/annotate.c3
-rw-r--r--tools/perf/ui/browsers/hists.c8
-rw-r--r--tools/perf/ui/stdio/hist.c10
-rw-r--r--tools/perf/util/callchain.c49
-rw-r--r--tools/perf/util/callchain.h9
-rw-r--r--tools/perf/util/event.h1
-rw-r--r--tools/perf/util/evsel.c19
-rw-r--r--tools/perf/util/evsel.h1
-rw-r--r--tools/perf/util/hist.c4
-rw-r--r--tools/perf/util/hist.h1
-rw-r--r--tools/perf/util/machine.c96
-rw-r--r--tools/perf/util/parse-events.c24
-rw-r--r--tools/perf/util/session.c3
-rw-r--r--tools/perf/util/sort.c42
-rw-r--r--tools/perf/util/sort.h1
-rw-r--r--tools/perf/util/symbol.h1
-rw-r--r--tools/perf/util/syscalltbl.c33
-rw-r--r--tools/perf/util/syscalltbl.h3
-rw-r--r--tools/testing/nvdimm/test/nfit.c4
-rw-r--r--tools/testing/selftests/x86/mpx-mini-test.c3
-rw-r--r--tools/testing/selftests/x86/protection_keys.c13
361 files changed, 8234 insertions, 4138 deletions
diff --git a/CREDITS b/CREDITS
index 0d2d60de5a25..9fbd2c77b546 100644
--- a/CREDITS
+++ b/CREDITS
@@ -2606,11 +2606,9 @@ E: tmolina@cablespeed.com
D: bug fixes, documentation, minor hackery
N: Paul Moore
-E: paul.moore@hp.com
-D: NetLabel author
-S: Hewlett-Packard
-S: 110 Spit Brook Road
-S: Nashua, NH 03062
+E: paul@paul-moore.com
+W: http://www.paul-moore.com
+D: NetLabel, SELinux, audit
N: James Morris
E: jmorris@namei.org
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 84c606fb3ca4..11b7f4ebea7c 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
Description:
Controls the issue rate of small discard commands.
+What: /sys/fs/f2fs/<disk>/discard_granularity
+Date: July 2017
+Contact: "Chao Yu" <yuchao0@huawei.com>
+Description:
+ Controls discard granularity of inner discard thread, inner thread
+ will not issue discards with size that is smaller than granularity.
+ The unit size is one block, now only support configuring in range
+ of [1, 512].
+
What: /sys/fs/f2fs/<disk>/max_victim_search
Date: January 2014
Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -130,3 +139,15 @@ Date: June 2017
Contact: "Chao Yu" <yuchao0@huawei.com>
Description:
Controls current reserved blocks in system.
+
+What: /sys/fs/f2fs/<disk>/gc_urgent
+Date: August 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Do background GC agressively
+
+What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time
+Date: August 2017
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+ Controls sleep time of GC urgent mode
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 45b29326d719..ac66ae2509a9 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -515,14 +515,15 @@ API at all.
::
void *
- dma_alloc_noncoherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t flag)
+ dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+ gfp_t flag, unsigned long attrs)
-Identical to dma_alloc_coherent() except that the platform will
-choose to return either consistent or non-consistent memory as it sees
-fit. By using this API, you are guaranteeing to the platform that you
-have all the correct and necessary sync points for this memory in the
-driver should it choose to return non-consistent memory.
+Identical to dma_alloc_coherent() except that when the
+DMA_ATTR_NON_CONSISTENT flags is passed in the attrs argument, the
+platform will choose to return either consistent or non-consistent memory
+as it sees fit. By using this API, you are guaranteeing to the platform
+that you have all the correct and necessary sync points for this memory
+in the driver should it choose to return non-consistent memory.
Note: where the platform can return consistent memory, it will
guarantee that the sync points become nops.
@@ -535,12 +536,13 @@ that simply cannot make consistent memory.
::
void
- dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
- dma_addr_t dma_handle)
+ dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
+ dma_addr_t dma_handle, unsigned long attrs)
-Free memory allocated by the nonconsistent API. All parameters must
-be identical to those passed in (and returned by
-dma_alloc_noncoherent()).
+Free memory allocated by the dma_alloc_attrs(). All parameters common
+parameters must identical to those otherwise passed to dma_fre_coherent,
+and the attrs argument must be identical to the attrs passed to
+dma_alloc_attrs().
::
@@ -564,8 +566,8 @@ memory or doing partial flushes.
dma_cache_sync(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction direction)
-Do a partial sync of memory that was allocated by
-dma_alloc_noncoherent(), starting at virtual address vaddr and
+Do a partial sync of memory that was allocated by dma_alloc_attrs() with
+the DMA_ATTR_NON_CONSISTENT flag starting at virtual address vaddr and
continuing on for size. Again, you *must* observe the cache line
boundaries when doing this.
@@ -590,34 +592,11 @@ size is the size of the area (must be multiples of PAGE_SIZE).
flags can be ORed together and are:
-- DMA_MEMORY_MAP - request that the memory returned from
- dma_alloc_coherent() be directly writable.
-
-- DMA_MEMORY_IO - request that the memory returned from
- dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
-
-One or both of these flags must be present.
-
-- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
- dma_alloc_coherent of any child devices of this one (for memory residing
- on a bridge).
-
- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
Do not allow dma_alloc_coherent() to fall back to system memory when
it's out of memory in the declared region.
-The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
-must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
-if only DMA_MEMORY_MAP were passed in) for success or zero for
-failure.
-
-Note, for DMA_MEMORY_IO returns, all subsequent memory returned by
-dma_alloc_coherent() may no longer be accessed directly, but instead
-must be accessed using the correct bus functions. If your driver
-isn't prepared to handle this contingency, it should not specify
-DMA_MEMORY_IO in the input flags.
-
-As a simplification for the platforms, only **one** such region of
+As a simplification for the platforms, only *one* such region of
memory may be declared per device.
For reasons of efficiency, most platforms choose to track the declared
diff --git a/Documentation/admin-guide/LSM/tomoyo.rst b/Documentation/admin-guide/LSM/tomoyo.rst
index a5947218fa64..e2d6b6e15082 100644
--- a/Documentation/admin-guide/LSM/tomoyo.rst
+++ b/Documentation/admin-guide/LSM/tomoyo.rst
@@ -9,8 +9,8 @@ TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
LiveCD-based tutorials are available at
-http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/
-http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/
+http://tomoyo.sourceforge.jp/1.8/ubuntu12.04-live.html
+http://tomoyo.sourceforge.jp/1.8/centos6-live.html
Though these tutorials use non-LSM version of TOMOYO, they are useful for you
to know what TOMOYO is.
@@ -21,35 +21,35 @@ How to enable TOMOYO?
Build the kernel with ``CONFIG_SECURITY_TOMOYO=y`` and pass ``security=tomoyo`` on
kernel's command line.
-Please see http://tomoyo.sourceforge.jp/2.3/ for details.
+Please see http://tomoyo.osdn.jp/2.5/ for details.
Where is documentation?
=======================
User <-> Kernel interface documentation is available at
-http://tomoyo.sourceforge.jp/2.3/policy-reference.html .
+http://tomoyo.osdn.jp/2.5/policy-specification/index.html .
Materials we prepared for seminars and symposiums are available at
-http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
+http://osdn.jp/projects/tomoyo/docs/?category_id=532&language_id=1 .
Below lists are chosen from three aspects.
What is TOMOYO?
TOMOYO Linux Overview
- http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf
+ http://osdn.jp/projects/tomoyo/docs/lca2009-takeda.pdf
TOMOYO Linux: pragmatic and manageable security for Linux
- http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
+ http://osdn.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf
TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box
- http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
+ http://osdn.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf
What can TOMOYO do?
Deep inside TOMOYO Linux
- http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
+ http://osdn.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf
The role of "pathname based access control" in security.
- http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf
+ http://osdn.jp/projects/tomoyo/docs/lfj2008-bof.pdf
History of TOMOYO?
Realities of Mainlining
- http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
+ http://osdn.jp/projects/tomoyo/docs/lfj2008.pdf
What is future plan?
====================
@@ -60,6 +60,6 @@ multiple LSM modules at the same time. We feel sorry that you have to give up
SELinux/SMACK/AppArmor etc. when you want to use TOMOYO.
We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM
-version of TOMOYO, available at http://tomoyo.sourceforge.jp/1.7/ .
+version of TOMOYO, available at http://tomoyo.osdn.jp/1.8/ .
LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning
to port non-LSM version's functionalities to LSM versions.
diff --git a/Documentation/conf.py b/Documentation/conf.py
index f9054ab60cb1..63857d33778c 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -271,10 +271,29 @@ latex_elements = {
# Additional stuff for the LaTeX preamble.
'preamble': '''
- \\usepackage{ifthen}
+ % Use some font with UTF-8 support with XeLaTeX
+ \\usepackage{fontspec}
+ \\setsansfont{DejaVu Serif}
+ \\setromanfont{DejaVu Sans}
+ \\setmonofont{DejaVu Sans Mono}
+
+ '''
+}
+
+# Fix reference escape troubles with Sphinx 1.4.x
+if major == 1 and minor > 3:
+ latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
+
+if major == 1 and minor <= 4:
+ latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
+elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
+ latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=1in'
+ latex_elements['preamble'] += '\\fvset{fontsize=auto}\n'
- % Allow generate some pages in landscape
- \\usepackage{lscape}
+# Customize notice background colors on Sphinx < 1.6:
+if major == 1 and minor < 6:
+ latex_elements['preamble'] += '''
+ \\usepackage{ifthen}
% Put notes in color and let them be inside a table
\\definecolor{NoteColor}{RGB}{204,255,255}
@@ -325,27 +344,26 @@ latex_elements = {
}
\\makeatother
- % Use some font with UTF-8 support with XeLaTeX
- \\usepackage{fontspec}
- \\setsansfont{DejaVu Serif}
- \\setromanfont{DejaVu Sans}
- \\setmonofont{DejaVu Sans Mono}
-
- % To allow adjusting table sizes
- \\usepackage{adjustbox}
-
'''
-}
-
-# Fix reference escape troubles with Sphinx 1.4.x
-if major == 1 and minor > 3:
- latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
-if major == 1 and minor <= 4:
- latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
-elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
- latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=1in'
- latex_elements['preamble'] += '\\fvset{fontsize=auto}\n'
+# With Sphinx 1.6, it is possible to change the Bg color directly
+# by using:
+# \definecolor{sphinxnoteBgColor}{RGB}{204,255,255}
+# \definecolor{sphinxwarningBgColor}{RGB}{255,204,204}
+# \definecolor{sphinxattentionBgColor}{RGB}{255,255,204}
+# \definecolor{sphinximportantBgColor}{RGB}{192,255,204}
+#
+# However, it require to use sphinx heavy box with:
+#
+# \renewenvironment{sphinxlightbox} {%
+# \\begin{sphinxheavybox}
+# }
+# \\end{sphinxheavybox}
+# }
+#
+# Unfortunately, the implementation is buggy: if a note is inside a
+# table, it isn't displayed well. So, for now, let's use boring
+# black and white notes.
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
diff --git a/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt b/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt
index cf573e85b11d..8cf87d1bfca5 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt
@@ -6,7 +6,7 @@ Required properties:
- clocks: This clock defines the base clock frequency of the PWM hardware
system, the period and the duty_cycle of the PWM signal is a multiple of
the base period.
-- #pwm-cells: Should be 2. See pwm.txt in this directory for a description of
+- #pwm-cells: Should be 3. See pwm.txt in this directory for a description of
the cells format.
Examples:
@@ -15,7 +15,7 @@ pwm@2020c000 {
compatible = "brcm,bcm2835-pwm";
reg = <0x2020c000 0x28>;
clocks = <&clk_pwm>;
- #pwm-cells = <2>;
+ #pwm-cells = <3>;
};
clocks {
diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
index 54c59b0560ad..ef8bd3cb67ab 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
@@ -2,6 +2,8 @@ MediaTek PWM controller
Required properties:
- compatible: should be "mediatek,<name>-pwm":
+ - "mediatek,mt2712-pwm": found on mt2712 SoC.
+ - "mediatek,mt7622-pwm": found on mt7622 SoC.
- "mediatek,mt7623-pwm": found on mt7623 SoC.
- reg: physical base address and length of the controller's registers.
- #pwm-cells: must be 2. See pwm.txt in this directory for a description of
@@ -10,7 +12,9 @@ Required properties:
- clock-names: must contain the following:
- "top": the top clock generator
- "main": clock used by the PWM core
- - "pwm1-5": the five per PWM clocks
+ - "pwm1-8": the eight per PWM clocks for mt2712
+ - "pwm1-6": the six per PWM clocks for mt7622
+ - "pwm1-5": the five per PWM clocks for mt7623
- pinctrl-names: Must contain a "default" entry.
- pinctrl-0: One property must exist for each entry in pinctrl-names.
See pinctrl/pinctrl-bindings.txt for details of the property values.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt
index b8be3d09ee26..2c5e52a5bede 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt
@@ -3,10 +3,17 @@ Rockchip PWM controller
Required properties:
- compatible: should be "rockchip,<name>-pwm"
"rockchip,rk2928-pwm": found on RK29XX,RK3066 and RK3188 SoCs
- "rockchip,rk3288-pwm": found on RK3288 SoC
+ "rockchip,rk3288-pwm": found on RK3288 SOC
+ "rockchip,rv1108-pwm", "rockchip,rk3288-pwm": found on RV1108 SoC
"rockchip,vop-pwm": found integrated in VOP on RK3288 SoC
- reg: physical base address and length of the controller's registers
- - clocks: phandle and clock specifier of the PWM reference clock
+ - clocks: See ../clock/clock-bindings.txt
+ - For older hardware (rk2928, rk3066, rk3188, rk3228, rk3288, rk3399):
+ - There is one clock that's used both to derive the functional clock
+ for the device and as the bus clock.
+ - For newer hardware (rk3328 and future socs): specified by name
+ - "pwm": This is used to derive the functional clock.
+ - "pclk": This is the APB bus clock.
- #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.txt in this directory
for a description of the cell format.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt
index 8007e839a716..06a363d9ccef 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt
@@ -6,6 +6,7 @@ Required properties:
for am4372 - compatible = "ti,am4372-ecap", "ti,am3352-ecap", "ti,am33xx-ecap";
for da850 - compatible = "ti,da850-ecap", "ti,am3352-ecap", "ti,am33xx-ecap";
for dra746 - compatible = "ti,dra746-ecap", "ti,am3352-ecap";
+ for 66ak2g - compatible = "ti,k2g-ecap", "ti,am3352-ecap";
- #pwm-cells: should be 3. See pwm.txt in this directory for a description of
the cells format. The PWM channel index ranges from 0 to 4. The only third
cell flag supported by this binding is PWM_POLARITY_INVERTED.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-zx.txt b/Documentation/devicetree/bindings/pwm/pwm-zx.txt
new file mode 100644
index 000000000000..a6bcc75c9164
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/pwm-zx.txt
@@ -0,0 +1,22 @@
+ZTE ZX PWM controller
+
+Required properties:
+ - compatible: Should be "zte,zx296718-pwm".
+ - reg: Physical base address and length of the controller's registers.
+ - clocks : The phandle and specifier referencing the controller's clocks.
+ - clock-names: "pclk" for PCLK, "wclk" for WCLK to the PWM controller. The
+ PCLK is for register access, while WCLK is the reference clock for
+ calculating period and duty cycles.
+ - #pwm-cells: Should be 3. See pwm.txt in this directory for a description of
+ the cells format.
+
+Example:
+
+ pwm: pwm@1439000 {
+ compatible = "zte,zx296718-pwm";
+ reg = <0x1439000 0x1000>;
+ clocks = <&lsp1crm LSP1_PWM_PCLK>,
+ <&lsp1crm LSP1_PWM_WCLK>;
+ clock-names = "pclk", "wclk";
+ #pwm-cells = <3>;
+ };
diff --git a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt
index b067e84a94b5..1aadc804dae4 100644
--- a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt
+++ b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt
@@ -6,7 +6,6 @@ Required Properties:
- "renesas,tpu-r8a73a4": for R8A77A4 (R-Mobile APE6) compatible PWM controller.
- "renesas,tpu-r8a7740": for R8A7740 (R-Mobile A1) compatible PWM controller.
- "renesas,tpu-r8a7790": for R8A7790 (R-Car H2) compatible PWM controller.
- - "renesas,tpu-sh7372": for SH7372 (SH-Mobile AP4) compatible PWM controller.
- "renesas,tpu": for generic R-Car TPU PWM controller.
- reg: Base address and length of each memory resource used by the PWM
diff --git a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
index e2f494d74d8a..0d73ea5e9c0c 100644
--- a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
@@ -11,6 +11,7 @@ Required properties:
- compatible:
- "mediatek,mt8173-thermal" : For MT8173 family of SoCs
- "mediatek,mt2701-thermal" : For MT2701 family of SoCs
+ - "mediatek,mt2712-thermal" : For MT2712 family of SoCs
- reg: Address range of the thermal controller
- interrupts: IRQ for the thermal controller
- clocks, clock-names: Clocks needed for the thermal controller. required
diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
index 43003aec94bd..e3a6234fb1ac 100644
--- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt
@@ -4,6 +4,7 @@ Required properties:
- compatible : should be "rockchip,<name>-tsadc"
"rockchip,rk3228-tsadc": found on RK3228 SoCs
"rockchip,rk3288-tsadc": found on RK3288 SoCs
+ "rockchip,rk3328-tsadc": found on RK3328 SoCs
"rockchip,rk3368-tsadc": found on RK3368 SoCs
"rockchip,rk3399-tsadc": found on RK3399 SoCs
- reg : physical base address of the controller and length of memory mapped
diff --git a/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
new file mode 100644
index 000000000000..686c0b42ed3f
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt
@@ -0,0 +1,64 @@
+* UniPhier Thermal bindings
+
+This describes the devicetree bindings for thermal monitor supported by
+PVT(Process, Voltage and Temperature) monitoring unit implemented on Socionext
+UniPhier SoCs.
+
+Required properties:
+- compatible :
+ - "socionext,uniphier-pxs2-thermal" : For UniPhier PXs2 SoC
+ - "socionext,uniphier-ld20-thermal" : For UniPhier LD20 SoC
+- interrupts : IRQ for the temperature alarm
+- #thermal-sensor-cells : Should be 0. See ./thermal.txt for details.
+
+Optional properties:
+- socionext,tmod-calibration: A pair of calibrated values referred from PVT,
+ in case that the values aren't set on SoC,
+ like a reference board.
+
+Example:
+
+ sysctrl@61840000 {
+ compatible = "socionext,uniphier-ld20-sysctrl",
+ "simple-mfd", "syscon";
+ reg = <0x61840000 0x10000>;
+ ...
+ pvtctl: pvtctl {
+ compatible = "socionext,uniphier-ld20-thermal";
+ interrupts = <0 3 1>;
+ #thermal-sensor-cells = <0>;
+ };
+ ...
+ };
+
+ thermal-zones {
+ cpu_thermal {
+ polling-delay-passive = <250>; /* 250ms */
+ polling-delay = <1000>; /* 1000ms */
+ thermal-sensors = <&pvtctl>;
+
+ trips {
+ cpu_crit: cpu_crit {
+ temperature = <110000>; /* 110C */
+ hysteresis = <2000>;
+ type = "critical";
+ };
+ cpu_alert: cpu_alert {
+ temperature = <100000>; /* 100C */
+ hysteresis = <2000>;
+ type = "passive";
+ };
+ };
+
+ cooling-maps {
+ map0 {
+ trip = <&cpu_alert>;
+ cooling-device = <&cpu0 (-1) (-1)>;
+ };
+ map1 {
+ trip = <&cpu_alert>;
+ cooling-device = <&cpu2 (-1) (-1)>;
+ };
+ };
+ };
+ };
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index fe25787ff6d4..75d2d57e2c44 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -22,7 +22,7 @@ prototypes:
struct vfsmount *(*d_automount)(struct path *path);
int (*d_manage)(const struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
- unsigned int);
+ unsigned int, unsigned int);
locking rules:
rename_lock ->d_lock may block rcu-walk
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 273ccb26885e..13c2ff034348 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,16 @@ io_bits=%u Set the bit size of write IO requests. It should be set
with "mode=lfs".
usrquota Enable plain user disk quota accounting.
grpquota Enable plain group disk quota accounting.
+prjquota Enable plain project quota accounting.
+usrjquota=<file> Appoint specified file and type during mount, so that quota
+grpjquota=<file> information can be properly updated during recovery flow,
+prjjquota=<file> <quota file>: must be in root directory;
+jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1].
+offusrjquota Turn off user journelled quota.
+offgrpjquota Turn off group journelled quota.
+offprjjquota Turn off project journelled quota.
+quota Enable plain user disk quota accounting.
+noquota Disable all plain disk quota option.
================================================================================
DEBUGFS ENTRIES
@@ -209,6 +219,15 @@ Files in /sys/fs/f2fs/<devname>
gc_idle = 1 will select the Cost Benefit approach
& setting gc_idle = 2 will select the greedy approach.
+ gc_urgent This parameter controls triggering background GCs
+ urgently or not. Setting gc_urgent = 0 [default]
+ makes back to default behavior, while if it is set
+ to 1, background thread starts to do GC by given
+ gc_urgent_sleep_time interval.
+
+ gc_urgent_sleep_time This parameter controls sleep time for gc_urgent.
+ 500 ms is set by default. See above gc_urgent.
+
reclaim_segments This parameter controls the number of prefree
segments to be reclaimed. If the number of prefree
segments is larger than the number of segments
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 405a3df759b3..5fd325df59e2 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -988,7 +988,7 @@ struct dentry_operations {
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(const struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
- unsigned int);
+ unsigned int, unsigned int);
};
d_revalidate: called when the VFS needs to revalidate a dentry. This
diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index bc80fc0e210f..a7a813258013 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -523,11 +523,11 @@ CPU ì—게 기대할 수 있는 ìµœì†Œí•œì˜ ë³´ìž¥ì‚¬í•­ 몇가지가 있습니
즉, ACQUIRE 는 ìµœì†Œí•œì˜ "ì·¨ë“" ë™ìž‘처럼, 그리고 RELEASE 는 ìµœì†Œí•œì˜ "공개"
처럼 ë™ìž‘한다는 ì˜ë¯¸ìž…니다.
-core-api/atomic_ops.rst ì—서 설명ë˜ëŠ” 어토믹 오í¼ë ˆì´ì…˜ë“¤ 중ì—는 완전히
-순서잡힌 것들과 (배리어를 사용하지 않는) ì™„í™”ëœ ìˆœì„œì˜ ê²ƒë“¤ ì™¸ì— ACQUIRE 와
-RELEASE ë¶€ë¥˜ì˜ ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 스토어를 ëª¨ë‘ ìˆ˜í–‰í•˜ëŠ” ì¡°í•©ëœ ì–´í† ë¯¹
-오í¼ë ˆì´ì…˜ì—서, ACQUIRE 는 해당 오í¼ë ˆì´ì…˜ì˜ 로드 부분ì—ë§Œ ì ìš©ë˜ê³  RELEASE 는
-해당 오í¼ë ˆì´ì…˜ì˜ 스토어 부분ì—ë§Œ ì ìš©ë©ë‹ˆë‹¤.
+atomic_t.txt ì— ì„¤ëª…ëœ ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ë“¤ 중 ì¼ë¶€ëŠ” 완전히 순서잡힌 것들과
+(배리어를 사용하지 않는) ì™„í™”ëœ ìˆœì„œì˜ ê²ƒë“¤ ì™¸ì— ACQUIRE 와 RELEASE 부류ì˜
+ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 스토어를 ëª¨ë‘ ìˆ˜í–‰í•˜ëŠ” ì¡°í•©ëœ ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ì—서,
+ACQUIRE 는 해당 오í¼ë ˆì´ì…˜ì˜ 로드 부분ì—ë§Œ ì ìš©ë˜ê³  RELEASE 는 해당
+오í¼ë ˆì´ì…˜ì˜ 스토어 부분ì—ë§Œ ì ìš©ë©ë‹ˆë‹¤.
메모리 ë°°ë¦¬ì–´ë“¤ì€ ë‘ CPU ê°„, ë˜ëŠ” CPU 와 디바ì´ìФ ê°„ì— ìƒí˜¸ìž‘ìš©ì˜ ê°€ëŠ¥ì„±ì´ ìžˆì„
때ì—ë§Œ 필요합니다. 만약 ì–´ë–¤ ì½”ë“œì— ê·¸ëŸ° ìƒí˜¸ìž‘ìš©ì´ ì—†ì„ ê²ƒì´ ë³´ìž¥ëœë‹¤ë©´, 해당
@@ -617,7 +617,22 @@ RELEASE ë¶€ë¥˜ì˜ ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 스토어를 ëª¨ë‘ ìˆ˜í
ì´ ë³€ê²½ì€ ì•žì˜ ì²˜ìŒ ë‘가지 ê²°ê³¼ 중 í•˜ë‚˜ë§Œì´ ë°œìƒí•  수 있고, ì„¸ë²ˆì§¸ì˜ ê²°ê³¼ëŠ”
ë°œìƒí•  수 ì—†ë„ë¡ í•©ë‹ˆë‹¤.
-ë°ì´í„° ì˜ì¡´ì„± 배리어는 ì˜ì¡´ì  ì“°ê¸°ì— ëŒ€í•´ì„œë„ ìˆœì„œë¥¼ 잡아ì¤ë‹ˆë‹¤:
+
+[!] ì´ ìƒë‹¹ížˆ ë°˜ì§ê´€ì ì¸ ìƒí™©ì€ ë¶„ë¦¬ëœ ìºì‹œë¥¼ 가지는 기계들ì—서 가장 잘
+ë°œìƒí•˜ëŠ”ë°, 예를 들면 한 ìºì‹œ ë±…í¬ëŠ” ì§ìˆ˜ ë²ˆí˜¸ì˜ ìºì‹œ ë¼ì¸ë“¤ì„ 처리하고, 다른
+ë±…í¬ëŠ” 홀수 ë²ˆí˜¸ì˜ ìºì‹œ ë¼ì¸ë“¤ì„ 처리하는 ê²½ìš°ìž„ì„ ì•Œì•„ë‘시기 ë°”ëžë‹ˆë‹¤. í¬ì¸í„°
+P 는 ì§ìˆ˜ 번호 ìºì‹œ ë¼ì¸ì— 저장ë˜ì–´ 있고, 변수 B 는 홀수 번호 ìºì‹œ ë¼ì¸ì—
+저장ë˜ì–´ ìžˆì„ ìˆ˜ 있습니다. 여기서 ê°’ì„ ì½ì–´ì˜¤ëŠ” CPU ì˜ ìºì‹œì˜ 홀수 번호 처리
+ë±…í¬ëŠ” 열심히 ì¼ê°ì„ ì²˜ë¦¬ì¤‘ì¸ ë°˜ë©´ 홀수 번호 처리 ë±…í¬ëŠ” í•  ì¼ ì—†ì´ í•œê°€í•œ
+중ì´ë¼ë©´ í¬ì¸í„° P (&B) ì˜ ìƒˆë¡œìš´ ê°’ê³¼ 변수 B ì˜ ê¸°ì¡´ ê°’ (2) 를 ë³¼ 수 있습니다.
+
+
+ì˜ì¡´ì  ì“°ê¸°ë“¤ì˜ ìˆœì„œë¥¼ 맞추는ë°ì—는 ë°ì´í„° ì˜ì¡´ì„± 배리어가 필요치 않ì€ë°, ì´ëŠ”
+리눅스 커ë„ì´ ì§€ì›í•˜ëŠ” CPU ë“¤ì€ (1) 쓰기가 ì •ë§ë¡œ ì¼ì–´ë‚ ì§€, (2) 쓰기가 ì–´ë””ì—
+ì´ë£¨ì–´ì§ˆì§€, 그리고 (3) 쓰여질 ê°’ì„ í™•ì‹¤ížˆ 알기 전까지는 쓰기를 수행하지 않기
+때문입니다. 하지만 "컨트롤 ì˜ì¡´ì„±" 섹션과
+Documentation/RCU/rcu_dereference.txt 파ì¼ì„ ì£¼ì˜ ê¹Šê²Œ ì½ì–´ 주시기 ë°”ëžë‹ˆë‹¤:
+컴파ì¼ëŸ¬ëŠ” 매우 ì°½ì˜ì ì¸ ë§Žì€ ë°©ë²•ìœ¼ë¡œ 종ì†ì„±ì„ ê¹° 수 있습니다.
CPU 1 CPU 2
=============== ===============
@@ -626,28 +641,19 @@ RELEASE ë¶€ë¥˜ì˜ ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 스토어를 ëª¨ë‘ ìˆ˜í
<쓰기 배리어>
WRITE_ONCE(P, &B);
Q = READ_ONCE(P);
- <ë°ì´í„° ì˜ì¡´ì„± 배리어>
- *Q = 5;
+ WRITE_ONCE(*Q, 5);
-ì´ ë°ì´í„° ì˜ì¡´ì„± 배리어는 Q ë¡œì˜ ì½ê¸°ê°€ *Q ë¡œì˜ ìŠ¤í† ì–´ì™€ 순서를 맞추게
-í•´ì¤ë‹ˆë‹¤. ì´ëŠ” 다ìŒê³¼ ê°™ì€ ê²°ê³¼ë¥¼ 막습니다:
+ë”°ë¼ì„œ, Q ë¡œì˜ ì½ê¸°ì™€ *Q ë¡œì˜ ì“°ê¸° 사ì´ì—는 ë°ì´í„° 종ì†ì„± 배리어가 필요치
+않습니다. 달리 ë§í•˜ë©´, ë°ì´í„° 종ì†ì„± 배리어가 ì—†ë”ë¼ë„ ë‹¤ìŒ ê²°ê³¼ëŠ” ìƒê¸°ì§€
+않습니다:
(Q == &B) && (B == 4)
ì´ëŸ° íŒ¨í„´ì€ ë“œë¬¼ê²Œ 사용ë˜ì–´ì•¼ í•¨ì„ ì•Œì•„ ë‘시기 ë°”ëžë‹ˆë‹¤. 무엇보다ë„, ì˜ì¡´ì„±
순서 ê·œì¹™ì˜ ì˜ë„는 쓰기 ìž‘ì—…ì„ -예방- 해서 그로 ì¸í•´ ë°œìƒí•˜ëŠ” 비싼 ìºì‹œ 미스ë„
없애려는 것입니다. ì´ íŒ¨í„´ì€ ë“œë¬¼ê²Œ ë°œìƒí•˜ëŠ” ì—러 ì¡°ê±´ ê°™ì€ê²ƒë“¤ì„ 기ë¡í•˜ëŠ”ë°
-ì‚¬ìš©ë  ìˆ˜ 있고, ì´ë ‡ê²Œ 배리어를 사용해 순서를 지키게 í•¨ìœ¼ë¡œì¨ ê·¸ëŸ° 기ë¡ì´
-사ë¼ì§€ëŠ” ê²ƒì„ ë§‰ìŠµë‹ˆë‹¤.
-
-
-[!] ìƒë‹¹ížˆ 비ì§ê´€ì ì¸ ì´ ìƒí™©ì€ ë¶„ë¦¬ëœ ìºì‹œë¥¼ 가진 기계, 예를 들어 한 ìºì‹œ
-ë±…í¬ê°€ ì§ìˆ˜ë²ˆ ìºì‹œ ë¼ì¸ì„ 처리하고 다른 ë±…í¬ëŠ” 홀수번 ìºì‹œ ë¼ì¸ì„ 처리하는 기계
-등ì—서 가장 잘 ë°œìƒí•©ë‹ˆë‹¤. í¬ì¸í„° P 는 홀수 ë²ˆí˜¸ì˜ ìºì‹œ ë¼ì¸ì— 있고, 변수 B 는
-ì§ìˆ˜ 번호 ìºì‹œ ë¼ì¸ì— 있다고 ìƒê°í•´ 봅시다. 그런 ìƒíƒœì—서 ì½ê¸° ìž‘ì—…ì„ í•˜ëŠ” CPU
-ì˜ ì§ìˆ˜ë²ˆ ë±…í¬ëŠ” í•  ì¼ì´ 쌓여 매우 ë°”ì˜ì§€ë§Œ 홀수번 ë±…í¬ëŠ” í•  ì¼ì´ 없어 아무
-ì¼ë„ 하지 않고 있었다면, í¬ì¸í„° P 는 새 ê°’ (&B) ì„, 그리고 변수 B 는 옛날 ê°’
-(2) ì„ ê°€ì§€ê³  있는 ìƒíƒœê°€ 보여질 ìˆ˜ë„ ìžˆìŠµë‹ˆë‹¤.
+ì‚¬ìš©ë  ìˆ˜ 있으며, CPUì˜ ìžì—°ì ì¸ 순서 ë³´ìž¥ì´ ê·¸ëŸ° 기ë¡ë“¤ì„ 사ë¼ì§€ì§€ 않게
+í•´ì¤ë‹ˆë‹¤.
ë°ì´í„° ì˜ì¡´ì„± 배리어는 매우 중요한ë°, 예를 들어 RCU 시스템ì—서 그렇습니다.
@@ -1848,8 +1854,7 @@ Mandatory ë°°ë¦¬ì–´ë“¤ì€ SMP 시스템ì—ì„œë„ UP 시스템ì—ì„œë„ SMP 효ê³
ì´ ì½”ë“œëŠ” ê°ì²´ì˜ ì—…ë°ì´íŠ¸ëœ death 마í¬ê°€ ë ˆí¼ëŸ°ìФ ì¹´ìš´í„° ê°ì†Œ ë™ìž‘
*ì „ì—* ë³´ì¼ ê²ƒì„ ë³´ìž¥í•©ë‹ˆë‹¤.
- ë” ë§Žì€ ì •ë³´ë¥¼ 위해선 Documentation/core-api/atomic_ops.rst 문서를 참고하세요.
- 어디서 ì´ê²ƒë“¤ì„ 사용해야 í• ì§€ ê¶ê¸ˆí•˜ë‹¤ë©´ "어토믹 오í¼ë ˆì´ì…˜" 서브섹션ì„
+ ë” ë§Žì€ ì •ë³´ë¥¼ 위해선 Documentation/atomic_{t,bitops}.txt 문서를
참고하세요.
@@ -2468,86 +2473,7 @@ _않습니다_.
ì „ì²´ 메모리 배리어를 ë‚´í¬í•˜ê³  ë˜ ì¼ë¶€ëŠ” ë‚´í¬í•˜ì§€ 않지만, 커ë„ì—서 ìƒë‹¹ížˆ
ì˜ì¡´ì ìœ¼ë¡œ 사용하는 기능 중 하나입니다.
-ë©”ëª¨ë¦¬ì˜ ì–´ë–¤ ìƒíƒœë¥¼ 수정하고 해당 ìƒíƒœì— 대한 (ì˜ˆì „ì˜ ë˜ëŠ” 최신ì˜) 정보를
-리턴하는 어토믹 오í¼ë ˆì´ì…˜ì€ ëª¨ë‘ SMP-ì¡°ê±´ì  ë²”ìš© 메모리 배리어(smp_mb())를
-실제 오í¼ë ˆì´ì…˜ì˜ 앞과 ë’¤ì— ë‚´í¬í•©ë‹ˆë‹¤. ì´ëŸ° 오í¼ë ˆì´ì…˜ì€ 다ìŒì˜ 것들ì„
-í¬í•¨í•©ë‹ˆë‹¤:
-
- xchg();
- atomic_xchg(); atomic_long_xchg();
- atomic_inc_return(); atomic_long_inc_return();
- atomic_dec_return(); atomic_long_dec_return();
- atomic_add_return(); atomic_long_add_return();
- atomic_sub_return(); atomic_long_sub_return();
- atomic_inc_and_test(); atomic_long_inc_and_test();
- atomic_dec_and_test(); atomic_long_dec_and_test();
- atomic_sub_and_test(); atomic_long_sub_and_test();
- atomic_add_negative(); atomic_long_add_negative();
- test_and_set_bit();
- test_and_clear_bit();
- test_and_change_bit();
-
- /* exchange ì¡°ê±´ì´ ì„±ê³µí•  때 */
- cmpxchg();
- atomic_cmpxchg(); atomic_long_cmpxchg();
- atomic_add_unless(); atomic_long_add_unless();
-
-ì´ê²ƒë“¤ì€ 메모리 배리어 효과가 필요한 ACQUIRE 부류와 RELEASE 부류 오í¼ë ˆì´ì…˜ë“¤ì„
-구현할 때, 그리고 ê°ì²´ 해제를 위해 ë ˆí¼ëŸ°ìФ 카운터를 ì¡°ì •í•  때, ì•”ë¬µì  ë©”ëª¨ë¦¬
-배리어 효과가 필요한 ê³³ ë“±ì— ì‚¬ìš©ë©ë‹ˆë‹¤.
-
-
-다ìŒì˜ 오í¼ë ˆì´ì…˜ë“¤ì€ 메모리 배리어를 ë‚´í¬í•˜ì§€ _않기_ ë•Œë¬¸ì— ë¬¸ì œê°€ ë  ìˆ˜
-있지만, RELEASE ë¶€ë¥˜ì˜ ì˜¤í¼ë ˆì´ì…˜ë“¤ê³¼ ê°™ì€ ê²ƒë“¤ì„ êµ¬í˜„í•  때 ì‚¬ìš©ë  ìˆ˜ë„
-있습니다:
-
- atomic_set();
- set_bit();
- clear_bit();
- change_bit();
-
-ì´ê²ƒë“¤ì„ 사용할 때ì—는 필요하다면 ì ì ˆí•œ (예를 들면 smp_mb__before_atomic()
-ê°™ì€) 메모리 배리어가 명시ì ìœ¼ë¡œ 함께 사용ë˜ì–´ì•¼ 합니다.
-
-
-ì•„ëž˜ì˜ ê²ƒë“¤ë„ ë©”ëª¨ë¦¬ 배리어를 ë‚´í¬í•˜ì§€ _않기_ 때문ì—, ì¼ë¶€ 환경ì—서는 (예를
-들면 smp_mb__before_atomic() ê³¼ ê°™ì€) 명시ì ì¸ 메모리 배리어 ì‚¬ìš©ì´ í•„ìš”í•©ë‹ˆë‹¤.
-
- atomic_add();
- atomic_sub();
- atomic_inc();
- atomic_dec();
-
-ì´ê²ƒë“¤ì´ 통계 ìƒì„±ì„ 위해 사용ëœë‹¤ë©´, 그리고 통계 ë°ì´í„° 사ì´ì— 관계가 존재하지
-않는다면 메모리 배리어는 필요치 ì•Šì„ ê²ë‹ˆë‹¤.
-
-ê°ì²´ì˜ ìˆ˜ëª…ì„ ê´€ë¦¬í•˜ê¸° 위해 ë ˆí¼ëŸ°ìФ 카운팅 목ì ìœ¼ë¡œ 사용ëœë‹¤ë©´, ë ˆí¼ëŸ°ìФ
-카운터는 ë½ìœ¼ë¡œ 보호ë˜ëŠ” 섹션ì—서만 ì¡°ì •ë˜ê±°ë‚˜ 호출하는 ìª½ì´ ì´ë¯¸ 충분한
-ë ˆí¼ëŸ°ìŠ¤ë¥¼ 잡고 ìžˆì„ ê²ƒì´ê¸° ë•Œë¬¸ì— ë©”ëª¨ë¦¬ 배리어는 아마 í•„ìš” ì—†ì„ ê²ë‹ˆë‹¤.
-
-만약 ì–´ë–¤ ë½ì„ 구성하기 위해 사용ëœë‹¤ë©´, ë½ ê´€ë ¨ ë™ìž‘ì€ ì¼ë°˜ì ìœ¼ë¡œ ìž‘ì—…ì„ íŠ¹ì •
-순서대로 진행해야 하므로 메모리 배리어가 필요할 수 있습니다.
-
-기본ì ìœ¼ë¡œ, ê° ì‚¬ìš©ì²˜ì—서는 메모리 배리어가 필요한지 아닌지 충분히 고려해야
-합니다.
-
-ì•„ëž˜ì˜ ì˜¤í¼ë ˆì´ì…˜ë“¤ì€ 특별한 ë½ ê´€ë ¨ ë™ìž‘들입니다:
-
- test_and_set_bit_lock();
- clear_bit_unlock();
- __clear_bit_unlock();
-
-ì´ê²ƒë“¤ì€ ACQUIRE 류와 RELEASE ë¥˜ì˜ ì˜¤í¼ë ˆì´ì…˜ë“¤ì„ 구현합니다. ë½ ê´€ë ¨ ë„구를
-구현할 때ì—는 ì´ê²ƒë“¤ì„ 좀 ë” ì„ í˜¸í•˜ëŠ” íŽ¸ì´ ë‚˜ì€ë°, ì´ê²ƒë“¤ì˜ êµ¬í˜„ì€ ë§Žì€
-아키í…ì³ì—서 최ì í™” ë  ìˆ˜ 있기 때문입니다.
-
-[!] ì´ëŸ° ìƒí™©ì— 사용할 수 있는 특수한 메모리 배리어 ë„êµ¬ë“¤ì´ ìžˆìŠµë‹ˆë‹¤ë§Œ, ì¼ë¶€
-CPU ì—서는 사용ë˜ëŠ” 어토믹 ì¸ìŠ¤íŠ¸ëŸ­ì…˜ ìžì²´ì— 메모리 배리어가 ë‚´í¬ë˜ì–´ 있어서
-어토믹 오í¼ë ˆì´ì…˜ê³¼ 메모리 배리어를 함께 사용하는 게 불필요한 ì¼ì´ ë  ìˆ˜
-있는ë°, 그런 ê²½ìš°ì— ì´ íŠ¹ìˆ˜ 메모리 배리어 ë„êµ¬ë“¤ì€ no-op ì´ ë˜ì–´ 실질ì ìœ¼ë¡œ
-아무ì¼ë„ 하지 않습니다.
-
-ë” ë§Žì€ ë‚´ìš©ì„ ìœ„í•´ì„  Documentation/core-api/atomic_ops.rst 를 참고하세요.
+ë” ë§Žì€ ë‚´ìš©ì„ ìœ„í•´ì„  Documentation/atomic_t.txt 를 참고하세요.
디바ì´ìФ 액세스
diff --git a/MAINTAINERS b/MAINTAINERS
index fbb269415f06..f46a3225e398 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4219,7 +4219,7 @@ DMA MAPPING HELPERS
M: Christoph Hellwig <hch@lst.de>
M: Marek Szyprowski <m.szyprowski@samsung.com>
R: Robin Murphy <robin.murphy@arm.com>
-L: linux-kernel@vger.kernel.org
+L: iommu@lists.linux-foundation.org
T: git git://git.infradead.org/users/hch/dma-mapping.git
W: http://git.infradead.org/users/hch/dma-mapping.git
S: Supported
@@ -9298,15 +9298,6 @@ F: net/*/netfilter/
F: net/netfilter/
F: net/bridge/br_netfilter*.c
-NETLABEL
-M: Paul Moore <paul@paul-moore.com>
-W: http://netlabel.sf.net
-L: netdev@vger.kernel.org
-S: Maintained
-F: Documentation/netlabel/
-F: include/net/netlabel.h
-F: net/netlabel/
-
NETROM NETWORK LAYER
M: Ralf Baechle <ralf@linux-mips.org>
L: linux-hams@vger.kernel.org
@@ -9434,10 +9425,23 @@ F: net/ipv6/
F: include/net/ip*
F: arch/x86/net/*
-NETWORKING [LABELED] (NetLabel, CIPSO, Labeled IPsec, SECMARK)
+NETWORKING [LABELED] (NetLabel, Labeled IPsec, SECMARK)
M: Paul Moore <paul@paul-moore.com>
+W: https://github.com/netlabel
L: netdev@vger.kernel.org
+L: linux-security-module@vger.kernel.org
S: Maintained
+F: Documentation/netlabel/
+F: include/net/calipso.h
+F: include/net/cipso_ipv4.h
+F: include/net/netlabel.h
+F: include/uapi/linux/netfilter/xt_SECMARK.h
+F: include/uapi/linux/netfilter/xt_CONNSECMARK.h
+F: net/netlabel/
+F: net/ipv4/cipso_ipv4.c
+F: net/ipv6/calipso.c
+F: net/netfilter/xt_CONNSECMARK.c
+F: net/netfilter/xt_SECMARK.c
NETWORKING [TLS]
M: Ilya Lesokhin <ilyal@mellanox.com>
@@ -12023,8 +12027,9 @@ M: Paul Moore <paul@paul-moore.com>
M: Stephen Smalley <sds@tycho.nsa.gov>
M: Eric Paris <eparis@parisplace.org>
L: selinux@tycho.nsa.gov (moderated for non-subscribers)
-W: http://selinuxproject.org
-T: git git://git.infradead.org/users/pcmoore/selinux
+W: https://selinuxproject.org
+W: https://github.com/SELinuxProject
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git
S: Supported
F: include/linux/selinux*
F: security/selinux/
diff --git a/arch/alpha/include/uapi/asm/siginfo.h b/arch/alpha/include/uapi/asm/siginfo.h
index 9822362a8424..70494d1d8f29 100644
--- a/arch/alpha/include/uapi/asm/siginfo.h
+++ b/arch/alpha/include/uapi/asm/siginfo.h
@@ -6,4 +6,18 @@
#include <asm-generic/siginfo.h>
+/*
+ * SIGFPE si_codes
+ */
+#ifdef __KERNEL__
+#define FPE_FIXME 0 /* Broken dup of SI_USER */
+#endif /* __KERNEL__ */
+
+/*
+ * SIGTRAP si_codes
+ */
+#ifdef __KERNEL__
+#define TRAP_FIXME 0 /* Broken dup of SI_USER */
+#endif /* __KERNEL__ */
+
#endif
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index ddb89a18cf26..49d3b1e63ce5 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -280,7 +280,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
case 1: /* bugcheck */
info.si_signo = SIGTRAP;
info.si_errno = 0;
- info.si_code = __SI_FAULT;
+ info.si_code = TRAP_FIXME;
info.si_addr = (void __user *) regs->pc;
info.si_trapno = 0;
send_sig_info(SIGTRAP, &info, current);
@@ -320,7 +320,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
break;
case GEN_ROPRAND:
signo = SIGFPE;
- code = __SI_FAULT;
+ code = FPE_FIXME;
break;
case GEN_DECOVF:
@@ -342,7 +342,7 @@ do_entIF(unsigned long type, struct pt_regs *regs)
case GEN_SUBRNG7:
default:
signo = SIGTRAP;
- code = __SI_FAULT;
+ code = TRAP_FIXME;
break;
}
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f1b3f1d575d4..7888c9803eb0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1531,7 +1531,6 @@ config THUMB2_KERNEL
bool "Compile the kernel in Thumb-2 mode" if !CPU_THUMBONLY
depends on (CPU_V7 || CPU_V7M) && !CPU_V6 && !CPU_V6K
default y if CPU_THUMBONLY
- select AEABI
select ARM_ASM_UNIFIED
select ARM_UNWIND
help
@@ -1594,7 +1593,8 @@ config ARM_PATCH_IDIV
code to do integer division.
config AEABI
- bool "Use the ARM EABI to compile the kernel"
+ bool "Use the ARM EABI to compile the kernel" if !CPU_V7 && !CPU_V7M && !CPU_V6 && !CPU_V6K
+ default CPU_V7 || CPU_V7M || CPU_V6 || CPU_V6K
help
This option allows for the kernel to be compiled using the latest
ARM ABI (aka EABI). This is only useful if you are using a user
diff --git a/arch/arm/include/asm/smp_scu.h b/arch/arm/include/asm/smp_scu.h
index bfe163c40024..5983f6bc62d5 100644
--- a/arch/arm/include/asm/smp_scu.h
+++ b/arch/arm/include/asm/smp_scu.h
@@ -7,6 +7,7 @@
#ifndef __ASSEMBLER__
+#include <linux/errno.h>
#include <asm/cputype.h>
static inline bool scu_a9_has_base(void)
diff --git a/arch/arm/include/asm/suspend.h b/arch/arm/include/asm/suspend.h
index 6c7182f32cef..a61905c86732 100644
--- a/arch/arm/include/asm/suspend.h
+++ b/arch/arm/include/asm/suspend.h
@@ -1,6 +1,8 @@
#ifndef __ASM_ARM_SUSPEND_H
#define __ASM_ARM_SUSPEND_H
+#include <linux/types.h>
+
struct sleep_save_sp {
u32 *save_ptr_stash;
u32 save_ptr_stash_phys;
diff --git a/arch/arm/include/debug/omap2plus.S b/arch/arm/include/debug/omap2plus.S
index 8be08d907a16..192a7583999c 100644
--- a/arch/arm/include/debug/omap2plus.S
+++ b/arch/arm/include/debug/omap2plus.S
@@ -22,6 +22,7 @@
#define UART_OFFSET(addr) ((addr) & 0x00ffffff)
.pushsection .data
+ .align 2
omap_uart_phys: .word 0
omap_uart_virt: .word 0
omap_uart_lsr: .word 0
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index c731f0d2b2af..fbc707626b3e 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -721,6 +721,7 @@ do_fpe:
*/
.pushsection .data
+ .align 2
ENTRY(fp_enter)
.word no_fp
.popsection
@@ -1224,6 +1225,7 @@ vector_addrexcptn:
W(b) vector_fiq
.data
+ .align 2
.globl cr_alignment
cr_alignment:
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index e33c32d56193..ca3614dc6938 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -27,6 +27,14 @@
#include "entry-header.S"
+saved_psr .req r8
+#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING)
+saved_pc .req r9
+#define TRACE(x...) x
+#else
+saved_pc .req lr
+#define TRACE(x...)
+#endif
.align 5
#if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING))
@@ -146,16 +154,17 @@ ENTRY(vector_swi)
ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr
THUMB( mov r8, sp )
THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr
- mrs r8, spsr @ called from non-FIQ mode, so ok.
- str lr, [sp, #S_PC] @ Save calling PC
- str r8, [sp, #S_PSR] @ Save CPSR
+ mrs saved_psr, spsr @ called from non-FIQ mode, so ok.
+ TRACE( mov saved_pc, lr )
+ str saved_pc, [sp, #S_PC] @ Save calling PC
+ str saved_psr, [sp, #S_PSR] @ Save CPSR
str r0, [sp, #S_OLD_R0] @ Save OLD_R0
#endif
zero_fp
alignment_trap r10, ip, __cr_alignment
- enable_irq
- ct_user_exit
- get_thread_info tsk
+ asm_trace_hardirqs_on save=0
+ enable_irq_notrace
+ ct_user_exit save=0
/*
* Get the system call number.
@@ -168,11 +177,11 @@ ENTRY(vector_swi)
* value to determine if it is an EABI or an old ABI call.
*/
#ifdef CONFIG_ARM_THUMB
- tst r8, #PSR_T_BIT
+ tst saved_psr, #PSR_T_BIT
movne r10, #0 @ no thumb OABI emulation
- USER( ldreq r10, [lr, #-4] ) @ get SWI instruction
+ USER( ldreq r10, [saved_pc, #-4] ) @ get SWI instruction
#else
- USER( ldr r10, [lr, #-4] ) @ get SWI instruction
+ USER( ldr r10, [saved_pc, #-4] ) @ get SWI instruction
#endif
ARM_BE8(rev r10, r10) @ little endian instruction
@@ -183,15 +192,17 @@ ENTRY(vector_swi)
*/
#elif defined(CONFIG_ARM_THUMB)
/* Legacy ABI only, possibly thumb mode. */
- tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs
+ tst saved_psr, #PSR_T_BIT @ this is SPSR from save_user_regs
addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in
- USER( ldreq scno, [lr, #-4] )
+ USER( ldreq scno, [saved_pc, #-4] )
#else
/* Legacy ABI only. */
- USER( ldr scno, [lr, #-4] ) @ get SWI instruction
+ USER( ldr scno, [saved_pc, #-4] ) @ get SWI instruction
#endif
+ /* saved_psr and saved_pc are now dead */
+
uaccess_disable tbl
adr tbl, sys_call_table @ load syscall table pointer
@@ -210,6 +221,12 @@ ENTRY(vector_swi)
bic scno, scno, #0xff000000 @ mask off SWI op-code
eor scno, scno, #__NR_SYSCALL_BASE @ check OS number
#endif
+ get_thread_info tsk
+ /*
+ * Reload the registers that may have been corrupted on entry to
+ * the syscall assembly (by tracing or context tracking.)
+ */
+ TRACE( ldmia sp, {r0 - r3} )
local_restart:
ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing
@@ -239,8 +256,9 @@ local_restart:
* current task.
*/
9001:
- sub lr, lr, #4
+ sub lr, saved_pc, #4
str lr, [sp, #S_PC]
+ get_thread_info tsk
b ret_fast_syscall
#endif
ENDPROC(vector_swi)
diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 04286fd9e09c..6b1148cafffd 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -556,6 +556,7 @@ ENDPROC(__fixup_smp)
.word __smpalt_end
.pushsection .data
+ .align 2
.globl smp_on_up
smp_on_up:
ALT_SMP(.long 1)
@@ -716,6 +717,7 @@ ENTRY(fixup_pv_table)
ENDPROC(fixup_pv_table)
.data
+ .align 2
.globl __pv_phys_pfn_offset
.type __pv_phys_pfn_offset, %object
__pv_phys_pfn_offset:
diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S
index ec7e7377d423..60146e32619a 100644
--- a/arch/arm/kernel/hyp-stub.S
+++ b/arch/arm/kernel/hyp-stub.S
@@ -31,6 +31,7 @@
* zeroing of .bss would clobber it.
*/
.data
+ .align 2
ENTRY(__boot_cpu_mode)
.long 0
.text
diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S
index 49fadbda8c63..81cd4d43b3ec 100644
--- a/arch/arm/kernel/iwmmxt.S
+++ b/arch/arm/kernel/iwmmxt.S
@@ -367,6 +367,7 @@ ENTRY(iwmmxt_task_release)
ENDPROC(iwmmxt_task_release)
.data
+ .align 2
concan_owner:
.word 0
diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S
index 0f6c1000582c..9f08d214d05a 100644
--- a/arch/arm/kernel/sleep.S
+++ b/arch/arm/kernel/sleep.S
@@ -171,6 +171,7 @@ mpidr_hash_ptr:
.long mpidr_hash - . @ mpidr_hash struct offset
.data
+ .align 2
.type sleep_save_sp, #object
ENTRY(sleep_save_sp)
.space SLEEP_SAVE_SP_SZ @ struct sleep_save_sp
diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c
index 3a2fa203637a..65228bf4c6df 100644
--- a/arch/arm/kernel/stacktrace.c
+++ b/arch/arm/kernel/stacktrace.c
@@ -171,6 +171,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
__save_stack_trace(tsk, trace, 1);
}
+EXPORT_SYMBOL(save_stack_trace_tsk);
void save_stack_trace(struct stack_trace *trace)
{
diff --git a/arch/arm/mach-exynos/sleep.S b/arch/arm/mach-exynos/sleep.S
index cf950790fbdc..4292cae43f3c 100644
--- a/arch/arm/mach-exynos/sleep.S
+++ b/arch/arm/mach-exynos/sleep.S
@@ -124,6 +124,7 @@ _cp15_save_diag:
#endif /* CONFIG_CACHE_L2X0 */
.data
+ .align 2
.globl cp15_save_diag
cp15_save_diag:
.long 0 @ cp15 diagnostic
diff --git a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
index dd75a4756761..5169dfba9718 100644
--- a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
+++ b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c
@@ -245,7 +245,6 @@ static phys_addr_t mx2_camera_base __initdata;
static void __init visstrim_analog_camera_init(void)
{
struct platform_device *pdev;
- int dma;
gpio_set_value(TVP5150_PWDN, 1);
ndelay(1);
@@ -258,12 +257,9 @@ static void __init visstrim_analog_camera_init(void)
if (IS_ERR(pdev))
return;
- dma = dma_declare_coherent_memory(&pdev->dev,
- mx2_camera_base, mx2_camera_base,
- MX2_CAMERA_BUF_SIZE,
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
- if (!(dma & DMA_MEMORY_MAP))
- return;
+ dma_declare_coherent_memory(&pdev->dev, mx2_camera_base,
+ mx2_camera_base, MX2_CAMERA_BUF_SIZE,
+ DMA_MEMORY_EXCLUSIVE);
}
static void __init visstrim_reserve(void)
@@ -444,16 +440,13 @@ static const struct imx_ssi_platform_data visstrim_m10_ssi_pdata __initconst = {
static void __init visstrim_coda_init(void)
{
struct platform_device *pdev;
- int dma;
pdev = imx27_add_coda();
- dma = dma_declare_coherent_memory(&pdev->dev,
- mx2_camera_base + MX2_CAMERA_BUF_SIZE,
- mx2_camera_base + MX2_CAMERA_BUF_SIZE,
- MX2_CAMERA_BUF_SIZE,
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
- if (!(dma & DMA_MEMORY_MAP))
- return;
+ dma_declare_coherent_memory(&pdev->dev,
+ mx2_camera_base + MX2_CAMERA_BUF_SIZE,
+ mx2_camera_base + MX2_CAMERA_BUF_SIZE,
+ MX2_CAMERA_BUF_SIZE,
+ DMA_MEMORY_EXCLUSIVE);
}
/* DMA deinterlace */
@@ -466,24 +459,21 @@ static void __init visstrim_deinterlace_init(void)
{
int ret = -ENOMEM;
struct platform_device *pdev = &visstrim_deinterlace;
- int dma;
ret = platform_device_register(pdev);
- dma = dma_declare_coherent_memory(&pdev->dev,
- mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
- mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
- MX2_CAMERA_BUF_SIZE,
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
- if (!(dma & DMA_MEMORY_MAP))
- return;
+ dma_declare_coherent_memory(&pdev->dev,
+ mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
+ mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE,
+ MX2_CAMERA_BUF_SIZE,
+ DMA_MEMORY_EXCLUSIVE);
}
/* Emma-PrP for format conversion */
static void __init visstrim_emmaprp_init(void)
{
struct platform_device *pdev;
- int dma;
+ int ret;
pdev = imx27_add_mx2_emmaprp();
if (IS_ERR(pdev))
@@ -493,11 +483,11 @@ static void __init visstrim_emmaprp_init(void)
* Use the same memory area as the analog camera since both
* devices are, by nature, exclusive.
*/
- dma = dma_declare_coherent_memory(&pdev->dev,
+ ret = dma_declare_coherent_memory(&pdev->dev,
mx2_camera_base, mx2_camera_base,
MX2_CAMERA_BUF_SIZE,
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
- if (!(dma & DMA_MEMORY_MAP))
+ DMA_MEMORY_EXCLUSIVE);
+ if (ret)
pr_err("Failed to declare memory for emmaprp\n");
}
diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c
index bde9a9af6714..7716f83aecdd 100644
--- a/arch/arm/mach-imx/mach-mx31moboard.c
+++ b/arch/arm/mach-imx/mach-mx31moboard.c
@@ -475,7 +475,7 @@ static phys_addr_t mx3_camera_base __initdata;
static int __init mx31moboard_init_cam(void)
{
- int dma, ret = -ENOMEM;
+ int ret;
struct platform_device *pdev;
imx31_add_ipu_core();
@@ -484,11 +484,11 @@ static int __init mx31moboard_init_cam(void)
if (IS_ERR(pdev))
return PTR_ERR(pdev);
- dma = dma_declare_coherent_memory(&pdev->dev,
- mx3_camera_base, mx3_camera_base,
- MX3_CAMERA_BUF_SIZE,
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE);
- if (!(dma & DMA_MEMORY_MAP))
+ ret = dma_declare_coherent_memory(&pdev->dev,
+ mx3_camera_base, mx3_camera_base,
+ MX3_CAMERA_BUF_SIZE,
+ DMA_MEMORY_EXCLUSIVE);
+ if (ret)
goto err;
ret = platform_device_add(pdev);
diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S
index 1b9f0520dea9..fa5fd24f524c 100644
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -530,10 +530,12 @@ l2dis_3630_offset:
.long l2dis_3630 - .
.data
+ .align 2
l2dis_3630:
.word 0
.data
+ .align 2
l2_inv_api_params:
.word 0x1, 0x00
diff --git a/arch/arm/mach-omap2/sleep44xx.S b/arch/arm/mach-omap2/sleep44xx.S
index c7a3b4aab4b5..56dfa2d5d0a8 100644
--- a/arch/arm/mach-omap2/sleep44xx.S
+++ b/arch/arm/mach-omap2/sleep44xx.S
@@ -385,6 +385,7 @@ ppa_zero_params_offset:
ENDPROC(omap_do_wfi)
.data
+ .align 2
ppa_zero_params:
.word 0
diff --git a/arch/arm/mach-pxa/mioa701_bootresume.S b/arch/arm/mach-pxa/mioa701_bootresume.S
index 81591491ab94..42d93f40a59f 100644
--- a/arch/arm/mach-pxa/mioa701_bootresume.S
+++ b/arch/arm/mach-pxa/mioa701_bootresume.S
@@ -16,6 +16,7 @@
* insist on it to be truly read-only.
*/
.data
+ .align 2
ENTRY(mioa701_bootstrap)
0:
b 1f
@@ -34,4 +35,5 @@ ENTRY(mioa701_jumpaddr)
ENTRY(mioa701_bootstrap_lg)
.data
+ .align 2
.word 2b-0b
diff --git a/arch/arm/mach-rockchip/sleep.S b/arch/arm/mach-rockchip/sleep.S
index 2eec9a341f05..9927f06f52fe 100644
--- a/arch/arm/mach-rockchip/sleep.S
+++ b/arch/arm/mach-rockchip/sleep.S
@@ -23,7 +23,7 @@
* ddr to sram for system resumeing.
* so it is ".data section".
*/
-.align
+ .align 2
ENTRY(rockchip_slp_cpu_resume)
setmode PSR_I_BIT | PSR_F_BIT | SVC_MODE, r1 @ set svc, irqs off
diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S
index 2522f8c8fbb1..a5084ec70c6e 100644
--- a/arch/arm/mm/cache-v4wb.S
+++ b/arch/arm/mm/cache-v4wb.S
@@ -47,6 +47,7 @@
#define CACHE_DLIMIT (CACHE_DSIZE * 4)
.data
+ .align 2
flush_base:
.long FLUSH_BASE
.text
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index ff8b0aa2dfde..42f585379e19 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -315,8 +315,11 @@ retry:
* signal first. We do not need to release the mmap_sem because
* it would already be released in __lock_page_or_retry in
* mm/filemap.c. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ if (!user_mode(regs))
+ goto no_context;
return 0;
+ }
/*
* Major/minor page fault accounting is only done on the
diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S
index 5e5720e8bc5f..7d16bbc4102b 100644
--- a/arch/arm/mm/proc-v7-3level.S
+++ b/arch/arm/mm/proc-v7-3level.S
@@ -129,8 +129,7 @@ ENDPROC(cpu_v7_set_pte_ext)
.macro v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp
ldr \tmp, =swapper_pg_dir @ swapper_pg_dir virtual address
cmp \ttbr1, \tmp, lsr #12 @ PHYS_OFFSET > PAGE_OFFSET?
- mrc p15, 0, \tmp, c2, c0, 2 @ TTB control egister
- orr \tmp, \tmp, #TTB_EAE
+ mov \tmp, #TTB_EAE @ for TTB control egister
ALT_SMP(orr \tmp, \tmp, #TTB_FLAGS_SMP)
ALT_UP(orr \tmp, \tmp, #TTB_FLAGS_UP)
ALT_SMP(orr \tmp, \tmp, #TTB_FLAGS_SMP << 16)
diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S
index b6bbfdb6dfdc..3d75b7972fd1 100644
--- a/arch/arm/mm/proc-xscale.S
+++ b/arch/arm/mm/proc-xscale.S
@@ -104,6 +104,7 @@
.endm
.data
+ .align 2
clean_addr: .word CLEAN_ADDR
.text
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 4e5a664be04b..e09bf5d15606 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -142,25 +142,25 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
+ err |= __put_user(from->si_code, &to->si_code);
if (from->si_code < 0)
err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad,
SI_PAD_SIZE);
- else switch (from->si_code & __SI_MASK) {
- case __SI_KILL:
+ else switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_TIMER:
+ case SIL_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_int, &to->si_int);
break;
- case __SI_POLL:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_FAULT:
+ case SIL_FAULT:
err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr,
&to->si_addr);
#ifdef BUS_MCEERR_AO
@@ -173,29 +173,24 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
#endif
break;
- case __SI_CHLD:
+ case SIL_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_status, &to->si_status);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
break;
- case __SI_RT: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ: /* But this is */
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_int, &to->si_int);
break;
- case __SI_SYS:
+ case SIL_SYS:
err |= __put_user((compat_uptr_t)(unsigned long)
from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
- default: /* this is just in case for now ... */
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
}
return err;
}
diff --git a/arch/blackfin/include/uapi/asm/siginfo.h b/arch/blackfin/include/uapi/asm/siginfo.h
index c72f4e6e386f..79dfe3979123 100644
--- a/arch/blackfin/include/uapi/asm/siginfo.h
+++ b/arch/blackfin/include/uapi/asm/siginfo.h
@@ -14,28 +14,36 @@
#define si_uid16 _sifields._kill._uid
-#define ILL_ILLPARAOP (__SI_FAULT|2) /* illegal opcode combine ********** */
-#define ILL_ILLEXCPT (__SI_FAULT|4) /* unrecoverable exception ********** */
-#define ILL_CPLB_VI (__SI_FAULT|9) /* D/I CPLB protect violation ******** */
-#define ILL_CPLB_MISS (__SI_FAULT|10) /* D/I CPLB miss ******** */
-#define ILL_CPLB_MULHIT (__SI_FAULT|11) /* D/I CPLB multiple hit ******** */
+#define ILL_ILLPARAOP 2 /* illegal opcode combine ********** */
+#define ILL_ILLEXCPT 4 /* unrecoverable exception ********** */
+#define ILL_CPLB_VI 9 /* D/I CPLB protect violation ******** */
+#define ILL_CPLB_MISS 10 /* D/I CPLB miss ******** */
+#define ILL_CPLB_MULHIT 11 /* D/I CPLB multiple hit ******** */
+#undef NSIGILL
+#define NSIGILL 11
/*
* SIGBUS si_codes
*/
-#define BUS_OPFETCH (__SI_FAULT|4) /* error from instruction fetch ******** */
+#define BUS_OPFETCH 4 /* error from instruction fetch ******** */
+#undef NSIGBUS
+#define NSIGBUS 4
/*
* SIGTRAP si_codes
*/
-#define TRAP_STEP (__SI_FAULT|1) /* single-step breakpoint************* */
-#define TRAP_TRACEFLOW (__SI_FAULT|2) /* trace buffer overflow ************* */
-#define TRAP_WATCHPT (__SI_FAULT|3) /* watchpoint match ************* */
-#define TRAP_ILLTRAP (__SI_FAULT|4) /* illegal trap ************* */
+#define TRAP_STEP 1 /* single-step breakpoint************* */
+#define TRAP_TRACEFLOW 2 /* trace buffer overflow ************* */
+#define TRAP_WATCHPT 3 /* watchpoint match ************* */
+#define TRAP_ILLTRAP 4 /* illegal trap ************* */
+#undef NSIGTRAP
+#define NSIGTRAP 4
/*
* SIGSEGV si_codes
*/
-#define SEGV_STACKFLOW (__SI_FAULT|3) /* stack overflow */
+#define SEGV_STACKFLOW 3 /* stack overflow */
+#undef NSIGSEGV
+#define NSIGSEGV 3
#endif /* _UAPI_BFIN_SIGINFO_H */
diff --git a/arch/frv/include/uapi/asm/siginfo.h b/arch/frv/include/uapi/asm/siginfo.h
index d3fd1ca45653..f55d9e0e9068 100644
--- a/arch/frv/include/uapi/asm/siginfo.h
+++ b/arch/frv/include/uapi/asm/siginfo.h
@@ -4,7 +4,7 @@
#include <linux/types.h>
#include <asm-generic/siginfo.h>
-#define FPE_MDAOVF (__SI_FAULT|9) /* media overflow */
+#define FPE_MDAOVF 9 /* media overflow */
#undef NSIGFPE
#define NSIGFPE 9
diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h
index 4694c64252d6..33389fc36f23 100644
--- a/arch/ia64/include/uapi/asm/siginfo.h
+++ b/arch/ia64/include/uapi/asm/siginfo.h
@@ -98,27 +98,30 @@ typedef struct siginfo {
/*
* SIGILL si_codes
*/
-#define ILL_BADIADDR (__SI_FAULT|9) /* unimplemented instruction address */
-#define __ILL_BREAK (__SI_FAULT|10) /* illegal break */
-#define __ILL_BNDMOD (__SI_FAULT|11) /* bundle-update (modification) in progress */
+#define ILL_BADIADDR 9 /* unimplemented instruction address */
+#define __ILL_BREAK 10 /* illegal break */
+#define __ILL_BNDMOD 11 /* bundle-update (modification) in progress */
#undef NSIGILL
#define NSIGILL 11
/*
* SIGFPE si_codes
*/
-#define __FPE_DECOVF (__SI_FAULT|9) /* decimal overflow */
-#define __FPE_DECDIV (__SI_FAULT|10) /* decimal division by zero */
-#define __FPE_DECERR (__SI_FAULT|11) /* packed decimal error */
-#define __FPE_INVASC (__SI_FAULT|12) /* invalid ASCII digit */
-#define __FPE_INVDEC (__SI_FAULT|13) /* invalid decimal digit */
+#ifdef __KERNEL__
+#define FPE_FIXME 0 /* Broken dup of SI_USER */
+#endif /* __KERNEL__ */
+#define __FPE_DECOVF 9 /* decimal overflow */
+#define __FPE_DECDIV 10 /* decimal division by zero */
+#define __FPE_DECERR 11 /* packed decimal error */
+#define __FPE_INVASC 12 /* invalid ASCII digit */
+#define __FPE_INVDEC 13 /* invalid decimal digit */
#undef NSIGFPE
#define NSIGFPE 13
/*
* SIGSEGV si_codes
*/
-#define __SEGV_PSTKOVF (__SI_FAULT|4) /* paragraph stack overflow */
+#define __SEGV_PSTKOVF 4 /* paragraph stack overflow */
#undef NSIGSEGV
#define NSIGSEGV 4
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 5db52c6813c4..6146d53b6ad7 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -124,31 +124,30 @@ copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from)
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
+ err |= __put_user(from->si_code, &to->si_code);
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_FAULT:
err |= __put_user(from->si_flags, &to->si_flags);
err |= __put_user(from->si_isr, &to->si_isr);
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(from->si_addr, &to->si_addr);
err |= __put_user(from->si_imm, &to->si_imm);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
- case __SI_RT >> 16: /* Not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
err |= __put_user(from->si_status, &to->si_status);
- default:
+ case SIL_KILL:
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_pid, &to->si_pid);
break;
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 7b1fe9462158..3cb17cf9b362 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -349,7 +349,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
}
siginfo.si_signo = SIGFPE;
siginfo.si_errno = 0;
- siginfo.si_code = __SI_FAULT; /* default code */
+ siginfo.si_code = FPE_FIXME; /* default code */
siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
if (isr & 0x11) {
siginfo.si_code = FPE_FLTINV;
@@ -373,7 +373,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr)
/* raise exception */
siginfo.si_signo = SIGFPE;
siginfo.si_errno = 0;
- siginfo.si_code = __SI_FAULT; /* default code */
+ siginfo.si_code = FPE_FIXME; /* default code */
siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri);
if (isr & 0x880) {
siginfo.si_code = FPE_FLTOVF;
diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h
index fad3dc3cb210..ea573be2b6d0 100644
--- a/arch/metag/include/asm/dma-mapping.h
+++ b/arch/metag/include/asm/dma-mapping.h
@@ -9,7 +9,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
}
/*
- * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to
+ * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
* do any flushing here.
*/
static inline void
diff --git a/arch/mips/include/uapi/asm/siginfo.h b/arch/mips/include/uapi/asm/siginfo.h
index 8069cf766603..cf6113bbcb98 100644
--- a/arch/mips/include/uapi/asm/siginfo.h
+++ b/arch/mips/include/uapi/asm/siginfo.h
@@ -120,7 +120,7 @@ typedef struct siginfo {
#undef SI_TIMER
#undef SI_MESGQ
#define SI_ASYNCIO -2 /* sent by AIO completion */
-#define SI_TIMER __SI_CODE(__SI_TIMER, -3) /* sent by timer expiration */
-#define SI_MESGQ __SI_CODE(__SI_MESGQ, -4) /* sent by real time mesq state change */
+#define SI_TIMER -3 /* sent by timer expiration */
+#define SI_MESGQ -4 /* sent by real time mesq state change */
#endif /* _UAPI_ASM_SIGINFO_H */
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 84165f2b31ff..cf5c7c05e5a3 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -93,38 +93,37 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
at the same time. */
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
+ err |= __put_user(from->si_code, &to->si_code);
if (from->si_code < 0)
err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
else {
- switch (from->si_code >> 16) {
- case __SI_TIMER >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_int, &to->si_int);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
err |= __put_user(from->si_status, &to->si_status);
- default:
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
err |= __put_user((unsigned long)from->si_addr, &to->si_addr);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_int, &to->si_int);
break;
- case __SI_SYS >> 16:
+ case SIL_SYS:
err |= __copy_to_user(&to->si_call_addr, &from->si_call_addr,
sizeof(compat_uptr_t));
err |= __put_user(from->si_syscall, &to->si_syscall);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index b68b4d0726d3..2bf414993347 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -735,7 +735,7 @@ void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr,
else if (fcr31 & FPU_CSR_INE_X)
si.si_code = FPE_FLTRES;
else
- si.si_code = __SI_FAULT;
+ return; /* Broken hardware? */
force_sig_info(SIGFPE, &si, tsk);
}
diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h
index 7b3c6f280293..f8dc62222741 100644
--- a/arch/nios2/include/asm/dma-mapping.h
+++ b/arch/nios2/include/asm/dma-mapping.h
@@ -18,7 +18,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
}
/*
- * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to
+ * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
* do any flushing here.
*/
static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c
index 70aaabb8b3cb..9e0cb6a577d6 100644
--- a/arch/parisc/kernel/signal32.c
+++ b/arch/parisc/kernel/signal32.c
@@ -290,25 +290,25 @@ copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from)
if (to->si_code < 0)
err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
else {
- switch (to->si_code >> 16) {
- case __SI_CHLD >> 16:
+ switch (siginfo_layout(to->si_signo, to->si_code)) {
+ case SIL_CHLD:
err |= __get_user(to->si_utime, &from->si_utime);
err |= __get_user(to->si_stime, &from->si_stime);
err |= __get_user(to->si_status, &from->si_status);
default:
+ case SIL_KILL:
err |= __get_user(to->si_pid, &from->si_pid);
err |= __get_user(to->si_uid, &from->si_uid);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
err |= __get_user(addr, &from->si_addr);
to->si_addr = compat_ptr(addr);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __get_user(to->si_band, &from->si_band);
err |= __get_user(to->si_fd, &from->si_fd);
break;
- case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __get_user(to->si_pid, &from->si_pid);
err |= __get_user(to->si_uid, &from->si_uid);
err |= __get_user(to->si_int, &from->si_int);
@@ -337,41 +337,40 @@ copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from)
at the same time. */
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
+ err |= __put_user(from->si_code, &to->si_code);
if (from->si_code < 0)
err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
else {
- switch (from->si_code >> 16) {
- case __SI_CHLD >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_CHLD:
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
err |= __put_user(from->si_status, &to->si_status);
- default:
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
addr = ptr_to_compat(from->si_addr);
err |= __put_user(addr, &to->si_addr);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
val = (compat_int_t)from->si_int;
err |= __put_user(val, &to->si_int);
break;
- case __SI_RT >> 16: /* Not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_pid, &to->si_pid);
val = (compat_int_t)from->si_int;
err |= __put_user(val, &to->si_int);
break;
- case __SI_SYS >> 16:
+ case SIL_SYS:
err |= __put_user(ptr_to_compat(from->si_call_addr), &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 97bb1385e771..92fb1c8dbbd8 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -913,42 +913,40 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s)
*/
err = __put_user(s->si_signo, &d->si_signo);
err |= __put_user(s->si_errno, &d->si_errno);
- err |= __put_user((short)s->si_code, &d->si_code);
+ err |= __put_user(s->si_code, &d->si_code);
if (s->si_code < 0)
err |= __copy_to_user(&d->_sifields._pad, &s->_sifields._pad,
SI_PAD_SIZE32);
- else switch(s->si_code >> 16) {
- case __SI_CHLD >> 16:
+ else switch(siginfo_layout(s->si_signo, s->si_code)) {
+ case SIL_CHLD:
err |= __put_user(s->si_pid, &d->si_pid);
err |= __put_user(s->si_uid, &d->si_uid);
err |= __put_user(s->si_utime, &d->si_utime);
err |= __put_user(s->si_stime, &d->si_stime);
err |= __put_user(s->si_status, &d->si_status);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
err |= __put_user((unsigned int)(unsigned long)s->si_addr,
&d->si_addr);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(s->si_band, &d->si_band);
err |= __put_user(s->si_fd, &d->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
err |= __put_user(s->si_tid, &d->si_tid);
err |= __put_user(s->si_overrun, &d->si_overrun);
err |= __put_user(s->si_int, &d->si_int);
break;
- case __SI_SYS >> 16:
+ case SIL_SYS:
err |= __put_user(ptr_to_compat(s->si_call_addr), &d->si_call_addr);
err |= __put_user(s->si_syscall, &d->si_syscall);
err |= __put_user(s->si_arch, &d->si_arch);
break;
- case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __put_user(s->si_int, &d->si_int);
/* fallthrough */
- case __SI_KILL >> 16:
- default:
+ case SIL_KILL:
err |= __put_user(s->si_pid, &d->si_pid);
err |= __put_user(s->si_uid, &d->si_uid);
break;
diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h
new file mode 100644
index 000000000000..c02f4aba88a6
--- /dev/null
+++ b/arch/s390/include/asm/ap.h
@@ -0,0 +1,126 @@
+/*
+ * Adjunct processor (AP) interfaces
+ *
+ * Copyright IBM Corp. 2017
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): Tony Krowiak <akrowia@linux.vnet.ibm.com>
+ * Martin Schwidefsky <schwidefsky@de.ibm.com>
+ * Harald Freudenberger <freude@de.ibm.com>
+ */
+
+#ifndef _ASM_S390_AP_H_
+#define _ASM_S390_AP_H_
+
+/**
+ * The ap_qid_t identifier of an ap queue.
+ * If the AP facilities test (APFT) facility is available,
+ * card and queue index are 8 bit values, otherwise
+ * card index is 6 bit and queue index a 4 bit value.
+ */
+typedef unsigned int ap_qid_t;
+
+#define AP_MKQID(_card, _queue) (((_card) & 63) << 8 | ((_queue) & 255))
+#define AP_QID_CARD(_qid) (((_qid) >> 8) & 63)
+#define AP_QID_QUEUE(_qid) ((_qid) & 255)
+
+/**
+ * struct ap_queue_status - Holds the AP queue status.
+ * @queue_empty: Shows if queue is empty
+ * @replies_waiting: Waiting replies
+ * @queue_full: Is 1 if the queue is full
+ * @irq_enabled: Shows if interrupts are enabled for the AP
+ * @response_code: Holds the 8 bit response code
+ *
+ * The ap queue status word is returned by all three AP functions
+ * (PQAP, NQAP and DQAP). There's a set of flags in the first
+ * byte, followed by a 1 byte response code.
+ */
+struct ap_queue_status {
+ unsigned int queue_empty : 1;
+ unsigned int replies_waiting : 1;
+ unsigned int queue_full : 1;
+ unsigned int _pad1 : 4;
+ unsigned int irq_enabled : 1;
+ unsigned int response_code : 8;
+ unsigned int _pad2 : 16;
+};
+
+/**
+ * ap_test_queue(): Test adjunct processor queue.
+ * @qid: The AP queue number
+ * @tbit: Test facilities bit
+ * @info: Pointer to queue descriptor
+ *
+ * Returns AP queue status structure.
+ */
+struct ap_queue_status ap_test_queue(ap_qid_t qid,
+ int tbit,
+ unsigned long *info);
+
+struct ap_config_info {
+ unsigned int apsc : 1; /* S bit */
+ unsigned int apxa : 1; /* N bit */
+ unsigned int qact : 1; /* C bit */
+ unsigned int rc8a : 1; /* R bit */
+ unsigned char _reserved1 : 4;
+ unsigned char _reserved2[3];
+ unsigned char Na; /* max # of APs - 1 */
+ unsigned char Nd; /* max # of Domains - 1 */
+ unsigned char _reserved3[10];
+ unsigned int apm[8]; /* AP ID mask */
+ unsigned int aqm[8]; /* AP queue mask */
+ unsigned int adm[8]; /* AP domain mask */
+ unsigned char _reserved4[16];
+} __aligned(8);
+
+/*
+ * ap_query_configuration(): Fetch cryptographic config info
+ *
+ * Returns the ap configuration info fetched via PQAP(QCI).
+ * On success 0 is returned, on failure a negative errno
+ * is returned, e.g. if the PQAP(QCI) instruction is not
+ * available, the return value will be -EOPNOTSUPP.
+ */
+int ap_query_configuration(struct ap_config_info *info);
+
+/*
+ * struct ap_qirq_ctrl - convenient struct for easy invocation
+ * of the ap_queue_irq_ctrl() function. This struct is passed
+ * as GR1 parameter to the PQAP(AQIC) instruction. For details
+ * please see the AR documentation.
+ */
+struct ap_qirq_ctrl {
+ unsigned int _res1 : 8;
+ unsigned int zone : 8; /* zone info */
+ unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */
+ unsigned int _res2 : 4;
+ unsigned int gisc : 3; /* guest isc field */
+ unsigned int _res3 : 6;
+ unsigned int gf : 2; /* gisa format */
+ unsigned int _res4 : 1;
+ unsigned int gisa : 27; /* gisa origin */
+ unsigned int _res5 : 1;
+ unsigned int isc : 3; /* irq sub class */
+};
+
+/**
+ * ap_queue_irq_ctrl(): Control interruption on a AP queue.
+ * @qid: The AP queue number
+ * @qirqctrl: struct ap_qirq_ctrl, see above
+ * @ind: The notification indicator byte
+ *
+ * Returns AP queue status.
+ *
+ * Control interruption on the given AP queue.
+ * Just a simple wrapper function for the low level PQAP(AQIC)
+ * instruction available for other kernel modules.
+ */
+struct ap_queue_status ap_queue_irq_ctrl(ap_qid_t qid,
+ struct ap_qirq_ctrl qirqctrl,
+ void *ind);
+
+#endif /* _ASM_S390_AP_H_ */
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index bd6f30304518..3f46a6577b8d 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -5,12 +5,11 @@
#include <linux/errno.h>
typedef struct {
+ spinlock_t lock;
cpumask_t cpu_attach_mask;
atomic_t flush_count;
unsigned int flush_mm;
- spinlock_t pgtable_lock;
struct list_head pgtable_list;
- spinlock_t gmap_lock;
struct list_head gmap_list;
unsigned long gmap_asce;
unsigned long asce;
@@ -27,10 +26,8 @@ typedef struct {
} mm_context_t;
#define INIT_MM_CONTEXT(name) \
- .context.pgtable_lock = \
- __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \
+ .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \
.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
- .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 72e9ca83a668..3c9abedc323c 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -17,9 +17,8 @@
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
- spin_lock_init(&mm->context.pgtable_lock);
+ spin_lock_init(&mm->context.lock);
INIT_LIST_HEAD(&mm->context.pgtable_list);
- spin_lock_init(&mm->context.gmap_lock);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
@@ -103,7 +102,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (prev == next)
return;
cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
- cpumask_set_cpu(cpu, mm_cpumask(next));
/* Clear old ASCE by loading the kernel ASCE. */
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
__ctl_load(S390_lowcore.kernel_asce, 7, 7);
@@ -121,9 +119,8 @@ static inline void finish_arch_post_lock_switch(void)
preempt_disable();
while (atomic_read(&mm->context.flush_count))
cpu_relax();
-
- if (mm->context.flush_mm)
- __tlb_flush_mm(mm);
+ cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
+ __tlb_flush_mm_lazy(mm);
preempt_enable();
}
set_fs(current->thread.mm_segment);
@@ -136,6 +133,7 @@ static inline void activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
switch_mm(prev, next, current);
+ cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
set_user_asce(next);
}
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 4d759f8f4bc7..b08d5bc2666e 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -48,23 +48,6 @@ static inline void __tlb_flush_global(void)
* Flush TLB entries for a specific mm on all CPUs (in case gmap is used
* this implicates multiple ASCEs!).
*/
-static inline void __tlb_flush_full(struct mm_struct *mm)
-{
- preempt_disable();
- atomic_inc(&mm->context.flush_count);
- if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
- /* Local TLB flush */
- __tlb_flush_local();
- } else {
- /* Global TLB flush */
- __tlb_flush_global();
- /* Reset TLB flush mask */
- cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
- }
- atomic_dec(&mm->context.flush_count);
- preempt_enable();
-}
-
static inline void __tlb_flush_mm(struct mm_struct *mm)
{
unsigned long gmap_asce;
@@ -76,16 +59,18 @@ static inline void __tlb_flush_mm(struct mm_struct *mm)
*/
preempt_disable();
atomic_inc(&mm->context.flush_count);
+ /* Reset TLB flush mask */
+ cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
+ barrier();
gmap_asce = READ_ONCE(mm->context.gmap_asce);
if (MACHINE_HAS_IDTE && gmap_asce != -1UL) {
if (gmap_asce)
__tlb_flush_idte(gmap_asce);
__tlb_flush_idte(mm->context.asce);
} else {
- __tlb_flush_full(mm);
+ /* Global TLB flush */
+ __tlb_flush_global();
}
- /* Reset TLB flush mask */
- cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask);
atomic_dec(&mm->context.flush_count);
preempt_enable();
}
@@ -99,7 +84,6 @@ static inline void __tlb_flush_kernel(void)
}
#else
#define __tlb_flush_global() __tlb_flush_local()
-#define __tlb_flush_full(mm) __tlb_flush_local()
/*
* Flush TLB entries for a specific ASCE on all CPUs.
@@ -117,10 +101,12 @@ static inline void __tlb_flush_kernel(void)
static inline void __tlb_flush_mm_lazy(struct mm_struct * mm)
{
+ spin_lock(&mm->context.lock);
if (mm->context.flush_mm) {
- __tlb_flush_mm(mm);
mm->context.flush_mm = 0;
+ __tlb_flush_mm(mm);
}
+ spin_unlock(&mm->context.lock);
}
/*
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index c620049c61f2..f549c4657376 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -75,35 +75,34 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
at the same time. */
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
+ err |= __put_user(from->si_code, &to->si_code);
if (from->si_code < 0)
err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
else {
- switch (from->si_code >> 16) {
- case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_RT:
err |= __put_user(from->si_int, &to->si_int);
/* fallthrough */
- case __SI_KILL >> 16:
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
err |= __put_user(from->si_status, &to->si_status);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
err |= __put_user((unsigned long) from->si_addr,
&to->si_addr);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_int, &to->si_int);
@@ -127,32 +126,31 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
if (to->si_code < 0)
err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
else {
- switch (to->si_code >> 16) {
- case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ switch (siginfo_layout(to->si_signo, to->si_code)) {
+ case SIL_RT:
err |= __get_user(to->si_int, &from->si_int);
/* fallthrough */
- case __SI_KILL >> 16:
+ case SIL_KILL:
err |= __get_user(to->si_pid, &from->si_pid);
err |= __get_user(to->si_uid, &from->si_uid);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
err |= __get_user(to->si_pid, &from->si_pid);
err |= __get_user(to->si_uid, &from->si_uid);
err |= __get_user(to->si_utime, &from->si_utime);
err |= __get_user(to->si_stime, &from->si_stime);
err |= __get_user(to->si_status, &from->si_status);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
err |= __get_user(tmp, &from->si_addr);
to->si_addr = (void __force __user *)
(u64) (tmp & PSW32_ADDR_INSN);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __get_user(to->si_band, &from->si_band);
err |= __get_user(to->si_fd, &from->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
err |= __get_user(to->si_tid, &from->si_tid);
err |= __get_user(to->si_overrun, &from->si_overrun);
err |= __get_user(to->si_int, &from->si_int);
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 9e1494e3d849..2f66290c9b92 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -100,14 +100,14 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
if (!gmap)
return NULL;
gmap->mm = mm;
- spin_lock(&mm->context.gmap_lock);
+ spin_lock(&mm->context.lock);
list_add_rcu(&gmap->list, &mm->context.gmap_list);
if (list_is_singular(&mm->context.gmap_list))
gmap_asce = gmap->asce;
else
gmap_asce = -1UL;
WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
- spin_unlock(&mm->context.gmap_lock);
+ spin_unlock(&mm->context.lock);
return gmap;
}
EXPORT_SYMBOL_GPL(gmap_create);
@@ -248,7 +248,7 @@ void gmap_remove(struct gmap *gmap)
spin_unlock(&gmap->shadow_lock);
}
/* Remove gmap from the pre-mm list */
- spin_lock(&gmap->mm->context.gmap_lock);
+ spin_lock(&gmap->mm->context.lock);
list_del_rcu(&gmap->list);
if (list_empty(&gmap->mm->context.gmap_list))
gmap_asce = 0;
@@ -258,7 +258,7 @@ void gmap_remove(struct gmap *gmap)
else
gmap_asce = -1UL;
WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
- spin_unlock(&gmap->mm->context.gmap_lock);
+ spin_unlock(&gmap->mm->context.lock);
synchronize_rcu();
/* Put reference */
gmap_put(gmap);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index c5b74dd61197..05f1f27e6708 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -83,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
int rc, notify;
/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
- BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
+ VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
if (end >= TASK_SIZE_MAX)
return -ENOMEM;
rc = 0;
@@ -124,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm)
pgd_t *pgd;
/* downgrade should only happen from 3 to 2 levels (compat only) */
- BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
+ VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
if (current->active_mm == mm) {
clear_user_asce();
@@ -188,7 +188,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Try to get a fragment of a 4K page as a 2K page table */
if (!mm_alloc_pgste(mm)) {
table = NULL;
- spin_lock_bh(&mm->context.pgtable_lock);
+ spin_lock_bh(&mm->context.lock);
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
@@ -203,7 +203,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
list_del(&page->lru);
}
}
- spin_unlock_bh(&mm->context.pgtable_lock);
+ spin_unlock_bh(&mm->context.lock);
if (table)
return table;
}
@@ -227,9 +227,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Return the first 2K fragment of the page */
atomic_set(&page->_mapcount, 1);
clear_table(table, _PAGE_INVALID, PAGE_SIZE);
- spin_lock_bh(&mm->context.pgtable_lock);
+ spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.pgtable_lock);
+ spin_unlock_bh(&mm->context.lock);
}
return table;
}
@@ -243,13 +243,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.pgtable_lock);
+ spin_lock_bh(&mm->context.lock);
mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.pgtable_lock);
+ spin_unlock_bh(&mm->context.lock);
if (mask != 0)
return;
}
@@ -275,13 +275,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
return;
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.pgtable_lock);
+ spin_lock_bh(&mm->context.lock);
mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.pgtable_lock);
+ spin_unlock_bh(&mm->context.lock);
table = (unsigned long *) (__pa(table) | (1U << bit));
tlb_remove_table(tlb, table);
}
diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index 9d597f7ab8dd..48aaefd8f5d6 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -63,11 +63,10 @@ static void gapspci_fixup_resources(struct pci_dev *dev)
res.end = GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1;
res.flags = IORESOURCE_MEM;
pcibios_resource_to_bus(dev->bus, &region, &res);
- BUG_ON(!dma_declare_coherent_memory(&dev->dev,
+ BUG_ON(dma_declare_coherent_memory(&dev->dev,
res.start,
region.start,
resource_size(&res),
- DMA_MEMORY_MAP |
DMA_MEMORY_EXCLUSIVE));
break;
default:
diff --git a/arch/sparc/include/uapi/asm/siginfo.h b/arch/sparc/include/uapi/asm/siginfo.h
index 2d9b79ccaa50..157f46fe374f 100644
--- a/arch/sparc/include/uapi/asm/siginfo.h
+++ b/arch/sparc/include/uapi/asm/siginfo.h
@@ -17,9 +17,16 @@
#define SI_NOINFO 32767 /* no information in siginfo_t */
/*
+ * SIGFPE si_codes
+ */
+#ifdef __KERNEL__
+#define FPE_FIXME 0 /* Broken dup of SI_USER */
+#endif /* __KERNEL__ */
+
+/*
* SIGEMT si_codes
*/
-#define EMT_TAGOVF (__SI_FAULT|1) /* tag overflow */
+#define EMT_TAGOVF 1 /* tag overflow */
#define NSIGEMT 1
#endif /* _UAPI__SPARC_SIGINFO_H */
diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c
index b4096bb665b2..0e4c08c45a37 100644
--- a/arch/sparc/kernel/signal32.c
+++ b/arch/sparc/kernel/signal32.c
@@ -85,34 +85,34 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
at the same time. */
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
+ err |= __put_user(from->si_code, &to->si_code);
if (from->si_code < 0)
err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
else {
- switch (from->si_code >> 16) {
- case __SI_TIMER >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_TIMER:
err |= __put_user(from->si_tid, &to->si_tid);
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_int, &to->si_int);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
err |= __put_user(from->si_status, &to->si_status);
default:
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_FAULT >> 16:
+ case SIL_FAULT:
err |= __put_user(from->si_trapno, &to->si_trapno);
err |= __put_user((unsigned long)from->si_addr, &to->si_addr);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_RT >> 16: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_int, &to->si_int);
diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c
index 466d4aed06c7..581cf35ee7e3 100644
--- a/arch/sparc/kernel/traps_32.c
+++ b/arch/sparc/kernel/traps_32.c
@@ -306,7 +306,7 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc,
info.si_errno = 0;
info.si_addr = (void __user *)pc;
info.si_trapno = 0;
- info.si_code = __SI_FAULT;
+ info.si_code = FPE_FIXME;
if ((fsr & 0x1c000) == (1 << 14)) {
if (fsr & 0x10)
info.si_code = FPE_FLTINV;
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index c74f2dffcc13..0a56dc257cb9 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2303,7 +2303,7 @@ static void do_fpe_common(struct pt_regs *regs)
info.si_errno = 0;
info.si_addr = (void __user *)regs->tpc;
info.si_trapno = 0;
- info.si_code = __SI_FAULT;
+ info.si_code = FPE_FIXME;
if ((fsr & 0x1c000) == (1 << 14)) {
if (fsr & 0x10)
info.si_code = FPE_FLTINV;
diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h
index bbc71a29b2c6..7061dc8af43a 100644
--- a/arch/tile/include/asm/dma-mapping.h
+++ b/arch/tile/include/asm/dma-mapping.h
@@ -68,8 +68,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
int dma_set_mask(struct device *dev, u64 mask);
/*
- * dma_alloc_noncoherent() is #defined to return coherent memory,
- * so there's no need to do any flushing here.
+ * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to
+ * do any flushing here.
*/
static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
enum dma_data_direction direction)
diff --git a/arch/tile/include/uapi/asm/siginfo.h b/arch/tile/include/uapi/asm/siginfo.h
index 56d661bb010b..e83f931aa1f0 100644
--- a/arch/tile/include/uapi/asm/siginfo.h
+++ b/arch/tile/include/uapi/asm/siginfo.h
@@ -26,8 +26,8 @@
/*
* Additional Tile-specific SIGILL si_codes
*/
-#define ILL_DBLFLT (__SI_FAULT|9) /* double fault */
-#define ILL_HARDWALL (__SI_FAULT|10) /* user networks hardwall violation */
+#define ILL_DBLFLT 9 /* double fault */
+#define ILL_HARDWALL 10 /* user networks hardwall violation */
#undef NSIGILL
#define NSIGILL 10
diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c
index 0e863f1ee08c..971d87a1d8cf 100644
--- a/arch/tile/kernel/compat_signal.c
+++ b/arch/tile/kernel/compat_signal.c
@@ -64,7 +64,7 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr
3 ints plus the relevant union member. */
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
+ err |= __put_user(from->si_code, &to->si_code);
if (from->si_code < 0) {
err |= __put_user(from->si_pid, &to->si_pid);
@@ -77,28 +77,26 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr
*/
err |= __put_user(from->_sifields._pad[0],
&to->_sifields._pad[0]);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_FAULT:
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
err |= __put_user(from->si_status, &to->si_status);
/* FALL THROUGH */
default:
- case __SI_KILL >> 16:
+ case SIL_KILL:
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
err |= __put_user(from->si_overrun, &to->si_overrun);
err |= __put_user(from->si_int, &to->si_int);
break;
- /* This is not generated by the kernel as of now. */
- case __SI_RT >> 16:
- case __SI_MESGQ >> 16:
+ case SIL_RT:
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_int, &to->si_int);
break;
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index 54804866f238..9b08c6055f15 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -188,7 +188,7 @@ static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep)
/* Make it the requested signal. */
*sigp = sig;
- *codep = code | __SI_FAULT;
+ *codep = code;
return 1;
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a3e6e6136a47..971feac13506 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -53,7 +53,6 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOV if X86_64
- select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_PMEM_API if X86_64
# Causing hangs/crashes, see the commit that added this change for details.
select ARCH_HAS_REFCOUNT if BROKEN
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 8b4140f6724f..cb9a1af109b4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -7,6 +7,4 @@
void clflush_cache_range(void *addr, unsigned int size);
-#define mmio_flush_range(addr, size) clflush_cache_range(addr, size)
-
#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 8e618fcf1f7c..6a77c63540f7 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -21,7 +21,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
-extern unsigned long sme_me_mask;
+extern u64 sme_me_mask;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
unsigned long decrypted_kernel_vaddr,
@@ -49,7 +49,7 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
#else /* !CONFIG_AMD_MEM_ENCRYPT */
-#define sme_me_mask 0UL
+#define sme_me_mask 0ULL
static inline void __init sme_early_encrypt(resource_size_t paddr,
unsigned long size) { }
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 831eb7895535..c471ca1f9412 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -86,7 +86,6 @@ static inline void e820__memblock_alloc_reserved_mpc_new(void) { }
#endif
int generic_processor_info(int apicid, int version);
-int __generic_processor_info(int apicid, int version, bool enabled);
#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 8315e2f517a7..d705c769f77d 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2130,7 +2130,7 @@ int generic_processor_info(int apicid, int version)
* Since fixing handling of boot_cpu_physical_apicid requires
* another discussion and tests on each platform, we leave it
* for now and here we use read_apic_id() directly in this
- * function, __generic_processor_info().
+ * function, generic_processor_info().
*/
if (disabled_cpu_apicid != BAD_APICID &&
disabled_cpu_apicid != read_apic_id() &&
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index 71beb28600d4..ab9feb5887b1 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -129,7 +129,7 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
3 ints plus the relevant union member. */
put_user_ex(from->si_signo, &to->si_signo);
put_user_ex(from->si_errno, &to->si_errno);
- put_user_ex((short)from->si_code, &to->si_code);
+ put_user_ex(from->si_code, &to->si_code);
if (from->si_code < 0) {
put_user_ex(from->si_pid, &to->si_pid);
@@ -142,8 +142,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
*/
put_user_ex(from->_sifields._pad[0],
&to->_sifields._pad[0]);
- switch (from->si_code >> 16) {
- case __SI_FAULT >> 16:
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_FAULT:
if (from->si_signo == SIGBUS &&
(from->si_code == BUS_MCEERR_AR ||
from->si_code == BUS_MCEERR_AO))
@@ -160,11 +160,11 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
put_user_ex(from->si_pkey, &to->si_pkey);
}
break;
- case __SI_SYS >> 16:
+ case SIL_SYS:
put_user_ex(from->si_syscall, &to->si_syscall);
put_user_ex(from->si_arch, &to->si_arch);
break;
- case __SI_CHLD >> 16:
+ case SIL_CHLD:
if (!x32_ABI) {
put_user_ex(from->si_utime, &to->si_utime);
put_user_ex(from->si_stime, &to->si_stime);
@@ -174,21 +174,18 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
}
put_user_ex(from->si_status, &to->si_status);
/* FALL THROUGH */
- default:
- case __SI_KILL >> 16:
+ case SIL_KILL:
put_user_ex(from->si_uid, &to->si_uid);
break;
- case __SI_POLL >> 16:
+ case SIL_POLL:
put_user_ex(from->si_fd, &to->si_fd);
break;
- case __SI_TIMER >> 16:
+ case SIL_TIMER:
put_user_ex(from->si_overrun, &to->si_overrun);
put_user_ex(ptr_to_compat(from->si_ptr),
&to->si_ptr);
break;
- /* This is not generated by the kernel as of now. */
- case __SI_RT >> 16:
- case __SI_MESGQ >> 16:
+ case SIL_RT:
put_user_ex(from->si_uid, &to->si_uid);
put_user_ex(from->si_int, &to->si_int);
break;
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 0fbd09269757..3fcc8e01683b 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -37,7 +37,7 @@ static char sme_cmdline_off[] __initdata = "off";
* reside in the .data section so as not to be zeroed out when the .bss
* section is later cleared.
*/
-unsigned long sme_me_mask __section(.data) = 0;
+u64 sme_me_mask __section(.data) = 0;
EXPORT_SYMBOL_GPL(sme_me_mask);
/* Buffer used for early in-place encryption by BSP, no locking needed */
diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig
index 6d3351452ea2..929ba4da0b30 100644
--- a/drivers/acpi/nfit/Kconfig
+++ b/drivers/acpi/nfit/Kconfig
@@ -2,7 +2,7 @@ config ACPI_NFIT
tristate "ACPI NVDIMM Firmware Interface Table (NFIT)"
depends on PHYS_ADDR_T_64BIT
depends on BLK_DEV
- depends on ARCH_HAS_MMIO_FLUSH
+ depends on ARCH_HAS_PMEM_API
select LIBNVDIMM
help
Infrastructure to probe ACPI 6 compliant platforms for
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 1893e416e7c0..9c2c49b6a240 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -228,6 +228,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
if (cmd == ND_CMD_CALL) {
call_pkg = buf;
func = call_pkg->nd_command;
+
+ for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++)
+ if (call_pkg->nd_reserved2[i])
+ return -EINVAL;
}
if (nvdimm) {
@@ -1674,8 +1678,19 @@ static ssize_t range_index_show(struct device *dev,
}
static DEVICE_ATTR_RO(range_index);
+static ssize_t ecc_unit_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nd_region *nd_region = to_nd_region(dev);
+ struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
+
+ return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
+}
+static DEVICE_ATTR_RO(ecc_unit_size);
+
static struct attribute *acpi_nfit_region_attributes[] = {
&dev_attr_range_index.attr,
+ &dev_attr_ecc_unit_size.attr,
NULL,
};
@@ -1804,6 +1819,7 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc,
spa->range_index, i);
+ struct acpi_nfit_control_region *dcr = nfit_mem->dcr;
if (!memdev || !nfit_mem->dcr) {
dev_err(dev, "%s: failed to find DCR\n", __func__);
@@ -1811,13 +1827,13 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
}
map->region_offset = memdev->region_offset;
- map->serial_number = nfit_mem->dcr->serial_number;
+ map->serial_number = dcr->serial_number;
map2->region_offset = memdev->region_offset;
- map2->serial_number = nfit_mem->dcr->serial_number;
- map2->vendor_id = nfit_mem->dcr->vendor_id;
- map2->manufacturing_date = nfit_mem->dcr->manufacturing_date;
- map2->manufacturing_location = nfit_mem->dcr->manufacturing_location;
+ map2->serial_number = dcr->serial_number;
+ map2->vendor_id = dcr->vendor_id;
+ map2->manufacturing_date = dcr->manufacturing_date;
+ map2->manufacturing_location = dcr->manufacturing_location;
}
/* v1.1 namespaces */
@@ -1835,6 +1851,28 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc,
cmp_map_compat, NULL);
nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0);
+ /* record the result of the sort for the mapping position */
+ for (i = 0; i < nr; i++) {
+ struct nfit_set_info_map2 *map2 = &info2->mapping[i];
+ int j;
+
+ for (j = 0; j < nr; j++) {
+ struct nd_mapping_desc *mapping = &ndr_desc->mapping[j];
+ struct nvdimm *nvdimm = mapping->nvdimm;
+ struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+ struct acpi_nfit_control_region *dcr = nfit_mem->dcr;
+
+ if (map2->serial_number == dcr->serial_number &&
+ map2->vendor_id == dcr->vendor_id &&
+ map2->manufacturing_date == dcr->manufacturing_date &&
+ map2->manufacturing_location
+ == dcr->manufacturing_location) {
+ mapping->position = i;
+ break;
+ }
+ }
+ }
+
ndr_desc->nd_set = nd_set;
devm_kfree(dev, info);
devm_kfree(dev, info2);
@@ -1930,7 +1968,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk,
memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c);
else {
if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH)
- mmio_flush_range((void __force *)
+ arch_invalidate_pmem((void __force *)
mmio->addr.aperture + offset, c);
memcpy(iobuf + copied, mmio->addr.aperture + offset, c);
diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c
index 1c152aed6b82..a39b2166b145 100644
--- a/drivers/base/dma-coherent.c
+++ b/drivers/base/dma-coherent.c
@@ -37,7 +37,7 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
return mem->device_base;
}
-static bool dma_init_coherent_memory(
+static int dma_init_coherent_memory(
phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags,
struct dma_coherent_mem **mem)
{
@@ -45,25 +45,28 @@ static bool dma_init_coherent_memory(
void __iomem *mem_base = NULL;
int pages = size >> PAGE_SHIFT;
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+ int ret;
- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
- goto out;
- if (!size)
+ if (!size) {
+ ret = -EINVAL;
goto out;
+ }
- if (flags & DMA_MEMORY_MAP)
- mem_base = memremap(phys_addr, size, MEMREMAP_WC);
- else
- mem_base = ioremap(phys_addr, size);
- if (!mem_base)
+ mem_base = memremap(phys_addr, size, MEMREMAP_WC);
+ if (!mem_base) {
+ ret = -EINVAL;
goto out;
-
+ }
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dma_mem)
+ if (!dma_mem) {
+ ret = -ENOMEM;
goto out;
+ }
dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dma_mem->bitmap)
+ if (!dma_mem->bitmap) {
+ ret = -ENOMEM;
goto out;
+ }
dma_mem->virt_base = mem_base;
dma_mem->device_base = device_addr;
@@ -73,17 +76,13 @@ static bool dma_init_coherent_memory(
spin_lock_init(&dma_mem->spinlock);
*mem = dma_mem;
- return true;
+ return 0;
out:
kfree(dma_mem);
- if (mem_base) {
- if (flags & DMA_MEMORY_MAP)
- memunmap(mem_base);
- else
- iounmap(mem_base);
- }
- return false;
+ if (mem_base)
+ memunmap(mem_base);
+ return ret;
}
static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
@@ -91,10 +90,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
if (!mem)
return;
- if (mem->flags & DMA_MEMORY_MAP)
- memunmap(mem->virt_base);
- else
- iounmap(mem->virt_base);
+ memunmap(mem->virt_base);
kfree(mem->bitmap);
kfree(mem);
}
@@ -109,8 +105,6 @@ static int dma_assign_coherent_memory(struct device *dev,
return -EBUSY;
dev->dma_mem = mem;
- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
return 0;
}
@@ -118,16 +112,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_addr_t device_addr, size_t size, int flags)
{
struct dma_coherent_mem *mem;
+ int ret;
- if (!dma_init_coherent_memory(phys_addr, device_addr, size, flags,
- &mem))
- return 0;
-
- if (dma_assign_coherent_memory(dev, mem) == 0)
- return flags & DMA_MEMORY_MAP ? DMA_MEMORY_MAP : DMA_MEMORY_IO;
+ ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem);
+ if (ret)
+ return ret;
- dma_release_coherent_memory(mem);
- return 0;
+ ret = dma_assign_coherent_memory(dev, mem);
+ if (ret)
+ dma_release_coherent_memory(mem);
+ return ret;
}
EXPORT_SYMBOL(dma_declare_coherent_memory);
@@ -171,7 +165,6 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
int order = get_order(size);
unsigned long flags;
int pageno;
- int dma_memory_map;
void *ret;
spin_lock_irqsave(&mem->spinlock, flags);
@@ -188,15 +181,9 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
*/
*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
ret = mem->virt_base + (pageno << PAGE_SHIFT);
- dma_memory_map = (mem->flags & DMA_MEMORY_MAP);
spin_unlock_irqrestore(&mem->spinlock, flags);
- if (dma_memory_map)
- memset(ret, 0, size);
- else
- memset_io(ret, 0, size);
-
+ memset(ret, 0, size);
return ret;
-
err:
spin_unlock_irqrestore(&mem->spinlock, flags);
return NULL;
@@ -359,14 +346,18 @@ static struct reserved_mem *dma_reserved_default_memory __initdata;
static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
{
struct dma_coherent_mem *mem = rmem->priv;
+ int ret;
- if (!mem &&
- !dma_init_coherent_memory(rmem->base, rmem->base, rmem->size,
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE,
- &mem)) {
+ if (!mem)
+ return -ENODEV;
+
+ ret = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size,
+ DMA_MEMORY_EXCLUSIVE, &mem);
+
+ if (ret) {
pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n",
&rmem->base, (unsigned long)rmem->size / SZ_1M);
- return -ENODEV;
+ return ret;
}
mem->use_dev_dma_pfn_offset = true;
rmem->priv = mem;
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c
index b555ff9dd8fc..e584eddef0a7 100644
--- a/drivers/base/dma-mapping.c
+++ b/drivers/base/dma-mapping.c
@@ -176,13 +176,10 @@ int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size,
flags);
- if (rc) {
+ if (!rc)
devres_add(dev, res);
- rc = 0;
- } else {
+ else
devres_free(res);
- rc = -ENOMEM;
- }
return rc;
}
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b008b6a98098..b640ad8a6d20 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3435,7 +3435,7 @@ static void rbd_acquire_lock(struct work_struct *work)
struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
struct rbd_device, lock_dwork);
enum rbd_lock_state lock_state;
- int ret;
+ int ret = 0;
dout("%s rbd_dev %p\n", __func__, rbd_dev);
again:
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 23f33f95d4a6..d1aed2513bd9 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -451,9 +451,6 @@ static struct port_buffer *alloc_buf(struct virtqueue *vq, size_t buf_size,
* device is created by remoteproc, the DMA memory is
* associated with the grandparent device:
* vdev => rproc => platform-dev.
- * The code here would have been less quirky if
- * DMA_MEMORY_INCLUDES_CHILDREN had been supported
- * in dma-coherent.c
*/
if (!vq->vdev->dev.parent || !vq->vdev->dev.parent->parent)
goto free_buf;
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 938eb4868f7f..3600ff786646 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -46,6 +46,8 @@ void dax_read_unlock(int id)
EXPORT_SYMBOL_GPL(dax_read_unlock);
#ifdef CONFIG_BLOCK
+#include <linux/blkdev.h>
+
int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
pgoff_t *pgoff)
{
@@ -59,6 +61,16 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
}
EXPORT_SYMBOL(bdev_dax_pgoff);
+#if IS_ENABLED(CONFIG_FS_DAX)
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+ if (!blk_queue_dax(bdev->bd_queue))
+ return NULL;
+ return fs_dax_get_by_host(bdev->bd_disk->disk_name);
+}
+EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
+#endif
+
/**
* __bdev_dax_supported() - Check if the device supports dax for filesystem
* @sb: The superblock of the device
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index bf3672a81e49..d2fcafcea07e 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -534,7 +534,7 @@ static void
cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
int sec_no)
{
- uuid_le *sec_type = (uuid_le *)gdata->section_type;
+ guid_t *sec_type = (guid_t *)gdata->section_type;
__u16 severity;
char newpfx[64];
@@ -545,12 +545,12 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
printk("%s""Error %d, type: %s\n", pfx, sec_no,
cper_severity_str(severity));
if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
- printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id);
+ printk("%s""fru_id: %pUl\n", pfx, gdata->fru_id);
if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT)
printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text);
snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP);
- if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) {
+ if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) {
struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata);
printk("%s""section_type: general processor error\n", newpfx);
@@ -558,7 +558,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
cper_print_proc_generic(newpfx, proc_err);
else
goto err_section_too_small;
- } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) {
+ } else if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
printk("%s""section_type: memory error\n", newpfx);
@@ -568,7 +568,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
gdata->error_data_length);
else
goto err_section_too_small;
- } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) {
+ } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata);
printk("%s""section_type: PCIe error\n", newpfx);
diff --git a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
index 96dc01750bc0..36762ec954e7 100644
--- a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
+++ b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c
@@ -1708,11 +1708,10 @@ static int sh_mobile_ceu_probe(struct platform_device *pdev)
err = dma_declare_coherent_memory(&pdev->dev, res->start,
res->start,
resource_size(res),
- DMA_MEMORY_MAP |
DMA_MEMORY_EXCLUSIVE);
- if (!err) {
+ if (err) {
dev_err(&pdev->dev, "Unable to declare CEU memory.\n");
- return -ENXIO;
+ return err;
}
pcdev->video_limit = resource_size(res);
diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c
index a3c90fe5de00..73ca8879ada7 100644
--- a/drivers/net/ethernet/amd/au1000_eth.c
+++ b/drivers/net/ethernet/amd/au1000_eth.c
@@ -1180,9 +1180,10 @@ static int au1000_probe(struct platform_device *pdev)
/* Allocate the data buffers
* Snooping works fine with eth on all au1xxx
*/
- aup->vaddr = (u32)dma_alloc_noncoherent(NULL, MAX_BUF_SIZE *
- (NUM_TX_BUFFS + NUM_RX_BUFFS),
- &aup->dma_addr, 0);
+ aup->vaddr = (u32)dma_alloc_attrs(NULL, MAX_BUF_SIZE *
+ (NUM_TX_BUFFS + NUM_RX_BUFFS),
+ &aup->dma_addr, 0,
+ DMA_ATTR_NON_CONSISTENT);
if (!aup->vaddr) {
dev_err(&pdev->dev, "failed to allocate data buffers\n");
err = -ENOMEM;
@@ -1361,8 +1362,9 @@ err_remap3:
err_remap2:
iounmap(aup->mac);
err_remap1:
- dma_free_noncoherent(NULL, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS),
- (void *)aup->vaddr, aup->dma_addr);
+ dma_free_attrs(NULL, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS),
+ (void *)aup->vaddr, aup->dma_addr,
+ DMA_ATTR_NON_CONSISTENT);
err_vaddr:
free_netdev(dev);
err_alloc:
@@ -1394,9 +1396,9 @@ static int au1000_remove(struct platform_device *pdev)
if (aup->tx_db_inuse[i])
au1000_ReleaseDB(aup, aup->tx_db_inuse[i]);
- dma_free_noncoherent(NULL, MAX_BUF_SIZE *
- (NUM_TX_BUFFS + NUM_RX_BUFFS),
- (void *)aup->vaddr, aup->dma_addr);
+ dma_free_attrs(NULL, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS),
+ (void *)aup->vaddr, aup->dma_addr,
+ DMA_ATTR_NON_CONSISTENT);
iounmap(aup->macdma);
iounmap(aup->mac);
diff --git a/drivers/net/ethernet/i825xx/lasi_82596.c b/drivers/net/ethernet/i825xx/lasi_82596.c
index aa22e108f09b..b69c622ba8b2 100644
--- a/drivers/net/ethernet/i825xx/lasi_82596.c
+++ b/drivers/net/ethernet/i825xx/lasi_82596.c
@@ -96,8 +96,6 @@
#define OPT_SWAP_PORT 0x0001 /* Need to wordswp on the MPU port */
-#define DMA_ALLOC dma_alloc_noncoherent
-#define DMA_FREE dma_free_noncoherent
#define DMA_WBACK(ndev, addr, len) \
do { dma_cache_sync((ndev)->dev.parent, (void *)addr, len, DMA_TO_DEVICE); } while (0)
@@ -200,8 +198,8 @@ static int __exit lan_remove_chip(struct parisc_device *pdev)
struct i596_private *lp = netdev_priv(dev);
unregister_netdev (dev);
- DMA_FREE(&pdev->dev, sizeof(struct i596_private),
- (void *)lp->dma, lp->dma_addr);
+ dma_free_attrs(&pdev->dev, sizeof(struct i596_private), lp->dma,
+ lp->dma_addr, DMA_ATTR_NON_CONSISTENT);
free_netdev (dev);
return 0;
}
diff --git a/drivers/net/ethernet/i825xx/lib82596.c b/drivers/net/ethernet/i825xx/lib82596.c
index 8449c58f01fd..f00a1dc2128c 100644
--- a/drivers/net/ethernet/i825xx/lib82596.c
+++ b/drivers/net/ethernet/i825xx/lib82596.c
@@ -1063,8 +1063,9 @@ static int i82596_probe(struct net_device *dev)
if (!dev->base_addr || !dev->irq)
return -ENODEV;
- dma = (struct i596_dma *) DMA_ALLOC(dev->dev.parent,
- sizeof(struct i596_dma), &lp->dma_addr, GFP_KERNEL);
+ dma = dma_alloc_attrs(dev->dev.parent, sizeof(struct i596_dma),
+ &lp->dma_addr, GFP_KERNEL,
+ DMA_ATTR_NON_CONSISTENT);
if (!dma) {
printk(KERN_ERR "%s: Couldn't get shared memory\n", __FILE__);
return -ENOMEM;
@@ -1085,8 +1086,8 @@ static int i82596_probe(struct net_device *dev)
i = register_netdev(dev);
if (i) {
- DMA_FREE(dev->dev.parent, sizeof(struct i596_dma),
- (void *)dma, lp->dma_addr);
+ dma_free_attrs(dev->dev.parent, sizeof(struct i596_dma),
+ dma, lp->dma_addr, DMA_ATTR_NON_CONSISTENT);
return i;
}
diff --git a/drivers/net/ethernet/i825xx/sni_82596.c b/drivers/net/ethernet/i825xx/sni_82596.c
index 2af7f77345fb..b2c04a789744 100644
--- a/drivers/net/ethernet/i825xx/sni_82596.c
+++ b/drivers/net/ethernet/i825xx/sni_82596.c
@@ -23,8 +23,6 @@
static const char sni_82596_string[] = "snirm_82596";
-#define DMA_ALLOC dma_alloc_coherent
-#define DMA_FREE dma_free_coherent
#define DMA_WBACK(priv, addr, len) do { } while (0)
#define DMA_INV(priv, addr, len) do { } while (0)
#define DMA_WBACK_INV(priv, addr, len) do { } while (0)
@@ -152,8 +150,8 @@ static int sni_82596_driver_remove(struct platform_device *pdev)
struct i596_private *lp = netdev_priv(dev);
unregister_netdev(dev);
- DMA_FREE(dev->dev.parent, sizeof(struct i596_private),
- lp->dma, lp->dma_addr);
+ dma_free_attrs(dev->dev.parent, sizeof(struct i596_private), lp->dma,
+ lp->dma_addr, DMA_ATTR_NON_CONSISTENT);
iounmap(lp->ca);
iounmap(lp->mpu_port);
free_netdev (dev);
diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c
index 70347720fdf9..573691bc3b71 100644
--- a/drivers/net/ethernet/seeq/sgiseeq.c
+++ b/drivers/net/ethernet/seeq/sgiseeq.c
@@ -737,8 +737,8 @@ static int sgiseeq_probe(struct platform_device *pdev)
sp = netdev_priv(dev);
/* Make private data page aligned */
- sr = dma_alloc_noncoherent(&pdev->dev, sizeof(*sp->srings),
- &sp->srings_dma, GFP_KERNEL);
+ sr = dma_alloc_attrs(&pdev->dev, sizeof(*sp->srings), &sp->srings_dma,
+ GFP_KERNEL, DMA_ATTR_NON_CONSISTENT);
if (!sr) {
printk(KERN_ERR "Sgiseeq: Page alloc failed, aborting.\n");
err = -ENOMEM;
@@ -813,8 +813,8 @@ static int sgiseeq_remove(struct platform_device *pdev)
struct sgiseeq_private *sp = netdev_priv(dev);
unregister_netdev(dev);
- dma_free_noncoherent(&pdev->dev, sizeof(*sp->srings), sp->srings,
- sp->srings_dma);
+ dma_free_attrs(&pdev->dev, sizeof(*sp->srings), sp->srings,
+ sp->srings_dma, DMA_ATTR_NON_CONSISTENT);
free_netdev(dev);
return 0;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 60491641a8d6..d5612bd1cc81 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -31,6 +31,16 @@ enum log_ent_request {
LOG_OLD_ENT
};
+static struct device *to_dev(struct arena_info *arena)
+{
+ return &arena->nd_btt->dev;
+}
+
+static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset)
+{
+ return offset + nd_btt->initial_offset;
+}
+
static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
void *buf, size_t n, unsigned long flags)
{
@@ -38,7 +48,7 @@ static int arena_read_bytes(struct arena_info *arena, resource_size_t offset,
struct nd_namespace_common *ndns = nd_btt->ndns;
/* arena offsets may be shifted from the base of the device */
- offset += arena->nd_btt->initial_offset;
+ offset = adjust_initial_offset(nd_btt, offset);
return nvdimm_read_bytes(ndns, offset, buf, n, flags);
}
@@ -49,7 +59,7 @@ static int arena_write_bytes(struct arena_info *arena, resource_size_t offset,
struct nd_namespace_common *ndns = nd_btt->ndns;
/* arena offsets may be shifted from the base of the device */
- offset += arena->nd_btt->initial_offset;
+ offset = adjust_initial_offset(nd_btt, offset);
return nvdimm_write_bytes(ndns, offset, buf, n, flags);
}
@@ -62,8 +72,10 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
* We rely on that to make sure rw_bytes does error clearing
* correctly, so make sure that is the case.
*/
- WARN_ON_ONCE(!IS_ALIGNED(arena->infooff, 512));
- WARN_ON_ONCE(!IS_ALIGNED(arena->info2off, 512));
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512),
+ "arena->infooff: %#llx is unaligned\n", arena->infooff);
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512),
+ "arena->info2off: %#llx is unaligned\n", arena->info2off);
ret = arena_write_bytes(arena, arena->info2off, super,
sizeof(struct btt_sb), 0);
@@ -76,7 +88,6 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super)
static int btt_info_read(struct arena_info *arena, struct btt_sb *super)
{
- WARN_ON(!super);
return arena_read_bytes(arena, arena->infooff, super,
sizeof(struct btt_sb), 0);
}
@@ -92,7 +103,10 @@ static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping,
{
u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
- WARN_ON(lba >= arena->external_nlba);
+ if (unlikely(lba >= arena->external_nlba))
+ dev_err_ratelimited(to_dev(arena),
+ "%s: lba %#x out of range (max: %#x)\n",
+ __func__, lba, arena->external_nlba);
return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags);
}
@@ -106,7 +120,7 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
* This 'mapping' is supposed to be just the LBA mapping, without
* any flags set, so strip the flag bits.
*/
- mapping &= MAP_LBA_MASK;
+ mapping = ent_lba(mapping);
ze = (z_flag << 1) + e_flag;
switch (ze) {
@@ -131,7 +145,8 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping,
* construed as a valid 'normal' case, but we decide not to,
* to avoid confusion
*/
- WARN_ONCE(1, "Invalid use of Z and E flags\n");
+ dev_err_ratelimited(to_dev(arena),
+ "Invalid use of Z and E flags\n");
return -EIO;
}
@@ -147,7 +162,10 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
u32 raw_mapping, postmap, ze, z_flag, e_flag;
u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE);
- WARN_ON(lba >= arena->external_nlba);
+ if (unlikely(lba >= arena->external_nlba))
+ dev_err_ratelimited(to_dev(arena),
+ "%s: lba %#x out of range (max: %#x)\n",
+ __func__, lba, arena->external_nlba);
ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags);
if (ret)
@@ -155,10 +173,10 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
raw_mapping = le32_to_cpu(in);
- z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT;
- e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT;
+ z_flag = ent_z_flag(raw_mapping);
+ e_flag = ent_e_flag(raw_mapping);
ze = (z_flag << 1) + e_flag;
- postmap = raw_mapping & MAP_LBA_MASK;
+ postmap = ent_lba(raw_mapping);
/* Reuse the {z,e}_flag variables for *trim and *error */
z_flag = 0;
@@ -195,7 +213,6 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping,
static int btt_log_read_pair(struct arena_info *arena, u32 lane,
struct log_entry *ent)
{
- WARN_ON(!ent);
return arena_read_bytes(arena,
arena->logoff + (2 * lane * LOG_ENT_SIZE), ent,
2 * LOG_ENT_SIZE, 0);
@@ -299,11 +316,6 @@ static int btt_log_get_old(struct log_entry *ent)
return old;
}
-static struct device *to_dev(struct arena_info *arena)
-{
- return &arena->nd_btt->dev;
-}
-
/*
* This function copies the desired (old/new) log entry into ent if
* it is not NULL. It returns the sub-slot number (0 or 1)
@@ -381,7 +393,9 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub,
arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
if (++(arena->freelist[lane].seq) == 4)
arena->freelist[lane].seq = 1;
- arena->freelist[lane].block = le32_to_cpu(ent->old_map);
+ if (ent_e_flag(ent->old_map))
+ arena->freelist[lane].has_err = 1;
+ arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
return ret;
}
@@ -407,12 +421,14 @@ static int btt_map_init(struct arena_info *arena)
* make sure rw_bytes does error clearing correctly, so make sure that
* is the case.
*/
- WARN_ON_ONCE(!IS_ALIGNED(arena->mapoff, 512));
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512),
+ "arena->mapoff: %#llx is unaligned\n", arena->mapoff);
while (mapsize) {
size_t size = min(mapsize, chunk_size);
- WARN_ON_ONCE(size < 512);
+ dev_WARN_ONCE(to_dev(arena), size < 512,
+ "chunk size: %#zx is unaligned\n", size);
ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf,
size, 0);
if (ret)
@@ -449,12 +465,14 @@ static int btt_log_init(struct arena_info *arena)
* make sure rw_bytes does error clearing correctly, so make sure that
* is the case.
*/
- WARN_ON_ONCE(!IS_ALIGNED(arena->logoff, 512));
+ dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512),
+ "arena->logoff: %#llx is unaligned\n", arena->logoff);
while (logsize) {
size_t size = min(logsize, chunk_size);
- WARN_ON_ONCE(size < 512);
+ dev_WARN_ONCE(to_dev(arena), size < 512,
+ "chunk size: %#zx is unaligned\n", size);
ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf,
size, 0);
if (ret)
@@ -480,6 +498,40 @@ static int btt_log_init(struct arena_info *arena)
return ret;
}
+static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
+{
+ return arena->dataoff + ((u64)lba * arena->internal_lbasize);
+}
+
+static int arena_clear_freelist_error(struct arena_info *arena, u32 lane)
+{
+ int ret = 0;
+
+ if (arena->freelist[lane].has_err) {
+ void *zero_page = page_address(ZERO_PAGE(0));
+ u32 lba = arena->freelist[lane].block;
+ u64 nsoff = to_namespace_offset(arena, lba);
+ unsigned long len = arena->sector_size;
+
+ mutex_lock(&arena->err_lock);
+
+ while (len) {
+ unsigned long chunk = min(len, PAGE_SIZE);
+
+ ret = arena_write_bytes(arena, nsoff, zero_page,
+ chunk, 0);
+ if (ret)
+ break;
+ len -= chunk;
+ nsoff += chunk;
+ if (len == 0)
+ arena->freelist[lane].has_err = 0;
+ }
+ mutex_unlock(&arena->err_lock);
+ }
+ return ret;
+}
+
static int btt_freelist_init(struct arena_info *arena)
{
int old, new, ret;
@@ -505,6 +557,17 @@ static int btt_freelist_init(struct arena_info *arena)
arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq));
arena->freelist[i].block = le32_to_cpu(log_new.old_map);
+ /*
+ * FIXME: if error clearing fails during init, we want to make
+ * the BTT read-only
+ */
+ if (ent_e_flag(log_new.old_map)) {
+ ret = arena_clear_freelist_error(arena, i);
+ if (ret)
+ dev_err_ratelimited(to_dev(arena),
+ "Unable to clear known errors\n");
+ }
+
/* This implies a newly created or untouched flog entry */
if (log_new.old_map == log_new.new_map)
continue;
@@ -525,7 +588,6 @@ static int btt_freelist_init(struct arena_info *arena)
if (ret)
return ret;
}
-
}
return 0;
@@ -566,6 +628,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size,
if (!arena)
return NULL;
arena->nd_btt = btt->nd_btt;
+ arena->sector_size = btt->sector_size;
if (!size)
return arena;
@@ -694,6 +757,7 @@ static int discover_arenas(struct btt *btt)
arena->external_lba_start = cur_nlba;
parse_arena_meta(arena, super, cur_off);
+ mutex_init(&arena->err_lock);
ret = btt_freelist_init(arena);
if (ret)
goto out;
@@ -904,11 +968,6 @@ static void unlock_map(struct arena_info *arena, u32 premap)
spin_unlock(&arena->map_locks[idx].lock);
}
-static u64 to_namespace_offset(struct arena_info *arena, u64 lba)
-{
- return arena->dataoff + ((u64)lba * arena->internal_lbasize);
-}
-
static int btt_data_read(struct arena_info *arena, struct page *page,
unsigned int off, u32 lba, u32 len)
{
@@ -1032,6 +1091,7 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
*/
while (1) {
u32 new_map;
+ int new_t, new_e;
if (t_flag) {
zero_fill_data(page, off, cur_len);
@@ -1050,20 +1110,29 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
*/
barrier();
- ret = btt_map_read(arena, premap, &new_map, &t_flag,
- &e_flag, NVDIMM_IO_ATOMIC);
+ ret = btt_map_read(arena, premap, &new_map, &new_t,
+ &new_e, NVDIMM_IO_ATOMIC);
if (ret)
goto out_rtt;
- if (postmap == new_map)
+ if ((postmap == new_map) && (t_flag == new_t) &&
+ (e_flag == new_e))
break;
postmap = new_map;
+ t_flag = new_t;
+ e_flag = new_e;
}
ret = btt_data_read(arena, page, off, postmap, cur_len);
- if (ret)
+ if (ret) {
+ int rc;
+
+ /* Media error - set the e_flag */
+ rc = btt_map_write(arena, premap, postmap, 0, 1,
+ NVDIMM_IO_ATOMIC);
goto out_rtt;
+ }
if (bip) {
ret = btt_rw_integrity(btt, bip, arena, postmap, READ);
@@ -1088,6 +1157,21 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip,
return ret;
}
+/*
+ * Normally, arena_{read,write}_bytes will take care of the initial offset
+ * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem,
+ * we need the final, raw namespace offset here
+ */
+static bool btt_is_badblock(struct btt *btt, struct arena_info *arena,
+ u32 postmap)
+{
+ u64 nsoff = adjust_initial_offset(arena->nd_btt,
+ to_namespace_offset(arena, postmap));
+ sector_t phys_sector = nsoff >> 9;
+
+ return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize);
+}
+
static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
sector_t sector, struct page *page, unsigned int off,
unsigned int len)
@@ -1100,7 +1184,9 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
while (len) {
u32 cur_len;
+ int e_flag;
+ retry:
lane = nd_region_acquire_lane(btt->nd_region);
ret = lba_to_arena(btt, sector, &premap, &arena);
@@ -1113,6 +1199,21 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
goto out_lane;
}
+ if (btt_is_badblock(btt, arena, arena->freelist[lane].block))
+ arena->freelist[lane].has_err = 1;
+
+ if (mutex_is_locked(&arena->err_lock)
+ || arena->freelist[lane].has_err) {
+ nd_region_release_lane(btt->nd_region, lane);
+
+ ret = arena_clear_freelist_error(arena, lane);
+ if (ret)
+ return ret;
+
+ /* OK to acquire a different lane/free block */
+ goto retry;
+ }
+
new_postmap = arena->freelist[lane].block;
/* Wait if the new block is being read from */
@@ -1138,7 +1239,7 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
}
lock_map(arena, premap);
- ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL,
+ ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag,
NVDIMM_IO_ATOMIC);
if (ret)
goto out_map;
@@ -1146,6 +1247,8 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
ret = -EIO;
goto out_map;
}
+ if (e_flag)
+ set_e_flag(old_postmap);
log.lba = cpu_to_le32(premap);
log.old_map = cpu_to_le32(old_postmap);
@@ -1156,13 +1259,20 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
if (ret)
goto out_map;
- ret = btt_map_write(arena, premap, new_postmap, 0, 0, 0);
+ ret = btt_map_write(arena, premap, new_postmap, 0, 0,
+ NVDIMM_IO_ATOMIC);
if (ret)
goto out_map;
unlock_map(arena, premap);
nd_region_release_lane(btt->nd_region, lane);
+ if (e_flag) {
+ ret = arena_clear_freelist_error(arena, lane);
+ if (ret)
+ return ret;
+ }
+
len -= cur_len;
off += cur_len;
sector += btt->sector_size >> SECTOR_SHIFT;
@@ -1211,11 +1321,13 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
- BUG_ON(len > PAGE_SIZE);
- /* Make sure len is in multiples of sector size. */
- /* XXX is this right? */
- BUG_ON(len < btt->sector_size);
- BUG_ON(len % btt->sector_size);
+ if (len > PAGE_SIZE || len < btt->sector_size ||
+ len % btt->sector_size) {
+ dev_err_ratelimited(&btt->nd_btt->dev,
+ "unaligned bio segment (len: %d)\n", len);
+ bio->bi_status = BLK_STS_IOERR;
+ break;
+ }
err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
op_is_write(bio_op(bio)), iter.bi_sector);
@@ -1345,6 +1457,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
{
int ret;
struct btt *btt;
+ struct nd_namespace_io *nsio;
struct device *dev = &nd_btt->dev;
btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL);
@@ -1358,6 +1471,8 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize,
INIT_LIST_HEAD(&btt->arena_list);
mutex_init(&btt->init_lock);
btt->nd_region = nd_region;
+ nsio = to_nd_namespace_io(&nd_btt->ndns->dev);
+ btt->phys_bb = &nsio->bb;
ret = discover_arenas(btt);
if (ret) {
@@ -1431,6 +1546,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns)
}
btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
+ if (!btt_sb)
+ return -ENOMEM;
/*
* If this returns < 0, that is ok as it just means there wasn't
diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h
index 888e862907a0..578c2057524d 100644
--- a/drivers/nvdimm/btt.h
+++ b/drivers/nvdimm/btt.h
@@ -15,6 +15,7 @@
#ifndef _LINUX_BTT_H
#define _LINUX_BTT_H
+#include <linux/badblocks.h>
#include <linux/types.h>
#define BTT_SIG_LEN 16
@@ -38,6 +39,11 @@
#define IB_FLAG_ERROR 0x00000001
#define IB_FLAG_ERROR_MASK 0x00000001
+#define ent_lba(ent) (ent & MAP_LBA_MASK)
+#define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK))
+#define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK))
+#define set_e_flag(ent) (ent |= MAP_ERR_MASK)
+
enum btt_init_state {
INIT_UNCHECKED = 0,
INIT_NOTFOUND,
@@ -78,6 +84,7 @@ struct free_entry {
u32 block;
u8 sub;
u8 seq;
+ u8 has_err;
};
struct aligned_lock {
@@ -104,6 +111,7 @@ struct aligned_lock {
* handle incoming writes.
* @version_major: Metadata layout version major.
* @version_minor: Metadata layout version minor.
+ * @sector_size: The Linux sector size - 512 or 4096
* @nextoff: Offset in bytes to the start of the next arena.
* @infooff: Offset in bytes to the info block of this arena.
* @dataoff: Offset in bytes to the data area of this arena.
@@ -131,6 +139,7 @@ struct arena_info {
u32 nfree;
u16 version_major;
u16 version_minor;
+ u32 sector_size;
/* Byte offsets to the different on-media structures */
u64 nextoff;
u64 infooff;
@@ -147,6 +156,7 @@ struct arena_info {
struct dentry *debugfs_dir;
/* Arena flags */
u32 flags;
+ struct mutex err_lock;
};
/**
@@ -181,6 +191,7 @@ struct btt {
struct mutex init_lock;
int init_state;
int num_arenas;
+ struct badblocks *phys_bb;
};
bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super);
diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c
index 3e359d282f8e..d58925295aa7 100644
--- a/drivers/nvdimm/btt_devs.c
+++ b/drivers/nvdimm/btt_devs.c
@@ -61,7 +61,7 @@ static ssize_t sector_size_show(struct device *dev,
{
struct nd_btt *nd_btt = to_nd_btt(dev);
- return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf);
+ return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf);
}
static ssize_t sector_size_store(struct device *dev,
@@ -72,7 +72,7 @@ static ssize_t sector_size_store(struct device *dev,
device_lock(dev);
nvdimm_bus_lock(dev);
- rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize,
+ rc = nd_size_select_store(dev, buf, &nd_btt->lbasize,
btt_lbasize_supported);
dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
rc, buf, buf[len - 1] == '\n' ? "" : "\n");
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 937fafa1886a..baf283986a7e 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
* General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/sched/mm.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
#include <linux/module.h>
@@ -234,6 +235,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
struct nd_cmd_clear_error clear_err;
struct nd_cmd_ars_cap ars_cap;
u32 clear_err_unit, mask;
+ unsigned int noio_flag;
int cmd_rc, rc;
if (!nvdimm_bus)
@@ -250,8 +252,10 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
memset(&ars_cap, 0, sizeof(ars_cap));
ars_cap.address = phys;
ars_cap.length = len;
+ noio_flag = memalloc_noio_save();
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap,
sizeof(ars_cap), &cmd_rc);
+ memalloc_noio_restore(noio_flag);
if (rc < 0)
return rc;
if (cmd_rc < 0)
@@ -266,8 +270,10 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
memset(&clear_err, 0, sizeof(clear_err));
clear_err.address = phys;
clear_err.length = len;
+ noio_flag = memalloc_noio_save();
rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err,
sizeof(clear_err), &cmd_rc);
+ memalloc_noio_restore(noio_flag);
if (rc < 0)
return rc;
if (cmd_rc < 0)
@@ -905,19 +911,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
int read_only, unsigned int ioctl_cmd, unsigned long arg)
{
struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
- size_t buf_len = 0, in_len = 0, out_len = 0;
static char out_env[ND_CMD_MAX_ENVELOPE];
static char in_env[ND_CMD_MAX_ENVELOPE];
const struct nd_cmd_desc *desc = NULL;
unsigned int cmd = _IOC_NR(ioctl_cmd);
- unsigned int func = cmd;
- void __user *p = (void __user *) arg;
struct device *dev = &nvdimm_bus->dev;
- struct nd_cmd_pkg pkg;
+ void __user *p = (void __user *) arg;
const char *cmd_name, *dimm_name;
+ u32 in_len = 0, out_len = 0;
+ unsigned int func = cmd;
unsigned long cmd_mask;
- void *buf;
+ struct nd_cmd_pkg pkg;
int rc, i, cmd_rc;
+ u64 buf_len = 0;
+ void *buf;
if (nvdimm) {
desc = nd_cmd_dimm_desc(cmd);
@@ -977,13 +984,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
if (cmd == ND_CMD_CALL) {
func = pkg.nd_command;
- dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
+ dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n",
__func__, dimm_name, pkg.nd_command,
in_len, out_len, buf_len);
-
- for (i = 0; i < ARRAY_SIZE(pkg.nd_reserved2); i++)
- if (pkg.nd_reserved2[i])
- return -EINVAL;
}
/* process an output envelope */
@@ -1007,9 +1010,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
out_len += out_size;
}
- buf_len = out_len + in_len;
+ buf_len = (u64) out_len + (u64) in_len;
if (buf_len > ND_IOCTL_MAX_BUFLEN) {
- dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
+ dev_dbg(dev, "%s:%s cmd: %s buf_len: %llu > %d\n", __func__,
dimm_name, cmd_name, buf_len,
ND_IOCTL_MAX_BUFLEN);
return -EINVAL;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index 47770460f3d3..b2fc29b8279b 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -280,18 +280,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
}
if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) {
- /*
- * FIXME: nsio_rw_bytes() may be called from atomic
- * context in the btt case and the ACPI DSM path for
- * clearing the error takes sleeping locks and allocates
- * memory. An explicit error clearing path, and support
- * for tracking badblocks in BTT metadata is needed to
- * work around this collision.
- */
if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512)
&& !(flags & NVDIMM_IO_ATOMIC)) {
long cleared;
+ might_sleep();
cleared = nvdimm_clear_poison(&ndns->dev,
nsio->res.start + offset, size);
if (cleared < size)
diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c
index 75bc08c6838c..bb71f0cf8f5d 100644
--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -277,14 +277,14 @@ int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
return 0;
}
-ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ssize_t nd_size_select_show(unsigned long current_size,
const unsigned long *supported, char *buf)
{
ssize_t len = 0;
int i;
for (i = 0; supported[i]; i++)
- if (current_lbasize == supported[i])
+ if (current_size == supported[i])
len += sprintf(buf + len, "[%ld] ", supported[i]);
else
len += sprintf(buf + len, "%ld ", supported[i]);
@@ -292,8 +292,8 @@ ssize_t nd_sector_size_show(unsigned long current_lbasize,
return len;
}
-ssize_t nd_sector_size_store(struct device *dev, const char *buf,
- unsigned long *current_lbasize, const unsigned long *supported)
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+ unsigned long *current_size, const unsigned long *supported)
{
unsigned long lbasize;
int rc, i;
@@ -310,7 +310,7 @@ ssize_t nd_sector_size_store(struct device *dev, const char *buf,
break;
if (supported[i]) {
- *current_lbasize = lbasize;
+ *current_size = lbasize;
return 0;
} else {
return -EINVAL;
diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 87796f840777..9c5f108910e3 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -45,12 +45,14 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd)
return ndd->nslabel_size;
}
-size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
{
- u32 index_span;
+ return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
+}
- if (ndd->nsindex_size)
- return ndd->nsindex_size;
+size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
+{
+ u32 nslot, space, size;
/*
* The minimum index space is 512 bytes, with that amount of
@@ -60,16 +62,16 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd)
* starts to waste space at larger config_sizes, but it's
* unlikely we'll ever see anything but 128K.
*/
- index_span = ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
- index_span /= NSINDEX_ALIGN * 2;
- ndd->nsindex_size = index_span * NSINDEX_ALIGN;
-
- return ndd->nsindex_size;
-}
-
-int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd)
-{
- return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1);
+ nslot = nvdimm_num_label_slots(ndd);
+ space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd);
+ size = ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8),
+ NSINDEX_ALIGN) * 2;
+ if (size <= space)
+ return size / 2;
+
+ dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n",
+ ndd->nsarea.config_size, sizeof_namespace_label(ndd));
+ return 0;
}
static int __nd_label_validate(struct nvdimm_drvdata *ndd)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index 5f1c6756e57c..1427a386a033 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1313,14 +1313,14 @@ static ssize_t sector_size_show(struct device *dev,
if (is_namespace_blk(dev)) {
struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev);
- return nd_sector_size_show(nsblk->lbasize,
+ return nd_size_select_show(nsblk->lbasize,
blk_lbasize_supported, buf);
}
if (is_namespace_pmem(dev)) {
struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev);
- return nd_sector_size_show(nspm->lbasize,
+ return nd_size_select_show(nspm->lbasize,
pmem_lbasize_supported, buf);
}
return -ENXIO;
@@ -1352,7 +1352,7 @@ static ssize_t sector_size_store(struct device *dev,
if (to_ndns(dev)->claim)
rc = -EBUSY;
if (rc >= 0)
- rc = nd_sector_size_store(dev, buf, lbasize, supported);
+ rc = nd_size_select_store(dev, buf, lbasize, supported);
if (rc >= 0)
rc = nd_namespace_label_update(nd_region, dev);
dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__,
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a87f793f2945..9c758a91372b 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -42,7 +42,7 @@ struct nd_poison {
struct nvdimm_drvdata {
struct device *dev;
- int nsindex_size, nslabel_size;
+ int nslabel_size;
struct nd_cmd_get_config_size nsarea;
void *data;
int ns_current, ns_next;
@@ -134,6 +134,7 @@ struct nd_mapping {
struct nvdimm *nvdimm;
u64 start;
u64 size;
+ int position;
struct list_head labels;
struct mutex lock;
/*
@@ -233,10 +234,10 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode);
void nd_device_notify(struct device *dev, enum nvdimm_event event);
int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf,
size_t len);
-ssize_t nd_sector_size_show(unsigned long current_lbasize,
+ssize_t nd_size_select_show(unsigned long current_size,
const unsigned long *supported, char *buf);
-ssize_t nd_sector_size_store(struct device *dev, const char *buf,
- unsigned long *current_lbasize, const unsigned long *supported);
+ssize_t nd_size_select_store(struct device *dev, const char *buf,
+ unsigned long *current_size, const unsigned long *supported);
int __init nvdimm_init(void);
int __init nd_region_init(void);
int __init nd_label_init(void);
@@ -285,6 +286,13 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region)
struct nd_pfn *to_nd_pfn(struct device *dev);
#if IS_ENABLED(CONFIG_NVDIMM_PFN)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE
+#else
+#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE
+#endif
+
int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns);
bool is_nd_pfn(struct device *dev);
struct device *nd_pfn_create(struct nd_region *nd_region);
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 5fcb6f5b22a2..9576c444f0ab 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -111,24 +111,27 @@ static ssize_t align_show(struct device *dev,
return sprintf(buf, "%ld\n", nd_pfn->align);
}
-static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf)
+static const unsigned long *nd_pfn_supported_alignments(void)
{
- unsigned long val;
- int rc;
-
- rc = kstrtoul(buf, 0, &val);
- if (rc)
- return rc;
-
- if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G)
- return -EINVAL;
+ /*
+ * This needs to be a non-static variable because the *_SIZE
+ * macros aren't always constants.
+ */
+ const unsigned long supported_alignments[] = {
+ PAGE_SIZE,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ HPAGE_PMD_SIZE,
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+ HPAGE_PUD_SIZE,
+#endif
+#endif
+ 0,
+ };
+ static unsigned long data[ARRAY_SIZE(supported_alignments)];
- if (nd_pfn->dev.driver)
- return -EBUSY;
- else
- nd_pfn->align = val;
+ memcpy(data, supported_alignments, sizeof(data));
- return 0;
+ return data;
}
static ssize_t align_store(struct device *dev,
@@ -139,7 +142,8 @@ static ssize_t align_store(struct device *dev,
device_lock(dev);
nvdimm_bus_lock(dev);
- rc = __align_store(nd_pfn, buf);
+ rc = nd_size_select_store(dev, buf, &nd_pfn->align,
+ nd_pfn_supported_alignments());
dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__,
rc, buf, buf[len - 1] == '\n' ? "" : "\n");
nvdimm_bus_unlock(dev);
@@ -260,6 +264,13 @@ static ssize_t size_show(struct device *dev,
}
static DEVICE_ATTR_RO(size);
+static ssize_t supported_alignments_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return nd_size_select_show(0, nd_pfn_supported_alignments(), buf);
+}
+static DEVICE_ATTR_RO(supported_alignments);
+
static struct attribute *nd_pfn_attributes[] = {
&dev_attr_mode.attr,
&dev_attr_namespace.attr,
@@ -267,6 +278,7 @@ static struct attribute *nd_pfn_attributes[] = {
&dev_attr_align.attr,
&dev_attr_resource.attr,
&dev_attr_size.attr,
+ &dev_attr_supported_alignments.attr,
NULL,
};
@@ -290,7 +302,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn,
return NULL;
nd_pfn->mode = PFN_MODE_NONE;
- nd_pfn->align = HPAGE_SIZE;
+ nd_pfn->align = PFN_DEFAULT_ALIGNMENT;
dev = &nd_pfn->dev;
device_initialize(&nd_pfn->dev);
if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) {
@@ -638,11 +650,12 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
/ PAGE_SIZE);
if (nd_pfn->mode == PFN_MODE_PMEM) {
/*
- * vmemmap_populate_hugepages() allocates the memmap array in
- * HPAGE_SIZE chunks.
+ * The altmap should be padded out to the block size used
+ * when populating the vmemmap. This *should* be equal to
+ * PMD_SIZE for most architectures.
*/
offset = ALIGN(start + SZ_8K + 64 * npfns + dax_label_reserve,
- max(nd_pfn->align, HPAGE_SIZE)) - start;
+ max(nd_pfn->align, PMD_SIZE)) - start;
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + SZ_8K + dax_label_reserve,
nd_pfn->align) - start;
diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h
index 5434321cad67..c5917f040fa7 100644
--- a/drivers/nvdimm/pmem.h
+++ b/drivers/nvdimm/pmem.h
@@ -5,20 +5,6 @@
#include <linux/pfn_t.h>
#include <linux/fs.h>
-#ifdef CONFIG_ARCH_HAS_PMEM_API
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
-void arch_wb_cache_pmem(void *addr, size_t size);
-void arch_invalidate_pmem(void *addr, size_t size);
-#else
-#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
-static inline void arch_wb_cache_pmem(void *addr, size_t size)
-{
-}
-static inline void arch_invalidate_pmem(void *addr, size_t size)
-{
-}
-#endif
-
/* this definition is in it's own header for tools/testing/nvdimm to consume */
struct pmem_device {
/* One contiguous memory region per device */
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 5954cfbea3fc..829d760f651c 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -723,8 +723,9 @@ static ssize_t mappingN(struct device *dev, char *buf, int n)
nd_mapping = &nd_region->mapping[n];
nvdimm = nd_mapping->nvdimm;
- return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev),
- nd_mapping->start, nd_mapping->size);
+ return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev),
+ nd_mapping->start, nd_mapping->size,
+ nd_mapping->position);
}
#define REGION_MAPPING(idx) \
@@ -965,6 +966,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus,
nd_region->mapping[i].nvdimm = nvdimm;
nd_region->mapping[i].start = mapping->start;
nd_region->mapping[i].size = mapping->size;
+ nd_region->mapping[i].position = mapping->position;
INIT_LIST_HEAD(&nd_region->mapping[i].labels);
mutex_init(&nd_region->mapping[i].lock);
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 17b66e9715d2..64b710265d39 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -9,6 +9,9 @@
#include <linux/module.h>
#include <linux/mod_devicetable.h>
#include <linux/slab.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/amba/bus.h>
#include <asm/errno.h>
#include "of_private.h"
@@ -84,31 +87,28 @@ int of_device_add(struct platform_device *ofdev)
*/
int of_dma_configure(struct device *dev, struct device_node *np)
{
- u64 dma_addr, paddr, size;
+ u64 dma_addr, paddr, size = 0;
int ret;
bool coherent;
unsigned long offset;
const struct iommu_ops *iommu;
u64 mask;
- /*
- * Set default coherent_dma_mask to 32 bit. Drivers are expected to
- * setup the correct supported mask.
- */
- if (!dev->coherent_dma_mask)
- dev->coherent_dma_mask = DMA_BIT_MASK(32);
-
- /*
- * Set it to coherent_dma_mask by default if the architecture
- * code has not set it.
- */
- if (!dev->dma_mask)
- dev->dma_mask = &dev->coherent_dma_mask;
-
ret = of_dma_get_range(np, &dma_addr, &paddr, &size);
if (ret < 0) {
+ /*
+ * For legacy reasons, we have to assume some devices need
+ * DMA configuration regardless of whether "dma-ranges" is
+ * correctly specified or not.
+ */
+ if (!dev_is_pci(dev) &&
+#ifdef CONFIG_ARM_AMBA
+ dev->bus != &amba_bustype &&
+#endif
+ dev->bus != &platform_bus_type)
+ return ret == -ENODEV ? 0 : ret;
+
dma_addr = offset = 0;
- size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
} else {
offset = PFN_DOWN(paddr - dma_addr);
@@ -129,6 +129,22 @@ int of_dma_configure(struct device *dev, struct device_node *np)
dev_dbg(dev, "dma_pfn_offset(%#08lx)\n", offset);
}
+ /*
+ * Set default coherent_dma_mask to 32 bit. Drivers are expected to
+ * setup the correct supported mask.
+ */
+ if (!dev->coherent_dma_mask)
+ dev->coherent_dma_mask = DMA_BIT_MASK(32);
+ /*
+ * Set it to coherent_dma_mask by default if the architecture
+ * code has not set it.
+ */
+ if (!dev->dma_mask)
+ dev->dma_mask = &dev->coherent_dma_mask;
+
+ if (!size)
+ size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+
dev->dma_pfn_offset = offset;
/*
diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 7cb982b54c8c..763ee50ea57d 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -300,7 +300,7 @@ config PWM_MEDIATEK
Generic PWM framework driver for Mediatek ARM SoC.
To compile this driver as a module, choose M here: the module
- will be called pwm-mxs.
+ will be called pwm-mediatek.
config PWM_MXS
tristate "Freescale MXS PWM support"
@@ -456,7 +456,7 @@ config PWM_TEGRA
config PWM_TIECAP
tristate "ECAP PWM support"
- depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX
+ depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_KEYSTONE
help
PWM driver support for the ECAP APWM controller found on AM33XX
TI SOC
@@ -510,4 +510,13 @@ config PWM_VT8500
To compile this driver as a module, choose M here: the module
will be called pwm-vt8500.
+config PWM_ZX
+ tristate "ZTE ZX PWM support"
+ depends on ARCH_ZX
+ help
+ Generic PWM framework driver for ZTE ZX family SoCs.
+
+ To compile this driver as a module, choose M here: the module
+ will be called pwm-zx.
+
endif
diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile
index a3a4beef6daa..ebefba5f528b 100644
--- a/drivers/pwm/Makefile
+++ b/drivers/pwm/Makefile
@@ -50,3 +50,4 @@ obj-$(CONFIG_PWM_TIPWMSS) += pwm-tipwmss.o
obj-$(CONFIG_PWM_TWL) += pwm-twl.o
obj-$(CONFIG_PWM_TWL_LED) += pwm-twl-led.o
obj-$(CONFIG_PWM_VT8500) += pwm-vt8500.o
+obj-$(CONFIG_PWM_ZX) += pwm-zx.o
diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c
index c5dbf16d810b..db001cba937f 100644
--- a/drivers/pwm/pwm-bcm2835.c
+++ b/drivers/pwm/pwm-bcm2835.c
@@ -167,6 +167,8 @@ static int bcm2835_pwm_probe(struct platform_device *pdev)
pc->chip.dev = &pdev->dev;
pc->chip.ops = &bcm2835_pwm_ops;
pc->chip.npwm = 2;
+ pc->chip.of_xlate = of_pwm_xlate_with_flags;
+ pc->chip.of_pwm_n_cells = 3;
platform_set_drvdata(pdev, pc);
diff --git a/drivers/pwm/pwm-hibvt.c b/drivers/pwm/pwm-hibvt.c
index 8dadc58d6cdf..27c107e78d59 100644
--- a/drivers/pwm/pwm-hibvt.c
+++ b/drivers/pwm/pwm-hibvt.c
@@ -208,7 +208,7 @@ static int hibvt_pwm_probe(struct platform_device *pdev)
if (ret < 0)
return ret;
- pwm_chip->rstc = devm_reset_control_get(&pdev->dev, NULL);
+ pwm_chip->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL);
if (IS_ERR(pwm_chip->rstc)) {
clk_disable_unprepare(pwm_chip->clk);
return PTR_ERR(pwm_chip->rstc);
diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c
index 5c11bc708a3c..b52f3afb2ba1 100644
--- a/drivers/pwm/pwm-mediatek.c
+++ b/drivers/pwm/pwm-mediatek.c
@@ -2,6 +2,7 @@
* Mediatek Pulse Width Modulator driver
*
* Copyright (C) 2015 John Crispin <blogic@openwrt.org>
+ * Copyright (C) 2017 Zhi Mao <zhi.mao@mediatek.com>
*
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
@@ -29,6 +30,8 @@
#define PWMDWIDTH 0x2c
#define PWMTHRES 0x30
+#define PWM_CLK_DIV_MAX 7
+
enum {
MTK_CLK_MAIN = 0,
MTK_CLK_TOP,
@@ -61,6 +64,42 @@ static inline struct mtk_pwm_chip *to_mtk_pwm_chip(struct pwm_chip *chip)
return container_of(chip, struct mtk_pwm_chip, chip);
}
+static int mtk_pwm_clk_enable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+ struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
+ int ret;
+
+ ret = clk_prepare_enable(pc->clks[MTK_CLK_TOP]);
+ if (ret < 0)
+ return ret;
+
+ ret = clk_prepare_enable(pc->clks[MTK_CLK_MAIN]);
+ if (ret < 0)
+ goto disable_clk_top;
+
+ ret = clk_prepare_enable(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]);
+ if (ret < 0)
+ goto disable_clk_main;
+
+ return 0;
+
+disable_clk_main:
+ clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]);
+disable_clk_top:
+ clk_disable_unprepare(pc->clks[MTK_CLK_TOP]);
+
+ return ret;
+}
+
+static void mtk_pwm_clk_disable(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+ struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
+
+ clk_disable_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]);
+ clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]);
+ clk_disable_unprepare(pc->clks[MTK_CLK_TOP]);
+}
+
static inline u32 mtk_pwm_readl(struct mtk_pwm_chip *chip, unsigned int num,
unsigned int offset)
{
@@ -80,6 +119,11 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip);
struct clk *clk = pc->clks[MTK_CLK_PWM1 + pwm->hwpwm];
u32 resolution, clkdiv = 0;
+ int ret;
+
+ ret = mtk_pwm_clk_enable(chip, pwm);
+ if (ret < 0)
+ return ret;
resolution = NSEC_PER_SEC / clk_get_rate(clk);
@@ -88,13 +132,18 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
clkdiv++;
}
- if (clkdiv > 7)
+ if (clkdiv > PWM_CLK_DIV_MAX) {
+ mtk_pwm_clk_disable(chip, pwm);
+ dev_err(chip->dev, "period %d not supported\n", period_ns);
return -EINVAL;
+ }
- mtk_pwm_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | BIT(3) | clkdiv);
+ mtk_pwm_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv);
mtk_pwm_writel(pc, pwm->hwpwm, PWMDWIDTH, period_ns / resolution);
mtk_pwm_writel(pc, pwm->hwpwm, PWMTHRES, duty_ns / resolution);
+ mtk_pwm_clk_disable(chip, pwm);
+
return 0;
}
@@ -104,7 +153,7 @@ static int mtk_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
u32 value;
int ret;
- ret = clk_prepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]);
+ ret = mtk_pwm_clk_enable(chip, pwm);
if (ret < 0)
return ret;
@@ -124,7 +173,7 @@ static void mtk_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
value &= ~BIT(pwm->hwpwm);
writel(value, pc->regs);
- clk_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]);
+ mtk_pwm_clk_disable(chip, pwm);
}
static const struct pwm_ops mtk_pwm_ops = {
@@ -156,14 +205,6 @@ static int mtk_pwm_probe(struct platform_device *pdev)
return PTR_ERR(pc->clks[i]);
}
- ret = clk_prepare(pc->clks[MTK_CLK_TOP]);
- if (ret < 0)
- return ret;
-
- ret = clk_prepare(pc->clks[MTK_CLK_MAIN]);
- if (ret < 0)
- goto disable_clk_top;
-
platform_set_drvdata(pdev, pc);
pc->chip.dev = &pdev->dev;
@@ -174,26 +215,15 @@ static int mtk_pwm_probe(struct platform_device *pdev)
ret = pwmchip_add(&pc->chip);
if (ret < 0) {
dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
- goto disable_clk_main;
+ return ret;
}
return 0;
-
-disable_clk_main:
- clk_unprepare(pc->clks[MTK_CLK_MAIN]);
-disable_clk_top:
- clk_unprepare(pc->clks[MTK_CLK_TOP]);
-
- return ret;
}
static int mtk_pwm_remove(struct platform_device *pdev)
{
struct mtk_pwm_chip *pc = platform_get_drvdata(pdev);
- unsigned int i;
-
- for (i = 0; i < pc->chip.npwm; i++)
- pwm_disable(&pc->chip.pwms[i]);
return pwmchip_remove(&pc->chip);
}
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
index cb845edfe2b4..d589331d1884 100644
--- a/drivers/pwm/pwm-meson.c
+++ b/drivers/pwm/pwm-meson.c
@@ -441,7 +441,7 @@ static int meson_pwm_init_channels(struct meson_pwm *meson,
for (i = 0; i < meson->chip.npwm; i++) {
struct meson_pwm_channel *channel = &channels[i];
- snprintf(name, sizeof(name), "%s#mux%u", np->full_name, i);
+ snprintf(name, sizeof(name), "%pOF#mux%u", np, i);
init.name = name;
init.ops = &clk_mux_ops;
diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c
index 5f55cfab9b1c..a7eaf962a95b 100644
--- a/drivers/pwm/pwm-pca9685.c
+++ b/drivers/pwm/pwm-pca9685.c
@@ -241,11 +241,11 @@ static inline int pca9685_pwm_gpio_probe(struct pca9685 *pca)
}
#endif
-static void pca9685_set_sleep_mode(struct pca9685 *pca, int sleep)
+static void pca9685_set_sleep_mode(struct pca9685 *pca, bool enable)
{
regmap_update_bits(pca->regmap, PCA9685_MODE1,
- MODE1_SLEEP, sleep ? MODE1_SLEEP : 0);
- if (!sleep) {
+ MODE1_SLEEP, enable ? MODE1_SLEEP : 0);
+ if (!enable) {
/* Wait 500us for the oscillator to be back up */
udelay(500);
}
@@ -272,13 +272,13 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
* state is guaranteed active here.
*/
/* Put chip into sleep mode */
- pca9685_set_sleep_mode(pca, 1);
+ pca9685_set_sleep_mode(pca, true);
/* Change the chip-wide output frequency */
regmap_write(pca->regmap, PCA9685_PRESCALE, prescale);
/* Wake the chip up */
- pca9685_set_sleep_mode(pca, 0);
+ pca9685_set_sleep_mode(pca, false);
pca->period_ns = period_ns;
} else {
@@ -534,7 +534,7 @@ static int pca9685_pwm_runtime_suspend(struct device *dev)
struct i2c_client *client = to_i2c_client(dev);
struct pca9685 *pca = i2c_get_clientdata(client);
- pca9685_set_sleep_mode(pca, 1);
+ pca9685_set_sleep_mode(pca, true);
return 0;
}
@@ -543,7 +543,7 @@ static int pca9685_pwm_runtime_resume(struct device *dev)
struct i2c_client *client = to_i2c_client(dev);
struct pca9685 *pca = i2c_get_clientdata(client);
- pca9685_set_sleep_mode(pca, 0);
+ pca9685_set_sleep_mode(pca, false);
return 0;
}
#endif
diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c
index 075c1a764ba2..29267d12fb4c 100644
--- a/drivers/pwm/pwm-renesas-tpu.c
+++ b/drivers/pwm/pwm-renesas-tpu.c
@@ -455,7 +455,6 @@ static const struct of_device_id tpu_of_table[] = {
{ .compatible = "renesas,tpu-r8a73a4", },
{ .compatible = "renesas,tpu-r8a7740", },
{ .compatible = "renesas,tpu-r8a7790", },
- { .compatible = "renesas,tpu-sh7372", },
{ .compatible = "renesas,tpu", },
{ },
};
diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c
index 744d56197286..4d99d468df09 100644
--- a/drivers/pwm/pwm-rockchip.c
+++ b/drivers/pwm/pwm-rockchip.c
@@ -27,12 +27,15 @@
#define PWM_DUTY_NEGATIVE (0 << 3)
#define PWM_INACTIVE_NEGATIVE (0 << 4)
#define PWM_INACTIVE_POSITIVE (1 << 4)
+#define PWM_POLARITY_MASK (PWM_DUTY_POSITIVE | PWM_INACTIVE_POSITIVE)
#define PWM_OUTPUT_LEFT (0 << 5)
+#define PWM_LOCK_EN (1 << 6)
#define PWM_LP_DISABLE (0 << 8)
struct rockchip_pwm_chip {
struct pwm_chip chip;
struct clk *clk;
+ struct clk *pclk;
const struct rockchip_pwm_data *data;
void __iomem *base;
};
@@ -48,13 +51,8 @@ struct rockchip_pwm_data {
struct rockchip_pwm_regs regs;
unsigned int prescaler;
bool supports_polarity;
- const struct pwm_ops *ops;
-
- void (*set_enable)(struct pwm_chip *chip,
- struct pwm_device *pwm, bool enable,
- enum pwm_polarity polarity);
- void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm,
- struct pwm_state *state);
+ bool supports_lock;
+ u32 enable_conf;
};
static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c)
@@ -62,90 +60,18 @@ static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c)
return container_of(c, struct rockchip_pwm_chip, chip);
}
-static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip,
- struct pwm_device *pwm, bool enable,
- enum pwm_polarity polarity)
-{
- struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
- u32 enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN;
- u32 val;
-
- val = readl_relaxed(pc->base + pc->data->regs.ctrl);
-
- if (enable)
- val |= enable_conf;
- else
- val &= ~enable_conf;
-
- writel_relaxed(val, pc->base + pc->data->regs.ctrl);
-}
-
-static void rockchip_pwm_get_state_v1(struct pwm_chip *chip,
- struct pwm_device *pwm,
- struct pwm_state *state)
-{
- struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
- u32 enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN;
- u32 val;
-
- val = readl_relaxed(pc->base + pc->data->regs.ctrl);
- if ((val & enable_conf) == enable_conf)
- state->enabled = true;
-}
-
-static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip,
- struct pwm_device *pwm, bool enable,
- enum pwm_polarity polarity)
-{
- struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
- u32 enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE |
- PWM_CONTINUOUS;
- u32 val;
-
- if (polarity == PWM_POLARITY_INVERSED)
- enable_conf |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE;
- else
- enable_conf |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE;
-
- val = readl_relaxed(pc->base + pc->data->regs.ctrl);
-
- if (enable)
- val |= enable_conf;
- else
- val &= ~enable_conf;
-
- writel_relaxed(val, pc->base + pc->data->regs.ctrl);
-}
-
-static void rockchip_pwm_get_state_v2(struct pwm_chip *chip,
- struct pwm_device *pwm,
- struct pwm_state *state)
-{
- struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
- u32 enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE |
- PWM_CONTINUOUS;
- u32 val;
-
- val = readl_relaxed(pc->base + pc->data->regs.ctrl);
- if ((val & enable_conf) != enable_conf)
- return;
-
- state->enabled = true;
-
- if (!(val & PWM_DUTY_POSITIVE))
- state->polarity = PWM_POLARITY_INVERSED;
-}
-
static void rockchip_pwm_get_state(struct pwm_chip *chip,
struct pwm_device *pwm,
struct pwm_state *state)
{
struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
+ u32 enable_conf = pc->data->enable_conf;
unsigned long clk_rate;
u64 tmp;
+ u32 val;
int ret;
- ret = clk_enable(pc->clk);
+ ret = clk_enable(pc->pclk);
if (ret)
return;
@@ -157,19 +83,31 @@ static void rockchip_pwm_get_state(struct pwm_chip *chip,
tmp = readl_relaxed(pc->base + pc->data->regs.duty);
tmp *= pc->data->prescaler * NSEC_PER_SEC;
- state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, clk_rate);
+ state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, clk_rate);
+
+ val = readl_relaxed(pc->base + pc->data->regs.ctrl);
+ if (pc->data->supports_polarity)
+ state->enabled = ((val & enable_conf) != enable_conf) ?
+ false : true;
+ else
+ state->enabled = ((val & enable_conf) == enable_conf) ?
+ true : false;
- pc->data->get_state(chip, pwm, state);
+ if (pc->data->supports_polarity) {
+ if (!(val & PWM_DUTY_POSITIVE))
+ state->polarity = PWM_POLARITY_INVERSED;
+ }
- clk_disable(pc->clk);
+ clk_disable(pc->pclk);
}
-static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
- int duty_ns, int period_ns)
+static void rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
+ struct pwm_state *state)
{
struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
unsigned long period, duty;
u64 clk_rate, div;
+ u32 ctrl;
clk_rate = clk_get_rate(pc->clk);
@@ -178,26 +116,53 @@ static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
* bits, every possible input period can be obtained using the
* default prescaler value for all practical clock rate values.
*/
- div = clk_rate * period_ns;
+ div = clk_rate * state->period;
period = DIV_ROUND_CLOSEST_ULL(div,
pc->data->prescaler * NSEC_PER_SEC);
- div = clk_rate * duty_ns;
+ div = clk_rate * state->duty_cycle;
duty = DIV_ROUND_CLOSEST_ULL(div, pc->data->prescaler * NSEC_PER_SEC);
+ /*
+ * Lock the period and duty of previous configuration, then
+ * change the duty and period, that would not be effective.
+ */
+ ctrl = readl_relaxed(pc->base + pc->data->regs.ctrl);
+ if (pc->data->supports_lock) {
+ ctrl |= PWM_LOCK_EN;
+ writel_relaxed(ctrl, pc->base + pc->data->regs.ctrl);
+ }
+
writel(period, pc->base + pc->data->regs.period);
writel(duty, pc->base + pc->data->regs.duty);
- return 0;
+ if (pc->data->supports_polarity) {
+ ctrl &= ~PWM_POLARITY_MASK;
+ if (state->polarity == PWM_POLARITY_INVERSED)
+ ctrl |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE;
+ else
+ ctrl |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE;
+ }
+
+ /*
+ * Unlock and set polarity at the same time,
+ * the configuration of duty, period and polarity
+ * would be effective together at next period.
+ */
+ if (pc->data->supports_lock)
+ ctrl &= ~PWM_LOCK_EN;
+
+ writel(ctrl, pc->base + pc->data->regs.ctrl);
}
static int rockchip_pwm_enable(struct pwm_chip *chip,
- struct pwm_device *pwm,
- bool enable,
- enum pwm_polarity polarity)
+ struct pwm_device *pwm,
+ bool enable)
{
struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
+ u32 enable_conf = pc->data->enable_conf;
int ret;
+ u32 val;
if (enable) {
ret = clk_enable(pc->clk);
@@ -205,7 +170,14 @@ static int rockchip_pwm_enable(struct pwm_chip *chip,
return ret;
}
- pc->data->set_enable(chip, pwm, enable, polarity);
+ val = readl_relaxed(pc->base + pc->data->regs.ctrl);
+
+ if (enable)
+ val |= enable_conf;
+ else
+ val &= ~enable_conf;
+
+ writel_relaxed(val, pc->base + pc->data->regs.ctrl);
if (!enable)
clk_disable(pc->clk);
@@ -219,33 +191,26 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip);
struct pwm_state curstate;
bool enabled;
- int ret;
+ int ret = 0;
- pwm_get_state(pwm, &curstate);
- enabled = curstate.enabled;
-
- ret = clk_enable(pc->clk);
+ ret = clk_enable(pc->pclk);
if (ret)
return ret;
- if (state->polarity != curstate.polarity && enabled) {
- ret = rockchip_pwm_enable(chip, pwm, false, state->polarity);
+ pwm_get_state(pwm, &curstate);
+ enabled = curstate.enabled;
+
+ if (state->polarity != curstate.polarity && enabled &&
+ !pc->data->supports_lock) {
+ ret = rockchip_pwm_enable(chip, pwm, false);
if (ret)
goto out;
enabled = false;
}
- ret = rockchip_pwm_config(chip, pwm, state->duty_cycle, state->period);
- if (ret) {
- if (enabled != curstate.enabled)
- rockchip_pwm_enable(chip, pwm, !enabled,
- state->polarity);
- goto out;
- }
-
+ rockchip_pwm_config(chip, pwm, state);
if (state->enabled != enabled) {
- ret = rockchip_pwm_enable(chip, pwm, state->enabled,
- state->polarity);
+ ret = rockchip_pwm_enable(chip, pwm, state->enabled);
if (ret)
goto out;
}
@@ -257,18 +222,12 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
rockchip_pwm_get_state(chip, pwm, state);
out:
- clk_disable(pc->clk);
+ clk_disable(pc->pclk);
return ret;
}
-static const struct pwm_ops rockchip_pwm_ops_v1 = {
- .get_state = rockchip_pwm_get_state,
- .apply = rockchip_pwm_apply,
- .owner = THIS_MODULE,
-};
-
-static const struct pwm_ops rockchip_pwm_ops_v2 = {
+static const struct pwm_ops rockchip_pwm_ops = {
.get_state = rockchip_pwm_get_state,
.apply = rockchip_pwm_apply,
.owner = THIS_MODULE,
@@ -282,9 +241,9 @@ static const struct rockchip_pwm_data pwm_data_v1 = {
.ctrl = 0x0c,
},
.prescaler = 2,
- .ops = &rockchip_pwm_ops_v1,
- .set_enable = rockchip_pwm_set_enable_v1,
- .get_state = rockchip_pwm_get_state_v1,
+ .supports_polarity = false,
+ .supports_lock = false,
+ .enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN,
};
static const struct rockchip_pwm_data pwm_data_v2 = {
@@ -296,9 +255,9 @@ static const struct rockchip_pwm_data pwm_data_v2 = {
},
.prescaler = 1,
.supports_polarity = true,
- .ops = &rockchip_pwm_ops_v2,
- .set_enable = rockchip_pwm_set_enable_v2,
- .get_state = rockchip_pwm_get_state_v2,
+ .supports_lock = false,
+ .enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE |
+ PWM_CONTINUOUS,
};
static const struct rockchip_pwm_data pwm_data_vop = {
@@ -310,15 +269,30 @@ static const struct rockchip_pwm_data pwm_data_vop = {
},
.prescaler = 1,
.supports_polarity = true,
- .ops = &rockchip_pwm_ops_v2,
- .set_enable = rockchip_pwm_set_enable_v2,
- .get_state = rockchip_pwm_get_state_v2,
+ .supports_lock = false,
+ .enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE |
+ PWM_CONTINUOUS,
+};
+
+static const struct rockchip_pwm_data pwm_data_v3 = {
+ .regs = {
+ .duty = 0x08,
+ .period = 0x04,
+ .cntr = 0x00,
+ .ctrl = 0x0c,
+ },
+ .prescaler = 1,
+ .supports_polarity = true,
+ .supports_lock = true,
+ .enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE |
+ PWM_CONTINUOUS,
};
static const struct of_device_id rockchip_pwm_dt_ids[] = {
{ .compatible = "rockchip,rk2928-pwm", .data = &pwm_data_v1},
{ .compatible = "rockchip,rk3288-pwm", .data = &pwm_data_v2},
{ .compatible = "rockchip,vop-pwm", .data = &pwm_data_vop},
+ { .compatible = "rockchip,rk3328-pwm", .data = &pwm_data_v3},
{ /* sentinel */ }
};
MODULE_DEVICE_TABLE(of, rockchip_pwm_dt_ids);
@@ -328,7 +302,7 @@ static int rockchip_pwm_probe(struct platform_device *pdev)
const struct of_device_id *id;
struct rockchip_pwm_chip *pc;
struct resource *r;
- int ret;
+ int ret, count;
id = of_match_device(rockchip_pwm_dt_ids, &pdev->dev);
if (!id)
@@ -343,19 +317,49 @@ static int rockchip_pwm_probe(struct platform_device *pdev)
if (IS_ERR(pc->base))
return PTR_ERR(pc->base);
- pc->clk = devm_clk_get(&pdev->dev, NULL);
- if (IS_ERR(pc->clk))
- return PTR_ERR(pc->clk);
+ pc->clk = devm_clk_get(&pdev->dev, "pwm");
+ if (IS_ERR(pc->clk)) {
+ pc->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(pc->clk)) {
+ ret = PTR_ERR(pc->clk);
+ if (ret != -EPROBE_DEFER)
+ dev_err(&pdev->dev, "Can't get bus clk: %d\n",
+ ret);
+ return ret;
+ }
+ }
+
+ count = of_count_phandle_with_args(pdev->dev.of_node,
+ "clocks", "#clock-cells");
+ if (count == 2)
+ pc->pclk = devm_clk_get(&pdev->dev, "pclk");
+ else
+ pc->pclk = pc->clk;
+
+ if (IS_ERR(pc->pclk)) {
+ ret = PTR_ERR(pc->pclk);
+ if (ret != -EPROBE_DEFER)
+ dev_err(&pdev->dev, "Can't get APB clk: %d\n", ret);
+ return ret;
+ }
ret = clk_prepare_enable(pc->clk);
- if (ret)
+ if (ret) {
+ dev_err(&pdev->dev, "Can't prepare enable bus clk: %d\n", ret);
return ret;
+ }
+
+ ret = clk_prepare(pc->pclk);
+ if (ret) {
+ dev_err(&pdev->dev, "Can't prepare APB clk: %d\n", ret);
+ goto err_clk;
+ }
platform_set_drvdata(pdev, pc);
pc->data = id->data;
pc->chip.dev = &pdev->dev;
- pc->chip.ops = pc->data->ops;
+ pc->chip.ops = &rockchip_pwm_ops;
pc->chip.base = -1;
pc->chip.npwm = 1;
@@ -368,12 +372,20 @@ static int rockchip_pwm_probe(struct platform_device *pdev)
if (ret < 0) {
clk_unprepare(pc->clk);
dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
+ goto err_pclk;
}
/* Keep the PWM clk enabled if the PWM appears to be up and running. */
if (!pwm_is_enabled(pc->chip.pwms))
clk_disable(pc->clk);
+ return 0;
+
+err_pclk:
+ clk_unprepare(pc->pclk);
+err_clk:
+ clk_disable_unprepare(pc->clk);
+
return ret;
}
@@ -395,6 +407,7 @@ static int rockchip_pwm_remove(struct platform_device *pdev)
if (pwm_is_enabled(pc->chip.pwms))
clk_disable(pc->clk);
+ clk_unprepare(pc->pclk);
clk_unprepare(pc->clk);
return pwmchip_remove(&pc->chip);
diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c
index f113cda47032..062f2cfc45ec 100644
--- a/drivers/pwm/pwm-samsung.c
+++ b/drivers/pwm/pwm-samsung.c
@@ -3,6 +3,7 @@
* Copyright (c) 2008 Simtec Electronics
* Ben Dooks <ben@simtec.co.uk>, <ben-linux@fluff.org>
* Copyright (c) 2013 Tomasz Figa <tomasz.figa@gmail.com>
+ * Copyright (c) 2017 Samsung Electronics Co., Ltd.
*
* PWM driver for Samsung SoCs
*
@@ -74,6 +75,7 @@ struct samsung_pwm_channel {
* @chip: generic PWM chip
* @variant: local copy of hardware variant data
* @inverter_mask: inverter status for all channels - one bit per channel
+ * @disabled_mask: disabled status for all channels - one bit per channel
* @base: base address of mapped PWM registers
* @base_clk: base clock used to drive the timers
* @tclk0: external clock 0 (can be ERR_PTR if not present)
@@ -83,6 +85,7 @@ struct samsung_pwm_chip {
struct pwm_chip chip;
struct samsung_pwm_variant variant;
u8 inverter_mask;
+ u8 disabled_mask;
void __iomem *base;
struct clk *base_clk;
@@ -257,6 +260,8 @@ static int pwm_samsung_enable(struct pwm_chip *chip, struct pwm_device *pwm)
tcon |= TCON_START(tcon_chan) | TCON_AUTORELOAD(tcon_chan);
writel(tcon, our_chip->base + REG_TCON);
+ our_chip->disabled_mask &= ~BIT(pwm->hwpwm);
+
spin_unlock_irqrestore(&samsung_pwm_lock, flags);
return 0;
@@ -275,6 +280,8 @@ static void pwm_samsung_disable(struct pwm_chip *chip, struct pwm_device *pwm)
tcon &= ~TCON_AUTORELOAD(tcon_chan);
writel(tcon, our_chip->base + REG_TCON);
+ our_chip->disabled_mask |= BIT(pwm->hwpwm);
+
spin_unlock_irqrestore(&samsung_pwm_lock, flags);
}
@@ -297,8 +304,8 @@ static void pwm_samsung_manual_update(struct samsung_pwm_chip *chip,
spin_unlock_irqrestore(&samsung_pwm_lock, flags);
}
-static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm,
- int duty_ns, int period_ns)
+static int __pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm,
+ int duty_ns, int period_ns, bool force_period)
{
struct samsung_pwm_chip *our_chip = to_samsung_pwm_chip(chip);
struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm);
@@ -312,9 +319,6 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm,
if (period_ns > NSEC_PER_SEC)
return -ERANGE;
- if (period_ns == chan->period_ns && duty_ns == chan->duty_ns)
- return 0;
-
tcnt = readl(our_chip->base + REG_TCNTB(pwm->hwpwm));
oldtcmp = readl(our_chip->base + REG_TCMPB(pwm->hwpwm));
@@ -322,7 +326,7 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm,
++tcnt;
/* Check to see if we are changing the clock rate of the PWM. */
- if (chan->period_ns != period_ns) {
+ if (chan->period_ns != period_ns || force_period) {
unsigned long tin_rate;
u32 period;
@@ -381,6 +385,12 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm,
return 0;
}
+static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm,
+ int duty_ns, int period_ns)
+{
+ return __pwm_samsung_config(chip, pwm, duty_ns, period_ns, false);
+}
+
static void pwm_samsung_set_invert(struct samsung_pwm_chip *chip,
unsigned int channel, bool invert)
{
@@ -592,51 +602,41 @@ static int pwm_samsung_remove(struct platform_device *pdev)
}
#ifdef CONFIG_PM_SLEEP
-static int pwm_samsung_suspend(struct device *dev)
+static int pwm_samsung_resume(struct device *dev)
{
- struct samsung_pwm_chip *chip = dev_get_drvdata(dev);
+ struct samsung_pwm_chip *our_chip = dev_get_drvdata(dev);
+ struct pwm_chip *chip = &our_chip->chip;
unsigned int i;
- /*
- * No one preserves these values during suspend so reset them.
- * Otherwise driver leaves PWM unconfigured if same values are
- * passed to pwm_config() next time.
- */
- for (i = 0; i < SAMSUNG_PWM_NUM; ++i) {
- struct pwm_device *pwm = &chip->chip.pwms[i];
+ for (i = 0; i < SAMSUNG_PWM_NUM; i++) {
+ struct pwm_device *pwm = &chip->pwms[i];
struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm);
if (!chan)
continue;
- chan->period_ns = 0;
- chan->duty_ns = 0;
- }
-
- return 0;
-}
+ if (our_chip->variant.output_mask & BIT(i))
+ pwm_samsung_set_invert(our_chip, i,
+ our_chip->inverter_mask & BIT(i));
-static int pwm_samsung_resume(struct device *dev)
-{
- struct samsung_pwm_chip *chip = dev_get_drvdata(dev);
- unsigned int chan;
+ if (chan->period_ns) {
+ __pwm_samsung_config(chip, pwm, chan->duty_ns,
+ chan->period_ns, true);
+ /* needed to make PWM disable work on Odroid-XU3 */
+ pwm_samsung_manual_update(our_chip, pwm);
+ }
- /*
- * Inverter setting must be preserved across suspend/resume
- * as nobody really seems to configure it more than once.
- */
- for (chan = 0; chan < SAMSUNG_PWM_NUM; ++chan) {
- if (chip->variant.output_mask & BIT(chan))
- pwm_samsung_set_invert(chip, chan,
- chip->inverter_mask & BIT(chan));
+ if (our_chip->disabled_mask & BIT(i))
+ pwm_samsung_disable(chip, pwm);
+ else
+ pwm_samsung_enable(chip, pwm);
}
return 0;
}
#endif
-static SIMPLE_DEV_PM_OPS(pwm_samsung_pm_ops, pwm_samsung_suspend,
- pwm_samsung_resume);
+static SIMPLE_DEV_PM_OPS(pwm_samsung_pm_ops, NULL, pwm_samsung_resume);
static struct platform_driver pwm_samsung_driver = {
.driver = {
diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c
index e9b33f09ff09..f8ebbece57b7 100644
--- a/drivers/pwm/pwm-tegra.c
+++ b/drivers/pwm/pwm-tegra.c
@@ -218,7 +218,7 @@ static int tegra_pwm_probe(struct platform_device *pdev)
*/
pwm->clk_rate = clk_get_rate(pwm->clk);
- pwm->rst = devm_reset_control_get(&pdev->dev, "pwm");
+ pwm->rst = devm_reset_control_get_exclusive(&pdev->dev, "pwm");
if (IS_ERR(pwm->rst)) {
ret = PTR_ERR(pwm->rst);
dev_err(&pdev->dev, "Reset control is not found: %d\n", ret);
diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c
index 6ec342dd3eea..34b228626bd5 100644
--- a/drivers/pwm/pwm-tiecap.c
+++ b/drivers/pwm/pwm-tiecap.c
@@ -39,15 +39,15 @@
#define ECCTL2_TSCTR_FREERUN BIT(4)
struct ecap_context {
- u32 cap3;
- u32 cap4;
- u16 ecctl2;
+ u32 cap3;
+ u32 cap4;
+ u16 ecctl2;
};
struct ecap_pwm_chip {
- struct pwm_chip chip;
- unsigned int clk_rate;
- void __iomem *mmio_base;
+ struct pwm_chip chip;
+ unsigned int clk_rate;
+ void __iomem *mmio_base;
struct ecap_context ctx;
};
@@ -64,9 +64,9 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
int duty_ns, int period_ns)
{
struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
+ u32 period_cycles, duty_cycles;
unsigned long long c;
- unsigned long period_cycles, duty_cycles;
- unsigned int reg_val;
+ u16 value;
if (period_ns > NSEC_PER_SEC)
return -ERANGE;
@@ -74,7 +74,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
c = pc->clk_rate;
c = c * period_ns;
do_div(c, NSEC_PER_SEC);
- period_cycles = (unsigned long)c;
+ period_cycles = (u32)c;
if (period_cycles < 1) {
period_cycles = 1;
@@ -83,17 +83,17 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
c = pc->clk_rate;
c = c * duty_ns;
do_div(c, NSEC_PER_SEC);
- duty_cycles = (unsigned long)c;
+ duty_cycles = (u32)c;
}
pm_runtime_get_sync(pc->chip.dev);
- reg_val = readw(pc->mmio_base + ECCTL2);
+ value = readw(pc->mmio_base + ECCTL2);
/* Configure APWM mode & disable sync option */
- reg_val |= ECCTL2_APWM_MODE | ECCTL2_SYNC_SEL_DISA;
+ value |= ECCTL2_APWM_MODE | ECCTL2_SYNC_SEL_DISA;
- writew(reg_val, pc->mmio_base + ECCTL2);
+ writew(value, pc->mmio_base + ECCTL2);
if (!pwm_is_enabled(pwm)) {
/* Update active registers if not running */
@@ -110,40 +110,45 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
}
if (!pwm_is_enabled(pwm)) {
- reg_val = readw(pc->mmio_base + ECCTL2);
+ value = readw(pc->mmio_base + ECCTL2);
/* Disable APWM mode to put APWM output Low */
- reg_val &= ~ECCTL2_APWM_MODE;
- writew(reg_val, pc->mmio_base + ECCTL2);
+ value &= ~ECCTL2_APWM_MODE;
+ writew(value, pc->mmio_base + ECCTL2);
}
pm_runtime_put_sync(pc->chip.dev);
+
return 0;
}
static int ecap_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm,
- enum pwm_polarity polarity)
+ enum pwm_polarity polarity)
{
struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
- unsigned short reg_val;
+ u16 value;
pm_runtime_get_sync(pc->chip.dev);
- reg_val = readw(pc->mmio_base + ECCTL2);
+
+ value = readw(pc->mmio_base + ECCTL2);
+
if (polarity == PWM_POLARITY_INVERSED)
/* Duty cycle defines LOW period of PWM */
- reg_val |= ECCTL2_APWM_POL_LOW;
+ value |= ECCTL2_APWM_POL_LOW;
else
/* Duty cycle defines HIGH period of PWM */
- reg_val &= ~ECCTL2_APWM_POL_LOW;
+ value &= ~ECCTL2_APWM_POL_LOW;
+
+ writew(value, pc->mmio_base + ECCTL2);
- writew(reg_val, pc->mmio_base + ECCTL2);
pm_runtime_put_sync(pc->chip.dev);
+
return 0;
}
static int ecap_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
{
struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
- unsigned int reg_val;
+ u16 value;
/* Leave clock enabled on enabling PWM */
pm_runtime_get_sync(pc->chip.dev);
@@ -152,24 +157,25 @@ static int ecap_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
* Enable 'Free run Time stamp counter mode' to start counter
* and 'APWM mode' to enable APWM output
*/
- reg_val = readw(pc->mmio_base + ECCTL2);
- reg_val |= ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE;
- writew(reg_val, pc->mmio_base + ECCTL2);
+ value = readw(pc->mmio_base + ECCTL2);
+ value |= ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE;
+ writew(value, pc->mmio_base + ECCTL2);
+
return 0;
}
static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
{
struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip);
- unsigned int reg_val;
+ u16 value;
/*
* Disable 'Free run Time stamp counter mode' to stop counter
* and 'APWM mode' to put APWM output to low
*/
- reg_val = readw(pc->mmio_base + ECCTL2);
- reg_val &= ~(ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE);
- writew(reg_val, pc->mmio_base + ECCTL2);
+ value = readw(pc->mmio_base + ECCTL2);
+ value &= ~(ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE);
+ writew(value, pc->mmio_base + ECCTL2);
/* Disable clock on PWM disable */
pm_runtime_put_sync(pc->chip.dev);
@@ -184,12 +190,12 @@ static void ecap_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
}
static const struct pwm_ops ecap_pwm_ops = {
- .free = ecap_pwm_free,
- .config = ecap_pwm_config,
- .set_polarity = ecap_pwm_set_polarity,
- .enable = ecap_pwm_enable,
- .disable = ecap_pwm_disable,
- .owner = THIS_MODULE,
+ .free = ecap_pwm_free,
+ .config = ecap_pwm_config,
+ .set_polarity = ecap_pwm_set_polarity,
+ .enable = ecap_pwm_enable,
+ .disable = ecap_pwm_disable,
+ .owner = THIS_MODULE,
};
static const struct of_device_id ecap_of_match[] = {
@@ -202,10 +208,10 @@ MODULE_DEVICE_TABLE(of, ecap_of_match);
static int ecap_pwm_probe(struct platform_device *pdev)
{
struct device_node *np = pdev->dev.of_node;
- int ret;
+ struct ecap_pwm_chip *pc;
struct resource *r;
struct clk *clk;
- struct ecap_pwm_chip *pc;
+ int ret;
pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
if (!pc)
@@ -248,9 +254,9 @@ static int ecap_pwm_probe(struct platform_device *pdev)
return ret;
}
+ platform_set_drvdata(pdev, pc);
pm_runtime_enable(&pdev->dev);
- platform_set_drvdata(pdev, pc);
return 0;
}
@@ -259,6 +265,7 @@ static int ecap_pwm_remove(struct platform_device *pdev)
struct ecap_pwm_chip *pc = platform_get_drvdata(pdev);
pm_runtime_disable(&pdev->dev);
+
return pwmchip_remove(&pc->chip);
}
@@ -311,14 +318,13 @@ static SIMPLE_DEV_PM_OPS(ecap_pwm_pm_ops, ecap_pwm_suspend, ecap_pwm_resume);
static struct platform_driver ecap_pwm_driver = {
.driver = {
- .name = "ecap",
+ .name = "ecap",
.of_match_table = ecap_of_match,
- .pm = &ecap_pwm_pm_ops,
+ .pm = &ecap_pwm_pm_ops,
},
.probe = ecap_pwm_probe,
.remove = ecap_pwm_remove,
};
-
module_platform_driver(ecap_pwm_driver);
MODULE_DESCRIPTION("ECAP PWM driver");
diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c
index b5c6b0636893..4c22cb395040 100644
--- a/drivers/pwm/pwm-tiehrpwm.c
+++ b/drivers/pwm/pwm-tiehrpwm.c
@@ -122,12 +122,12 @@ struct ehrpwm_context {
};
struct ehrpwm_pwm_chip {
- struct pwm_chip chip;
- unsigned int clk_rate;
- void __iomem *mmio_base;
+ struct pwm_chip chip;
+ unsigned long clk_rate;
+ void __iomem *mmio_base;
unsigned long period_cycles[NUM_PWM_CHANNEL];
enum pwm_polarity polarity[NUM_PWM_CHANNEL];
- struct clk *tbclk;
+ struct clk *tbclk;
struct ehrpwm_context ctx;
};
@@ -136,25 +136,26 @@ static inline struct ehrpwm_pwm_chip *to_ehrpwm_pwm_chip(struct pwm_chip *chip)
return container_of(chip, struct ehrpwm_pwm_chip, chip);
}
-static inline u16 ehrpwm_read(void __iomem *base, int offset)
+static inline u16 ehrpwm_read(void __iomem *base, unsigned int offset)
{
return readw(base + offset);
}
-static inline void ehrpwm_write(void __iomem *base, int offset, unsigned int val)
+static inline void ehrpwm_write(void __iomem *base, unsigned int offset,
+ u16 value)
{
- writew(val & 0xFFFF, base + offset);
+ writew(value, base + offset);
}
-static void ehrpwm_modify(void __iomem *base, int offset,
- unsigned short mask, unsigned short val)
+static void ehrpwm_modify(void __iomem *base, unsigned int offset, u16 mask,
+ u16 value)
{
- unsigned short regval;
+ unsigned short val;
- regval = readw(base + offset);
- regval &= ~mask;
- regval |= val & mask;
- writew(regval, base + offset);
+ val = readw(base + offset);
+ val &= ~mask;
+ val |= value & mask;
+ writew(val, base + offset);
}
/**
@@ -163,14 +164,13 @@ static void ehrpwm_modify(void __iomem *base, int offset,
* @prescale_div: prescaler value set
* @tb_clk_div: Time Base Control prescaler bits
*/
-static int set_prescale_div(unsigned long rqst_prescaler,
- unsigned short *prescale_div, unsigned short *tb_clk_div)
+static int set_prescale_div(unsigned long rqst_prescaler, u16 *prescale_div,
+ u16 *tb_clk_div)
{
unsigned int clkdiv, hspclkdiv;
for (clkdiv = 0; clkdiv <= CLKDIV_MAX; clkdiv++) {
for (hspclkdiv = 0; hspclkdiv <= HSPCLKDIV_MAX; hspclkdiv++) {
-
/*
* calculations for prescaler value :
* prescale_div = HSPCLKDIVIDER * CLKDIVIDER.
@@ -191,13 +191,14 @@ static int set_prescale_div(unsigned long rqst_prescaler,
}
}
}
+
return 1;
}
static void configure_polarity(struct ehrpwm_pwm_chip *pc, int chan)
{
- int aqctl_reg;
- unsigned short aqctl_val, aqctl_mask;
+ u16 aqctl_val, aqctl_mask;
+ unsigned int aqctl_reg;
/*
* Configure PWM output to HIGH/LOW level on counter
@@ -232,13 +233,13 @@ static void configure_polarity(struct ehrpwm_pwm_chip *pc, int chan)
* duty_ns = 10^9 * (ps_divval * duty_cycles) / PWM_CLK_RATE
*/
static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
- int duty_ns, int period_ns)
+ int duty_ns, int period_ns)
{
struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
+ u32 period_cycles, duty_cycles;
+ u16 ps_divval, tb_divval;
+ unsigned int i, cmp_reg;
unsigned long long c;
- unsigned long period_cycles, duty_cycles;
- unsigned short ps_divval, tb_divval;
- int i, cmp_reg;
if (period_ns > NSEC_PER_SEC)
return -ERANGE;
@@ -272,8 +273,9 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
if (i == pwm->hwpwm)
continue;
- dev_err(chip->dev, "Period value conflicts with channel %d\n",
- i);
+ dev_err(chip->dev,
+ "period value conflicts with channel %u\n",
+ i);
return -EINVAL;
}
}
@@ -282,7 +284,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
/* Configure clock prescaler to support Low frequency PWM wave */
if (set_prescale_div(period_cycles/PERIOD_MAX, &ps_divval,
- &tb_divval)) {
+ &tb_divval)) {
dev_err(chip->dev, "Unsupported values\n");
return -EINVAL;
}
@@ -303,7 +305,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
/* Configure ehrpwm counter for up-count mode */
ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CTRMODE_MASK,
- TBCTL_CTRMODE_UP);
+ TBCTL_CTRMODE_UP);
if (pwm->hwpwm == 1)
/* Channel 1 configured with compare B register */
@@ -315,23 +317,26 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles);
pm_runtime_put_sync(chip->dev);
+
return 0;
}
static int ehrpwm_pwm_set_polarity(struct pwm_chip *chip,
- struct pwm_device *pwm, enum pwm_polarity polarity)
+ struct pwm_device *pwm,
+ enum pwm_polarity polarity)
{
struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
/* Configuration of polarity in hardware delayed, do at enable */
pc->polarity[pwm->hwpwm] = polarity;
+
return 0;
}
static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
{
struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
- unsigned short aqcsfrc_val, aqcsfrc_mask;
+ u16 aqcsfrc_val, aqcsfrc_mask;
int ret;
/* Leave clock enabled on enabling PWM */
@@ -348,7 +353,7 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
/* Changes to shadow mode */
ehrpwm_modify(pc->mmio_base, AQSFRC, AQSFRC_RLDCSF_MASK,
- AQSFRC_RLDCSF_ZRO);
+ AQSFRC_RLDCSF_ZRO);
ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val);
@@ -358,20 +363,21 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
/* Enable TBCLK before enabling PWM device */
ret = clk_enable(pc->tbclk);
if (ret) {
- dev_err(chip->dev, "Failed to enable TBCLK for %s\n",
- dev_name(pc->chip.dev));
+ dev_err(chip->dev, "Failed to enable TBCLK for %s: %d\n",
+ dev_name(pc->chip.dev), ret);
return ret;
}
/* Enable time counter for free_run */
ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_RUN_MASK, TBCTL_FREE_RUN);
+
return 0;
}
static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
{
struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip);
- unsigned short aqcsfrc_val, aqcsfrc_mask;
+ u16 aqcsfrc_val, aqcsfrc_mask;
/* Action Qualifier puts PWM output low forcefully */
if (pwm->hwpwm) {
@@ -387,7 +393,7 @@ static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
* Action Qualifier control on PWM output from next TBCLK
*/
ehrpwm_modify(pc->mmio_base, AQSFRC, AQSFRC_RLDCSF_MASK,
- AQSFRC_RLDCSF_IMDT);
+ AQSFRC_RLDCSF_IMDT);
ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val);
@@ -415,17 +421,17 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
}
static const struct pwm_ops ehrpwm_pwm_ops = {
- .free = ehrpwm_pwm_free,
- .config = ehrpwm_pwm_config,
- .set_polarity = ehrpwm_pwm_set_polarity,
- .enable = ehrpwm_pwm_enable,
- .disable = ehrpwm_pwm_disable,
- .owner = THIS_MODULE,
+ .free = ehrpwm_pwm_free,
+ .config = ehrpwm_pwm_config,
+ .set_polarity = ehrpwm_pwm_set_polarity,
+ .enable = ehrpwm_pwm_enable,
+ .disable = ehrpwm_pwm_disable,
+ .owner = THIS_MODULE,
};
static const struct of_device_id ehrpwm_of_match[] = {
- { .compatible = "ti,am3352-ehrpwm" },
- { .compatible = "ti,am33xx-ehrpwm" },
+ { .compatible = "ti,am3352-ehrpwm" },
+ { .compatible = "ti,am33xx-ehrpwm" },
{},
};
MODULE_DEVICE_TABLE(of, ehrpwm_of_match);
@@ -433,10 +439,10 @@ MODULE_DEVICE_TABLE(of, ehrpwm_of_match);
static int ehrpwm_pwm_probe(struct platform_device *pdev)
{
struct device_node *np = pdev->dev.of_node;
- int ret;
+ struct ehrpwm_pwm_chip *pc;
struct resource *r;
struct clk *clk;
- struct ehrpwm_pwm_chip *pc;
+ int ret;
pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL);
if (!pc)
@@ -489,13 +495,18 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev)
ret = pwmchip_add(&pc->chip);
if (ret < 0) {
dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
- return ret;
+ goto err_clk_unprepare;
}
+ platform_set_drvdata(pdev, pc);
pm_runtime_enable(&pdev->dev);
- platform_set_drvdata(pdev, pc);
return 0;
+
+err_clk_unprepare:
+ clk_unprepare(pc->tbclk);
+
+ return ret;
}
static int ehrpwm_pwm_remove(struct platform_device *pdev)
@@ -504,8 +515,8 @@ static int ehrpwm_pwm_remove(struct platform_device *pdev)
clk_unprepare(pc->tbclk);
- pm_runtime_put_sync(&pdev->dev);
pm_runtime_disable(&pdev->dev);
+
return pwmchip_remove(&pc->chip);
}
@@ -513,6 +524,7 @@ static int ehrpwm_pwm_remove(struct platform_device *pdev)
static void ehrpwm_pwm_save_context(struct ehrpwm_pwm_chip *pc)
{
pm_runtime_get_sync(pc->chip.dev);
+
pc->ctx.tbctl = ehrpwm_read(pc->mmio_base, TBCTL);
pc->ctx.tbprd = ehrpwm_read(pc->mmio_base, TBPRD);
pc->ctx.cmpa = ehrpwm_read(pc->mmio_base, CMPA);
@@ -521,6 +533,7 @@ static void ehrpwm_pwm_save_context(struct ehrpwm_pwm_chip *pc)
pc->ctx.aqctlb = ehrpwm_read(pc->mmio_base, AQCTLB);
pc->ctx.aqsfrc = ehrpwm_read(pc->mmio_base, AQSFRC);
pc->ctx.aqcsfrc = ehrpwm_read(pc->mmio_base, AQCSFRC);
+
pm_runtime_put_sync(pc->chip.dev);
}
@@ -539,9 +552,10 @@ static void ehrpwm_pwm_restore_context(struct ehrpwm_pwm_chip *pc)
static int ehrpwm_pwm_suspend(struct device *dev)
{
struct ehrpwm_pwm_chip *pc = dev_get_drvdata(dev);
- int i;
+ unsigned int i;
ehrpwm_pwm_save_context(pc);
+
for (i = 0; i < pc->chip.npwm; i++) {
struct pwm_device *pwm = &pc->chip.pwms[i];
@@ -551,13 +565,14 @@ static int ehrpwm_pwm_suspend(struct device *dev)
/* Disable explicitly if PWM is running */
pm_runtime_put_sync(dev);
}
+
return 0;
}
static int ehrpwm_pwm_resume(struct device *dev)
{
struct ehrpwm_pwm_chip *pc = dev_get_drvdata(dev);
- int i;
+ unsigned int i;
for (i = 0; i < pc->chip.npwm; i++) {
struct pwm_device *pwm = &pc->chip.pwms[i];
@@ -568,24 +583,25 @@ static int ehrpwm_pwm_resume(struct device *dev)
/* Enable explicitly if PWM was running */
pm_runtime_get_sync(dev);
}
+
ehrpwm_pwm_restore_context(pc);
+
return 0;
}
#endif
static SIMPLE_DEV_PM_OPS(ehrpwm_pwm_pm_ops, ehrpwm_pwm_suspend,
- ehrpwm_pwm_resume);
+ ehrpwm_pwm_resume);
static struct platform_driver ehrpwm_pwm_driver = {
.driver = {
- .name = "ehrpwm",
+ .name = "ehrpwm",
.of_match_table = ehrpwm_of_match,
- .pm = &ehrpwm_pwm_pm_ops,
+ .pm = &ehrpwm_pwm_pm_ops,
},
.probe = ehrpwm_pwm_probe,
.remove = ehrpwm_pwm_remove,
};
-
module_platform_driver(ehrpwm_pwm_driver);
MODULE_DESCRIPTION("EHRPWM PWM driver");
diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c
index 8141a4984126..3a78dd09ac81 100644
--- a/drivers/pwm/pwm-vt8500.c
+++ b/drivers/pwm/pwm-vt8500.c
@@ -241,6 +241,7 @@ static int vt8500_pwm_probe(struct platform_device *pdev)
ret = pwmchip_add(&chip->chip);
if (ret < 0) {
dev_err(&pdev->dev, "failed to add PWM chip\n");
+ clk_unprepare(chip->clk);
return ret;
}
diff --git a/drivers/pwm/pwm-zx.c b/drivers/pwm/pwm-zx.c
new file mode 100644
index 000000000000..5d27c16edfb1
--- /dev/null
+++ b/drivers/pwm/pwm-zx.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (C) 2017 Sanechips Technology Co., Ltd.
+ * Copyright 2017 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/slab.h>
+
+#define ZX_PWM_MODE 0x0
+#define ZX_PWM_CLKDIV_SHIFT 2
+#define ZX_PWM_CLKDIV_MASK GENMASK(11, 2)
+#define ZX_PWM_CLKDIV(x) (((x) << ZX_PWM_CLKDIV_SHIFT) & \
+ ZX_PWM_CLKDIV_MASK)
+#define ZX_PWM_POLAR BIT(1)
+#define ZX_PWM_EN BIT(0)
+#define ZX_PWM_PERIOD 0x4
+#define ZX_PWM_DUTY 0x8
+
+#define ZX_PWM_CLKDIV_MAX 1023
+#define ZX_PWM_PERIOD_MAX 65535
+
+struct zx_pwm_chip {
+ struct pwm_chip chip;
+ struct clk *pclk;
+ struct clk *wclk;
+ void __iomem *base;
+};
+
+static inline struct zx_pwm_chip *to_zx_pwm_chip(struct pwm_chip *chip)
+{
+ return container_of(chip, struct zx_pwm_chip, chip);
+}
+
+static inline u32 zx_pwm_readl(struct zx_pwm_chip *zpc, unsigned int hwpwm,
+ unsigned int offset)
+{
+ return readl(zpc->base + (hwpwm + 1) * 0x10 + offset);
+}
+
+static inline void zx_pwm_writel(struct zx_pwm_chip *zpc, unsigned int hwpwm,
+ unsigned int offset, u32 value)
+{
+ writel(value, zpc->base + (hwpwm + 1) * 0x10 + offset);
+}
+
+static void zx_pwm_set_mask(struct zx_pwm_chip *zpc, unsigned int hwpwm,
+ unsigned int offset, u32 mask, u32 value)
+{
+ u32 data;
+
+ data = zx_pwm_readl(zpc, hwpwm, offset);
+ data &= ~mask;
+ data |= value & mask;
+ zx_pwm_writel(zpc, hwpwm, offset, data);
+}
+
+static void zx_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
+ struct pwm_state *state)
+{
+ struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip);
+ unsigned long rate;
+ unsigned int div;
+ u32 value;
+ u64 tmp;
+
+ value = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_MODE);
+
+ if (value & ZX_PWM_POLAR)
+ state->polarity = PWM_POLARITY_NORMAL;
+ else
+ state->polarity = PWM_POLARITY_INVERSED;
+
+ if (value & ZX_PWM_EN)
+ state->enabled = true;
+ else
+ state->enabled = false;
+
+ div = (value & ZX_PWM_CLKDIV_MASK) >> ZX_PWM_CLKDIV_SHIFT;
+ rate = clk_get_rate(zpc->wclk);
+
+ tmp = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_PERIOD);
+ tmp *= div * NSEC_PER_SEC;
+ state->period = DIV_ROUND_CLOSEST_ULL(tmp, rate);
+
+ tmp = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_DUTY);
+ tmp *= div * NSEC_PER_SEC;
+ state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, rate);
+}
+
+static int zx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
+ unsigned int duty_ns, unsigned int period_ns)
+{
+ struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip);
+ unsigned int period_cycles, duty_cycles;
+ unsigned long long c;
+ unsigned int div = 1;
+ unsigned long rate;
+
+ /* Find out the best divider */
+ rate = clk_get_rate(zpc->wclk);
+
+ while (1) {
+ c = rate / div;
+ c = c * period_ns;
+ do_div(c, NSEC_PER_SEC);
+
+ if (c < ZX_PWM_PERIOD_MAX)
+ break;
+
+ div++;
+
+ if (div > ZX_PWM_CLKDIV_MAX)
+ return -ERANGE;
+ }
+
+ /* Calculate duty cycles */
+ period_cycles = c;
+ c *= duty_ns;
+ do_div(c, period_ns);
+ duty_cycles = c;
+
+ /*
+ * If the PWM is being enabled, we have to temporarily disable it
+ * before configuring the registers.
+ */
+ if (pwm_is_enabled(pwm))
+ zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_EN, 0);
+
+ /* Set up registers */
+ zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_CLKDIV_MASK,
+ ZX_PWM_CLKDIV(div));
+ zx_pwm_writel(zpc, pwm->hwpwm, ZX_PWM_PERIOD, period_cycles);
+ zx_pwm_writel(zpc, pwm->hwpwm, ZX_PWM_DUTY, duty_cycles);
+
+ /* Re-enable the PWM if needed */
+ if (pwm_is_enabled(pwm))
+ zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE,
+ ZX_PWM_EN, ZX_PWM_EN);
+
+ return 0;
+}
+
+static int zx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+ struct pwm_state *state)
+{
+ struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip);
+ struct pwm_state cstate;
+ int ret;
+
+ pwm_get_state(pwm, &cstate);
+
+ if (state->polarity != cstate.polarity)
+ zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_POLAR,
+ (state->polarity == PWM_POLARITY_INVERSED) ?
+ 0 : ZX_PWM_POLAR);
+
+ if (state->period != cstate.period ||
+ state->duty_cycle != cstate.duty_cycle) {
+ ret = zx_pwm_config(chip, pwm, state->duty_cycle,
+ state->period);
+ if (ret)
+ return ret;
+ }
+
+ if (state->enabled != cstate.enabled) {
+ if (state->enabled) {
+ ret = clk_prepare_enable(zpc->wclk);
+ if (ret)
+ return ret;
+
+ zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE,
+ ZX_PWM_EN, ZX_PWM_EN);
+ } else {
+ zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE,
+ ZX_PWM_EN, 0);
+ clk_disable_unprepare(zpc->wclk);
+ }
+ }
+
+ return 0;
+}
+
+static const struct pwm_ops zx_pwm_ops = {
+ .apply = zx_pwm_apply,
+ .get_state = zx_pwm_get_state,
+ .owner = THIS_MODULE,
+};
+
+static int zx_pwm_probe(struct platform_device *pdev)
+{
+ struct zx_pwm_chip *zpc;
+ struct resource *res;
+ unsigned int i;
+ int ret;
+
+ zpc = devm_kzalloc(&pdev->dev, sizeof(*zpc), GFP_KERNEL);
+ if (!zpc)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ zpc->base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(zpc->base))
+ return PTR_ERR(zpc->base);
+
+ zpc->pclk = devm_clk_get(&pdev->dev, "pclk");
+ if (IS_ERR(zpc->pclk))
+ return PTR_ERR(zpc->pclk);
+
+ zpc->wclk = devm_clk_get(&pdev->dev, "wclk");
+ if (IS_ERR(zpc->wclk))
+ return PTR_ERR(zpc->wclk);
+
+ ret = clk_prepare_enable(zpc->pclk);
+ if (ret)
+ return ret;
+
+ zpc->chip.dev = &pdev->dev;
+ zpc->chip.ops = &zx_pwm_ops;
+ zpc->chip.base = -1;
+ zpc->chip.npwm = 4;
+ zpc->chip.of_xlate = of_pwm_xlate_with_flags;
+ zpc->chip.of_pwm_n_cells = 3;
+
+ /*
+ * PWM devices may be enabled by firmware, and let's disable all of
+ * them initially to save power.
+ */
+ for (i = 0; i < zpc->chip.npwm; i++)
+ zx_pwm_set_mask(zpc, i, ZX_PWM_MODE, ZX_PWM_EN, 0);
+
+ ret = pwmchip_add(&zpc->chip);
+ if (ret < 0) {
+ dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret);
+ return ret;
+ }
+
+ platform_set_drvdata(pdev, zpc);
+
+ return 0;
+}
+
+static int zx_pwm_remove(struct platform_device *pdev)
+{
+ struct zx_pwm_chip *zpc = platform_get_drvdata(pdev);
+ int ret;
+
+ ret = pwmchip_remove(&zpc->chip);
+ clk_disable_unprepare(zpc->pclk);
+
+ return ret;
+}
+
+static const struct of_device_id zx_pwm_dt_ids[] = {
+ { .compatible = "zte,zx296718-pwm", },
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, zx_pwm_dt_ids);
+
+static struct platform_driver zx_pwm_driver = {
+ .driver = {
+ .name = "zx-pwm",
+ .of_match_table = zx_pwm_dt_ids,
+ },
+ .probe = zx_pwm_probe,
+ .remove = zx_pwm_remove,
+};
+module_platform_driver(zx_pwm_driver);
+
+MODULE_ALIAS("platform:zx-pwm");
+MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>");
+MODULE_DESCRIPTION("ZTE ZX PWM Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 9c97ad1ee121..ea19b4ff87a2 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -62,7 +62,6 @@ MODULE_LICENSE("GPL");
static int dasd_alloc_queue(struct dasd_block *);
static void dasd_setup_queue(struct dasd_block *);
static void dasd_free_queue(struct dasd_block *);
-static void dasd_flush_request_queue(struct dasd_block *);
static int dasd_flush_block_queue(struct dasd_block *);
static void dasd_device_tasklet(struct dasd_device *);
static void dasd_block_tasklet(struct dasd_block *);
@@ -158,7 +157,6 @@ struct dasd_block *dasd_alloc_block(void)
/* open_count = 0 means device online but not in use */
atomic_set(&block->open_count, -1);
- spin_lock_init(&block->request_queue_lock);
atomic_set(&block->tasklet_scheduled, 0);
tasklet_init(&block->tasklet,
(void (*)(unsigned long)) dasd_block_tasklet,
@@ -391,7 +389,6 @@ static int dasd_state_ready_to_basic(struct dasd_device *device)
device->state = DASD_STATE_READY;
return rc;
}
- dasd_flush_request_queue(block);
dasd_destroy_partitions(block);
block->blocks = 0;
block->bp_block = 0;
@@ -1645,8 +1642,10 @@ void dasd_generic_handle_state_change(struct dasd_device *device)
dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING);
dasd_schedule_device_bh(device);
- if (device->block)
+ if (device->block) {
dasd_schedule_block_bh(device->block);
+ blk_mq_run_hw_queues(device->block->request_queue, true);
+ }
}
EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change);
@@ -2638,6 +2637,7 @@ static void dasd_block_timeout(unsigned long ptr)
dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING);
spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags);
dasd_schedule_block_bh(block);
+ blk_mq_run_hw_queues(block->request_queue, true);
}
/*
@@ -2677,115 +2677,11 @@ static void __dasd_process_erp(struct dasd_device *device,
erp_fn(cqr);
}
-/*
- * Fetch requests from the block device queue.
- */
-static void __dasd_process_request_queue(struct dasd_block *block)
-{
- struct request_queue *queue;
- struct request *req;
- struct dasd_ccw_req *cqr;
- struct dasd_device *basedev;
- unsigned long flags;
- queue = block->request_queue;
- basedev = block->base;
- /* No queue ? Then there is nothing to do. */
- if (queue == NULL)
- return;
-
- /*
- * We requeue request from the block device queue to the ccw
- * queue only in two states. In state DASD_STATE_READY the
- * partition detection is done and we need to requeue requests
- * for that. State DASD_STATE_ONLINE is normal block device
- * operation.
- */
- if (basedev->state < DASD_STATE_READY) {
- while ((req = blk_fetch_request(block->request_queue)))
- __blk_end_request_all(req, BLK_STS_IOERR);
- return;
- }
-
- /*
- * if device is stopped do not fetch new requests
- * except failfast is active which will let requests fail
- * immediately in __dasd_block_start_head()
- */
- if (basedev->stopped && !(basedev->features & DASD_FEATURE_FAILFAST))
- return;
-
- /* Now we try to fetch requests from the request queue */
- while ((req = blk_peek_request(queue))) {
- if (basedev->features & DASD_FEATURE_READONLY &&
- rq_data_dir(req) == WRITE) {
- DBF_DEV_EVENT(DBF_ERR, basedev,
- "Rejecting write request %p",
- req);
- blk_start_request(req);
- __blk_end_request_all(req, BLK_STS_IOERR);
- continue;
- }
- if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) &&
- (basedev->features & DASD_FEATURE_FAILFAST ||
- blk_noretry_request(req))) {
- DBF_DEV_EVENT(DBF_ERR, basedev,
- "Rejecting failfast request %p",
- req);
- blk_start_request(req);
- __blk_end_request_all(req, BLK_STS_TIMEOUT);
- continue;
- }
- cqr = basedev->discipline->build_cp(basedev, block, req);
- if (IS_ERR(cqr)) {
- if (PTR_ERR(cqr) == -EBUSY)
- break; /* normal end condition */
- if (PTR_ERR(cqr) == -ENOMEM)
- break; /* terminate request queue loop */
- if (PTR_ERR(cqr) == -EAGAIN) {
- /*
- * The current request cannot be build right
- * now, we have to try later. If this request
- * is the head-of-queue we stop the device
- * for 1/2 second.
- */
- if (!list_empty(&block->ccw_queue))
- break;
- spin_lock_irqsave(
- get_ccwdev_lock(basedev->cdev), flags);
- dasd_device_set_stop_bits(basedev,
- DASD_STOPPED_PENDING);
- spin_unlock_irqrestore(
- get_ccwdev_lock(basedev->cdev), flags);
- dasd_block_set_timer(block, HZ/2);
- break;
- }
- DBF_DEV_EVENT(DBF_ERR, basedev,
- "CCW creation failed (rc=%ld) "
- "on request %p",
- PTR_ERR(cqr), req);
- blk_start_request(req);
- __blk_end_request_all(req, BLK_STS_IOERR);
- continue;
- }
- /*
- * Note: callback is set to dasd_return_cqr_cb in
- * __dasd_block_start_head to cover erp requests as well
- */
- cqr->callback_data = (void *) req;
- cqr->status = DASD_CQR_FILLED;
- req->completion_data = cqr;
- blk_start_request(req);
- list_add_tail(&cqr->blocklist, &block->ccw_queue);
- INIT_LIST_HEAD(&cqr->devlist);
- dasd_profile_start(block, cqr, req);
- }
-}
-
static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
{
struct request *req;
- int status;
blk_status_t error = BLK_STS_OK;
+ int status;
req = (struct request *) cqr->callback_data;
dasd_profile_end(cqr->block, cqr, req);
@@ -2809,7 +2705,19 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
break;
}
}
- __blk_end_request_all(req, error);
+
+ /*
+ * We need to take care for ETIMEDOUT errors here since the
+ * complete callback does not get called in this case.
+ * Take care of all errors here and avoid additional code to
+ * transfer the error value to the complete callback.
+ */
+ if (error) {
+ blk_mq_end_request(req, error);
+ blk_mq_run_hw_queues(req->q, true);
+ } else {
+ blk_mq_complete_request(req);
+ }
}
/*
@@ -2938,27 +2846,30 @@ static void dasd_block_tasklet(struct dasd_block *block)
struct list_head final_queue;
struct list_head *l, *n;
struct dasd_ccw_req *cqr;
+ struct dasd_queue *dq;
atomic_set(&block->tasklet_scheduled, 0);
INIT_LIST_HEAD(&final_queue);
- spin_lock(&block->queue_lock);
+ spin_lock_irq(&block->queue_lock);
/* Finish off requests on ccw queue */
__dasd_process_block_ccw_queue(block, &final_queue);
- spin_unlock(&block->queue_lock);
+ spin_unlock_irq(&block->queue_lock);
+
/* Now call the callback function of requests with final status */
- spin_lock_irq(&block->request_queue_lock);
list_for_each_safe(l, n, &final_queue) {
cqr = list_entry(l, struct dasd_ccw_req, blocklist);
+ dq = cqr->dq;
+ spin_lock_irq(&dq->lock);
list_del_init(&cqr->blocklist);
__dasd_cleanup_cqr(cqr);
+ spin_unlock_irq(&dq->lock);
}
- spin_lock(&block->queue_lock);
- /* Get new request from the block device request queue */
- __dasd_process_request_queue(block);
+
+ spin_lock_irq(&block->queue_lock);
/* Now check if the head of the ccw queue needs to be started. */
__dasd_block_start_head(block);
- spin_unlock(&block->queue_lock);
- spin_unlock_irq(&block->request_queue_lock);
+ spin_unlock_irq(&block->queue_lock);
+
if (waitqueue_active(&shutdown_waitq))
wake_up(&shutdown_waitq);
dasd_put_device(block->base);
@@ -2977,14 +2888,13 @@ static int _dasd_requeue_request(struct dasd_ccw_req *cqr)
{
struct dasd_block *block = cqr->block;
struct request *req;
- unsigned long flags;
if (!block)
return -EINVAL;
- spin_lock_irqsave(&block->request_queue_lock, flags);
+ spin_lock_irq(&cqr->dq->lock);
req = (struct request *) cqr->callback_data;
- blk_requeue_request(block->request_queue, req);
- spin_unlock_irqrestore(&block->request_queue_lock, flags);
+ blk_mq_requeue_request(req, false);
+ spin_unlock_irq(&cqr->dq->lock);
return 0;
}
@@ -2999,6 +2909,7 @@ static int dasd_flush_block_queue(struct dasd_block *block)
struct dasd_ccw_req *cqr, *n;
int rc, i;
struct list_head flush_queue;
+ unsigned long flags;
INIT_LIST_HEAD(&flush_queue);
spin_lock_bh(&block->queue_lock);
@@ -3037,11 +2948,11 @@ restart_cb:
goto restart_cb;
}
/* call the callback function */
- spin_lock_irq(&block->request_queue_lock);
+ spin_lock_irqsave(&cqr->dq->lock, flags);
cqr->endclk = get_tod_clock();
list_del_init(&cqr->blocklist);
__dasd_cleanup_cqr(cqr);
- spin_unlock_irq(&block->request_queue_lock);
+ spin_unlock_irqrestore(&cqr->dq->lock, flags);
}
return rc;
}
@@ -3069,42 +2980,114 @@ EXPORT_SYMBOL(dasd_schedule_block_bh);
/*
* Dasd request queue function. Called from ll_rw_blk.c
*/
-static void do_dasd_request(struct request_queue *queue)
+static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *qd)
{
- struct dasd_block *block;
+ struct dasd_block *block = hctx->queue->queuedata;
+ struct dasd_queue *dq = hctx->driver_data;
+ struct request *req = qd->rq;
+ struct dasd_device *basedev;
+ struct dasd_ccw_req *cqr;
+ blk_status_t rc = BLK_STS_OK;
+
+ basedev = block->base;
+ spin_lock_irq(&dq->lock);
+ if (basedev->state < DASD_STATE_READY) {
+ DBF_DEV_EVENT(DBF_ERR, basedev,
+ "device not ready for request %p", req);
+ rc = BLK_STS_IOERR;
+ goto out;
+ }
+
+ /*
+ * if device is stopped do not fetch new requests
+ * except failfast is active which will let requests fail
+ * immediately in __dasd_block_start_head()
+ */
+ if (basedev->stopped && !(basedev->features & DASD_FEATURE_FAILFAST)) {
+ DBF_DEV_EVENT(DBF_ERR, basedev,
+ "device stopped request %p", req);
+ rc = BLK_STS_RESOURCE;
+ goto out;
+ }
+
+ if (basedev->features & DASD_FEATURE_READONLY &&
+ rq_data_dir(req) == WRITE) {
+ DBF_DEV_EVENT(DBF_ERR, basedev,
+ "Rejecting write request %p", req);
+ rc = BLK_STS_IOERR;
+ goto out;
+ }
- block = queue->queuedata;
+ if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) &&
+ (basedev->features & DASD_FEATURE_FAILFAST ||
+ blk_noretry_request(req))) {
+ DBF_DEV_EVENT(DBF_ERR, basedev,
+ "Rejecting failfast request %p", req);
+ rc = BLK_STS_IOERR;
+ goto out;
+ }
+
+ cqr = basedev->discipline->build_cp(basedev, block, req);
+ if (IS_ERR(cqr)) {
+ if (PTR_ERR(cqr) == -EBUSY ||
+ PTR_ERR(cqr) == -ENOMEM ||
+ PTR_ERR(cqr) == -EAGAIN) {
+ rc = BLK_STS_RESOURCE;
+ goto out;
+ }
+ DBF_DEV_EVENT(DBF_ERR, basedev,
+ "CCW creation failed (rc=%ld) on request %p",
+ PTR_ERR(cqr), req);
+ rc = BLK_STS_IOERR;
+ goto out;
+ }
+ /*
+ * Note: callback is set to dasd_return_cqr_cb in
+ * __dasd_block_start_head to cover erp requests as well
+ */
+ cqr->callback_data = req;
+ cqr->status = DASD_CQR_FILLED;
+ cqr->dq = dq;
+ req->completion_data = cqr;
+ blk_mq_start_request(req);
spin_lock(&block->queue_lock);
- /* Get new request from the block device request queue */
- __dasd_process_request_queue(block);
- /* Now check if the head of the ccw queue needs to be started. */
- __dasd_block_start_head(block);
+ list_add_tail(&cqr->blocklist, &block->ccw_queue);
+ INIT_LIST_HEAD(&cqr->devlist);
+ dasd_profile_start(block, cqr, req);
+ dasd_schedule_block_bh(block);
spin_unlock(&block->queue_lock);
+
+out:
+ spin_unlock_irq(&dq->lock);
+ return rc;
}
/*
* Block timeout callback, called from the block layer
*
- * request_queue lock is held on entry.
- *
* Return values:
* BLK_EH_RESET_TIMER if the request should be left running
* BLK_EH_NOT_HANDLED if the request is handled or terminated
* by the driver.
*/
-enum blk_eh_timer_return dasd_times_out(struct request *req)
+enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved)
{
struct dasd_ccw_req *cqr = req->completion_data;
struct dasd_block *block = req->q->queuedata;
struct dasd_device *device;
+ unsigned long flags;
int rc = 0;
if (!cqr)
return BLK_EH_NOT_HANDLED;
+ spin_lock_irqsave(&cqr->dq->lock, flags);
device = cqr->startdev ? cqr->startdev : block->base;
- if (!device->blk_timeout)
+ if (!device->blk_timeout) {
+ spin_unlock_irqrestore(&cqr->dq->lock, flags);
return BLK_EH_RESET_TIMER;
+ }
DBF_DEV_EVENT(DBF_WARNING, device,
" dasd_times_out cqr %p status %x",
cqr, cqr->status);
@@ -3154,19 +3137,64 @@ enum blk_eh_timer_return dasd_times_out(struct request *req)
}
dasd_schedule_block_bh(block);
spin_unlock(&block->queue_lock);
+ spin_unlock_irqrestore(&cqr->dq->lock, flags);
return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED;
}
+static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int idx)
+{
+ struct dasd_queue *dq = kzalloc(sizeof(*dq), GFP_KERNEL);
+
+ if (!dq)
+ return -ENOMEM;
+
+ spin_lock_init(&dq->lock);
+ hctx->driver_data = dq;
+
+ return 0;
+}
+
+static void dasd_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int idx)
+{
+ kfree(hctx->driver_data);
+ hctx->driver_data = NULL;
+}
+
+static void dasd_request_done(struct request *req)
+{
+ blk_mq_end_request(req, 0);
+ blk_mq_run_hw_queues(req->q, true);
+}
+
+static struct blk_mq_ops dasd_mq_ops = {
+ .queue_rq = do_dasd_request,
+ .complete = dasd_request_done,
+ .timeout = dasd_times_out,
+ .init_hctx = dasd_init_hctx,
+ .exit_hctx = dasd_exit_hctx,
+};
+
/*
* Allocate and initialize request queue and default I/O scheduler.
*/
static int dasd_alloc_queue(struct dasd_block *block)
{
- block->request_queue = blk_init_queue(do_dasd_request,
- &block->request_queue_lock);
- if (block->request_queue == NULL)
- return -ENOMEM;
+ int rc;
+
+ block->tag_set.ops = &dasd_mq_ops;
+ block->tag_set.nr_hw_queues = DASD_NR_HW_QUEUES;
+ block->tag_set.queue_depth = DASD_MAX_LCU_DEV * DASD_REQ_PER_DEV;
+ block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+
+ rc = blk_mq_alloc_tag_set(&block->tag_set);
+ if (rc)
+ return rc;
+
+ block->request_queue = blk_mq_init_queue(&block->tag_set);
+ if (IS_ERR(block->request_queue))
+ return PTR_ERR(block->request_queue);
block->request_queue->queuedata = block;
@@ -3229,26 +3257,11 @@ static void dasd_free_queue(struct dasd_block *block)
{
if (block->request_queue) {
blk_cleanup_queue(block->request_queue);
+ blk_mq_free_tag_set(&block->tag_set);
block->request_queue = NULL;
}
}
-/*
- * Flush request on the request queue.
- */
-static void dasd_flush_request_queue(struct dasd_block *block)
-{
- struct request *req;
-
- if (!block->request_queue)
- return;
-
- spin_lock_irq(&block->request_queue_lock);
- while ((req = blk_fetch_request(block->request_queue)))
- __blk_end_request_all(req, BLK_STS_IOERR);
- spin_unlock_irq(&block->request_queue_lock);
-}
-
static int dasd_open(struct block_device *bdev, fmode_t mode)
{
struct dasd_device *base;
@@ -3744,8 +3757,10 @@ int dasd_generic_path_operational(struct dasd_device *device)
return 1;
}
dasd_schedule_device_bh(device);
- if (device->block)
+ if (device->block) {
dasd_schedule_block_bh(device->block);
+ blk_mq_run_hw_queues(device->block->request_queue, true);
+ }
if (!device->stopped)
wake_up(&generic_waitq);
@@ -4008,8 +4023,10 @@ int dasd_generic_restore_device(struct ccw_device *cdev)
*/
device->stopped |= DASD_UNRESUMED_PM;
- if (device->block)
+ if (device->block) {
dasd_schedule_block_bh(device->block);
+ blk_mq_run_hw_queues(device->block->request_queue, true);
+ }
clear_bit(DASD_FLAG_SUSPENDED, &device->flags);
dasd_put_device(device);
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index e38042ce94e6..c95a4784c191 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -1326,7 +1326,7 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr,
{
struct dasd_device *device;
struct request_queue *q;
- unsigned long val, flags;
+ unsigned long val;
device = dasd_device_from_cdev(to_ccwdev(dev));
if (IS_ERR(device) || !device->block)
@@ -1342,16 +1342,10 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr,
dasd_put_device(device);
return -ENODEV;
}
- spin_lock_irqsave(&device->block->request_queue_lock, flags);
- if (!val)
- blk_queue_rq_timed_out(q, NULL);
- else
- blk_queue_rq_timed_out(q, dasd_times_out);
device->blk_timeout = val;
blk_queue_rq_timeout(q, device->blk_timeout * HZ);
- spin_unlock_irqrestore(&device->block->request_queue_lock, flags);
dasd_put_device(device);
return count;
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index f9e25fc03d6b..db470bd10175 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -56,6 +56,7 @@
#include <asm/dasd.h>
#include <asm/idals.h>
#include <linux/bitops.h>
+#include <linux/blk-mq.h>
/* DASD discipline magic */
#define DASD_ECKD_MAGIC 0xC5C3D2C4
@@ -185,6 +186,7 @@ struct dasd_ccw_req {
char status; /* status of this request */
short retries; /* A retry counter */
unsigned long flags; /* flags of this request */
+ struct dasd_queue *dq;
/* ... and how */
unsigned long starttime; /* jiffies time of request start */
@@ -248,6 +250,16 @@ struct dasd_ccw_req {
#define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */
#define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */
+/*
+ * There is no reliable way to determine the number of available CPUs on
+ * LPAR but there is no big performance difference between 1 and the
+ * maximum CPU number.
+ * 64 is a good trade off performance wise.
+ */
+#define DASD_NR_HW_QUEUES 64
+#define DASD_MAX_LCU_DEV 256
+#define DASD_REQ_PER_DEV 4
+
/* Signature for error recovery functions. */
typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *);
@@ -539,6 +551,7 @@ struct dasd_block {
struct gendisk *gdp;
struct request_queue *request_queue;
spinlock_t request_queue_lock;
+ struct blk_mq_tag_set tag_set;
struct block_device *bdev;
atomic_t open_count;
@@ -563,6 +576,10 @@ struct dasd_attention_data {
__u8 lpum;
};
+struct dasd_queue {
+ spinlock_t lock;
+};
+
/* reasons why device (ccw_device_start) was stopped */
#define DASD_STOPPED_NOT_ACC 1 /* not accessible */
#define DASD_STOPPED_QUIESCE 2 /* Quiesced */
@@ -731,7 +748,7 @@ void dasd_free_device(struct dasd_device *);
struct dasd_block *dasd_alloc_block(void);
void dasd_free_block(struct dasd_block *);
-enum blk_eh_timer_return dasd_times_out(struct request *req);
+enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved);
void dasd_enable_device(struct dasd_device *);
void dasd_set_target_state(struct dasd_device *, int);
diff --git a/drivers/s390/crypto/ap_asm.h b/drivers/s390/crypto/ap_asm.h
index 287b4ad0999e..cd350345b3d2 100644
--- a/drivers/s390/crypto/ap_asm.h
+++ b/drivers/s390/crypto/ap_asm.h
@@ -69,16 +69,19 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid)
}
/**
- * ap_aqic(): Enable interruption for a specific AP.
+ * ap_aqic(): Control interruption for a specific AP.
* @qid: The AP queue number
+ * @qirqctrl: struct ap_qirq_ctrl (64 bit value)
* @ind: The notification indicator byte
*
* Returns AP queue status.
*/
-static inline struct ap_queue_status ap_aqic(ap_qid_t qid, void *ind)
+static inline struct ap_queue_status ap_aqic(ap_qid_t qid,
+ struct ap_qirq_ctrl qirqctrl,
+ void *ind)
{
register unsigned long reg0 asm ("0") = qid | (3UL << 24);
- register unsigned long reg1_in asm ("1") = (8UL << 44) | AP_ISC;
+ register struct ap_qirq_ctrl reg1_in asm ("1") = qirqctrl;
register struct ap_queue_status reg1_out asm ("1");
register void *reg2 asm ("2") = ind;
diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 6dee598979e7..5f0be2040272 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -166,26 +166,51 @@ static int ap_configuration_available(void)
}
/**
+ * ap_apft_available(): Test if AP facilities test (APFT)
+ * facility is available.
+ *
+ * Returns 1 if APFT is is available.
+ */
+static int ap_apft_available(void)
+{
+ return test_facility(15);
+}
+
+/**
* ap_test_queue(): Test adjunct processor queue.
* @qid: The AP queue number
+ * @tbit: Test facilities bit
* @info: Pointer to queue descriptor
*
* Returns AP queue status structure.
*/
-static inline struct ap_queue_status
-ap_test_queue(ap_qid_t qid, unsigned long *info)
+struct ap_queue_status ap_test_queue(ap_qid_t qid,
+ int tbit,
+ unsigned long *info)
{
- if (test_facility(15))
- qid |= 1UL << 23; /* set APFT T bit*/
+ if (tbit)
+ qid |= 1UL << 23; /* set T bit*/
return ap_tapq(qid, info);
}
+EXPORT_SYMBOL(ap_test_queue);
-static inline int ap_query_configuration(void)
+/*
+ * ap_query_configuration(): Fetch cryptographic config info
+ *
+ * Returns the ap configuration info fetched via PQAP(QCI).
+ * On success 0 is returned, on failure a negative errno
+ * is returned, e.g. if the PQAP(QCI) instruction is not
+ * available, the return value will be -EOPNOTSUPP.
+ */
+int ap_query_configuration(struct ap_config_info *info)
{
- if (!ap_configuration)
+ if (!ap_configuration_available())
return -EOPNOTSUPP;
- return ap_qci(ap_configuration);
+ if (!info)
+ return -EINVAL;
+ return ap_qci(info);
}
+EXPORT_SYMBOL(ap_query_configuration);
/**
* ap_init_configuration(): Allocate and query configuration array.
@@ -198,7 +223,7 @@ static void ap_init_configuration(void)
ap_configuration = kzalloc(sizeof(*ap_configuration), GFP_KERNEL);
if (!ap_configuration)
return;
- if (ap_query_configuration() != 0) {
+ if (ap_query_configuration(ap_configuration) != 0) {
kfree(ap_configuration);
ap_configuration = NULL;
return;
@@ -261,7 +286,7 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type,
if (!ap_test_config_card_id(AP_QID_CARD(qid)))
return -ENODEV;
- status = ap_test_queue(qid, &info);
+ status = ap_test_queue(qid, ap_apft_available(), &info);
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
*queue_depth = (int)(info & 0xff);
@@ -940,7 +965,9 @@ static int ap_select_domain(void)
for (j = 0; j < AP_DEVICES; j++) {
if (!ap_test_config_card_id(j))
continue;
- status = ap_test_queue(AP_MKQID(j, i), NULL);
+ status = ap_test_queue(AP_MKQID(j, i),
+ ap_apft_available(),
+ NULL);
if (status.response_code != AP_RESPONSE_NORMAL)
continue;
count++;
@@ -993,7 +1020,7 @@ static void ap_scan_bus(struct work_struct *unused)
AP_DBF(DBF_DEBUG, "ap_scan_bus running\n");
- ap_query_configuration();
+ ap_query_configuration(ap_configuration);
if (ap_select_domain() != 0)
goto out;
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 4dc7c88fb054..754cf2223cfb 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -28,6 +28,7 @@
#include <linux/device.h>
#include <linux/types.h>
+#include <asm/ap.h>
#define AP_DEVICES 64 /* Number of AP devices. */
#define AP_DOMAINS 256 /* Number of AP domains. */
@@ -40,41 +41,6 @@ extern int ap_domain_index;
extern spinlock_t ap_list_lock;
extern struct list_head ap_card_list;
-/**
- * The ap_qid_t identifier of an ap queue. It contains a
- * 6 bit card index and a 4 bit queue index (domain).
- */
-typedef unsigned int ap_qid_t;
-
-#define AP_MKQID(_card, _queue) (((_card) & 63) << 8 | ((_queue) & 255))
-#define AP_QID_CARD(_qid) (((_qid) >> 8) & 63)
-#define AP_QID_QUEUE(_qid) ((_qid) & 255)
-
-/**
- * structy ap_queue_status - Holds the AP queue status.
- * @queue_empty: Shows if queue is empty
- * @replies_waiting: Waiting replies
- * @queue_full: Is 1 if the queue is full
- * @pad: A 4 bit pad
- * @int_enabled: Shows if interrupts are enabled for the AP
- * @response_code: Holds the 8 bit response code
- * @pad2: A 16 bit pad
- *
- * The ap queue status word is returned by all three AP functions
- * (PQAP, NQAP and DQAP). There's a set of flags in the first
- * byte, followed by a 1 byte response code.
- */
-struct ap_queue_status {
- unsigned int queue_empty : 1;
- unsigned int replies_waiting : 1;
- unsigned int queue_full : 1;
- unsigned int pad1 : 4;
- unsigned int int_enabled : 1;
- unsigned int response_code : 8;
- unsigned int pad2 : 16;
-} __packed;
-
-
static inline int ap_test_bit(unsigned int *ptr, unsigned int nr)
{
return (*ptr & (0x80000000u >> nr)) != 0;
@@ -238,17 +204,6 @@ struct ap_message {
struct ap_message *);
};
-struct ap_config_info {
- unsigned int special_command:1;
- unsigned int ap_extended:1;
- unsigned char reserved1:6;
- unsigned char reserved2[15];
- unsigned int apm[8]; /* AP ID mask */
- unsigned int aqm[8]; /* AP queue mask */
- unsigned int adm[8]; /* AP domain mask */
- unsigned char reserved4[16];
-} __packed;
-
/**
* ap_init_message() - Initialize ap_message.
* Initialize a message before using. Otherwise this might result in
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 0f1a5d02acb0..56b96edffd5b 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -16,6 +16,25 @@
#include "ap_asm.h"
/**
+ * ap_queue_irq_ctrl(): Control interruption on a AP queue.
+ * @qirqctrl: struct ap_qirq_ctrl (64 bit value)
+ * @ind: The notification indicator byte
+ *
+ * Returns AP queue status.
+ *
+ * Control interruption on the given AP queue.
+ * Just a simple wrapper function for the low level PQAP(AQIC)
+ * instruction available for other kernel modules.
+ */
+struct ap_queue_status ap_queue_irq_ctrl(ap_qid_t qid,
+ struct ap_qirq_ctrl qirqctrl,
+ void *ind)
+{
+ return ap_aqic(qid, qirqctrl, ind);
+}
+EXPORT_SYMBOL(ap_queue_irq_ctrl);
+
+/**
* ap_queue_enable_interruption(): Enable interruption on an AP queue.
* @qid: The AP queue number
* @ind: the notification indicator byte
@@ -27,8 +46,11 @@
static int ap_queue_enable_interruption(struct ap_queue *aq, void *ind)
{
struct ap_queue_status status;
+ struct ap_qirq_ctrl qirqctrl = { 0 };
- status = ap_aqic(aq->qid, ind);
+ qirqctrl.ir = 1;
+ qirqctrl.isc = AP_ISC;
+ status = ap_aqic(aq->qid, qirqctrl, ind);
switch (status.response_code) {
case AP_RESPONSE_NORMAL:
case AP_RESPONSE_OTHERWISE_CHANGED:
@@ -362,7 +384,7 @@ static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq)
/* Get the status with TAPQ */
status = ap_tapq(aq->qid, NULL);
- if (status.int_enabled == 1) {
+ if (status.irq_enabled == 1) {
/* Irqs are now enabled */
aq->interrupt = AP_INTR_ENABLED;
aq->state = (aq->queue_count > 0) ?
diff --git a/drivers/scsi/NCR_Q720.c b/drivers/scsi/NCR_Q720.c
index 05835bf1bf9c..54e7d26908ee 100644
--- a/drivers/scsi/NCR_Q720.c
+++ b/drivers/scsi/NCR_Q720.c
@@ -217,8 +217,7 @@ NCR_Q720_probe(struct device *dev)
}
if (dma_declare_coherent_memory(dev, base_addr, base_addr,
- mem_size, DMA_MEMORY_MAP)
- != DMA_MEMORY_MAP) {
+ mem_size, 0)) {
printk(KERN_ERR "NCR_Q720: DMA declare memory failed\n");
goto out_release_region;
}
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index b5b5facb8747..07002df4f83a 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -342,7 +342,7 @@ config X86_PKG_TEMP_THERMAL
config INTEL_SOC_DTS_IOSF_CORE
tristate
- depends on X86
+ depends on X86 && PCI
select IOSF_MBI
help
This is becoming a common feature for Intel SoCs to expose the additional
@@ -352,7 +352,7 @@ config INTEL_SOC_DTS_IOSF_CORE
config INTEL_SOC_DTS_THERMAL
tristate "Intel SoCs DTS thermal driver"
- depends on X86
+ depends on X86 && PCI
select INTEL_SOC_DTS_IOSF_CORE
select THERMAL_WRITABLE_TRIPS
help
@@ -473,4 +473,12 @@ config ZX2967_THERMAL
the primitive temperature sensor embedded in zx2967 SoCs.
This sensor generates the real time die temperature.
+config UNIPHIER_THERMAL
+ tristate "Socionext UniPhier thermal driver"
+ depends on ARCH_UNIPHIER || COMPILE_TEST
+ depends on THERMAL_OF && MFD_SYSCON
+ help
+ Enable this to plug in UniPhier on-chip PVT thermal driver into the
+ thermal framework. The driver supports CPU thermal zone temperature
+ reporting and a couple of trip points.
endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 094d7039981c..8b79bca23536 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -59,3 +59,4 @@ obj-$(CONFIG_HISI_THERMAL) += hisi_thermal.o
obj-$(CONFIG_MTK_THERMAL) += mtk_thermal.o
obj-$(CONFIG_GENERIC_ADC_THERMAL) += thermal-generic-adc.o
obj-$(CONFIG_ZX2967_THERMAL) += zx2967_thermal.o
+obj-$(CONFIG_UNIPHIER_THERMAL) += uniphier_thermal.o
diff --git a/drivers/thermal/broadcom/bcm2835_thermal.c b/drivers/thermal/broadcom/bcm2835_thermal.c
index e6863c841662..a4d6a0e2e993 100644
--- a/drivers/thermal/broadcom/bcm2835_thermal.c
+++ b/drivers/thermal/broadcom/bcm2835_thermal.c
@@ -145,7 +145,7 @@ static void bcm2835_thermal_debugfs(struct platform_device *pdev)
debugfs_create_regset32("regset", 0444, data->debugfsdir, regset);
}
-static struct thermal_zone_of_device_ops bcm2835_thermal_ops = {
+static const struct thermal_zone_of_device_ops bcm2835_thermal_ops = {
.get_temp = bcm2835_thermal_get_temp,
};
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index 9c3ce341eb97..bd3572c41585 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -206,7 +206,7 @@ static int hisi_thermal_get_temp(void *_sensor, int *temp)
return 0;
}
-static struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
+static const struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
.get_temp = hisi_thermal_get_temp,
};
diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c
index 51ceb80212a7..c719167e9f28 100644
--- a/drivers/thermal/int340x_thermal/acpi_thermal_rel.c
+++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c
@@ -228,7 +228,7 @@ static void get_single_name(acpi_handle handle, char *name)
struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER};
if (ACPI_FAILURE(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer)))
- pr_warn("Failed get name from handle\n");
+ pr_warn("Failed to get device name from acpi handle\n");
else {
memcpy(name, buffer.pointer, ACPI_NAME_SIZE);
kfree(buffer.pointer);
diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.h b/drivers/thermal/int340x_thermal/acpi_thermal_rel.h
index f00700bc9d79..65075b174329 100644
--- a/drivers/thermal/int340x_thermal/acpi_thermal_rel.h
+++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.h
@@ -34,10 +34,10 @@ struct trt {
acpi_handle target;
u64 influence;
u64 sample_period;
- u64 reverved1;
- u64 reverved2;
- u64 reverved3;
- u64 reverved4;
+ u64 reserved1;
+ u64 reserved2;
+ u64 reserved3;
+ u64 reserved4;
} __packed;
#define ACPI_NR_ART_ELEMENTS 13
diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c
index a9ec94ed7a42..8ee38f55c7f3 100644
--- a/drivers/thermal/int340x_thermal/int3400_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3400_thermal.c
@@ -16,6 +16,8 @@
#include <linux/thermal.h>
#include "acpi_thermal_rel.h"
+#define INT3400_THERMAL_TABLE_CHANGED 0x83
+
enum int3400_thermal_uuid {
INT3400_THERMAL_PASSIVE_1,
INT3400_THERMAL_ACTIVE,
@@ -104,7 +106,7 @@ static struct attribute *uuid_attrs[] = {
NULL
};
-static struct attribute_group uuid_attribute_group = {
+static const struct attribute_group uuid_attribute_group = {
.attrs = uuid_attrs,
.name = "uuids"
};
@@ -185,6 +187,35 @@ static int int3400_thermal_run_osc(acpi_handle handle,
return result;
}
+static void int3400_notify(acpi_handle handle,
+ u32 event,
+ void *data)
+{
+ struct int3400_thermal_priv *priv = data;
+ char *thermal_prop[5];
+
+ if (!priv)
+ return;
+
+ switch (event) {
+ case INT3400_THERMAL_TABLE_CHANGED:
+ thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s",
+ priv->thermal->type);
+ thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d",
+ priv->thermal->temperature);
+ thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=");
+ thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d",
+ THERMAL_TABLE_CHANGED);
+ thermal_prop[4] = NULL;
+ kobject_uevent_env(&priv->thermal->device.kobj, KOBJ_CHANGE,
+ thermal_prop);
+ break;
+ default:
+ dev_err(&priv->adev->dev, "Unsupported event [0x%x]\n", event);
+ break;
+ }
+}
+
static int int3400_thermal_get_temp(struct thermal_zone_device *thermal,
int *temp)
{
@@ -290,6 +321,12 @@ static int int3400_thermal_probe(struct platform_device *pdev)
if (result)
goto free_zone;
+ result = acpi_install_notify_handler(
+ priv->adev->handle, ACPI_DEVICE_NOTIFY, int3400_notify,
+ (void *)priv);
+ if (result)
+ goto free_zone;
+
return 0;
free_zone:
@@ -306,6 +343,10 @@ static int int3400_thermal_remove(struct platform_device *pdev)
{
struct int3400_thermal_priv *priv = platform_get_drvdata(pdev);
+ acpi_remove_notify_handler(
+ priv->adev->handle, ACPI_DEVICE_NOTIFY,
+ int3400_notify);
+
if (!priv->rel_misc_dev_res)
acpi_thermal_rel_misc_device_remove(priv->adev->handle);
diff --git a/drivers/thermal/int340x_thermal/int3406_thermal.c b/drivers/thermal/int340x_thermal/int3406_thermal.c
index 1891f34ab7fc..f69ab026ba24 100644
--- a/drivers/thermal/int340x_thermal/int3406_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3406_thermal.c
@@ -21,39 +21,33 @@
struct int3406_thermal_data {
int upper_limit;
- int upper_limit_index;
int lower_limit;
- int lower_limit_index;
acpi_handle handle;
struct acpi_video_device_brightness *br;
struct backlight_device *raw_bd;
struct thermal_cooling_device *cooling_dev;
};
-static int int3406_thermal_to_raw(int level, struct int3406_thermal_data *d)
-{
- int max_level = d->br->levels[d->br->count - 1];
- int raw_max = d->raw_bd->props.max_brightness;
-
- return level * raw_max / max_level;
-}
-
-static int int3406_thermal_to_acpi(int level, struct int3406_thermal_data *d)
-{
- int raw_max = d->raw_bd->props.max_brightness;
- int max_level = d->br->levels[d->br->count - 1];
-
- return level * max_level / raw_max;
-}
+/*
+ * According to the ACPI spec,
+ * "Each brightness level is represented by a number between 0 and 100,
+ * and can be thought of as a percentage. For example, 50 can be 50%
+ * power consumption or 50% brightness, as defined by the OEM."
+ *
+ * As int3406 device uses this value to communicate with the native
+ * graphics driver, we make the assumption that it represents
+ * the percentage of brightness only
+ */
+#define ACPI_TO_RAW(v, d) (d->raw_bd->props.max_brightness * v / 100)
+#define RAW_TO_ACPI(v, d) (v * 100 / d->raw_bd->props.max_brightness)
static int
int3406_thermal_get_max_state(struct thermal_cooling_device *cooling_dev,
unsigned long *state)
{
struct int3406_thermal_data *d = cooling_dev->devdata;
- int index = d->lower_limit_index ? d->lower_limit_index : 2;
- *state = d->br->count - 1 - index;
+ *state = d->upper_limit - d->lower_limit;
return 0;
}
@@ -62,19 +56,15 @@ int3406_thermal_set_cur_state(struct thermal_cooling_device *cooling_dev,
unsigned long state)
{
struct int3406_thermal_data *d = cooling_dev->devdata;
- int level, raw_level;
+ int acpi_level, raw_level;
- if (state > d->br->count - 3)
+ if (state > d->upper_limit - d->lower_limit)
return -EINVAL;
- state = d->br->count - 1 - state;
- level = d->br->levels[state];
+ acpi_level = d->br->levels[d->upper_limit - state];
- if ((d->upper_limit && level > d->upper_limit) ||
- (d->lower_limit && level < d->lower_limit))
- return -EINVAL;
+ raw_level = ACPI_TO_RAW(acpi_level, d);
- raw_level = int3406_thermal_to_raw(level, d);
return backlight_device_set_brightness(d->raw_bd, raw_level);
}
@@ -83,27 +73,22 @@ int3406_thermal_get_cur_state(struct thermal_cooling_device *cooling_dev,
unsigned long *state)
{
struct int3406_thermal_data *d = cooling_dev->devdata;
- int raw_level, level, i;
- int *levels = d->br->levels;
+ int acpi_level;
+ int index;
- raw_level = d->raw_bd->props.brightness;
- level = int3406_thermal_to_acpi(raw_level, d);
+ acpi_level = RAW_TO_ACPI(d->raw_bd->props.brightness, d);
/*
- * There is no 1:1 mapping between the firmware interface level with the
- * raw interface level, we will have to find one that is close enough.
+ * There is no 1:1 mapping between the firmware interface level
+ * with the raw interface level, we will have to find one that is
+ * right above it.
*/
- for (i = 2; i < d->br->count; i++) {
- if (level < levels[i]) {
- if (i == 2)
- break;
- if ((level - levels[i - 1]) < (levels[i] - level))
- i--;
+ for (index = d->lower_limit; index < d->upper_limit; index++) {
+ if (acpi_level <= d->br->levels[index])
break;
- }
}
- *state = d->br->count - 1 - i;
+ *state = d->upper_limit - index;
return 0;
}
@@ -117,7 +102,7 @@ static int int3406_thermal_get_index(int *array, int nr, int value)
{
int i;
- for (i = 0; i < nr; i++) {
+ for (i = 2; i < nr; i++) {
if (array[i] == value)
break;
}
@@ -128,27 +113,20 @@ static void int3406_thermal_get_limit(struct int3406_thermal_data *d)
{
acpi_status status;
unsigned long long lower_limit, upper_limit;
- int index;
status = acpi_evaluate_integer(d->handle, "DDDL", NULL, &lower_limit);
- if (ACPI_SUCCESS(status)) {
- index = int3406_thermal_get_index(d->br->levels, d->br->count,
- lower_limit);
- if (index > 0) {
- d->lower_limit = (int)lower_limit;
- d->lower_limit_index = index;
- }
- }
+ if (ACPI_SUCCESS(status))
+ d->lower_limit = int3406_thermal_get_index(d->br->levels,
+ d->br->count, lower_limit);
status = acpi_evaluate_integer(d->handle, "DDPC", NULL, &upper_limit);
- if (ACPI_SUCCESS(status)) {
- index = int3406_thermal_get_index(d->br->levels, d->br->count,
- upper_limit);
- if (index > 0) {
- d->upper_limit = (int)upper_limit;
- d->upper_limit_index = index;
- }
- }
+ if (ACPI_SUCCESS(status))
+ d->upper_limit = int3406_thermal_get_index(d->br->levels,
+ d->br->count, upper_limit);
+
+ /* lower_limit and upper_limit should be always set */
+ d->lower_limit = d->lower_limit > 0 ? d->lower_limit : 2;
+ d->upper_limit = d->upper_limit > 0 ? d->upper_limit : d->br->count - 1;
}
static void int3406_notify(acpi_handle handle, u32 event, void *data)
diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c
index ff3b36f339e3..f02341f7134d 100644
--- a/drivers/thermal/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c
@@ -127,7 +127,7 @@ static struct attribute *power_limit_attrs[] = {
NULL
};
-static struct attribute_group power_limit_attribute_group = {
+static const struct attribute_group power_limit_attribute_group = {
.attrs = power_limit_attrs,
.name = "power_limits"
};
diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c
index 2b49e8d0fe9e..c60b1cfcc64e 100644
--- a/drivers/thermal/intel_pch_thermal.c
+++ b/drivers/thermal/intel_pch_thermal.c
@@ -49,7 +49,7 @@
#define WPT_TSGPEN 0x84 /* General Purpose Event Enables */
/* Wildcat Point-LP PCH Thermal Register bit definitions */
-#define WPT_TEMP_TSR 0x00ff /* Temp TS Reading */
+#define WPT_TEMP_TSR 0x01ff /* Temp TS Reading */
#define WPT_TSC_CPDE 0x01 /* Catastrophic Power-Down Enable */
#define WPT_TSS_TSDSS 0x10 /* Thermal Sensor Dynamic Shutdown Status */
#define WPT_TSS_GPES 0x08 /* GPE status */
@@ -125,7 +125,7 @@ static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips)
*nr_trips = 0;
/* Check if BIOS has already enabled thermal sensor */
- if (WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS)) {
+ if (WPT_TSEL_ETS & readb(ptd->hw_base + WPT_TSEL)) {
ptd->bios_enabled = true;
goto read_trips;
}
@@ -141,7 +141,7 @@ static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips)
}
writeb(tsel|WPT_TSEL_ETS, ptd->hw_base + WPT_TSEL);
- if (!(WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))) {
+ if (!(WPT_TSEL_ETS & readb(ptd->hw_base + WPT_TSEL))) {
dev_err(&ptd->pdev->dev, "Sensor can't be enabled\n");
return -ENODEV;
}
@@ -174,9 +174,9 @@ read_trips:
static int pch_wpt_get_temp(struct pch_thermal_device *ptd, int *temp)
{
- u8 wpt_temp;
+ u16 wpt_temp;
- wpt_temp = WPT_TEMP_TSR & readl(ptd->hw_base + WPT_TEMP);
+ wpt_temp = WPT_TEMP_TSR & readw(ptd->hw_base + WPT_TEMP);
/* Resolution of 1/2 degree C and an offset of -50C */
*temp = (wpt_temp * 1000 / 2 - 50000);
@@ -387,7 +387,7 @@ static int intel_pch_thermal_resume(struct device *device)
return ptd->ops->resume(ptd);
}
-static struct pci_device_id intel_pch_thermal_id[] = {
+static const struct pci_device_id intel_pch_thermal_id[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_HSW_1),
.driver_data = board_hsw, },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_HSW_2),
diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c
index 7737f14846f9..1e61c09153c9 100644
--- a/drivers/thermal/mtk_thermal.c
+++ b/drivers/thermal/mtk_thermal.c
@@ -3,6 +3,7 @@
* Author: Hanyi Wu <hanyi.wu@mediatek.com>
* Sascha Hauer <s.hauer@pengutronix.de>
* Dawei Chien <dawei.chien@mediatek.com>
+ * Louis Yu <louis.yu@mediatek.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -111,9 +112,10 @@
/*
* Layout of the fuses providing the calibration data
- * These macros could be used for both MT8173 and MT2701.
- * MT8173 has five sensors and need five VTS calibration data,
- * and MT2701 has three sensors and need three VTS calibration data.
+ * These macros could be used for MT8173, MT2701, and MT2712.
+ * MT8173 has 5 sensors and needs 5 VTS calibration data.
+ * MT2701 has 3 sensors and needs 3 VTS calibration data.
+ * MT2712 has 4 sensors and needs 4 VTS calibration data.
*/
#define MT8173_CALIB_BUF0_VALID BIT(0)
#define MT8173_CALIB_BUF1_ADC_GE(x) (((x) >> 22) & 0x3ff)
@@ -124,6 +126,8 @@
#define MT8173_CALIB_BUF2_VTS_TSABB(x) (((x) >> 14) & 0x1ff)
#define MT8173_CALIB_BUF0_DEGC_CALI(x) (((x) >> 1) & 0x3f)
#define MT8173_CALIB_BUF0_O_SLOPE(x) (((x) >> 26) & 0x3f)
+#define MT8173_CALIB_BUF0_O_SLOPE_SIGN(x) (((x) >> 7) & 0x1)
+#define MT8173_CALIB_BUF1_ID(x) (((x) >> 9) & 0x1)
/* MT2701 thermal sensors */
#define MT2701_TS1 0
@@ -136,11 +140,26 @@
/* The total number of temperature sensors in the MT2701 */
#define MT2701_NUM_SENSORS 3
-#define THERMAL_NAME "mtk-thermal"
-
/* The number of sensing points per bank */
#define MT2701_NUM_SENSORS_PER_ZONE 3
+/* MT2712 thermal sensors */
+#define MT2712_TS1 0
+#define MT2712_TS2 1
+#define MT2712_TS3 2
+#define MT2712_TS4 3
+
+/* AUXADC channel 11 is used for the temperature sensors */
+#define MT2712_TEMP_AUXADC_CHANNEL 11
+
+/* The total number of temperature sensors in the MT2712 */
+#define MT2712_NUM_SENSORS 4
+
+/* The number of sensing points per bank */
+#define MT2712_NUM_SENSORS_PER_ZONE 4
+
+#define THERMAL_NAME "mtk-thermal"
+
struct mtk_thermal;
struct thermal_bank_cfg {
@@ -215,6 +234,21 @@ static const int mt2701_adcpnp[MT2701_NUM_SENSORS_PER_ZONE] = {
static const int mt2701_mux_values[MT2701_NUM_SENSORS] = { 0, 1, 16 };
+/* MT2712 thermal sensor data */
+static const int mt2712_bank_data[MT2712_NUM_SENSORS] = {
+ MT2712_TS1, MT2712_TS2, MT2712_TS3, MT2712_TS4
+};
+
+static const int mt2712_msr[MT2712_NUM_SENSORS_PER_ZONE] = {
+ TEMP_MSR0, TEMP_MSR1, TEMP_MSR2, TEMP_MSR3
+};
+
+static const int mt2712_adcpnp[MT2712_NUM_SENSORS_PER_ZONE] = {
+ TEMP_ADCPNP0, TEMP_ADCPNP1, TEMP_ADCPNP2, TEMP_ADCPNP3
+};
+
+static const int mt2712_mux_values[MT2712_NUM_SENSORS] = { 0, 1, 2, 3 };
+
/**
* The MT8173 thermal controller has four banks. Each bank can read up to
* four temperature sensors simultaneously. The MT8173 has a total of 5
@@ -278,6 +312,31 @@ static const struct mtk_thermal_data mt2701_thermal_data = {
};
/**
+ * The MT2712 thermal controller has one bank, which can read up to
+ * four temperature sensors simultaneously. The MT2712 has a total of 4
+ * temperature sensors.
+ *
+ * The thermal core only gets the maximum temperature of this one bank,
+ * so the bank concept wouldn't be necessary here. However, the SVS (Smart
+ * Voltage Scaling) unit makes its decisions based on the same bank
+ * data.
+ */
+static const struct mtk_thermal_data mt2712_thermal_data = {
+ .auxadc_channel = MT2712_TEMP_AUXADC_CHANNEL,
+ .num_banks = 1,
+ .num_sensors = MT2712_NUM_SENSORS,
+ .bank_data = {
+ {
+ .num_sensors = 4,
+ .sensors = mt2712_bank_data,
+ },
+ },
+ .msr = mt2712_msr,
+ .adcpnp = mt2712_adcpnp,
+ .sensor_mux_values = mt2712_mux_values,
+};
+
+/**
* raw_to_mcelsius - convert a raw ADC value to mcelsius
* @mt: The thermal controller
* @raw: raw ADC value
@@ -552,7 +611,11 @@ static int mtk_thermal_get_calibration_data(struct device *dev,
mt->vts[MT8173_TS4] = MT8173_CALIB_BUF2_VTS_TS4(buf[2]);
mt->vts[MT8173_TSABB] = MT8173_CALIB_BUF2_VTS_TSABB(buf[2]);
mt->degc_cali = MT8173_CALIB_BUF0_DEGC_CALI(buf[0]);
- mt->o_slope = MT8173_CALIB_BUF0_O_SLOPE(buf[0]);
+ if (MT8173_CALIB_BUF1_ID(buf[1]) &
+ MT8173_CALIB_BUF0_O_SLOPE_SIGN(buf[0]))
+ mt->o_slope = -MT8173_CALIB_BUF0_O_SLOPE(buf[0]);
+ else
+ mt->o_slope = MT8173_CALIB_BUF0_O_SLOPE(buf[0]);
} else {
dev_info(dev, "Device not calibrated, using default calibration values\n");
}
@@ -571,6 +634,10 @@ static const struct of_device_id mtk_thermal_of_match[] = {
{
.compatible = "mediatek,mt2701-thermal",
.data = (void *)&mt2701_thermal_data,
+ },
+ {
+ .compatible = "mediatek,mt2712-thermal",
+ .data = (void *)&mt2712_thermal_data,
}, {
},
};
@@ -645,16 +712,16 @@ static int mtk_thermal_probe(struct platform_device *pdev)
return -EINVAL;
}
+ ret = device_reset(&pdev->dev);
+ if (ret)
+ return ret;
+
ret = clk_prepare_enable(mt->clk_auxadc);
if (ret) {
dev_err(&pdev->dev, "Can't enable auxadc clk: %d\n", ret);
return ret;
}
- ret = device_reset(&pdev->dev);
- if (ret)
- goto err_disable_clk_auxadc;
-
ret = clk_prepare_enable(mt->clk_peri_therm);
if (ret) {
dev_err(&pdev->dev, "Can't enable peri clk: %d\n", ret);
@@ -705,6 +772,7 @@ static struct platform_driver mtk_thermal_driver = {
module_platform_driver(mtk_thermal_driver);
+MODULE_AUTHOR("Louis Yu <louis.yu@mediatek.com>");
MODULE_AUTHOR("Dawei Chien <dawei.chien@mediatek.com>");
MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
MODULE_AUTHOR("Hanyi Wu <hanyi.wu@mediatek.com>");
diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
index 4362a69ac88d..c866cc165960 100644
--- a/drivers/thermal/qoriq_thermal.c
+++ b/drivers/thermal/qoriq_thermal.c
@@ -188,7 +188,7 @@ static void qoriq_tmu_init_device(struct qoriq_tmu_data *data)
tmu_write(data, TMR_DISABLE, &data->regs->tmr);
}
-static struct thermal_zone_of_device_ops tmu_tz_ops = {
+static const struct thermal_zone_of_device_ops tmu_tz_ops = {
.get_temp = tmu_get_temp,
};
diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c
index 37fcefd06d9f..203aca44a2bb 100644
--- a/drivers/thermal/rcar_gen3_thermal.c
+++ b/drivers/thermal/rcar_gen3_thermal.c
@@ -225,7 +225,7 @@ static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high)
return 0;
}
-static struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
+static const struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = {
.get_temp = rcar_gen3_thermal_get_temp,
.set_trips = rcar_gen3_thermal_set_trips,
};
diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index 4c7796512453..206035139110 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -320,6 +320,44 @@ static const struct tsadc_table rk3288_code_table[] = {
{0, 125000},
};
+static const struct tsadc_table rk3328_code_table[] = {
+ {0, -40000},
+ {296, -40000},
+ {304, -35000},
+ {313, -30000},
+ {331, -20000},
+ {340, -15000},
+ {349, -10000},
+ {359, -5000},
+ {368, 0},
+ {378, 5000},
+ {388, 10000},
+ {398, 15000},
+ {408, 20000},
+ {418, 25000},
+ {429, 30000},
+ {440, 35000},
+ {451, 40000},
+ {462, 45000},
+ {473, 50000},
+ {485, 55000},
+ {496, 60000},
+ {508, 65000},
+ {521, 70000},
+ {533, 75000},
+ {546, 80000},
+ {559, 85000},
+ {572, 90000},
+ {586, 95000},
+ {600, 100000},
+ {614, 105000},
+ {629, 110000},
+ {644, 115000},
+ {659, 120000},
+ {675, 125000},
+ {TSADCV2_DATA_MASK, 125000},
+};
+
static const struct tsadc_table rk3368_code_table[] = {
{0, -40000},
{106, -40000},
@@ -790,6 +828,29 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = {
},
};
+static const struct rockchip_tsadc_chip rk3328_tsadc_data = {
+ .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
+ .chn_num = 1, /* one channels for tsadc */
+
+ .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */
+ .tshut_temp = 95000,
+
+ .initialize = rk_tsadcv2_initialize,
+ .irq_ack = rk_tsadcv3_irq_ack,
+ .control = rk_tsadcv3_control,
+ .get_temp = rk_tsadcv2_get_temp,
+ .set_alarm_temp = rk_tsadcv2_alarm_temp,
+ .set_tshut_temp = rk_tsadcv2_tshut_temp,
+ .set_tshut_mode = rk_tsadcv2_tshut_mode,
+
+ .table = {
+ .id = rk3328_code_table,
+ .length = ARRAY_SIZE(rk3328_code_table),
+ .data_mask = TSADCV2_DATA_MASK,
+ .mode = ADC_INCREMENT,
+ },
+};
+
static const struct rockchip_tsadc_chip rk3366_tsadc_data = {
.chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */
.chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */
@@ -875,6 +936,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = {
.data = (void *)&rk3288_tsadc_data,
},
{
+ .compatible = "rockchip,rk3328-tsadc",
+ .data = (void *)&rk3328_tsadc_data,
+ },
+ {
.compatible = "rockchip,rk3366-tsadc",
.data = (void *)&rk3366_tsadc_data,
},
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index 7b8ef09d2b3c..ed805c7c5ace 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -1286,7 +1286,7 @@ static int exynos_map_dt_data(struct platform_device *pdev)
return 0;
}
-static struct thermal_zone_of_device_ops exynos_sensor_ops = {
+static const struct thermal_zone_of_device_ops exynos_sensor_ops = {
.get_temp = exynos_get_temp,
.set_emul_temp = exynos_tmu_set_emulation,
};
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5a51c740e372..2b1b0ba393a4 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -390,7 +390,7 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
if (trip_type == THERMAL_TRIP_CRITICAL) {
dev_emerg(&tz->device,
- "critical temperature reached(%d C),shutting down\n",
+ "critical temperature reached (%d C), shutting down\n",
tz->temperature / 1000);
mutex_lock(&poweroff_lock);
if (!power_off_triggered) {
@@ -836,11 +836,7 @@ static void thermal_release(struct device *dev)
if (!strncmp(dev_name(dev), "thermal_zone",
sizeof("thermal_zone") - 1)) {
tz = to_thermal_zone(dev);
- kfree(tz->trip_type_attrs);
- kfree(tz->trip_temp_attrs);
- kfree(tz->trip_hyst_attrs);
- kfree(tz->trips_attribute_group.attrs);
- kfree(tz->device.groups);
+ thermal_zone_destroy_device_groups(tz);
kfree(tz);
} else if (!strncmp(dev_name(dev), "cooling_device",
sizeof("cooling_device") - 1)) {
@@ -1213,10 +1209,8 @@ thermal_zone_device_register(const char *type, int trips, int mask,
ida_init(&tz->ida);
mutex_init(&tz->lock);
result = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL);
- if (result < 0) {
- kfree(tz);
- return ERR_PTR(result);
- }
+ if (result < 0)
+ goto free_tz;
tz->id = result;
strlcpy(tz->type, type, sizeof(tz->type));
@@ -1232,18 +1226,15 @@ thermal_zone_device_register(const char *type, int trips, int mask,
/* Add nodes that are always present via .groups */
result = thermal_zone_create_device_groups(tz, mask);
if (result)
- goto unregister;
+ goto remove_id;
/* A new thermal zone needs to be updated anyway. */
atomic_set(&tz->need_update, 1);
dev_set_name(&tz->device, "thermal_zone%d", tz->id);
result = device_register(&tz->device);
- if (result) {
- ida_simple_remove(&thermal_tz_ida, tz->id);
- kfree(tz);
- return ERR_PTR(result);
- }
+ if (result)
+ goto remove_device_groups;
for (count = 0; count < trips; count++) {
if (tz->ops->get_trip_type(tz, count, &trip_type))
@@ -1297,6 +1288,14 @@ unregister:
ida_simple_remove(&thermal_tz_ida, tz->id);
device_unregister(&tz->device);
return ERR_PTR(result);
+
+remove_device_groups:
+ thermal_zone_destroy_device_groups(tz);
+remove_id:
+ ida_simple_remove(&thermal_tz_ida, tz->id);
+free_tz:
+ kfree(tz);
+ return ERR_PTR(result);
}
EXPORT_SYMBOL_GPL(thermal_zone_device_register);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 2412b3759e16..27e3b1df7360 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -71,6 +71,7 @@ int thermal_build_list_of_policies(char *buf);
/* sysfs I/F */
int thermal_zone_create_device_groups(struct thermal_zone_device *, int);
+void thermal_zone_destroy_device_groups(struct thermal_zone_device *);
void thermal_cooling_device_setup_sysfs(struct thermal_cooling_device *);
/* used only at binding time */
ssize_t
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index a694de907a26..fb80c96d8f73 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -605,6 +605,24 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask)
return 0;
}
+/**
+ * destroy_trip_attrs() - destroy attributes for trip points
+ * @tz: the thermal zone device
+ *
+ * helper function to free resources allocated by create_trip_attrs()
+ */
+static void destroy_trip_attrs(struct thermal_zone_device *tz)
+{
+ if (!tz)
+ return;
+
+ kfree(tz->trip_type_attrs);
+ kfree(tz->trip_temp_attrs);
+ if (tz->ops->get_trip_hyst)
+ kfree(tz->trip_hyst_attrs);
+ kfree(tz->trips_attribute_group.attrs);
+}
+
int thermal_zone_create_device_groups(struct thermal_zone_device *tz,
int mask)
{
@@ -637,6 +655,17 @@ int thermal_zone_create_device_groups(struct thermal_zone_device *tz,
return 0;
}
+void thermal_zone_destroy_device_groups(struct thermal_zone_device *tz)
+{
+ if (!tz)
+ return;
+
+ if (tz->trips)
+ destroy_trip_attrs(tz);
+
+ kfree(tz->device.groups);
+}
+
/* sys I/F for cooling device */
static ssize_t
thermal_cooling_device_type_show(struct device *dev,
diff --git a/drivers/thermal/uniphier_thermal.c b/drivers/thermal/uniphier_thermal.c
new file mode 100644
index 000000000000..95704732f760
--- /dev/null
+++ b/drivers/thermal/uniphier_thermal.c
@@ -0,0 +1,384 @@
+/**
+ * uniphier_thermal.c - Socionext UniPhier thermal driver
+ *
+ * Copyright 2014 Panasonic Corporation
+ * Copyright 2016-2017 Socionext Inc.
+ * All rights reserved.
+ *
+ * Author:
+ * Kunihiko Hayashi <hayashi.kunihiko@socionext.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 of
+ * the License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/bitops.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/syscon.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+/*
+ * block registers
+ * addresses are the offset from .block_base
+ */
+#define PVTCTLEN 0x0000
+#define PVTCTLEN_EN BIT(0)
+
+#define PVTCTLMODE 0x0004
+#define PVTCTLMODE_MASK 0xf
+#define PVTCTLMODE_TEMPMON 0x5
+
+#define EMONREPEAT 0x0040
+#define EMONREPEAT_ENDLESS BIT(24)
+#define EMONREPEAT_PERIOD GENMASK(3, 0)
+#define EMONREPEAT_PERIOD_1000000 0x9
+
+/*
+ * common registers
+ * addresses are the offset from .map_base
+ */
+#define PVTCTLSEL 0x0900
+#define PVTCTLSEL_MASK GENMASK(2, 0)
+#define PVTCTLSEL_MONITOR 0
+
+#define SETALERT0 0x0910
+#define SETALERT1 0x0914
+#define SETALERT2 0x0918
+#define SETALERT_TEMP_OVF (GENMASK(7, 0) << 16)
+#define SETALERT_TEMP_OVF_VALUE(val) (((val) & GENMASK(7, 0)) << 16)
+#define SETALERT_EN BIT(0)
+
+#define PMALERTINTCTL 0x0920
+#define PMALERTINTCTL_CLR(ch) BIT(4 * (ch) + 2)
+#define PMALERTINTCTL_SET(ch) BIT(4 * (ch) + 1)
+#define PMALERTINTCTL_EN(ch) BIT(4 * (ch) + 0)
+#define PMALERTINTCTL_MASK (GENMASK(10, 8) | GENMASK(6, 4) | \
+ GENMASK(2, 0))
+
+#define TMOD 0x0928
+#define TMOD_WIDTH 9
+
+#define TMODCOEF 0x0e5c
+
+#define TMODSETUP0_EN BIT(30)
+#define TMODSETUP0_VAL(val) (((val) & GENMASK(13, 0)) << 16)
+#define TMODSETUP1_EN BIT(15)
+#define TMODSETUP1_VAL(val) ((val) & GENMASK(14, 0))
+
+/* SoC critical temperature */
+#define CRITICAL_TEMP_LIMIT (120 * 1000)
+
+/* Max # of alert channels */
+#define ALERT_CH_NUM 3
+
+/* SoC specific thermal sensor data */
+struct uniphier_tm_soc_data {
+ u32 map_base;
+ u32 block_base;
+ u32 tmod_setup_addr;
+};
+
+struct uniphier_tm_dev {
+ struct regmap *regmap;
+ struct device *dev;
+ bool alert_en[ALERT_CH_NUM];
+ struct thermal_zone_device *tz_dev;
+ const struct uniphier_tm_soc_data *data;
+};
+
+static int uniphier_tm_initialize_sensor(struct uniphier_tm_dev *tdev)
+{
+ struct regmap *map = tdev->regmap;
+ u32 val;
+ u32 tmod_calib[2];
+ int ret;
+
+ /* stop PVT */
+ regmap_write_bits(map, tdev->data->block_base + PVTCTLEN,
+ PVTCTLEN_EN, 0);
+
+ /*
+ * Since SoC has a calibrated value that was set in advance,
+ * TMODCOEF shows non-zero and PVT refers the value internally.
+ *
+ * If TMODCOEF shows zero, the boards don't have the calibrated
+ * value, and the driver has to set default value from DT.
+ */
+ ret = regmap_read(map, tdev->data->map_base + TMODCOEF, &val);
+ if (ret)
+ return ret;
+ if (!val) {
+ /* look for the default values in DT */
+ ret = of_property_read_u32_array(tdev->dev->of_node,
+ "socionext,tmod-calibration",
+ tmod_calib,
+ ARRAY_SIZE(tmod_calib));
+ if (ret)
+ return ret;
+
+ regmap_write(map, tdev->data->tmod_setup_addr,
+ TMODSETUP0_EN | TMODSETUP0_VAL(tmod_calib[0]) |
+ TMODSETUP1_EN | TMODSETUP1_VAL(tmod_calib[1]));
+ }
+
+ /* select temperature mode */
+ regmap_write_bits(map, tdev->data->block_base + PVTCTLMODE,
+ PVTCTLMODE_MASK, PVTCTLMODE_TEMPMON);
+
+ /* set monitoring period */
+ regmap_write_bits(map, tdev->data->block_base + EMONREPEAT,
+ EMONREPEAT_ENDLESS | EMONREPEAT_PERIOD,
+ EMONREPEAT_ENDLESS | EMONREPEAT_PERIOD_1000000);
+
+ /* set monitor mode */
+ regmap_write_bits(map, tdev->data->map_base + PVTCTLSEL,
+ PVTCTLSEL_MASK, PVTCTLSEL_MONITOR);
+
+ return 0;
+}
+
+static void uniphier_tm_set_alert(struct uniphier_tm_dev *tdev, u32 ch,
+ u32 temp)
+{
+ struct regmap *map = tdev->regmap;
+
+ /* set alert temperature */
+ regmap_write_bits(map, tdev->data->map_base + SETALERT0 + (ch << 2),
+ SETALERT_EN | SETALERT_TEMP_OVF,
+ SETALERT_EN |
+ SETALERT_TEMP_OVF_VALUE(temp / 1000));
+}
+
+static void uniphier_tm_enable_sensor(struct uniphier_tm_dev *tdev)
+{
+ struct regmap *map = tdev->regmap;
+ int i;
+ u32 bits = 0;
+
+ for (i = 0; i < ALERT_CH_NUM; i++)
+ if (tdev->alert_en[i])
+ bits |= PMALERTINTCTL_EN(i);
+
+ /* enable alert interrupt */
+ regmap_write_bits(map, tdev->data->map_base + PMALERTINTCTL,
+ PMALERTINTCTL_MASK, bits);
+
+ /* start PVT */
+ regmap_write_bits(map, tdev->data->block_base + PVTCTLEN,
+ PVTCTLEN_EN, PVTCTLEN_EN);
+
+ usleep_range(700, 1500); /* The spec note says at least 700us */
+}
+
+static void uniphier_tm_disable_sensor(struct uniphier_tm_dev *tdev)
+{
+ struct regmap *map = tdev->regmap;
+
+ /* disable alert interrupt */
+ regmap_write_bits(map, tdev->data->map_base + PMALERTINTCTL,
+ PMALERTINTCTL_MASK, 0);
+
+ /* stop PVT */
+ regmap_write_bits(map, tdev->data->block_base + PVTCTLEN,
+ PVTCTLEN_EN, 0);
+
+ usleep_range(1000, 2000); /* The spec note says at least 1ms */
+}
+
+static int uniphier_tm_get_temp(void *data, int *out_temp)
+{
+ struct uniphier_tm_dev *tdev = data;
+ struct regmap *map = tdev->regmap;
+ int ret;
+ u32 temp;
+
+ ret = regmap_read(map, tdev->data->map_base + TMOD, &temp);
+ if (ret)
+ return ret;
+
+ /* MSB of the TMOD field is a sign bit */
+ *out_temp = sign_extend32(temp, TMOD_WIDTH - 1) * 1000;
+
+ return 0;
+}
+
+static const struct thermal_zone_of_device_ops uniphier_of_thermal_ops = {
+ .get_temp = uniphier_tm_get_temp,
+};
+
+static void uniphier_tm_irq_clear(struct uniphier_tm_dev *tdev)
+{
+ u32 mask = 0, bits = 0;
+ int i;
+
+ for (i = 0; i < ALERT_CH_NUM; i++) {
+ mask |= (PMALERTINTCTL_CLR(i) | PMALERTINTCTL_SET(i));
+ bits |= PMALERTINTCTL_CLR(i);
+ }
+
+ /* clear alert interrupt */
+ regmap_write_bits(tdev->regmap,
+ tdev->data->map_base + PMALERTINTCTL, mask, bits);
+}
+
+static irqreturn_t uniphier_tm_alarm_irq(int irq, void *_tdev)
+{
+ struct uniphier_tm_dev *tdev = _tdev;
+
+ disable_irq_nosync(irq);
+ uniphier_tm_irq_clear(tdev);
+
+ return IRQ_WAKE_THREAD;
+}
+
+static irqreturn_t uniphier_tm_alarm_irq_thread(int irq, void *_tdev)
+{
+ struct uniphier_tm_dev *tdev = _tdev;
+
+ thermal_zone_device_update(tdev->tz_dev, THERMAL_EVENT_UNSPECIFIED);
+
+ return IRQ_HANDLED;
+}
+
+static int uniphier_tm_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct regmap *regmap;
+ struct device_node *parent;
+ struct uniphier_tm_dev *tdev;
+ const struct thermal_trip *trips;
+ int i, ret, irq, ntrips, crit_temp = INT_MAX;
+
+ tdev = devm_kzalloc(dev, sizeof(*tdev), GFP_KERNEL);
+ if (!tdev)
+ return -ENOMEM;
+ tdev->dev = dev;
+
+ tdev->data = of_device_get_match_data(dev);
+ if (WARN_ON(!tdev->data))
+ return -EINVAL;
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0)
+ return irq;
+
+ /* get regmap from syscon node */
+ parent = of_get_parent(dev->of_node); /* parent should be syscon node */
+ regmap = syscon_node_to_regmap(parent);
+ of_node_put(parent);
+ if (IS_ERR(regmap)) {
+ dev_err(dev, "failed to get regmap (error %ld)\n",
+ PTR_ERR(regmap));
+ return PTR_ERR(regmap);
+ }
+ tdev->regmap = regmap;
+
+ ret = uniphier_tm_initialize_sensor(tdev);
+ if (ret) {
+ dev_err(dev, "failed to initialize sensor\n");
+ return ret;
+ }
+
+ ret = devm_request_threaded_irq(dev, irq, uniphier_tm_alarm_irq,
+ uniphier_tm_alarm_irq_thread,
+ 0, "thermal", tdev);
+ if (ret)
+ return ret;
+
+ platform_set_drvdata(pdev, tdev);
+
+ tdev->tz_dev = devm_thermal_zone_of_sensor_register(dev, 0, tdev,
+ &uniphier_of_thermal_ops);
+ if (IS_ERR(tdev->tz_dev)) {
+ dev_err(dev, "failed to register sensor device\n");
+ return PTR_ERR(tdev->tz_dev);
+ }
+
+ /* get trip points */
+ trips = of_thermal_get_trip_points(tdev->tz_dev);
+ ntrips = of_thermal_get_ntrips(tdev->tz_dev);
+ if (ntrips > ALERT_CH_NUM) {
+ dev_err(dev, "thermal zone has too many trips\n");
+ return -E2BIG;
+ }
+
+ /* set alert temperatures */
+ for (i = 0; i < ntrips; i++) {
+ if (trips[i].type == THERMAL_TRIP_CRITICAL &&
+ trips[i].temperature < crit_temp)
+ crit_temp = trips[i].temperature;
+ uniphier_tm_set_alert(tdev, i, trips[i].temperature);
+ tdev->alert_en[i] = true;
+ }
+ if (crit_temp > CRITICAL_TEMP_LIMIT) {
+ dev_err(dev, "critical trip is over limit(>%d), or not set\n",
+ CRITICAL_TEMP_LIMIT);
+ return -EINVAL;
+ }
+
+ uniphier_tm_enable_sensor(tdev);
+
+ return 0;
+}
+
+static int uniphier_tm_remove(struct platform_device *pdev)
+{
+ struct uniphier_tm_dev *tdev = platform_get_drvdata(pdev);
+
+ /* disable sensor */
+ uniphier_tm_disable_sensor(tdev);
+
+ return 0;
+}
+
+static const struct uniphier_tm_soc_data uniphier_pxs2_tm_data = {
+ .map_base = 0xe000,
+ .block_base = 0xe000,
+ .tmod_setup_addr = 0xe904,
+};
+
+static const struct uniphier_tm_soc_data uniphier_ld20_tm_data = {
+ .map_base = 0xe000,
+ .block_base = 0xe800,
+ .tmod_setup_addr = 0xe938,
+};
+
+static const struct of_device_id uniphier_tm_dt_ids[] = {
+ {
+ .compatible = "socionext,uniphier-pxs2-thermal",
+ .data = &uniphier_pxs2_tm_data,
+ },
+ {
+ .compatible = "socionext,uniphier-ld20-thermal",
+ .data = &uniphier_ld20_tm_data,
+ },
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, uniphier_tm_dt_ids);
+
+static struct platform_driver uniphier_tm_driver = {
+ .probe = uniphier_tm_probe,
+ .remove = uniphier_tm_remove,
+ .driver = {
+ .name = "uniphier-thermal",
+ .of_match_table = uniphier_tm_dt_ids,
+ },
+};
+module_platform_driver(uniphier_tm_driver);
+
+MODULE_AUTHOR("Kunihiko Hayashi <hayashi.kunihiko@socionext.com>");
+MODULE_DESCRIPTION("UniPhier thermal driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/zx2967_thermal.c b/drivers/thermal/zx2967_thermal.c
index a5670ad2cfc8..6acce0bce7c0 100644
--- a/drivers/thermal/zx2967_thermal.c
+++ b/drivers/thermal/zx2967_thermal.c
@@ -111,7 +111,7 @@ unlock:
return ret;
}
-static struct thermal_zone_of_device_ops zx2967_of_thermal_ops = {
+static const struct thermal_zone_of_device_ops zx2967_of_thermal_ops = {
.get_temp = zx2967_thermal_get_temp,
};
diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c
index a8b8d8b8d9f3..d4e0f7cd96fa 100644
--- a/drivers/usb/host/ohci-sm501.c
+++ b/drivers/usb/host/ohci-sm501.c
@@ -123,13 +123,12 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev)
* regular memory. The HCD_LOCAL_MEM flag does just that.
*/
- if (!dma_declare_coherent_memory(dev, mem->start,
+ retval = dma_declare_coherent_memory(dev, mem->start,
mem->start - mem->parent->start,
resource_size(mem),
- DMA_MEMORY_MAP |
- DMA_MEMORY_EXCLUSIVE)) {
+ DMA_MEMORY_EXCLUSIVE);
+ if (retval) {
dev_err(dev, "cannot declare coherent memory\n");
- retval = -ENXIO;
goto err1;
}
diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c
index cfcfadfc94fc..16d081a093bb 100644
--- a/drivers/usb/host/ohci-tmio.c
+++ b/drivers/usb/host/ohci-tmio.c
@@ -227,13 +227,10 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev)
goto err_ioremap_regs;
}
- if (!dma_declare_coherent_memory(&dev->dev, sram->start,
- sram->start,
- resource_size(sram),
- DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE)) {
- ret = -EBUSY;
+ ret = dma_declare_coherent_memory(&dev->dev, sram->start, sram->start,
+ resource_size(sram), DMA_MEMORY_EXCLUSIVE);
+ if (ret)
goto err_dma_declare;
- }
if (cell->enable) {
ret = cell->enable(dev);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1bc709fe330a..b3e3edc09d80 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -152,17 +152,10 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
ceph_invalidate_fscache_page(inode, page);
+ WARN_ON(!PageLocked(page));
if (!PagePrivate(page))
return;
- /*
- * We can get non-dirty pages here due to races between
- * set_page_dirty and truncate_complete_page; just spit out a
- * warning, in case we end up with accounting problems later.
- */
- if (!PageDirty(page))
- pr_err("%p invalidatepage %p page not dirty\n", inode, page);
-
ClearPageChecked(page);
dout("%p invalidatepage %p idx %lu full dirty page\n",
@@ -455,13 +448,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
if (rc == 0)
goto out;
- if (fsc->mount_options->rsize >= PAGE_SIZE)
- max = (fsc->mount_options->rsize + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
-
- dout("readpages %p file %p nr_pages %d max %d\n", inode,
- file, nr_pages,
- max);
+ max = fsc->mount_options->rsize >> PAGE_SHIFT;
+ dout("readpages %p file %p nr_pages %d max %d\n",
+ inode, file, nr_pages, max);
while (!list_empty(page_list)) {
rc = start_read(inode, page_list, max);
if (rc < 0)
@@ -474,14 +463,22 @@ out:
return rc;
}
+struct ceph_writeback_ctl
+{
+ loff_t i_size;
+ u64 truncate_size;
+ u32 truncate_seq;
+ bool size_stable;
+ bool head_snapc;
+};
+
/*
* Get ref for the oldest snapc for an inode with dirty data... that is, the
* only snap context we are allowed to write back.
*/
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
- loff_t *snap_size,
- u64 *truncate_size,
- u32 *truncate_seq)
+static struct ceph_snap_context *
+get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
+ struct ceph_snap_context *page_snapc)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc = NULL;
@@ -491,30 +488,78 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
capsnap->context, capsnap->dirty_pages);
- if (capsnap->dirty_pages) {
- snapc = ceph_get_snap_context(capsnap->context);
- if (snap_size)
- *snap_size = capsnap->size;
- if (truncate_size)
- *truncate_size = capsnap->truncate_size;
- if (truncate_seq)
- *truncate_seq = capsnap->truncate_seq;
- break;
+ if (!capsnap->dirty_pages)
+ continue;
+
+ /* get i_size, truncate_{seq,size} for page_snapc? */
+ if (snapc && capsnap->context != page_snapc)
+ continue;
+
+ if (ctl) {
+ if (capsnap->writing) {
+ ctl->i_size = i_size_read(inode);
+ ctl->size_stable = false;
+ } else {
+ ctl->i_size = capsnap->size;
+ ctl->size_stable = true;
+ }
+ ctl->truncate_size = capsnap->truncate_size;
+ ctl->truncate_seq = capsnap->truncate_seq;
+ ctl->head_snapc = false;
}
+
+ if (snapc)
+ break;
+
+ snapc = ceph_get_snap_context(capsnap->context);
+ if (!page_snapc ||
+ page_snapc == snapc ||
+ page_snapc->seq > snapc->seq)
+ break;
}
if (!snapc && ci->i_wrbuffer_ref_head) {
snapc = ceph_get_snap_context(ci->i_head_snapc);
dout(" head snapc %p has %d dirty pages\n",
snapc, ci->i_wrbuffer_ref_head);
- if (truncate_size)
- *truncate_size = ci->i_truncate_size;
- if (truncate_seq)
- *truncate_seq = ci->i_truncate_seq;
+ if (ctl) {
+ ctl->i_size = i_size_read(inode);
+ ctl->truncate_size = ci->i_truncate_size;
+ ctl->truncate_seq = ci->i_truncate_seq;
+ ctl->size_stable = false;
+ ctl->head_snapc = true;
+ }
}
spin_unlock(&ci->i_ceph_lock);
return snapc;
}
+static u64 get_writepages_data_length(struct inode *inode,
+ struct page *page, u64 start)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc = page_snap_context(page);
+ struct ceph_cap_snap *capsnap = NULL;
+ u64 end = i_size_read(inode);
+
+ if (snapc != ci->i_head_snapc) {
+ bool found = false;
+ spin_lock(&ci->i_ceph_lock);
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ if (capsnap->context == snapc) {
+ if (!capsnap->writing)
+ end = capsnap->size;
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ WARN_ON(!found);
+ }
+ if (end > page_offset(page) + PAGE_SIZE)
+ end = page_offset(page) + PAGE_SIZE;
+ return end > start ? end - start : 0;
+}
+
/*
* Write a single page, but leave the page locked.
*
@@ -526,30 +571,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
struct inode *inode;
struct ceph_inode_info *ci;
struct ceph_fs_client *fsc;
- struct ceph_osd_client *osdc;
struct ceph_snap_context *snapc, *oldest;
loff_t page_off = page_offset(page);
- loff_t snap_size = -1;
long writeback_stat;
- u64 truncate_size;
- u32 truncate_seq;
int err, len = PAGE_SIZE;
+ struct ceph_writeback_ctl ceph_wbc;
dout("writepage %p idx %lu\n", page, page->index);
inode = page->mapping->host;
ci = ceph_inode(inode);
fsc = ceph_inode_to_client(inode);
- osdc = &fsc->client->osdc;
/* verify this is a writeable snap context */
snapc = page_snap_context(page);
- if (snapc == NULL) {
+ if (!snapc) {
dout("writepage %p page %p not dirty?\n", inode, page);
return 0;
}
- oldest = get_oldest_context(inode, &snap_size,
- &truncate_size, &truncate_seq);
+ oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
dout("writepage %p page %p snapc %p not writeable - noop\n",
inode, page, snapc);
@@ -561,20 +601,18 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
}
ceph_put_snap_context(oldest);
- if (snap_size == -1)
- snap_size = i_size_read(inode);
-
/* is this a partial page at end of file? */
- if (page_off >= snap_size) {
- dout("%p page eof %llu\n", page, snap_size);
+ if (page_off >= ceph_wbc.i_size) {
+ dout("%p page eof %llu\n", page, ceph_wbc.i_size);
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
return 0;
}
- if (snap_size < page_off + len)
- len = snap_size - page_off;
+ if (ceph_wbc.i_size < page_off + len)
+ len = ceph_wbc.i_size - page_off;
- dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
- inode, page, page->index, page_off, len, snapc);
+ dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
+ inode, page, page->index, page_off, len, snapc, snapc->seq);
writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
@@ -582,10 +620,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
- err = ceph_osdc_writepages(osdc, ceph_vino(inode),
- &ci->i_layout, snapc,
- page_off, len,
- truncate_seq, truncate_size,
+ err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode),
+ &ci->i_layout, snapc, page_off, len,
+ ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size,
&inode->i_mtime, &page, 1);
if (err < 0) {
struct writeback_control tmp_wbc;
@@ -746,31 +784,17 @@ static int ceph_writepages_start(struct address_space *mapping,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_vino vino = ceph_vino(inode);
- pgoff_t index, start, end;
- int range_whole = 0;
- int should_loop = 1;
- pgoff_t max_pages = 0, max_pages_ever = 0;
+ pgoff_t index, start_index, end = -1;
struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
struct pagevec pvec;
- int done = 0;
int rc = 0;
unsigned int wsize = i_blocksize(inode);
struct ceph_osd_request *req = NULL;
- int do_sync = 0;
- loff_t snap_size, i_size;
- u64 truncate_size;
- u32 truncate_seq;
+ struct ceph_writeback_ctl ceph_wbc;
+ bool should_loop, range_whole = false;
+ bool stop, done = false;
- /*
- * Include a 'sync' in the OSD request if this is a data
- * integrity write (e.g., O_SYNC write or fsync()), or if our
- * cap is being revoked.
- */
- if ((wbc->sync_mode == WB_SYNC_ALL) ||
- ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
- do_sync = 1;
- dout("writepages_start %p dosync=%d (mode=%s)\n",
- inode, do_sync,
+ dout("writepages_start %p (mode=%s)\n", inode,
wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
@@ -783,35 +807,17 @@ static int ceph_writepages_start(struct address_space *mapping,
mapping_set_error(mapping, -EIO);
return -EIO; /* we're in a forced umount, don't write! */
}
- if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
+ if (fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- if (wsize < PAGE_SIZE)
- wsize = PAGE_SIZE;
- max_pages_ever = wsize >> PAGE_SHIFT;
pagevec_init(&pvec, 0);
- /* where to start/end? */
- if (wbc->range_cyclic) {
- start = mapping->writeback_index; /* Start from prev offset */
- end = -1;
- dout(" cyclic, start at %lu\n", start);
- } else {
- start = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
- if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = 1;
- should_loop = 0;
- dout(" not cyclic, %lu to %lu\n", start, end);
- }
- index = start;
+ start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
+ index = start_index;
retry:
/* find oldest snap context with dirty data */
- ceph_put_snap_context(snapc);
- snap_size = -1;
- snapc = get_oldest_context(inode, &snap_size,
- &truncate_size, &truncate_seq);
+ snapc = get_oldest_context(inode, &ceph_wbc, NULL);
if (!snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
@@ -821,40 +827,56 @@ retry:
dout(" oldest snapc is %p seq %lld (%d snaps)\n",
snapc, snapc->seq, snapc->num_snaps);
- i_size = i_size_read(inode);
-
- if (last_snapc && snapc != last_snapc) {
- /* if we switched to a newer snapc, restart our scan at the
- * start of the original file range. */
- dout(" snapc differs from last pass, restarting at %lu\n",
- index);
- index = start;
+ should_loop = false;
+ if (ceph_wbc.head_snapc && snapc != last_snapc) {
+ /* where to start/end? */
+ if (wbc->range_cyclic) {
+ index = start_index;
+ end = -1;
+ if (index > 0)
+ should_loop = true;
+ dout(" cyclic, start at %lu\n", index);
+ } else {
+ index = wbc->range_start >> PAGE_SHIFT;
+ end = wbc->range_end >> PAGE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = true;
+ dout(" not cyclic, %lu to %lu\n", index, end);
+ }
+ } else if (!ceph_wbc.head_snapc) {
+ /* Do not respect wbc->range_{start,end}. Dirty pages
+ * in that range can be associated with newer snapc.
+ * They are not writeable until we write all dirty pages
+ * associated with 'snapc' get written */
+ if (index > 0 || wbc->sync_mode != WB_SYNC_NONE)
+ should_loop = true;
+ dout(" non-head snapc, range whole\n");
}
+
+ ceph_put_snap_context(last_snapc);
last_snapc = snapc;
- while (!done && index <= end) {
- unsigned i;
- int first;
- pgoff_t strip_unit_end = 0;
+ stop = false;
+ while (!stop && index <= end) {
int num_ops = 0, op_idx;
- int pvec_pages, locked_pages = 0;
+ unsigned i, pvec_pages, max_pages, locked_pages = 0;
struct page **pages = NULL, **data_pages;
mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page;
- int want;
+ pgoff_t strip_unit_end = 0;
u64 offset = 0, len = 0;
- max_pages = max_pages_ever;
+ max_pages = wsize >> PAGE_SHIFT;
get_more_pages:
- first = -1;
- want = min(end - index,
- min((pgoff_t)PAGEVEC_SIZE,
- max_pages - (pgoff_t)locked_pages) - 1)
- + 1;
+ pvec_pages = min_t(unsigned, PAGEVEC_SIZE,
+ max_pages - locked_pages);
+ if (end - index < (u64)(pvec_pages - 1))
+ pvec_pages = (unsigned)(end - index) + 1;
+
pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY,
- want);
+ pvec_pages);
dout("pagevec_lookup_tag got %d\n", pvec_pages);
if (!pvec_pages && !locked_pages)
break;
@@ -871,11 +893,15 @@ get_more_pages:
unlikely(page->mapping != mapping)) {
dout("!dirty or !mapping %p\n", page);
unlock_page(page);
- break;
+ continue;
}
- if (!wbc->range_cyclic && page->index > end) {
+ if (page->index > end) {
dout("end of range %p\n", page);
- done = 1;
+ /* can't be range_cyclic (1st pass) because
+ * end == -1 in that case. */
+ stop = true;
+ if (ceph_wbc.head_snapc)
+ done = true;
unlock_page(page);
break;
}
@@ -884,39 +910,37 @@ get_more_pages:
unlock_page(page);
break;
}
- if (wbc->sync_mode != WB_SYNC_NONE) {
- dout("waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- }
- if (page_offset(page) >=
- (snap_size == -1 ? i_size : snap_size)) {
- dout("%p page eof %llu\n", page,
- (snap_size == -1 ? i_size : snap_size));
- done = 1;
+ if (page_offset(page) >= ceph_wbc.i_size) {
+ dout("%p page eof %llu\n",
+ page, ceph_wbc.i_size);
+ /* not done if range_cyclic */
+ stop = true;
unlock_page(page);
break;
}
if (PageWriteback(page)) {
- dout("%p under writeback\n", page);
- unlock_page(page);
- break;
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ dout("%p under writeback\n", page);
+ unlock_page(page);
+ continue;
+ }
+ dout("waiting on writeback %p\n", page);
+ wait_on_page_writeback(page);
}
/* only if matching snap context */
pgsnapc = page_snap_context(page);
- if (pgsnapc->seq > snapc->seq) {
- dout("page snapc %p %lld > oldest %p %lld\n",
+ if (pgsnapc != snapc) {
+ dout("page snapc %p %lld != oldest %p %lld\n",
pgsnapc, pgsnapc->seq, snapc, snapc->seq);
unlock_page(page);
- if (!locked_pages)
- continue; /* keep looking for snap */
- break;
+ continue;
}
if (!clear_page_dirty_for_io(page)) {
dout("%p !clear_page_dirty_for_io\n", page);
unlock_page(page);
- break;
+ continue;
}
/*
@@ -942,7 +966,7 @@ get_more_pages:
break;
}
- num_ops = 1 + do_sync;
+ num_ops = 1;
strip_unit_end = page->index +
((len - 1) >> PAGE_SHIFT);
@@ -972,8 +996,6 @@ get_more_pages:
}
/* note position of first page in pvec */
- if (first < 0)
- first = i;
dout("%p will write page %p idx %lu\n",
inode, page, page->index);
@@ -984,8 +1006,10 @@ get_more_pages:
BLK_RW_ASYNC);
}
- pages[locked_pages] = page;
- locked_pages++;
+
+ pages[locked_pages++] = page;
+ pvec.pages[i] = NULL;
+
len += PAGE_SIZE;
}
@@ -993,23 +1017,23 @@ get_more_pages:
if (!locked_pages)
goto release_pvec_pages;
if (i) {
- int j;
- BUG_ON(!locked_pages || first < 0);
+ unsigned j, n = 0;
+ /* shift unused page to beginning of pvec */
+ for (j = 0; j < pvec_pages; j++) {
+ if (!pvec.pages[j])
+ continue;
+ if (n < j)
+ pvec.pages[n] = pvec.pages[j];
+ n++;
+ }
+ pvec.nr = n;
if (pvec_pages && i == pvec_pages &&
locked_pages < max_pages) {
dout("reached end pvec, trying for more\n");
- pagevec_reinit(&pvec);
+ pagevec_release(&pvec);
goto get_more_pages;
}
-
- /* shift unused pages over in the pvec... we
- * will need to release them below. */
- for (j = i; j < pvec_pages; j++) {
- dout(" pvec leftover page %p\n", pvec.pages[j]);
- pvec.pages[j-i+first] = pvec.pages[j];
- }
- pvec.nr -= i-first;
}
new_request:
@@ -1019,10 +1043,9 @@ new_request:
req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout, vino,
offset, &len, 0, num_ops,
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq,
- truncate_size, false);
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ snapc, ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, false);
if (IS_ERR(req)) {
req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout, vino,
@@ -1031,8 +1054,8 @@ new_request:
CEPH_OSD_SLAB_OPS),
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq,
- truncate_size, true);
+ snapc, ceph_wbc.truncate_seq,
+ ceph_wbc.truncate_size, true);
BUG_ON(IS_ERR(req));
}
BUG_ON(len < page_offset(pages[locked_pages - 1]) +
@@ -1048,7 +1071,7 @@ new_request:
for (i = 0; i < locked_pages; i++) {
u64 cur_offset = page_offset(pages[i]);
if (offset + len != cur_offset) {
- if (op_idx + do_sync + 1 == req->r_num_ops)
+ if (op_idx + 1 == req->r_num_ops)
break;
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
@@ -1069,14 +1092,15 @@ new_request:
len += PAGE_SIZE;
}
- if (snap_size != -1) {
- len = min(len, snap_size - offset);
+ if (ceph_wbc.size_stable) {
+ len = min(len, ceph_wbc.i_size - offset);
} else if (i == locked_pages) {
/* writepages_finish() clears writeback pages
* according to the data length, so make sure
* data length covers all locked pages */
u64 min_len = len + 1 - PAGE_SIZE;
- len = min(len, (u64)i_size_read(inode) - offset);
+ len = get_writepages_data_length(inode, pages[i - 1],
+ offset);
len = max(len, min_len);
}
dout("writepages got pages at %llu~%llu\n", offset, len);
@@ -1085,17 +1109,12 @@ new_request:
0, !!pool, false);
osd_req_op_extent_update(req, op_idx, len);
- if (do_sync) {
- op_idx++;
- osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
- }
BUG_ON(op_idx + 1 != req->r_num_ops);
pool = NULL;
if (i < locked_pages) {
BUG_ON(num_ops <= req->r_num_ops);
num_ops -= req->r_num_ops;
- num_ops += do_sync;
locked_pages -= i;
/* allocate new pages array for next request */
@@ -1127,22 +1146,50 @@ new_request:
if (pages)
goto new_request;
- if (wbc->nr_to_write <= 0)
- done = 1;
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
+ done = stop = true;
release_pvec_pages:
dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
pvec.nr ? pvec.pages[0] : NULL);
pagevec_release(&pvec);
-
- if (locked_pages && !done)
- goto retry;
}
if (should_loop && !done) {
/* more to do; loop back to beginning of file */
dout("writepages looping back to beginning of file\n");
- should_loop = 0;
+ end = start_index - 1; /* OK even when start_index == 0 */
+
+ /* to write dirty pages associated with next snapc,
+ * we need to wait until current writes complete */
+ if (wbc->sync_mode != WB_SYNC_NONE &&
+ start_index == 0 && /* all dirty pages were checked */
+ !ceph_wbc.head_snapc) {
+ struct page *page;
+ unsigned i, nr;
+ index = 0;
+ while ((index <= end) &&
+ (nr = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_WRITEBACK,
+ PAGEVEC_SIZE))) {
+ for (i = 0; i < nr; i++) {
+ page = pvec.pages[i];
+ if (page_snap_context(page) != snapc)
+ continue;
+ wait_on_page_writeback(page);
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+ }
+
+ start_index = 0;
index = 0;
goto retry;
}
@@ -1152,8 +1199,8 @@ release_pvec_pages:
out:
ceph_osdc_put_request(req);
- ceph_put_snap_context(snapc);
- dout("writepages done, rc = %d\n", rc);
+ ceph_put_snap_context(last_snapc);
+ dout("writepages dend - startone, rc = %d\n", rc);
return rc;
}
@@ -1165,8 +1212,7 @@ out:
static int context_is_writeable_or_written(struct inode *inode,
struct ceph_snap_context *snapc)
{
- struct ceph_snap_context *oldest = get_oldest_context(inode, NULL,
- NULL, NULL);
+ struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
int ret = !oldest || snapc->seq <= oldest->seq;
ceph_put_snap_context(oldest);
@@ -1211,8 +1257,7 @@ retry_locked:
* this page is already dirty in another (older) snap
* context! is it writeable now?
*/
- oldest = get_oldest_context(inode, NULL, NULL, NULL);
-
+ oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
ceph_put_snap_context(oldest);
dout(" page %p snapc %p not current or oldest\n",
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 174d6e6569a8..a3ab265d3215 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -209,7 +209,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
/* No caching for filesystem */
- if (fsc->fscache == NULL)
+ if (!fsc->fscache)
return;
/* Only cache for regular files that are read only */
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7007ae2a5ad2..157fe59fbabe 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -490,13 +490,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
}
/*
- * if we are newly issued FILE_SHARED, mark dir not complete; we
- * don't know what happened to this directory while we didn't
- * have the cap.
+ * If FILE_SHARED is newly issued, mark dir not complete. We don't
+ * know what happened to this directory while we didn't have the cap.
+ * If FILE_SHARED is being revoked, also mark dir not complete. It
+ * stops on-going cached readdir.
*/
- if ((issued & CEPH_CAP_FILE_SHARED) &&
- (had & CEPH_CAP_FILE_SHARED) == 0) {
- ci->i_shared_gen++;
+ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
+ if (issued & CEPH_CAP_FILE_SHARED)
+ ci->i_shared_gen++;
if (S_ISDIR(ci->vfs_inode.i_mode)) {
dout(" marking %p NOT complete\n", &ci->vfs_inode);
__ceph_dir_clear_complete(ci);
@@ -611,7 +612,7 @@ void ceph_add_cap(struct inode *inode,
}
if (flags & CEPH_CAP_FLAG_AUTH) {
- if (ci->i_auth_cap == NULL ||
+ if (!ci->i_auth_cap ||
ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
ci->i_auth_cap = cap;
cap->mds_wanted = wanted;
@@ -728,7 +729,7 @@ static void __touch_cap(struct ceph_cap *cap)
struct ceph_mds_session *s = cap->session;
spin_lock(&s->s_cap_lock);
- if (s->s_cap_iterator == NULL) {
+ if (!s->s_cap_iterator) {
dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
s->s_mds);
list_move_tail(&cap->session_caps, &s->s_caps);
@@ -1248,7 +1249,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
arg.mode = inode->i_mode;
arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
- arg.flags = 0;
+ if (list_empty(&ci->i_cap_snaps))
+ arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP;
+ else
+ arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
if (sync)
arg.flags |= CEPH_CLIENT_CAPS_SYNC;
@@ -1454,13 +1458,19 @@ retry:
goto retry;
}
+ // make sure flushsnap messages are sent in proper order.
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ __kick_flushing_caps(mdsc, session, ci, 0);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+
__ceph_flush_snaps(ci, session);
out:
spin_unlock(&ci->i_ceph_lock);
if (psession) {
*psession = session;
- } else {
+ } else if (session) {
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
}
@@ -1901,11 +1911,7 @@ ack:
(ci->i_ceph_flags &
(CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
- spin_lock(&mdsc->cap_dirty_lock);
- oldest_flush_tid = __get_oldest_flush_tid(mdsc);
- spin_unlock(&mdsc->cap_dirty_lock);
- __kick_flushing_caps(mdsc, session, ci,
- oldest_flush_tid);
+ __kick_flushing_caps(mdsc, session, ci, 0);
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
}
if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
@@ -2110,7 +2116,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ ret = file_write_and_wait_range(file, start, end);
if (ret < 0)
goto out;
@@ -3422,7 +3428,7 @@ retry:
tcap = __get_cap_for_mds(ci, target);
if (tcap) {
/* already have caps from the target */
- if (tcap->cap_id != t_cap_id ||
+ if (tcap->cap_id == t_cap_id &&
ceph_seq_cmp(tcap->seq, t_seq) < 0) {
dout(" updating import cap %p mds%d\n", tcap, target);
tcap->cap_id = t_cap_id;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 4e2d112c982f..d635496ea189 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -24,7 +24,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_mdsmap *mdsmap;
- if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
+ if (!fsc->mdsc || !fsc->mdsc->mdsmap)
return 0;
mdsmap = fsc->mdsc->mdsmap;
seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ef7240ace576..019c2036d36f 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -377,8 +377,10 @@ more:
}
/* hints to request -> mds selection code */
req->r_direct_mode = USE_AUTH_MDS;
- req->r_direct_hash = ceph_frag_value(frag);
- __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ if (op == CEPH_MDS_OP_READDIR) {
+ req->r_direct_hash = ceph_frag_value(frag);
+ __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
+ }
if (fi->last_name) {
req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL);
if (!req->r_path2) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3d48c415f3cb..65a6fa12c857 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -175,7 +175,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode);
cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
- if (cf == NULL) {
+ if (!cf) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM;
}
@@ -562,8 +562,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
ssize_t ret;
size_t len = iov_iter_count(to);
- dout("sync_read on file %p %llu~%u %s\n", file, off,
- (unsigned)len,
+ dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
if (!len)
@@ -788,7 +787,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
goto out;
}
- req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
+ req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc);
ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid);
@@ -800,7 +799,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
}
req->r_ops[0] = orig_req->r_ops[0];
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
req->r_mtime = aio_req->mtime;
req->r_data_offset = req->r_ops[0].extent.offset;
@@ -847,8 +845,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_direct_read_write (%s) on file %p %lld~%u\n",
- (write ? "write" : "read"), file, pos, (unsigned)count);
+ dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n",
+ (write ? "write" : "read"), file, pos, (unsigned)count,
+ snapc, snapc->seq);
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
@@ -861,7 +860,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (ret2 < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret2);
- flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
+ flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
} else {
flags = CEPH_OSD_FLAG_READ;
}
@@ -874,8 +873,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
vino = ceph_vino(inode);
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
vino, pos, &size, 0,
- /*include a 'startsync' command*/
- write ? 2 : 1,
+ 1,
write ? CEPH_OSD_OP_WRITE :
CEPH_OSD_OP_READ,
flags, snapc,
@@ -887,6 +885,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
break;
}
+ if (write)
+ size = min_t(u64, size, fsc->mount_options->wsize);
+ else
+ size = min_t(u64, size, fsc->mount_options->rsize);
+
len = size;
pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
if (IS_ERR(pages)) {
@@ -922,7 +925,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
truncate_inode_pages_range(inode->i_mapping, pos,
(pos+len) | (PAGE_SIZE - 1));
- osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
req->r_mtime = mtime;
}
@@ -1048,7 +1050,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+ dout("sync_write on file %p %lld~%u snapc %p seq %lld\n",
+ file, pos, (unsigned)count, snapc, snapc->seq);
ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
@@ -1060,7 +1063,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
- flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE;
+ flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE;
while ((len = iov_iter_count(from)) > 0) {
size_t left;
@@ -1307,6 +1310,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (!prealloc_cf)
return -ENOMEM;
+retry_snap:
inode_lock(inode);
/* We can write back this queue in page reclaim */
@@ -1338,7 +1342,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
-retry_snap:
/* FIXME: not complete since it doesn't account for being at quota */
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC;
@@ -1387,14 +1390,6 @@ retry_snap:
&prealloc_cf);
else
written = ceph_sync_write(iocb, &data, pos, snapc);
- if (written == -EOLDSNAPC) {
- dout("aio_write %p %llx.%llx %llu~%u"
- "got EOLDSNAPC, retrying\n",
- inode, ceph_vinop(inode),
- pos, (unsigned)count);
- inode_lock(inode);
- goto retry_snap;
- }
if (written > 0)
iov_iter_advance(from, written);
ceph_put_snap_context(snapc);
@@ -1428,10 +1423,15 @@ retry_snap:
ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
+ if (written == -EOLDSNAPC) {
+ dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n",
+ inode, ceph_vinop(inode), pos, (unsigned)count);
+ goto retry_snap;
+ }
+
if (written >= 0) {
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL))
iocb->ki_flags |= IOCB_DSYNC;
-
written = generic_write_sync(iocb, written);
}
@@ -1481,13 +1481,13 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
offset += file->f_pos;
break;
case SEEK_DATA:
- if (offset >= i_size) {
+ if (offset < 0 || offset >= i_size) {
ret = -ENXIO;
goto out;
}
break;
case SEEK_HOLE:
- if (offset >= i_size) {
+ if (offset < 0 || offset >= i_size) {
ret = -ENXIO;
goto out;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 220dfd87cbfa..373dab5173ca 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -52,7 +52,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
ino_t t = ceph_vino_to_ino(vino);
inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
- if (inode == NULL)
+ if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
dout("get_inode created new inode %p %llx.%llx ino %llx\n",
@@ -133,12 +133,9 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
}
frag = kmalloc(sizeof(*frag), GFP_NOFS);
- if (!frag) {
- pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
- "frag %x\n", &ci->vfs_inode,
- ceph_vinop(&ci->vfs_inode), f);
+ if (!frag)
return ERR_PTR(-ENOMEM);
- }
+
frag->frag = f;
frag->split_by = 0;
frag->mds = -1;
@@ -1070,7 +1067,6 @@ out_unlock:
spin_unlock(&dentry->d_lock);
if (old_lease_session)
ceph_put_mds_session(old_lease_session);
- return;
}
/*
@@ -1177,7 +1173,7 @@ retry_lookup:
dn = d_alloc(parent, &dname);
dout("d_alloc %p '%.*s' = %p\n", parent,
dname.len, dname.name, dn);
- if (dn == NULL) {
+ if (!dn) {
dput(parent);
err = -ENOMEM;
goto done;
@@ -1477,7 +1473,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct dentry *dn;
struct inode *in;
int err = 0, skipped = 0, ret, i;
- struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
u32 frag = le32_to_cpu(rhead->args.readdir.frag);
u32 last_hash = 0;
@@ -1510,8 +1505,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
}
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
- snapdir = ceph_get_snapdir(d_inode(parent));
- parent = d_find_alias(snapdir);
dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
rinfo->dir_nr, parent);
} else {
@@ -1519,15 +1512,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
rinfo->dir_nr, parent);
if (rinfo->dir_dir)
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
- }
- if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
- !(rinfo->hash_order && last_hash)) {
- /* note dir version at start of readdir so we can tell
- * if any dentries get dropped */
- req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
- req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
- req->r_readdir_cache_idx = 0;
+ if (ceph_frag_is_leftmost(frag) &&
+ req->r_readdir_offset == 2 &&
+ !(rinfo->hash_order && last_hash)) {
+ /* note dir version at start of readdir so we can
+ * tell if any dentries get dropped */
+ req->r_dir_release_cnt =
+ atomic64_read(&ci->i_release_count);
+ req->r_dir_ordered_cnt =
+ atomic64_read(&ci->i_ordered_count);
+ req->r_readdir_cache_idx = 0;
+ }
}
cache_ctl.index = req->r_readdir_cache_idx;
@@ -1566,7 +1562,7 @@ retry_lookup:
dn = d_alloc(parent, &dname);
dout("d_alloc %p '%.*s' = %p\n", parent,
dname.len, dname.name, dn);
- if (dn == NULL) {
+ if (!dn) {
dout("d_alloc badness\n");
err = -ENOMEM;
goto out;
@@ -1650,10 +1646,6 @@ out:
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
- if (snapdir) {
- iput(snapdir);
- dput(parent);
- }
dout("readdir_prepopulate done\n");
return err;
}
@@ -1841,9 +1833,20 @@ retry:
* possibly truncate them.. so write AND block!
*/
if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
+ struct ceph_cap_snap *capsnap;
+ to = ci->i_truncate_size;
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ // MDS should have revoked Frw caps
+ WARN_ON_ONCE(capsnap->writing);
+ if (capsnap->dirty_pages && capsnap->size > to)
+ to = capsnap->size;
+ }
+ spin_unlock(&ci->i_ceph_lock);
dout("__do_pending_vmtruncate %p flushing snaps first\n",
inode);
- spin_unlock(&ci->i_ceph_lock);
+
+ truncate_pagecache(inode, to);
+
filemap_write_and_wait_range(&inode->i_data, 0,
inode->i_sb->s_maxbytes);
goto retry;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 666a9f274832..9dd6b836ac9e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -408,7 +408,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
{
struct ceph_mds_session *session;
- if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
+ if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return NULL;
session = mdsc->sessions[mds];
dout("lookup_mds_session %p %d\n", session,
@@ -483,7 +483,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
dout("register_session realloc to %d\n", newmax);
sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
- if (sa == NULL)
+ if (!sa)
goto fail_realloc;
if (mdsc->sessions) {
memcpy(sa, mdsc->sessions,
@@ -731,9 +731,16 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
inode = NULL;
if (req->r_inode) {
- inode = req->r_inode;
- ihold(inode);
- } else if (req->r_dentry) {
+ if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) {
+ inode = req->r_inode;
+ ihold(inode);
+ } else {
+ /* req->r_dentry is non-null for LSSNAP request.
+ * fall-thru */
+ WARN_ON_ONCE(!req->r_dentry);
+ }
+ }
+ if (!inode && req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */
struct dentry *parent;
struct inode *dir;
@@ -886,7 +893,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
/* Calculate serialized length of metadata */
metadata_bytes = 4; /* map length */
- for (i = 0; metadata[i][0] != NULL; ++i) {
+ for (i = 0; metadata[i][0]; ++i) {
metadata_bytes += 8 + strlen(metadata[i][0]) +
strlen(metadata[i][1]);
metadata_key_count++;
@@ -919,7 +926,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
ceph_encode_32(&p, metadata_key_count);
/* Two length-prefixed strings for each entry in the map */
- for (i = 0; metadata[i][0] != NULL; ++i) {
+ for (i = 0; metadata[i][0]; ++i) {
size_t const key_len = strlen(metadata[i][0]);
size_t const val_len = strlen(metadata[i][1]);
@@ -1122,7 +1129,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
spin_lock(&session->s_cap_lock);
p = p->next;
- if (cap->ci == NULL) {
+ if (!cap->ci) {
dout("iterate_session_caps finishing cap %p removal\n",
cap);
BUG_ON(cap->session != session);
@@ -1748,7 +1755,7 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
int len, pos;
unsigned seq;
- if (dentry == NULL)
+ if (!dentry)
return ERR_PTR(-EINVAL);
retry:
@@ -1771,7 +1778,7 @@ retry:
len--; /* no leading '/' */
path = kmalloc(len+1, GFP_NOFS);
- if (path == NULL)
+ if (!path)
return ERR_PTR(-ENOMEM);
pos = len;
path[pos] = 0; /* trailing null */
@@ -2875,7 +2882,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
}
if (list_empty(&ci->i_cap_snaps)) {
- snap_follows = 0;
+ snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0;
} else {
struct ceph_cap_snap *capsnap =
list_first_entry(&ci->i_cap_snaps,
@@ -3133,7 +3140,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
newmap->m_epoch, oldmap->m_epoch);
for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
- if (mdsc->sessions[i] == NULL)
+ if (!mdsc->sessions[i])
continue;
s = mdsc->sessions[i];
oldstate = ceph_mdsmap_get_state(oldmap, i);
@@ -3280,7 +3287,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
mutex_lock(&session->s_mutex);
session->s_seq++;
- if (inode == NULL) {
+ if (!inode) {
dout("handle_lease no inode %llx\n", vino.ino);
goto release;
}
@@ -3438,7 +3445,7 @@ static void delayed_work(struct work_struct *work)
for (i = 0; i < mdsc->max_sessions; i++) {
struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (s == NULL)
+ if (!s)
continue;
if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
dout("resending session close request for mds%d\n",
@@ -3490,7 +3497,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
fsc->mdsc = mdsc;
mutex_init(&mdsc->mutex);
mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
- if (mdsc->mdsmap == NULL) {
+ if (!mdsc->mdsmap) {
kfree(mdsc);
return -ENOMEM;
}
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 1a748cf88535..33ced4c22732 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -112,7 +112,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
u16 mdsmap_ev;
m = kzalloc(sizeof(*m), GFP_NOFS);
- if (m == NULL)
+ if (!m)
return ERR_PTR(-ENOMEM);
ceph_decode_need(p, end, 1 + 1, bad);
@@ -138,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_num_mds = m->m_max_mds;
m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS);
- if (m->m_info == NULL)
+ if (!m->m_info)
goto nomem;
/* pick out active nodes from mds_info (state > 0) */
@@ -232,7 +232,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
if (num_export_targets) {
info->export_targets = kcalloc(num_export_targets,
sizeof(u32), GFP_NOFS);
- if (info->export_targets == NULL)
+ if (!info->export_targets)
goto nomem;
for (j = 0; j < num_export_targets; j++)
info->export_targets[j] =
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index dab5d6732345..1ffc8b426c1c 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -299,7 +299,8 @@ static int cmpu64_rev(const void *a, const void *b)
/*
* build the snap context for a given realm.
*/
-static int build_snap_context(struct ceph_snap_realm *realm)
+static int build_snap_context(struct ceph_snap_realm *realm,
+ struct list_head* dirty_realms)
{
struct ceph_snap_realm *parent = realm->parent;
struct ceph_snap_context *snapc;
@@ -313,7 +314,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
*/
if (parent) {
if (!parent->cached_context) {
- err = build_snap_context(parent);
+ err = build_snap_context(parent, dirty_realms);
if (err)
goto fail;
}
@@ -332,7 +333,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
" (unchanged)\n",
realm->ino, realm, realm->cached_context,
realm->cached_context->seq,
- (unsigned int) realm->cached_context->num_snaps);
+ (unsigned int)realm->cached_context->num_snaps);
return 0;
}
@@ -373,7 +374,11 @@ static int build_snap_context(struct ceph_snap_realm *realm)
realm->ino, realm, snapc, snapc->seq,
(unsigned int) snapc->num_snaps);
- ceph_put_snap_context(realm->cached_context);
+ if (realm->cached_context) {
+ ceph_put_snap_context(realm->cached_context);
+ /* queue realm for cap_snap creation */
+ list_add_tail(&realm->dirty_item, dirty_realms);
+ }
realm->cached_context = snapc;
return 0;
@@ -394,15 +399,16 @@ fail:
/*
* rebuild snap context for the given realm and all of its children.
*/
-static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+static void rebuild_snap_realms(struct ceph_snap_realm *realm,
+ struct list_head *dirty_realms)
{
struct ceph_snap_realm *child;
dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
- build_snap_context(realm);
+ build_snap_context(realm, dirty_realms);
list_for_each_entry(child, &realm->children, child_item)
- rebuild_snap_realms(child);
+ rebuild_snap_realms(child, dirty_realms);
}
@@ -624,13 +630,11 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
{
struct ceph_inode_info *ci;
struct inode *lastinode = NULL;
- struct ceph_snap_realm *child;
dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
spin_lock(&realm->inodes_with_caps_lock);
- list_for_each_entry(ci, &realm->inodes_with_caps,
- i_snap_realm_item) {
+ list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) {
struct inode *inode = igrab(&ci->vfs_inode);
if (!inode)
continue;
@@ -643,14 +647,6 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
spin_unlock(&realm->inodes_with_caps_lock);
iput(lastinode);
- list_for_each_entry(child, &realm->children, child_item) {
- dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
- realm, realm->ino, child, child->ino);
- list_del_init(&child->dirty_item);
- list_add(&child->dirty_item, &realm->dirty_item);
- }
-
- list_del_init(&realm->dirty_item);
dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
}
@@ -721,8 +717,6 @@ more:
if (err < 0)
goto fail;
- /* queue realm for cap_snap creation */
- list_add(&realm->dirty_item, &dirty_realms);
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
@@ -741,7 +735,7 @@ more:
/* invalidate when we reach the _end_ (root) of the trace */
if (invalidate && p >= e)
- rebuild_snap_realms(realm);
+ rebuild_snap_realms(realm, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -758,6 +752,7 @@ more:
while (!list_empty(&dirty_realms)) {
realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
dirty_item);
+ list_del_init(&realm->dirty_item);
queue_realm_cap_snaps(realm);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index aa06a8c24792..e4082afedcb1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -49,9 +49,16 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
struct ceph_statfs st;
u64 fsid;
int err;
+ u64 data_pool;
+
+ if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) {
+ data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0];
+ } else {
+ data_pool = CEPH_NOPOOL;
+ }
dout("statfs\n");
- err = ceph_monc_do_statfs(&fsc->client->monc, &st);
+ err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st);
if (err < 0)
return err;
@@ -113,7 +120,6 @@ enum {
Opt_rasize,
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
- Opt_cap_release_safety,
Opt_readdir_max_entries,
Opt_readdir_max_bytes,
Opt_congestion_kb,
@@ -152,7 +158,6 @@ static match_table_t fsopt_tokens = {
{Opt_rasize, "rasize=%d"},
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
- {Opt_cap_release_safety, "cap_release_safety=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
{Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
{Opt_congestion_kb, "write_congestion_kb=%d"},
@@ -235,27 +240,43 @@ static int parse_fsopt_token(char *c, void *private)
break;
/* misc */
case Opt_wsize:
- fsopt->wsize = intval;
+ if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
+ return -EINVAL;
+ fsopt->wsize = ALIGN(intval, PAGE_SIZE);
break;
case Opt_rsize:
- fsopt->rsize = intval;
+ if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE)
+ return -EINVAL;
+ fsopt->rsize = ALIGN(intval, PAGE_SIZE);
break;
case Opt_rasize:
- fsopt->rasize = intval;
+ if (intval < 0)
+ return -EINVAL;
+ fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE);
break;
case Opt_caps_wanted_delay_min:
+ if (intval < 1)
+ return -EINVAL;
fsopt->caps_wanted_delay_min = intval;
break;
case Opt_caps_wanted_delay_max:
+ if (intval < 1)
+ return -EINVAL;
fsopt->caps_wanted_delay_max = intval;
break;
case Opt_readdir_max_entries:
+ if (intval < 1)
+ return -EINVAL;
fsopt->max_readdir = intval;
break;
case Opt_readdir_max_bytes:
+ if (intval < PAGE_SIZE && intval != 0)
+ return -EINVAL;
fsopt->max_readdir_bytes = intval;
break;
case Opt_congestion_kb:
+ if (intval < 1024) /* at least 1M */
+ return -EINVAL;
fsopt->congestion_kb = intval;
break;
case Opt_dirstat:
@@ -392,7 +413,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->sb_flags = flags;
fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
- fsopt->rsize = CEPH_RSIZE_DEFAULT;
+ fsopt->wsize = CEPH_MAX_WRITE_SIZE;
+ fsopt->rsize = CEPH_MAX_READ_SIZE;
fsopt->rasize = CEPH_RASIZE_DEFAULT;
fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
if (!fsopt->snapdir_name) {
@@ -402,7 +424,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
- fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
@@ -508,7 +529,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
- if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
+ if (fsopt->rsize != CEPH_MAX_READ_SIZE)
seq_printf(m, ",rsize=%d", fsopt->rsize);
if (fsopt->rasize != CEPH_RASIZE_DEFAULT)
seq_printf(m, ",rasize=%d", fsopt->rasize);
@@ -520,9 +541,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
seq_printf(m, ",caps_wanted_delay_max=%d",
fsopt->caps_wanted_delay_max);
- if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
- seq_printf(m, ",cap_release_safety=%d",
- fsopt->cap_release_safety);
if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
@@ -576,7 +594,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- if (fsopt->mds_namespace == NULL) {
+ if (!fsopt->mds_namespace) {
ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
0, true);
} else {
@@ -597,13 +615,13 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
* to be processed in parallel, limit concurrency.
*/
fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
- if (fsc->wb_wq == NULL)
+ if (!fsc->wb_wq)
goto fail_client;
fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
- if (fsc->pg_inv_wq == NULL)
+ if (!fsc->pg_inv_wq)
goto fail_wb_wq;
fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
- if (fsc->trunc_wq == NULL)
+ if (!fsc->trunc_wq)
goto fail_pg_inv_wq;
/* set up mempools */
@@ -674,26 +692,26 @@ static int __init init_caches(void)
__alignof__(struct ceph_inode_info),
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
SLAB_ACCOUNT, ceph_inode_init_once);
- if (ceph_inode_cachep == NULL)
+ if (!ceph_inode_cachep)
return -ENOMEM;
ceph_cap_cachep = KMEM_CACHE(ceph_cap,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_cachep == NULL)
+ if (!ceph_cap_cachep)
goto bad_cap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_cap_flush_cachep == NULL)
+ if (!ceph_cap_flush_cachep)
goto bad_cap_flush;
ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
- if (ceph_dentry_cachep == NULL)
+ if (!ceph_dentry_cachep)
goto bad_dentry;
ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
- if (ceph_file_cachep == NULL)
+ if (!ceph_file_cachep)
goto bad_file;
if ((error = ceph_fscache_register()))
@@ -947,20 +965,10 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
return err;
/* set ra_pages based on rasize mount option? */
- if (fsc->mount_options->rasize >= PAGE_SIZE)
- sb->s_bdi->ra_pages =
- (fsc->mount_options->rasize + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
- else
- sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-
- if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
- fsc->mount_options->rsize >= PAGE_SIZE)
- sb->s_bdi->io_pages =
- (fsc->mount_options->rsize + PAGE_SIZE - 1)
- >> PAGE_SHIFT;
- else if (fsc->mount_options->rsize == 0)
- sb->s_bdi->io_pages = ULONG_MAX;
+ sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT;
+
+ /* set io_pages based on max osd read size */
+ sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT;
return 0;
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index f02a2225fe42..279a2f401cf5 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -46,12 +46,25 @@
#define ceph_test_mount_opt(fsc, opt) \
(!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
-#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */
+/* max size of osd read request, limited by libceph */
+#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN
+/* osd has a configurable limitaion of max write size.
+ * CEPH_MSG_MAX_DATA_LEN should be small enough. */
+#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN
#define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */
#define CEPH_MAX_READDIR_DEFAULT 1024
#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file. Delay a minimum amount of time, even if we send a cap
+ * message for some other reason. Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+
struct ceph_mount_options {
int flags;
int sb_flags;
@@ -61,7 +74,6 @@ struct ceph_mount_options {
int rasize; /* max readahead */
int congestion_kb; /* max writeback in flight */
int caps_wanted_delay_min, caps_wanted_delay_max;
- int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 11263f102e4c..3542b2c364cf 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -777,7 +777,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
spin_unlock(&ci->i_ceph_lock);
/* security module gets xattr while filling trace */
- if (current->journal_info != NULL) {
+ if (current->journal_info) {
pr_warn_ratelimited("sync getxattr %p "
"during filling trace\n", inode);
return -EBUSY;
@@ -809,7 +809,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
memcpy(value, xattr->val, xattr->val_len);
- if (current->journal_info != NULL &&
+ if (current->journal_info &&
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
ci->i_ceph_flags |= CEPH_I_SEC_INITED;
out:
@@ -1058,7 +1058,7 @@ do_sync_unlocked:
up_read(&mdsc->snap_rwsem);
/* security module set xattr while filling trace */
- if (current->journal_info != NULL) {
+ if (current->journal_info) {
pr_warn_ratelimited("sync setxattr %p "
"during filling trace\n", inode);
err = -EBUSY;
@@ -1108,7 +1108,7 @@ bool ceph_security_xattr_deadlock(struct inode *in)
{
struct ceph_inode_info *ci;
bool ret;
- if (in->i_security == NULL)
+ if (!in->i_security)
return false;
ci = ceph_inode(in);
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 23ebb92484c6..28de3edd4f4d 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -114,6 +114,7 @@ struct ext2_sb_info {
*/
spinlock_t s_lock;
struct mb_cache *s_ea_block_cache;
+ struct dax_device *s_daxdev;
};
static inline spinlock_t *
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 30163d007b2f..4dca6f348714 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -800,10 +800,10 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
- struct block_device *bdev;
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+ struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb);
bool new = false, boundary = false;
u32 bno;
int ret;
@@ -814,13 +814,9 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return ret;
iomap->flags = 0;
- bdev = inode->i_sb->s_bdev;
- iomap->bdev = bdev;
+ iomap->bdev = inode->i_sb->s_bdev;
iomap->offset = (u64)first_block << blkbits;
- if (blk_queue_dax(bdev->bd_queue))
- iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
- else
- iomap->dax_dev = NULL;
+ iomap->dax_dev = sbi->s_daxdev;
if (ret == 0) {
iomap->type = IOMAP_HOLE;
@@ -842,7 +838,6 @@ static int
ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
- fs_put_dax(iomap->dax_dev);
if (iomap->type == IOMAP_MAPPED &&
written < length &&
(flags & IOMAP_WRITE))
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7b1bc9059863..fc18edd81815 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -171,6 +171,7 @@ static void ext2_put_super (struct super_block * sb)
brelse (sbi->s_sbh);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
+ fs_put_dax(sbi->s_daxdev);
kfree(sbi);
}
@@ -813,6 +814,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
static int ext2_fill_super(struct super_block *sb, void *data, int silent)
{
+ struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
struct buffer_head * bh;
struct ext2_sb_info * sbi;
struct ext2_super_block * es;
@@ -842,6 +844,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb_block = sb_block;
+ sbi->s_daxdev = dax_dev;
spin_lock_init(&sbi->s_lock);
@@ -1200,6 +1203,7 @@ failed_sbi:
kfree(sbi->s_blockgroup_lock);
kfree(sbi);
failed:
+ fs_put_dax(dax_dev);
return ret;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 84b9da192238..e2abe01c8c6b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1526,6 +1526,7 @@ struct ext4_sb_info {
/* Barrier between changing inodes' journal flags and writepages ops. */
struct percpu_rw_semaphore s_journal_flag_rwsem;
+ struct dax_device *s_daxdev;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e963508ea35f..31db875bc7a1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3397,7 +3397,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
- struct block_device *bdev;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3466,12 +3466,8 @@ retry:
}
iomap->flags = 0;
- bdev = inode->i_sb->s_bdev;
- iomap->bdev = bdev;
- if (blk_queue_dax(bdev->bd_queue))
- iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
- else
- iomap->dax_dev = NULL;
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->dax_dev = sbi->s_daxdev;
iomap->offset = first_block << blkbits;
if (ret == 0) {
@@ -3504,7 +3500,6 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
int blkbits = inode->i_blkbits;
bool truncate = false;
- fs_put_dax(iomap->dax_dev);
if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
return 0;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 93aece6891f2..71b9a667e1bc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -951,6 +951,7 @@ static void ext4_put_super(struct super_block *sb)
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
+ fs_put_dax(sbi->s_daxdev);
kfree(sbi);
}
@@ -3398,6 +3399,7 @@ static void ext4_set_resv_clusters(struct super_block *sb)
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
+ struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
char *orig_data = kstrdup(data, GFP_KERNEL);
struct buffer_head *bh;
struct ext4_super_block *es = NULL;
@@ -3423,6 +3425,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if ((data && !orig_data) || !sbi)
goto out_free_base;
+ sbi->s_daxdev = dax_dev;
sbi->s_blockgroup_lock =
kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
if (!sbi->s_blockgroup_lock)
@@ -4399,6 +4402,7 @@ out_fail:
out_free_base:
kfree(sbi);
kfree(orig_data);
+ fs_put_dax(dax_dev);
return err ? err : ret;
}
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index b4b8438c42ef..436b3a1464d9 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -207,15 +207,16 @@ static int __f2fs_set_acl(struct inode *inode, int type,
void *value = NULL;
size_t size = 0;
int error;
+ umode_t mode = inode->i_mode;
switch (type) {
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
if (acl && !ipage) {
- error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+ error = posix_acl_update_mode(inode, &mode, &acl);
if (error)
return error;
- set_acl_inode(inode, inode->i_mode);
+ set_acl_inode(inode, mode);
}
break;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 5b876f6d3f6b..04fe1df052b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -230,8 +230,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
}
-static int f2fs_write_meta_page(struct page *page,
- struct writeback_control *wbc)
+static int __f2fs_write_meta_page(struct page *page,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
@@ -244,7 +245,7 @@ static int f2fs_write_meta_page(struct page *page,
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
- write_meta_page(sbi, page);
+ write_meta_page(sbi, page, io_type);
dec_page_count(sbi, F2FS_DIRTY_META);
if (wbc->for_reclaim)
@@ -263,6 +264,12 @@ redirty_out:
return AOP_WRITEPAGE_ACTIVATE;
}
+static int f2fs_write_meta_page(struct page *page,
+ struct writeback_control *wbc)
+{
+ return __f2fs_write_meta_page(page, wbc, FS_META_IO);
+}
+
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -283,7 +290,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping,
trace_f2fs_writepages(mapping->host, wbc, META);
diff = nr_pages_to_write(sbi, META, wbc);
- written = sync_meta_pages(sbi, META, wbc->nr_to_write);
+ written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO);
mutex_unlock(&sbi->cp_mutex);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
return 0;
@@ -295,7 +302,7 @@ skip_write:
}
long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
- long nr_to_write)
+ long nr_to_write, enum iostat_type io_type)
{
struct address_space *mapping = META_MAPPING(sbi);
pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX;
@@ -346,7 +353,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- if (mapping->a_ops->writepage(page, &wbc)) {
+ if (__f2fs_write_meta_page(page, &wbc, io_type)) {
unlock_page(page);
break;
}
@@ -581,11 +588,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
int recover_orphan_inodes(struct f2fs_sb_info *sbi)
{
block_t start_blk, orphan_blocks, i, j;
- int err;
+ unsigned int s_flags = sbi->sb->s_flags;
+ int err = 0;
if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
return 0;
+ if (s_flags & MS_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sbi->sb->s_flags &= ~MS_RDONLY;
+ }
+
+#ifdef CONFIG_QUOTA
+ /* Needed for iput() to work correctly and not trash data */
+ sbi->sb->s_flags |= MS_ACTIVE;
+ /* Turn on quotas so that they are updated correctly */
+ f2fs_enable_quota_files(sbi);
+#endif
+
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
@@ -601,14 +621,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
err = recover_orphan_inode(sbi, ino);
if (err) {
f2fs_put_page(page, 1);
- return err;
+ goto out;
}
}
f2fs_put_page(page, 1);
}
/* clear Orphan Flag */
clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
- return 0;
+out:
+#ifdef CONFIG_QUOTA
+ /* Turn quotas off */
+ f2fs_quota_off_umount(sbi->sb);
+#endif
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+
+ return err;
}
static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
@@ -904,7 +931,14 @@ retry:
if (inode) {
unsigned long cur_ino = inode->i_ino;
+ if (is_dir)
+ F2FS_I(inode)->cp_task = current;
+
filemap_fdatawrite(inode->i_mapping);
+
+ if (is_dir)
+ F2FS_I(inode)->cp_task = NULL;
+
iput(inode);
/* We need to give cpu to another writers. */
if (ino == cur_ino) {
@@ -1017,7 +1051,7 @@ retry_flush_nodes:
if (get_pages(sbi, F2FS_DIRTY_NODES)) {
up_write(&sbi->node_write);
- err = sync_node_pages(sbi, &wbc);
+ err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
if (err) {
up_write(&sbi->node_change);
f2fs_unlock_all(sbi);
@@ -1115,7 +1149,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META)) {
- sync_meta_pages(sbi, META, LONG_MAX);
+ sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
}
@@ -1194,7 +1228,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT BITS pages */
while (get_pages(sbi, F2FS_DIRTY_META)) {
- sync_meta_pages(sbi, META, LONG_MAX);
+ sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
}
@@ -1249,7 +1283,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
percpu_counter_set(&sbi->alloc_valid_block_count, 0);
/* Here, we only have one bio having CP pack */
- sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
+ sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO);
/* wait for previous submitted meta pages writeback */
wait_on_all_pages_writeback(sbi);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index fb96bb71da00..36b535207c88 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -457,14 +457,65 @@ out_fail:
return err;
}
+static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
+ unsigned nr_pages)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct fscrypt_ctx *ctx = NULL;
+ struct bio *bio;
+
+ if (f2fs_encrypted_file(inode)) {
+ ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+ if (IS_ERR(ctx))
+ return ERR_CAST(ctx);
+
+ /* wait the page to be moved by cleaning */
+ f2fs_wait_on_block_writeback(sbi, blkaddr);
+ }
+
+ bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+ if (!bio) {
+ if (ctx)
+ fscrypt_release_ctx(ctx);
+ return ERR_PTR(-ENOMEM);
+ }
+ f2fs_target_device(sbi, blkaddr, bio);
+ bio->bi_end_io = f2fs_read_end_io;
+ bio->bi_private = ctx;
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ return bio;
+}
+
+/* This can handle encryption stuffs */
+static int f2fs_submit_page_read(struct inode *inode, struct page *page,
+ block_t blkaddr)
+{
+ struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ bio_put(bio);
+ return -EFAULT;
+ }
+ __submit_bio(F2FS_I_SB(inode), bio, DATA);
+ return 0;
+}
+
static void __set_data_blkaddr(struct dnode_of_data *dn)
{
struct f2fs_node *rn = F2FS_NODE(dn->node_page);
__le32 *addr_array;
+ int base = 0;
+
+ if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+ base = get_extra_isize(dn->inode);
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
- addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+ addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
}
/*
@@ -508,8 +559,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
f2fs_wait_on_page_writeback(dn->node_page, NODE, true);
for (; count > 0; dn->ofs_in_node++) {
- block_t blkaddr =
- datablock_addr(dn->node_page, dn->ofs_in_node);
+ block_t blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
if (blkaddr == NULL_ADDR) {
dn->data_blkaddr = NEW_ADDR;
__set_data_blkaddr(dn);
@@ -570,16 +621,6 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index,
struct page *page;
struct extent_info ei = {0,0,0};
int err;
- struct f2fs_io_info fio = {
- .sbi = F2FS_I_SB(inode),
- .type = DATA,
- .op = REQ_OP_READ,
- .op_flags = op_flags,
- .encrypted_page = NULL,
- };
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- return read_mapping_page(mapping, index, NULL);
page = f2fs_grab_cache_page(mapping, index, for_write);
if (!page)
@@ -620,9 +661,7 @@ got_it:
return page;
}
- fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
- fio.page = page;
- err = f2fs_submit_page_bio(&fio);
+ err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
if (err)
goto put_err;
return page;
@@ -756,7 +795,8 @@ static int __allocate_data_block(struct dnode_of_data *dn)
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return -EPERM;
- dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
if (dn->data_blkaddr == NEW_ADDR)
goto alloc;
@@ -782,7 +822,7 @@ alloc:
static inline bool __force_buffered_io(struct inode *inode, int rw)
{
- return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+ return (f2fs_encrypted_file(inode) ||
(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
F2FS_I_SB(inode)->s_ndevs);
}
@@ -814,7 +854,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
F2FS_GET_BLOCK_PRE_AIO :
F2FS_GET_BLOCK_PRE_DIO);
}
- if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
+ if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
err = f2fs_convert_inline_inode(inode);
if (err)
return err;
@@ -903,7 +943,7 @@ next_dnode:
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
if (create) {
@@ -1040,7 +1080,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
return __get_data_block(inode, iblock, bh_result, create,
- F2FS_GET_BLOCK_DIO, NULL);
+ F2FS_GET_BLOCK_DEFAULT, NULL);
}
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1146,35 +1186,6 @@ out:
return ret;
}
-static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
- unsigned nr_pages)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct fscrypt_ctx *ctx = NULL;
- struct bio *bio;
-
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
- ctx = fscrypt_get_ctx(inode, GFP_NOFS);
- if (IS_ERR(ctx))
- return ERR_CAST(ctx);
-
- /* wait the page to be moved by cleaning */
- f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
- }
-
- bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
- if (!bio) {
- if (ctx)
- fscrypt_release_ctx(ctx);
- return ERR_PTR(-ENOMEM);
- }
- f2fs_target_device(sbi, blkaddr, bio);
- bio->bi_end_io = f2fs_read_end_io;
- bio->bi_private = ctx;
-
- return bio;
-}
-
/*
* This function was originally taken from fs/mpage.c, and customized for f2fs.
* Major change was from block_size == page_size in f2fs by default.
@@ -1240,7 +1251,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_len = last_block - block_in_file;
if (f2fs_map_blocks(inode, &map, 0,
- F2FS_GET_BLOCK_READ))
+ F2FS_GET_BLOCK_DEFAULT))
goto set_error_page;
}
got_it:
@@ -1271,12 +1282,11 @@ submit_and_realloc:
bio = NULL;
}
if (bio == NULL) {
- bio = f2fs_grab_bio(inode, block_nr, nr_pages);
+ bio = f2fs_grab_read_bio(inode, block_nr, nr_pages);
if (IS_ERR(bio)) {
bio = NULL;
goto set_error_page;
}
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
}
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
@@ -1341,11 +1351,11 @@ static int encrypt_one_page(struct f2fs_io_info *fio)
struct inode *inode = fio->page->mapping->host;
gfp_t gfp_flags = GFP_NOFS;
- if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+ if (!f2fs_encrypted_file(inode))
return 0;
/* wait for GCed encrypted page writeback */
- f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr);
+ f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr);
retry_encrypt:
fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
@@ -1471,7 +1481,8 @@ out:
}
static int __write_data_page(struct page *page, bool *submitted,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1492,6 +1503,7 @@ static int __write_data_page(struct page *page, bool *submitted,
.encrypted_page = NULL,
.submitted = false,
.need_lock = LOCK_RETRY,
+ .io_type = io_type,
};
trace_f2fs_writepage(page, DATA);
@@ -1598,7 +1610,7 @@ redirty_out:
static int f2fs_write_data_page(struct page *page,
struct writeback_control *wbc)
{
- return __write_data_page(page, NULL, wbc);
+ return __write_data_page(page, NULL, wbc, FS_DATA_IO);
}
/*
@@ -1607,7 +1619,8 @@ static int f2fs_write_data_page(struct page *page,
* warm/hot data page.
*/
static int f2fs_write_cache_pages(struct address_space *mapping,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
int ret = 0;
int done = 0;
@@ -1697,7 +1710,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- ret = __write_data_page(page, &submitted, wbc);
+ ret = __write_data_page(page, &submitted, wbc, io_type);
if (unlikely(ret)) {
/*
* keep nr_to_write, since vfs uses this to
@@ -1752,8 +1765,9 @@ continue_unlock:
return ret;
}
-static int f2fs_write_data_pages(struct address_space *mapping,
- struct writeback_control *wbc)
+int __f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ enum iostat_type io_type)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -1790,7 +1804,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
goto skip_write;
blk_start_plug(&plug);
- ret = f2fs_write_cache_pages(mapping, wbc);
+ ret = f2fs_write_cache_pages(mapping, wbc, io_type);
blk_finish_plug(&plug);
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -1809,6 +1823,16 @@ skip_write:
return 0;
}
+static int f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+
+ return __f2fs_write_data_pages(mapping, wbc,
+ F2FS_I(inode)->cp_task == current ?
+ FS_CP_DATA_IO : FS_DATA_IO);
+}
+
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
@@ -1858,7 +1882,7 @@ restart:
set_new_dnode(&dn, inode, ipage, ipage, 0);
if (f2fs_has_inline_data(inode)) {
- if (pos + len <= MAX_INLINE_DATA) {
+ if (pos + len <= MAX_INLINE_DATA(inode)) {
read_inline_data(page, ipage);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
@@ -1956,8 +1980,8 @@ repeat:
f2fs_wait_on_page_writeback(page, DATA, false);
/* wait for GCed encrypted page writeback */
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
+ if (f2fs_encrypted_file(inode))
+ f2fs_wait_on_block_writeback(sbi, blkaddr);
if (len == PAGE_SIZE || PageUptodate(page))
return 0;
@@ -1971,21 +1995,9 @@ repeat:
zero_user_segment(page, 0, PAGE_SIZE);
SetPageUptodate(page);
} else {
- struct bio *bio;
-
- bio = f2fs_grab_bio(inode, blkaddr, 1);
- if (IS_ERR(bio)) {
- err = PTR_ERR(bio);
- goto fail;
- }
- bio->bi_opf = REQ_OP_READ;
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
- bio_put(bio);
- err = -EFAULT;
+ err = f2fs_submit_page_read(inode, page, blkaddr);
+ if (err)
goto fail;
- }
-
- __submit_bio(sbi, bio, DATA);
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -2075,10 +2087,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
up_read(&F2FS_I(inode)->dio_rwsem[rw]);
if (rw == WRITE) {
- if (err > 0)
+ if (err > 0) {
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
+ err);
set_inode_flag(inode, FI_UPDATE_WRITE);
- else if (err < 0)
+ } else if (err < 0) {
f2fs_write_failed(mapping, offset + count);
+ }
}
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 37f9c7f55605..c0c933ad43c8 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -705,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
+ struct address_space *mapping = page_mapping(page);
+ unsigned long flags;
int i;
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
@@ -735,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!truncate_hole(dir, page->index, page->index + 1)) {
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
clear_page_dirty_for_io(page);
ClearPagePrivate(page);
ClearPageUptodate(page);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 94a88b233e98..9a7c90386947 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -91,6 +91,8 @@ extern char *fault_name[FAULT_MAX];
#define F2FS_MOUNT_LFS 0x00040000
#define F2FS_MOUNT_USRQUOTA 0x00080000
#define F2FS_MOUNT_GRPQUOTA 0x00100000
+#define F2FS_MOUNT_PRJQUOTA 0x00200000
+#define F2FS_MOUNT_QUOTA 0x00400000
#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -110,8 +112,12 @@ struct f2fs_mount_info {
unsigned int opt;
};
-#define F2FS_FEATURE_ENCRYPT 0x0001
-#define F2FS_FEATURE_BLKZONED 0x0002
+#define F2FS_FEATURE_ENCRYPT 0x0001
+#define F2FS_FEATURE_BLKZONED 0x0002
+#define F2FS_FEATURE_ATOMIC_WRITE 0x0004
+#define F2FS_FEATURE_EXTRA_ATTR 0x0008
+#define F2FS_FEATURE_PRJQUOTA 0x0010
+#define F2FS_FEATURE_INODE_CHKSUM 0x0020
#define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -142,6 +148,8 @@ enum {
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi)
#define DISCARD_ISSUE_RATE 8
+#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */
+#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */
#define DEF_CP_INTERVAL 60 /* 60 secs */
#define DEF_IDLE_INTERVAL 5 /* 5 secs */
@@ -190,11 +198,18 @@ struct discard_entry {
unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
+/* default discard granularity of inner discard thread, unit: block count */
+#define DEFAULT_DISCARD_GRANULARITY 16
+
/* max discard pend list number */
#define MAX_PLIST_NUM 512
#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \
(MAX_PLIST_NUM - 1) : (blk_num - 1))
+#define P_ACTIVE 0x01
+#define P_TRIM 0x02
+#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM))
+
enum {
D_PREP,
D_SUBMIT,
@@ -230,11 +245,14 @@ struct discard_cmd_control {
struct task_struct *f2fs_issue_discard; /* discard thread */
struct list_head entry_list; /* 4KB discard entry list */
struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
+ unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
struct list_head wait_list; /* store on-flushing entries */
wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */
+ unsigned int discard_wake; /* to wake up discard thread */
struct mutex cmd_lock;
unsigned int nr_discards; /* # of discards in the list */
unsigned int max_discards; /* max. discards to be issued */
+ unsigned int discard_granularity; /* discard granularity */
unsigned int undiscard_blks; /* # of undiscard blocks */
atomic_t issued_discard; /* # of issued discard */
atomic_t issing_discard; /* # of issing discard */
@@ -308,6 +326,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
struct f2fs_flush_device)
#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \
struct f2fs_gc_range)
+#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32)
#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -332,6 +351,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION
#endif
+#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR
+#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR
+
struct f2fs_gc_range {
u32 sync;
u64 start;
@@ -355,16 +377,36 @@ struct f2fs_flush_device {
u32 segments; /* # of segments to flush */
};
+/* for inline stuff */
+#define DEF_INLINE_RESERVED_SIZE 1
+static inline int get_extra_isize(struct inode *inode);
+#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \
+ (CUR_ADDRS_PER_INODE(inode) - \
+ DEF_INLINE_RESERVED_SIZE - \
+ F2FS_INLINE_XATTR_ADDRS))
+
+/* for inline dir */
+#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
+ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+ BITS_PER_BYTE + 1))
+#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \
+ BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \
+ ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
+ NR_INLINE_DENTRY(inode) + \
+ INLINE_DENTRY_BITMAP_SIZE(inode)))
+
/*
* For INODE and NODE manager
*/
/* for directory operations */
struct f2fs_dentry_ptr {
struct inode *inode;
- const void *bitmap;
+ void *bitmap;
struct f2fs_dir_entry *dentry;
__u8 (*filename)[F2FS_SLOT_LEN];
int max;
+ int nr_bitmap;
};
static inline void make_dentry_ptr_block(struct inode *inode,
@@ -372,19 +414,26 @@ static inline void make_dentry_ptr_block(struct inode *inode,
{
d->inode = inode;
d->max = NR_DENTRY_IN_BLOCK;
+ d->nr_bitmap = SIZE_OF_DENTRY_BITMAP;
d->bitmap = &t->dentry_bitmap;
d->dentry = t->dentry;
d->filename = t->filename;
}
static inline void make_dentry_ptr_inline(struct inode *inode,
- struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t)
+ struct f2fs_dentry_ptr *d, void *t)
{
+ int entry_cnt = NR_INLINE_DENTRY(inode);
+ int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode);
+ int reserved_size = INLINE_RESERVED_SIZE(inode);
+
d->inode = inode;
- d->max = NR_INLINE_DENTRY;
- d->bitmap = &t->dentry_bitmap;
- d->dentry = t->dentry;
- d->filename = t->filename;
+ d->max = entry_cnt;
+ d->nr_bitmap = bitmap_size;
+ d->bitmap = t;
+ d->dentry = t + bitmap_size + reserved_size;
+ d->filename = t + bitmap_size + reserved_size +
+ SIZE_OF_DIR_ENTRY * entry_cnt;
}
/*
@@ -473,12 +522,13 @@ struct f2fs_map_blocks {
};
/* for flag in get_data_block */
-#define F2FS_GET_BLOCK_READ 0
-#define F2FS_GET_BLOCK_DIO 1
-#define F2FS_GET_BLOCK_FIEMAP 2
-#define F2FS_GET_BLOCK_BMAP 3
-#define F2FS_GET_BLOCK_PRE_DIO 4
-#define F2FS_GET_BLOCK_PRE_AIO 5
+enum {
+ F2FS_GET_BLOCK_DEFAULT,
+ F2FS_GET_BLOCK_FIEMAP,
+ F2FS_GET_BLOCK_BMAP,
+ F2FS_GET_BLOCK_PRE_DIO,
+ F2FS_GET_BLOCK_PRE_AIO,
+};
/*
* i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
@@ -521,6 +571,7 @@ struct f2fs_inode_info {
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
struct task_struct *task; /* lookup and create consistency */
+ struct task_struct *cp_task; /* separate cp/wb IO stats*/
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
@@ -533,10 +584,15 @@ struct f2fs_inode_info {
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
+ struct task_struct *inmem_task; /* store inmemory task */
struct mutex inmem_lock; /* lock for inmemory pages */
struct extent_tree *extent_tree; /* cached extent_tree entry */
struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
struct rw_semaphore i_mmap_sem;
+ struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */
+
+ int i_extra_isize; /* size of extra space located in i_addr */
+ kprojid_t i_projid; /* id for project quota */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -823,6 +879,23 @@ enum need_lock_type {
LOCK_RETRY,
};
+enum iostat_type {
+ APP_DIRECT_IO, /* app direct IOs */
+ APP_BUFFERED_IO, /* app buffered IOs */
+ APP_WRITE_IO, /* app write IOs */
+ APP_MAPPED_IO, /* app mapped IOs */
+ FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */
+ FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */
+ FS_META_IO, /* meta IOs from kworker/reclaimer */
+ FS_GC_DATA_IO, /* data IOs from forground gc */
+ FS_GC_NODE_IO, /* node IOs from forground gc */
+ FS_CP_DATA_IO, /* data IOs from checkpoint */
+ FS_CP_NODE_IO, /* node IOs from checkpoint */
+ FS_CP_META_IO, /* meta IOs from checkpoint */
+ FS_DISCARD, /* discard */
+ NR_IO_TYPE,
+};
+
struct f2fs_io_info {
struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
@@ -837,6 +910,7 @@ struct f2fs_io_info {
bool submitted; /* indicate IO submission */
int need_lock; /* indicate we need to lock cp_rwsem */
bool in_list; /* indicate fio is in io_list */
+ enum iostat_type io_type; /* io type */
};
#define is_read_io(rw) ((rw) == READ)
@@ -1028,6 +1102,11 @@ struct f2fs_sb_info {
#endif
spinlock_t stat_lock; /* lock for stat operations */
+ /* For app/fs IO statistics */
+ spinlock_t iostat_lock;
+ unsigned long long write_iostat[NR_IO_TYPE];
+ bool iostat_enable;
+
/* For sysfs suppport */
struct kobject s_kobj;
struct completion s_kobj_unregister;
@@ -1046,10 +1125,19 @@ struct f2fs_sb_info {
/* Reference to checksum algorithm driver via cryptoapi */
struct crypto_shash *s_chksum_driver;
+ /* Precomputed FS UUID checksum for seeding other checksums */
+ __u32 s_chksum_seed;
+
/* For fault injection */
#ifdef CONFIG_F2FS_FAULT_INJECTION
struct f2fs_fault_info fault_info;
#endif
+
+#ifdef CONFIG_QUOTA
+ /* Names of quota files with journalled quota */
+ char *s_qf_names[MAXQUOTAS];
+ int s_jquota_fmt; /* Format of quota to use */
+#endif
};
#ifdef CONFIG_F2FS_FAULT_INJECTION
@@ -1137,6 +1225,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
}
+static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
+ const void *address, unsigned int length)
+{
+ struct {
+ struct shash_desc shash;
+ char ctx[4];
+ } desc;
+ int err;
+
+ BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx));
+
+ desc.shash.tfm = sbi->s_chksum_driver;
+ desc.shash.flags = 0;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
+ BUG_ON(err);
+
+ return *(u32 *)desc.ctx;
+}
+
static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
{
return container_of(inode, struct f2fs_inode_info, vfs_inode);
@@ -1760,20 +1869,38 @@ static inline bool IS_INODE(struct page *page)
return RAW_IS_INODE(p);
}
+static inline int offset_in_addr(struct f2fs_inode *i)
+{
+ return (i->i_inline & F2FS_EXTRA_ATTR) ?
+ (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0;
+}
+
static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
{
return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
}
-static inline block_t datablock_addr(struct page *node_page,
- unsigned int offset)
+static inline int f2fs_has_extra_attr(struct inode *inode);
+static inline block_t datablock_addr(struct inode *inode,
+ struct page *node_page, unsigned int offset)
{
struct f2fs_node *raw_node;
__le32 *addr_array;
+ int base = 0;
+ bool is_inode = IS_INODE(node_page);
raw_node = F2FS_NODE(node_page);
+
+ /* from GC path only */
+ if (!inode) {
+ if (is_inode)
+ base = offset_in_addr(&raw_node->i);
+ } else if (f2fs_has_extra_attr(inode) && is_inode) {
+ base = get_extra_isize(inode);
+ }
+
addr_array = blkaddr_in_node(raw_node);
- return le32_to_cpu(addr_array[offset]);
+ return le32_to_cpu(addr_array[base + offset]);
}
static inline int f2fs_test_bit(unsigned int nr, char *addr)
@@ -1836,6 +1963,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
*addr ^= mask;
}
+#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
+#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
+#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL)
+
+static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & F2FS_REG_FLMASK;
+ else
+ return flags & F2FS_OTHER_FLMASK;
+}
+
/* used for f2fs_inode_info->flags */
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
@@ -1864,6 +2005,8 @@ enum {
FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
FI_HOT_DATA, /* indicate file is hot */
+ FI_EXTRA_ATTR, /* indicate file has extra attribute */
+ FI_PROJ_INHERIT, /* indicate file inherits projectid */
};
static inline void __mark_inode_dirty_flag(struct inode *inode,
@@ -1983,6 +2126,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
set_bit(FI_DATA_EXIST, &fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
set_bit(FI_INLINE_DOTS, &fi->flags);
+ if (ri->i_inline & F2FS_EXTRA_ATTR)
+ set_bit(FI_EXTRA_ATTR, &fi->flags);
}
static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -1999,6 +2144,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
ri->i_inline |= F2FS_DATA_EXIST;
if (is_inode_flag_set(inode, FI_INLINE_DOTS))
ri->i_inline |= F2FS_INLINE_DOTS;
+ if (is_inode_flag_set(inode, FI_EXTRA_ATTR))
+ ri->i_inline |= F2FS_EXTRA_ATTR;
+}
+
+static inline int f2fs_has_extra_attr(struct inode *inode)
+{
+ return is_inode_flag_set(inode, FI_EXTRA_ATTR);
}
static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -2009,8 +2161,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
static inline unsigned int addrs_per_inode(struct inode *inode)
{
if (f2fs_has_inline_xattr(inode))
- return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
- return DEF_ADDRS_PER_INODE;
+ return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
+ return CUR_ADDRS_PER_INODE(inode);
}
static inline void *inline_xattr_addr(struct page *page)
@@ -2069,11 +2221,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode)
return is_inode_flag_set(inode, FI_DROP_CACHE);
}
-static inline void *inline_data_addr(struct page *page)
+static inline void *inline_data_addr(struct inode *inode, struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
+ int extra_size = get_extra_isize(inode);
- return (void *)&(ri->i_addr[1]);
+ return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]);
}
static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -2164,10 +2317,50 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
return kmalloc(size, flags);
}
+static inline int get_extra_isize(struct inode *inode)
+{
+ return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
+}
+
#define get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+#define F2FS_TOTAL_EXTRA_ATTR_SIZE \
+ (offsetof(struct f2fs_inode, i_extra_end) - \
+ offsetof(struct f2fs_inode, i_extra_isize)) \
+
+#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr))
+#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \
+ ((offsetof(typeof(*f2fs_inode), field) + \
+ sizeof((f2fs_inode)->field)) \
+ <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \
+
+static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi)
+{
+ int i;
+
+ spin_lock(&sbi->iostat_lock);
+ for (i = 0; i < NR_IO_TYPE; i++)
+ sbi->write_iostat[i] = 0;
+ spin_unlock(&sbi->iostat_lock);
+}
+
+static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi,
+ enum iostat_type type, unsigned long long io_bytes)
+{
+ if (!sbi->iostat_enable)
+ return;
+ spin_lock(&sbi->iostat_lock);
+ sbi->write_iostat[type] += io_bytes;
+
+ if (type == APP_WRITE_IO || type == APP_DIRECT_IO)
+ sbi->write_iostat[APP_BUFFERED_IO] =
+ sbi->write_iostat[APP_WRITE_IO] -
+ sbi->write_iostat[APP_DIRECT_IO];
+ spin_unlock(&sbi->iostat_lock);
+}
+
/*
* file.c
*/
@@ -2187,6 +2380,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
* inode.c
*/
void f2fs_set_inode_flags(struct inode *inode);
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
@@ -2255,6 +2450,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
*/
int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
+void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+void f2fs_quota_off_umount(struct super_block *sb);
int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
int f2fs_sync_fs(struct super_block *sb, int sync);
extern __printf(3, 4)
@@ -2285,15 +2482,15 @@ int truncate_xattr_node(struct inode *inode, struct page *page);
int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
int remove_inode_page(struct inode *inode);
struct page *new_inode_page(struct inode *inode);
-struct page *new_node_page(struct dnode_of_data *dn,
- unsigned int ofs, struct page *ipage);
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs);
void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
struct page *get_node_page_ra(struct page *parent, int start);
void move_node_page(struct page *node_page, int gc_type);
int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic);
-int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc);
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
+ bool do_balance, enum iostat_type io_type);
void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount);
bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
@@ -2314,6 +2511,7 @@ void destroy_node_manager_caches(void);
/*
* segment.c
*/
+bool need_SSR(struct f2fs_sb_info *sbi);
void register_inmem_page(struct inode *inode, struct page *page);
void drop_inmem_pages(struct inode *inode);
void drop_inmem_page(struct inode *inode, struct page *page);
@@ -2336,7 +2534,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc);
struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno);
void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr);
-void write_meta_page(struct f2fs_sb_info *sbi, struct page *page);
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+ enum iostat_type io_type);
void write_node_page(unsigned int nid, struct f2fs_io_info *fio);
void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio);
int rewrite_data_page(struct f2fs_io_info *fio);
@@ -2353,8 +2552,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
struct f2fs_io_info *fio, bool add_list);
void f2fs_wait_on_page_writeback(struct page *page,
enum page_type type, bool ordered);
-void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
- block_t blkaddr);
+void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr);
void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk);
int lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
@@ -2377,7 +2575,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync);
void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index);
long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
- long nr_to_write);
+ long nr_to_write, enum iostat_type io_type);
void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
@@ -2430,6 +2628,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void f2fs_set_page_dirty_nobuffers(struct page *page);
+int __f2fs_write_data_pages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ enum iostat_type io_type);
void f2fs_invalidate_page(struct page *page, unsigned int offset,
unsigned int length);
int f2fs_release_page(struct page *page, gfp_t wait);
@@ -2726,10 +2927,10 @@ void destroy_extent_cache(void);
/*
* sysfs.c
*/
-int __init f2fs_register_sysfs(void);
-void f2fs_unregister_sysfs(void);
-int f2fs_init_sysfs(struct f2fs_sb_info *sbi);
-void f2fs_exit_sysfs(struct f2fs_sb_info *sbi);
+int __init f2fs_init_sysfs(void);
+void f2fs_exit_sysfs(void);
+int f2fs_register_sysfs(struct f2fs_sb_info *sbi);
+void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi);
/*
* crypto support
@@ -2739,6 +2940,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode)
return file_is_encrypt(inode);
}
+static inline bool f2fs_encrypted_file(struct inode *inode)
+{
+ return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode);
+}
+
static inline void f2fs_set_encrypted_inode(struct inode *inode)
{
#ifdef CONFIG_F2FS_FS_ENCRYPTION
@@ -2761,6 +2967,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
}
+static inline int f2fs_sb_has_extra_attr(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR);
+}
+
+static inline int f2fs_sb_has_project_quota(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA);
+}
+
+static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
+{
+ return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
+}
+
#ifdef CONFIG_BLK_DEV_ZONED
static inline int get_blkz_type(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 843a0d99f7ea..517e112c8a9a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -98,14 +98,16 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (!PageUptodate(page))
SetPageUptodate(page);
+ f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE);
+
trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
/* fill the page */
f2fs_wait_on_page_writeback(page, DATA, false);
/* wait for GCed encrypted page writeback */
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+ if (f2fs_encrypted_file(inode))
+ f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr);
out_sem:
up_read(&F2FS_I(inode)->i_mmap_sem);
@@ -274,9 +276,19 @@ sync_nodes:
goto sync_nodes;
}
- ret = wait_on_node_pages_writeback(sbi, ino);
- if (ret)
- goto out;
+ /*
+ * If it's atomic_write, it's just fine to keep write ordering. So
+ * here we don't need to wait for node write completion, since we use
+ * node chain which serializes node blocks. If one of node writes are
+ * reordered, we can see simply broken chain, resulting in stopping
+ * roll-forward recovery. It means we'll recover all or none node blocks
+ * given fsync mark.
+ */
+ if (!atomic) {
+ ret = wait_on_node_pages_writeback(sbi, ino);
+ if (ret)
+ goto out;
+ }
/* once recovery info is written, don't need to tack this */
remove_ino_entry(sbi, ino, APPEND_INO);
@@ -382,7 +394,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
dn.ofs_in_node++, pgofs++,
data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node);
if (__found_offset(blkaddr, dirty, pgofs, whence)) {
f2fs_put_dnode(&dn);
@@ -467,9 +480,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
struct f2fs_node *raw_node;
int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
+ int base = 0;
+
+ if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
+ base = get_extra_isize(dn->inode);
raw_node = F2FS_NODE(dn->node_page);
- addr = blkaddr_in_node(raw_node) + ofs;
+ addr = blkaddr_in_node(raw_node) + base + ofs;
for (; count > 0; count--, addr++, dn->ofs_in_node++) {
block_t blkaddr = le32_to_cpu(*addr);
@@ -647,7 +664,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int flags;
- flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL);
if (flags & FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (flags & FS_COMPR_FL)
@@ -927,7 +944,8 @@ next_dnode:
done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
- *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ *blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node);
if (!is_checkpointed_data(sbi, *blkaddr)) {
if (test_opt(sbi, LFS)) {
@@ -1003,8 +1021,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
ADDRS_PER_PAGE(dn.node_page, dst_inode) -
dn.ofs_in_node, len - i);
do {
- dn.data_blkaddr = datablock_addr(dn.node_page,
- dn.ofs_in_node);
+ dn.data_blkaddr = datablock_addr(dn.inode,
+ dn.node_page, dn.ofs_in_node);
truncate_data_blocks_range(&dn, 1);
if (do_replace[i]) {
@@ -1173,7 +1191,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
int ret;
for (; index < end; index++, dn->ofs_in_node++) {
- if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR)
+ if (datablock_addr(dn->inode, dn->node_page,
+ dn->ofs_in_node) == NULL_ADDR)
count++;
}
@@ -1184,8 +1203,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
dn->ofs_in_node = ofs_in_node;
for (index = start; index < end; index++, dn->ofs_in_node++) {
- dn->data_blkaddr =
- datablock_addr(dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
/*
* reserve_new_blocks will not guarantee entire block
* allocation.
@@ -1495,33 +1514,67 @@ static int f2fs_release_file(struct inode *inode, struct file *filp)
return 0;
}
-#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
-#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
-
-static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
+static int f2fs_file_flush(struct file *file, fl_owner_t id)
{
- if (S_ISDIR(mode))
- return flags;
- else if (S_ISREG(mode))
- return flags & F2FS_REG_FLMASK;
- else
- return flags & F2FS_OTHER_FLMASK;
+ struct inode *inode = file_inode(file);
+
+ /*
+ * If the process doing a transaction is crashed, we should do
+ * roll-back. Otherwise, other reader/write can see corrupted database
+ * until all the writers close its file. Since this should be done
+ * before dropping file lock, it needs to do in ->flush.
+ */
+ if (f2fs_is_atomic_file(inode) &&
+ F2FS_I(inode)->inmem_task == current)
+ drop_inmem_pages(inode);
+ return 0;
}
static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
- unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
+ unsigned int flags = fi->i_flags &
+ (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL);
return put_user(flags, (int __user *)arg);
}
+static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags)
+{
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ unsigned int oldflags;
+
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ return -EPERM;
+
+ flags = f2fs_mask_flags(inode->i_mode, flags);
+
+ oldflags = fi->i_flags;
+
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL))
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ flags = flags & (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL);
+ flags |= oldflags & ~(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL);
+ fi->i_flags = flags;
+
+ if (fi->i_flags & FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+ else
+ clear_inode_flag(inode, FI_PROJ_INHERIT);
+
+ inode->i_ctime = current_time(inode);
+ f2fs_set_inode_flags(inode);
+ f2fs_mark_inode_dirty_sync(inode, false);
+ return 0;
+}
+
static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- struct f2fs_inode_info *fi = F2FS_I(inode);
unsigned int flags;
- unsigned int oldflags;
int ret;
if (!inode_owner_or_capable(inode))
@@ -1536,31 +1589,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
inode_lock(inode);
- /* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- ret = -EPERM;
- goto unlock_out;
- }
-
- flags = f2fs_mask_flags(inode->i_mode, flags);
-
- oldflags = fi->i_flags;
-
- if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- ret = -EPERM;
- goto unlock_out;
- }
- }
-
- flags = flags & FS_FL_USER_MODIFIABLE;
- flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
- fi->i_flags = flags;
+ ret = __f2fs_ioc_setflags(inode, flags);
- inode->i_ctime = current_time(inode);
- f2fs_set_inode_flags(inode);
- f2fs_mark_inode_dirty_sync(inode, false);
-unlock_out:
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -1610,10 +1640,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
if (ret) {
clear_inode_flag(inode, FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_HOT_DATA);
goto out;
}
inc_stat:
+ F2FS_I(inode)->inmem_task = current;
stat_inc_atomic_write(inode);
stat_update_max_atomic_write(inode);
out:
@@ -1647,10 +1679,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
if (!ret) {
clear_inode_flag(inode, FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_HOT_DATA);
stat_dec_atomic_write(inode);
}
} else {
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
err_out:
inode_unlock(inode);
@@ -1786,7 +1819,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
f2fs_stop_checkpoint(sbi, false);
break;
case F2FS_GOING_DOWN_METAFLUSH:
- sync_meta_pages(sbi, META, LONG_MAX);
+ sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO);
f2fs_stop_checkpoint(sbi, false);
break;
default:
@@ -2043,7 +2076,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
*/
while (map.m_lblk < pg_end) {
map.m_len = pg_end - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto out;
@@ -2085,7 +2118,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
do_map:
map.m_len = pg_end - map.m_lblk;
- err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+ err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto clear_out;
@@ -2384,6 +2417,210 @@ out:
return ret;
}
+static int f2fs_ioc_get_features(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature);
+
+ /* Must validate to set it with SQLite behavior in Android. */
+ sb_feature |= F2FS_FEATURE_ATOMIC_WRITE;
+
+ return put_user(sb_feature, (u32 __user *)arg);
+}
+
+#ifdef CONFIG_QUOTA
+static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct super_block *sb = sbi->sb;
+ struct dquot *transfer_to[MAXQUOTAS] = {};
+ struct page *ipage;
+ kprojid_t kprojid;
+ int err;
+
+ if (!f2fs_sb_has_project_quota(sb)) {
+ if (projid != F2FS_DEF_PROJID)
+ return -EOPNOTSUPP;
+ else
+ return 0;
+ }
+
+ if (!f2fs_has_extra_attr(inode))
+ return -EOPNOTSUPP;
+
+ kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+ if (projid_eq(kprojid, F2FS_I(inode)->i_projid))
+ return 0;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ err = -EPERM;
+ inode_lock(inode);
+
+ /* Is it quota file? Do not allow user to mess with it */
+ if (IS_NOQUOTA(inode))
+ goto out_unlock;
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out_unlock;
+ }
+
+ if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize,
+ i_projid)) {
+ err = -EOVERFLOW;
+ f2fs_put_page(ipage, 1);
+ goto out_unlock;
+ }
+ f2fs_put_page(ipage, 1);
+
+ dquot_initialize(inode);
+
+ transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+ if (!IS_ERR(transfer_to[PRJQUOTA])) {
+ err = __dquot_transfer(inode, transfer_to);
+ dqput(transfer_to[PRJQUOTA]);
+ if (err)
+ goto out_dirty;
+ }
+
+ F2FS_I(inode)->i_projid = kprojid;
+ inode->i_ctime = current_time(inode);
+out_dirty:
+ f2fs_mark_inode_dirty_sync(inode, true);
+out_unlock:
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ return err;
+}
+#else
+static int f2fs_ioc_setproject(struct file *filp, __u32 projid)
+{
+ if (projid != F2FS_DEF_PROJID)
+ return -EOPNOTSUPP;
+ return 0;
+}
+#endif
+
+/* Transfer internal flags to xflags */
+static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags)
+{
+ __u32 xflags = 0;
+
+ if (iflags & FS_SYNC_FL)
+ xflags |= FS_XFLAG_SYNC;
+ if (iflags & FS_IMMUTABLE_FL)
+ xflags |= FS_XFLAG_IMMUTABLE;
+ if (iflags & FS_APPEND_FL)
+ xflags |= FS_XFLAG_APPEND;
+ if (iflags & FS_NODUMP_FL)
+ xflags |= FS_XFLAG_NODUMP;
+ if (iflags & FS_NOATIME_FL)
+ xflags |= FS_XFLAG_NOATIME;
+ if (iflags & FS_PROJINHERIT_FL)
+ xflags |= FS_XFLAG_PROJINHERIT;
+ return xflags;
+}
+
+#define F2FS_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
+ FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
+ FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+
+/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
+#define F2FS_FL_XFLAG_VISIBLE (FS_SYNC_FL | \
+ FS_IMMUTABLE_FL | \
+ FS_APPEND_FL | \
+ FS_NODUMP_FL | \
+ FS_NOATIME_FL | \
+ FS_PROJINHERIT_FL)
+
+/* Transfer xflags flags to internal */
+static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags)
+{
+ unsigned long iflags = 0;
+
+ if (xflags & FS_XFLAG_SYNC)
+ iflags |= FS_SYNC_FL;
+ if (xflags & FS_XFLAG_IMMUTABLE)
+ iflags |= FS_IMMUTABLE_FL;
+ if (xflags & FS_XFLAG_APPEND)
+ iflags |= FS_APPEND_FL;
+ if (xflags & FS_XFLAG_NODUMP)
+ iflags |= FS_NODUMP_FL;
+ if (xflags & FS_XFLAG_NOATIME)
+ iflags |= FS_NOATIME_FL;
+ if (xflags & FS_XFLAG_PROJINHERIT)
+ iflags |= FS_PROJINHERIT_FL;
+
+ return iflags;
+}
+
+static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct fsxattr fa;
+
+ memset(&fa, 0, sizeof(struct fsxattr));
+ fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags &
+ (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL));
+
+ if (f2fs_sb_has_project_quota(inode->i_sb))
+ fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
+ fi->i_projid);
+
+ if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa)))
+ return -EFAULT;
+ return 0;
+}
+
+static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct fsxattr fa;
+ unsigned int flags;
+ int err;
+
+ if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa)))
+ return -EFAULT;
+
+ /* Make sure caller has proper permission */
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (fa.fsx_xflags & ~F2FS_SUPPORTED_FS_XFLAGS)
+ return -EOPNOTSUPP;
+
+ flags = f2fs_xflags_to_iflags(fa.fsx_xflags);
+ if (f2fs_mask_flags(inode->i_mode, flags) != flags)
+ return -EOPNOTSUPP;
+
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ inode_lock(inode);
+ flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) |
+ (flags & F2FS_FL_XFLAG_VISIBLE);
+ err = __f2fs_ioc_setflags(inode, flags);
+ inode_unlock(inode);
+ mnt_drop_write_file(filp);
+ if (err)
+ return err;
+
+ err = f2fs_ioc_setproject(filp, fa.fsx_projid);
+ if (err)
+ return err;
+
+ return 0;
+}
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
@@ -2426,6 +2663,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_move_range(filp, arg);
case F2FS_IOC_FLUSH_DEVICE:
return f2fs_ioc_flush_device(filp, arg);
+ case F2FS_IOC_GET_FEATURES:
+ return f2fs_ioc_get_features(filp, arg);
+ case F2FS_IOC_FSGETXATTR:
+ return f2fs_ioc_fsgetxattr(filp, arg);
+ case F2FS_IOC_FSSETXATTR:
+ return f2fs_ioc_fssetxattr(filp, arg);
default:
return -ENOTTY;
}
@@ -2455,6 +2698,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = __generic_file_write_iter(iocb, from);
blk_finish_plug(&plug);
clear_inode_flag(inode, FI_NO_PREALLOC);
+
+ if (ret > 0)
+ f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret);
}
inode_unlock(inode);
@@ -2491,6 +2737,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_DEFRAGMENT:
case F2FS_IOC_MOVE_RANGE:
case F2FS_IOC_FLUSH_DEVICE:
+ case F2FS_IOC_GET_FEATURES:
+ case F2FS_IOC_FSGETXATTR:
+ case F2FS_IOC_FSSETXATTR:
break;
default:
return -ENOIOCTLCMD;
@@ -2506,6 +2755,7 @@ const struct file_operations f2fs_file_operations = {
.open = f2fs_file_open,
.release = f2fs_release_file,
.mmap = f2fs_file_mmap,
+ .flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
.unlocked_ioctl = f2fs_ioctl,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fa3d2e2df8e7..bfe6a8ccc3a0 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -28,16 +28,21 @@ static int gc_thread_func(void *data)
struct f2fs_sb_info *sbi = data;
struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
- long wait_ms;
+ unsigned int wait_ms;
wait_ms = gc_th->min_sleep_time;
set_freezable();
do {
wait_event_interruptible_timeout(*wq,
- kthread_should_stop() || freezing(current),
+ kthread_should_stop() || freezing(current) ||
+ gc_th->gc_wake,
msecs_to_jiffies(wait_ms));
+ /* give it a try one time */
+ if (gc_th->gc_wake)
+ gc_th->gc_wake = 0;
+
if (try_to_freeze())
continue;
if (kthread_should_stop())
@@ -55,6 +60,9 @@ static int gc_thread_func(void *data)
}
#endif
+ if (!sb_start_write_trylock(sbi->sb))
+ continue;
+
/*
* [GC triggering condition]
* 0. GC is not conducted currently.
@@ -69,19 +77,24 @@ static int gc_thread_func(void *data)
* So, I'd like to wait some time to collect dirty segments.
*/
if (!mutex_trylock(&sbi->gc_mutex))
- continue;
+ goto next;
+
+ if (gc_th->gc_urgent) {
+ wait_ms = gc_th->urgent_sleep_time;
+ goto do_gc;
+ }
if (!is_idle(sbi)) {
increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
- continue;
+ goto next;
}
if (has_enough_invalid_blocks(sbi))
decrease_sleep_time(gc_th, &wait_ms);
else
increase_sleep_time(gc_th, &wait_ms);
-
+do_gc:
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
@@ -93,6 +106,8 @@ static int gc_thread_func(void *data)
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
+next:
+ sb_end_write(sbi->sb);
} while (!kthread_should_stop());
return 0;
@@ -110,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
goto out;
}
+ gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
gc_th->gc_idle = 0;
+ gc_th->gc_urgent = 0;
+ gc_th->gc_wake= 0;
sbi->gc_thread = gc_th;
init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
@@ -259,20 +277,11 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
valid_blocks * 2 : valid_blocks;
}
-static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi,
- unsigned int segno)
-{
- struct seg_entry *se = get_seg_entry(sbi, segno);
-
- return se->ckpt_valid_blocks > se->valid_blocks ?
- se->ckpt_valid_blocks : se->valid_blocks;
-}
-
static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
unsigned int segno, struct victim_sel_policy *p)
{
if (p->alloc_mode == SSR)
- return get_ssr_cost(sbi, segno);
+ return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
/* alloc_mode == LFS */
if (p->gc_mode == GC_GREEDY)
@@ -582,7 +591,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
*nofs = ofs_of_node(node_page);
- source_blkaddr = datablock_addr(node_page, ofs_in_node);
+ source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr)
@@ -590,8 +599,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return true;
}
-static void move_encrypted_block(struct inode *inode, block_t bidx,
- unsigned int segno, int off)
+/*
+ * Move data block via META_MAPPING while keeping locked data page.
+ * This can be used to move blocks, aka LBAs, directly on disk.
+ */
+static void move_data_block(struct inode *inode, block_t bidx,
+ unsigned int segno, int off)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
@@ -684,6 +697,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx,
fio.new_blkaddr = newaddr;
f2fs_submit_page_write(&fio);
+ f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
+
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
@@ -731,6 +746,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
.page = page,
.encrypted_page = NULL,
.need_lock = LOCK_REQ,
+ .io_type = FS_GC_DATA_IO,
};
bool is_dirty = PageDirty(page);
int err;
@@ -819,8 +835,7 @@ next_step:
continue;
/* if encrypted inode, let's go phase 3 */
- if (f2fs_encrypted_inode(inode) &&
- S_ISREG(inode->i_mode)) {
+ if (f2fs_encrypted_file(inode)) {
add_gc_inode(gc_list, inode);
continue;
}
@@ -854,14 +869,18 @@ next_step:
continue;
}
locked = true;
+
+ /* wait for all inflight aio data */
+ inode_dio_wait(inode);
}
start_bidx = start_bidx_of_node(nofs, inode)
+ ofs_in_node;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
- move_encrypted_block(inode, start_bidx, segno, off);
+ if (f2fs_encrypted_file(inode))
+ move_data_block(inode, start_bidx, segno, off);
else
- move_data_page(inode, start_bidx, gc_type, segno, off);
+ move_data_page(inode, start_bidx, gc_type,
+ segno, off);
if (locked) {
up_write(&fi->dio_rwsem[WRITE]);
@@ -898,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + sbi->segs_per_sec;
- int sec_freed = 0;
+ int seg_freed = 0;
unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
SUM_TYPE_DATA : SUM_TYPE_NODE;
@@ -944,6 +963,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
gc_type);
stat_inc_seg_count(sbi, type, gc_type);
+
+ if (gc_type == FG_GC &&
+ get_valid_blocks(sbi, segno, false) == 0)
+ seg_freed++;
next:
f2fs_put_page(sum_page, 0);
}
@@ -954,21 +977,17 @@ next:
blk_finish_plug(&plug);
- if (gc_type == FG_GC &&
- get_valid_blocks(sbi, start_segno, true) == 0)
- sec_freed = 1;
-
stat_inc_call_count(sbi->stat_info);
- return sec_freed;
+ return seg_freed;
}
int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
bool background, unsigned int segno)
{
int gc_type = sync ? FG_GC : BG_GC;
- int sec_freed = 0;
- int ret;
+ int sec_freed = 0, seg_freed = 0, total_freed = 0;
+ int ret = 0;
struct cp_control cpc;
unsigned int init_segno = segno;
struct gc_inode_list gc_list = {
@@ -976,6 +995,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
.iroot = RADIX_TREE_INIT(GFP_NOFS),
};
+ trace_f2fs_gc_begin(sbi->sb, sync, background,
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DENTS),
+ get_pages(sbi, F2FS_DIRTY_IMETA),
+ free_sections(sbi),
+ free_segments(sbi),
+ reserved_segments(sbi),
+ prefree_segments(sbi));
+
cpc.reason = __get_cp_reason(sbi);
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) {
@@ -1002,17 +1030,20 @@ gc_more:
gc_type = FG_GC;
}
- ret = -EINVAL;
/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
- if (gc_type == BG_GC && !background)
+ if (gc_type == BG_GC && !background) {
+ ret = -EINVAL;
goto stop;
- if (!__get_victim(sbi, &segno, gc_type))
+ }
+ if (!__get_victim(sbi, &segno, gc_type)) {
+ ret = -ENODATA;
goto stop;
- ret = 0;
+ }
- if (do_garbage_collect(sbi, segno, &gc_list, gc_type) &&
- gc_type == FG_GC)
+ seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type);
+ if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec)
sec_freed++;
+ total_freed += seg_freed;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
@@ -1029,6 +1060,16 @@ gc_more:
stop:
SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
+
+ trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
+ get_pages(sbi, F2FS_DIRTY_NODES),
+ get_pages(sbi, F2FS_DIRTY_DENTS),
+ get_pages(sbi, F2FS_DIRTY_IMETA),
+ free_sections(sbi),
+ free_segments(sbi),
+ reserved_segments(sbi),
+ prefree_segments(sbi));
+
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index a993967dcdb9..9325191fab2d 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -13,6 +13,7 @@
* whether IO subsystem is idle
* or not
*/
+#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */
#define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */
#define DEF_GC_THREAD_MAX_SLEEP_TIME 60000
#define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */
@@ -27,12 +28,15 @@ struct f2fs_gc_kthread {
wait_queue_head_t gc_wait_queue_head;
/* for gc sleep time */
+ unsigned int urgent_sleep_time;
unsigned int min_sleep_time;
unsigned int max_sleep_time;
unsigned int no_gc_sleep_time;
/* for changing gc mode */
unsigned int gc_idle;
+ unsigned int gc_urgent;
+ unsigned int gc_wake;
};
struct gc_inode_list {
@@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
}
static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
- long *wait)
+ unsigned int *wait)
{
+ unsigned int min_time = gc_th->min_sleep_time;
+ unsigned int max_time = gc_th->max_sleep_time;
+
if (*wait == gc_th->no_gc_sleep_time)
return;
- *wait += gc_th->min_sleep_time;
- if (*wait > gc_th->max_sleep_time)
- *wait = gc_th->max_sleep_time;
+ if ((long long)*wait + (long long)min_time > (long long)max_time)
+ *wait = max_time;
+ else
+ *wait += min_time;
}
static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
- long *wait)
+ unsigned int *wait)
{
+ unsigned int min_time = gc_th->min_sleep_time;
+
if (*wait == gc_th->no_gc_sleep_time)
*wait = gc_th->max_sleep_time;
- *wait -= gc_th->min_sleep_time;
- if (*wait <= gc_th->min_sleep_time)
- *wait = gc_th->min_sleep_time;
+ if ((long long)*wait - (long long)min_time < (long long)min_time)
+ *wait = min_time;
+ else
+ *wait -= min_time;
}
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index e0fd4376e6fb..8322e4e7bb3f 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -22,10 +22,10 @@ bool f2fs_may_inline_data(struct inode *inode)
if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
- if (i_size_read(inode) > MAX_INLINE_DATA)
+ if (i_size_read(inode) > MAX_INLINE_DATA(inode))
return false;
- if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ if (f2fs_encrypted_file(inode))
return false;
return true;
@@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode)
void read_inline_data(struct page *page, struct page *ipage)
{
+ struct inode *inode = page->mapping->host;
void *src_addr, *dst_addr;
if (PageUptodate(page))
@@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage)
f2fs_bug_on(F2FS_P_SB(page), page->index);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
/* Copy the whole inline data block */
- src_addr = inline_data_addr(ipage);
+ src_addr = inline_data_addr(inode, ipage);
dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
flush_dcache_page(page);
kunmap_atomic(dst_addr);
if (!PageUptodate(page))
@@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from)
{
void *addr;
- if (from >= MAX_INLINE_DATA)
+ if (from >= MAX_INLINE_DATA(inode))
return;
- addr = inline_data_addr(ipage);
+ addr = inline_data_addr(inode, ipage);
f2fs_wait_on_page_writeback(ipage, NODE, true);
- memset(addr + from, 0, MAX_INLINE_DATA - from);
+ memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
set_page_dirty(ipage);
if (from == 0)
@@ -116,6 +117,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
.op_flags = REQ_SYNC | REQ_PRIO,
.page = page,
.encrypted_page = NULL,
+ .io_type = FS_DATA_IO,
};
int dirty, err;
@@ -200,6 +202,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
{
void *src_addr, *dst_addr;
struct dnode_of_data dn;
+ struct address_space *mapping = page_mapping(page);
+ unsigned long flags;
int err;
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -216,11 +220,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_wait_on_page_writeback(dn.inode_page, NODE, true);
src_addr = kmap_atomic(page);
- dst_addr = inline_data_addr(dn.inode_page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ dst_addr = inline_data_addr(inode, dn.inode_page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
kunmap_atomic(src_addr);
set_page_dirty(dn.inode_page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ radix_tree_tag_clear(&mapping->page_tree, page_index(page),
+ PAGECACHE_TAG_DIRTY);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
@@ -255,9 +264,9 @@ process_inline:
f2fs_wait_on_page_writeback(ipage, NODE, true);
- src_addr = inline_data_addr(npage);
- dst_addr = inline_data_addr(ipage);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ src_addr = inline_data_addr(inode, npage);
+ dst_addr = inline_data_addr(inode, ipage);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
set_inode_flag(inode, FI_INLINE_DATA);
set_inode_flag(inode, FI_DATA_EXIST);
@@ -285,11 +294,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
struct fscrypt_name *fname, struct page **res_page)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- struct f2fs_inline_dentry *inline_dentry;
struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
struct page *ipage;
+ void *inline_dentry;
f2fs_hash_t namehash;
ipage = get_node_page(sbi, dir->i_ino);
@@ -300,9 +309,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
namehash = f2fs_dentry_hash(&name, fname);
- inline_dentry = inline_data_addr(ipage);
+ inline_dentry = inline_data_addr(dir, ipage);
- make_dentry_ptr_inline(NULL, &d, inline_dentry);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
de = find_target_dentry(fname, namehash, NULL, &d);
unlock_page(ipage);
if (de)
@@ -316,19 +325,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
int make_empty_inline_dir(struct inode *inode, struct inode *parent,
struct page *ipage)
{
- struct f2fs_inline_dentry *inline_dentry;
struct f2fs_dentry_ptr d;
+ void *inline_dentry;
- inline_dentry = inline_data_addr(ipage);
+ inline_dentry = inline_data_addr(inode, ipage);
- make_dentry_ptr_inline(NULL, &d, inline_dentry);
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
do_make_empty_dir(inode, parent, &d);
set_page_dirty(ipage);
/* update i_size to MAX_INLINE_DATA */
- if (i_size_read(inode) < MAX_INLINE_DATA)
- f2fs_i_size_write(inode, MAX_INLINE_DATA);
+ if (i_size_read(inode) < MAX_INLINE_DATA(inode))
+ f2fs_i_size_write(inode, MAX_INLINE_DATA(inode));
return 0;
}
@@ -337,11 +346,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent,
* release ipage in this function.
*/
static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
- struct f2fs_inline_dentry *inline_dentry)
+ void *inline_dentry)
{
struct page *page;
struct dnode_of_data dn;
struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dentry_ptr src, dst;
int err;
page = f2fs_grab_cache_page(dir->i_mapping, 0, false);
@@ -356,25 +366,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
goto out;
f2fs_wait_on_page_writeback(page, DATA, true);
- zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
+ zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE);
dentry_blk = kmap_atomic(page);
+ make_dentry_ptr_inline(dir, &src, inline_dentry);
+ make_dentry_ptr_block(dir, &dst, dentry_blk);
+
/* copy data from inline dentry block to new dentry block */
- memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap,
- INLINE_DENTRY_BITMAP_SIZE);
- memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0,
- SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE);
+ memcpy(dst.bitmap, src.bitmap, src.nr_bitmap);
+ memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap);
/*
* we do not need to zero out remainder part of dentry and filename
* field, since we have used bitmap for marking the usage status of
* them, besides, we can also ignore copying/zeroing reserved space
* of dentry block, because them haven't been used so far.
*/
- memcpy(dentry_blk->dentry, inline_dentry->dentry,
- sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY);
- memcpy(dentry_blk->filename, inline_dentry->filename,
- NR_INLINE_DENTRY * F2FS_SLOT_LEN);
+ memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
+ memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
kunmap_atomic(dentry_blk);
if (!PageUptodate(page))
@@ -395,14 +404,13 @@ out:
return err;
}
-static int f2fs_add_inline_entries(struct inode *dir,
- struct f2fs_inline_dentry *inline_dentry)
+static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry)
{
struct f2fs_dentry_ptr d;
unsigned long bit_pos = 0;
int err = 0;
- make_dentry_ptr_inline(NULL, &d, inline_dentry);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
while (bit_pos < d.max) {
struct f2fs_dir_entry *de;
@@ -444,19 +452,19 @@ punch_dentry_pages:
}
static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
- struct f2fs_inline_dentry *inline_dentry)
+ void *inline_dentry)
{
- struct f2fs_inline_dentry *backup_dentry;
+ void *backup_dentry;
int err;
backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
- sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO);
+ MAX_INLINE_DATA(dir), GFP_F2FS_ZERO);
if (!backup_dentry) {
f2fs_put_page(ipage, 1);
return -ENOMEM;
}
- memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA);
+ memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir));
truncate_inline_inode(dir, ipage, 0);
unlock_page(ipage);
@@ -473,9 +481,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
return 0;
recover:
lock_page(ipage);
- memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA);
+ memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
f2fs_i_depth_write(dir, 0);
- f2fs_i_size_write(dir, MAX_INLINE_DATA);
+ f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
set_page_dirty(ipage);
f2fs_put_page(ipage, 1);
@@ -484,7 +492,7 @@ recover:
}
static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
- struct f2fs_inline_dentry *inline_dentry)
+ void *inline_dentry)
{
if (!F2FS_I(dir)->i_dir_level)
return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
@@ -500,7 +508,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
struct page *ipage;
unsigned int bit_pos;
f2fs_hash_t name_hash;
- struct f2fs_inline_dentry *inline_dentry = NULL;
+ void *inline_dentry = NULL;
struct f2fs_dentry_ptr d;
int slots = GET_DENTRY_SLOTS(new_name->len);
struct page *page = NULL;
@@ -510,10 +518,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
if (IS_ERR(ipage))
return PTR_ERR(ipage);
- inline_dentry = inline_data_addr(ipage);
- bit_pos = room_for_filename(&inline_dentry->dentry_bitmap,
- slots, NR_INLINE_DENTRY);
- if (bit_pos >= NR_INLINE_DENTRY) {
+ inline_dentry = inline_data_addr(dir, ipage);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+ bit_pos = room_for_filename(d.bitmap, slots, d.max);
+ if (bit_pos >= d.max) {
err = f2fs_convert_inline_dir(dir, ipage, inline_dentry);
if (err)
return err;
@@ -534,7 +543,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name,
f2fs_wait_on_page_writeback(ipage, NODE, true);
name_hash = f2fs_dentry_hash(new_name, NULL);
- make_dentry_ptr_inline(NULL, &d, inline_dentry);
f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos);
set_page_dirty(ipage);
@@ -557,7 +565,8 @@ out:
void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
struct inode *dir, struct inode *inode)
{
- struct f2fs_inline_dentry *inline_dentry;
+ struct f2fs_dentry_ptr d;
+ void *inline_dentry;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
unsigned int bit_pos;
int i;
@@ -565,11 +574,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
lock_page(page);
f2fs_wait_on_page_writeback(page, NODE, true);
- inline_dentry = inline_data_addr(page);
- bit_pos = dentry - inline_dentry->dentry;
+ inline_dentry = inline_data_addr(dir, page);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+ bit_pos = dentry - d.dentry;
for (i = 0; i < slots; i++)
- __clear_bit_le(bit_pos + i,
- &inline_dentry->dentry_bitmap);
+ __clear_bit_le(bit_pos + i, d.bitmap);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -586,20 +596,21 @@ bool f2fs_empty_inline_dir(struct inode *dir)
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct page *ipage;
unsigned int bit_pos = 2;
- struct f2fs_inline_dentry *inline_dentry;
+ void *inline_dentry;
+ struct f2fs_dentry_ptr d;
ipage = get_node_page(sbi, dir->i_ino);
if (IS_ERR(ipage))
return false;
- inline_dentry = inline_data_addr(ipage);
- bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap,
- NR_INLINE_DENTRY,
- bit_pos);
+ inline_dentry = inline_data_addr(dir, ipage);
+ make_dentry_ptr_inline(dir, &d, inline_dentry);
+
+ bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos);
f2fs_put_page(ipage, 1);
- if (bit_pos < NR_INLINE_DENTRY)
+ if (bit_pos < d.max)
return false;
return true;
@@ -609,25 +620,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
struct fscrypt_str *fstr)
{
struct inode *inode = file_inode(file);
- struct f2fs_inline_dentry *inline_dentry = NULL;
struct page *ipage = NULL;
struct f2fs_dentry_ptr d;
+ void *inline_dentry = NULL;
int err;
- if (ctx->pos == NR_INLINE_DENTRY)
+ make_dentry_ptr_inline(inode, &d, inline_dentry);
+
+ if (ctx->pos == d.max)
return 0;
ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
if (IS_ERR(ipage))
return PTR_ERR(ipage);
- inline_dentry = inline_data_addr(ipage);
+ inline_dentry = inline_data_addr(inode, ipage);
make_dentry_ptr_inline(inode, &d, inline_dentry);
err = f2fs_fill_dentries(ctx, &d, 0, fstr);
if (!err)
- ctx->pos = NR_INLINE_DENTRY;
+ ctx->pos = d.max;
f2fs_put_page(ipage, 1);
return err < 0 ? err : 0;
@@ -652,7 +665,7 @@ int f2fs_inline_data_fiemap(struct inode *inode,
goto out;
}
- ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+ ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode));
if (start >= ilen)
goto out;
if (start + len < ilen)
@@ -661,7 +674,8 @@ int f2fs_inline_data_fiemap(struct inode *inode,
get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
- byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+ byteaddr += (char *)inline_data_addr(inode, ipage) -
+ (char *)F2FS_INODE(ipage);
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
out:
f2fs_put_page(ipage, 1);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 6cd312a17c69..50c88e37ed66 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode)
static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
{
+ int extra_size = get_extra_isize(inode);
+
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
- if (ri->i_addr[0])
- inode->i_rdev =
- old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+ if (ri->i_addr[extra_size])
+ inode->i_rdev = old_decode_dev(
+ le32_to_cpu(ri->i_addr[extra_size]));
else
- inode->i_rdev =
- new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+ inode->i_rdev = new_decode_dev(
+ le32_to_cpu(ri->i_addr[extra_size + 1]));
}
}
static bool __written_first_block(struct f2fs_inode *ri)
{
- block_t addr = le32_to_cpu(ri->i_addr[0]);
+ block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
if (addr != NEW_ADDR && addr != NULL_ADDR)
return true;
@@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri)
static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
{
+ int extra_size = get_extra_isize(inode);
+
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
if (old_valid_dev(inode->i_rdev)) {
- ri->i_addr[0] =
+ ri->i_addr[extra_size] =
cpu_to_le32(old_encode_dev(inode->i_rdev));
- ri->i_addr[1] = 0;
+ ri->i_addr[extra_size + 1] = 0;
} else {
- ri->i_addr[0] = 0;
- ri->i_addr[1] =
+ ri->i_addr[extra_size] = 0;
+ ri->i_addr[extra_size + 1] =
cpu_to_le32(new_encode_dev(inode->i_rdev));
- ri->i_addr[2] = 0;
+ ri->i_addr[extra_size + 2] = 0;
}
}
}
static void __recover_inline_status(struct inode *inode, struct page *ipage)
{
- void *inline_data = inline_data_addr(ipage);
+ void *inline_data = inline_data_addr(inode, ipage);
__le32 *start = inline_data;
- __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
+ __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32);
while (start < end) {
if (*start++) {
@@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage)
return;
}
+static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+ int extra_isize = le32_to_cpu(ri->i_extra_isize);
+
+ if (!f2fs_sb_has_inode_chksum(sbi->sb))
+ return false;
+
+ if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+ return false;
+
+ if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum))
+ return false;
+
+ return true;
+}
+
+static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_node *node = F2FS_NODE(page);
+ struct f2fs_inode *ri = &node->i;
+ __le32 ino = node->footer.ino;
+ __le32 gen = ri->i_generation;
+ __u32 chksum, chksum_seed;
+ __u32 dummy_cs = 0;
+ unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum);
+ unsigned int cs_size = sizeof(dummy_cs);
+
+ chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino,
+ sizeof(ino));
+ chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen));
+
+ chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset);
+ chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size);
+ offset += cs_size;
+ chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset,
+ F2FS_BLKSIZE - offset);
+ return chksum;
+}
+
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_inode *ri;
+ __u32 provided, calculated;
+
+ if (!f2fs_enable_inode_chksum(sbi, page) ||
+ PageDirty(page) || PageWriteback(page))
+ return true;
+
+ ri = &F2FS_NODE(page)->i;
+ provided = le32_to_cpu(ri->i_inode_checksum);
+ calculated = f2fs_inode_chksum(sbi, page);
+
+ if (provided != calculated)
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "checksum invalid, ino = %x, %x vs. %x",
+ ino_of_node(page), provided, calculated);
+
+ return provided == calculated;
+}
+
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
+{
+ struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+
+ if (!f2fs_enable_inode_chksum(sbi, page))
+ return;
+
+ ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
+}
+
static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
+ projid_t i_projid;
/* Check if ino is within scope */
if (check_nid_range(sbi, inode->i_ino)) {
@@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode)
get_inline_info(inode, ri);
+ fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
+ le16_to_cpu(ri->i_extra_isize) : 0;
+
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
__recover_inline_status(inode, node_page);
@@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode)
if (!need_inode_block_update(sbi, inode->i_ino))
fi->last_disk_size = inode->i_size;
+ if (fi->i_flags & FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
+ if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) &&
+ F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid))
+ i_projid = (projid_t)le32_to_cpu(ri->i_projid);
+ else
+ i_projid = F2FS_DEF_PROJID;
+ fi->i_projid = make_kprojid(&init_user_ns, i_projid);
+
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -292,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page)
ri->i_generation = cpu_to_le32(inode->i_generation);
ri->i_dir_level = F2FS_I(inode)->i_dir_level;
+ if (f2fs_has_extra_attr(inode)) {
+ ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
+
+ if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
+ F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+ i_projid)) {
+ projid_t i_projid;
+
+ i_projid = from_kprojid(&init_user_ns,
+ F2FS_I(inode)->i_projid);
+ ri->i_projid = cpu_to_le32(i_projid);
+ }
+ }
+
__set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
@@ -416,6 +519,9 @@ no_delete:
stat_dec_inline_dir(inode);
stat_dec_inline_inode(inode);
+ if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))
+ f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+
/* ino == 0, if f2fs_new_inode() was failed t*/
if (inode->i_ino)
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 760d85223c81..a4dab98c4b7b 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
goto fail;
}
+ if (f2fs_sb_has_project_quota(sbi->sb) &&
+ (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL))
+ F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+ else
+ F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
+ F2FS_DEF_PROJID);
+
err = dquot_initialize(inode);
if (err)
goto fail_drop;
@@ -72,6 +79,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
+ if (f2fs_sb_has_extra_attr(sbi->sb)) {
+ set_inode_flag(inode, FI_EXTRA_ATTR);
+ F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+ }
+
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
@@ -85,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
+ F2FS_I(inode)->i_flags =
+ f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_flags |= FS_INDEX_FL;
+
+ if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
trace_f2fs_new_inode(inode, 0);
return inode;
@@ -204,6 +225,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
!fscrypt_has_permitted_context(dir, inode))
return -EPERM;
+ if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
+ (!projid_eq(F2FS_I(dir)->i_projid,
+ F2FS_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
err = dquot_initialize(dir);
if (err)
return err;
@@ -261,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
return 0;
}
+ err = dquot_initialize(dir);
+ if (err)
+ return err;
+
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
@@ -724,6 +754,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
}
+ if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ (!projid_eq(F2FS_I(new_dir)->i_projid,
+ F2FS_I(old_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
err = dquot_initialize(old_dir);
if (err)
goto out;
@@ -912,6 +947,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
!fscrypt_has_permitted_context(old_dir, new_inode)))
return -EPERM;
+ if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ !projid_eq(F2FS_I(new_dir)->i_projid,
+ F2FS_I(old_dentry->d_inode)->i_projid)) ||
+ (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ !projid_eq(F2FS_I(old_dir)->i_projid,
+ F2FS_I(new_dentry->d_inode)->i_projid)))
+ return -EXDEV;
+
err = dquot_initialize(old_dir);
if (err)
goto out;
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d53fe620939e..fca87835a1da 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "xattr.h"
#include "trace.h"
#include <trace/events/f2fs.h>
@@ -554,7 +555,7 @@ static int get_node_path(struct inode *inode, long block,
level = 3;
goto got;
} else {
- BUG();
+ return -E2BIG;
}
got:
return level;
@@ -578,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
int err = 0;
level = get_node_path(dn->inode, index, offset, noffset);
+ if (level < 0)
+ return level;
nids[0] = dn->inode->i_ino;
npage[0] = dn->inode_page;
@@ -613,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
dn->nid = nids[i];
- npage[i] = new_node_page(dn, noffset[i], NULL);
+ npage[i] = new_node_page(dn, noffset[i]);
if (IS_ERR(npage[i])) {
alloc_nid_failed(sbi, nids[i]);
err = PTR_ERR(npage[i]);
@@ -654,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->nid = nids[level];
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
- dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = datablock_addr(dn->inode,
+ dn->node_page, dn->ofs_in_node);
return 0;
release_pages:
@@ -876,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from)
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
+ if (level < 0)
+ return level;
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -1022,11 +1028,10 @@ struct page *new_inode_page(struct inode *inode)
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
/* caller should f2fs_put_page(page, 1); */
- return new_node_page(&dn, 0, NULL);
+ return new_node_page(&dn, 0);
}
-struct page *new_node_page(struct dnode_of_data *dn,
- unsigned int ofs, struct page *ipage)
+struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info new_ni;
@@ -1170,6 +1175,11 @@ repeat:
err = -EIO;
goto out_err;
}
+
+ if (!f2fs_inode_chksum_verify(sbi, page)) {
+ err = -EBADMSG;
+ goto out_err;
+ }
page_hit:
if(unlikely(nid != nid_of_node(page))) {
f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
@@ -1177,9 +1187,9 @@ page_hit:
nid, nid_of_node(page), ino_of_node(page),
ofs_of_node(page), cpver_of_node(page),
next_blkaddr_of_node(page));
- ClearPageUptodate(page);
err = -EINVAL;
out_err:
+ ClearPageUptodate(page);
f2fs_put_page(page, 1);
return ERR_PTR(err);
}
@@ -1326,7 +1336,8 @@ continue_unlock:
}
static int __write_node_page(struct page *page, bool atomic, bool *submitted,
- struct writeback_control *wbc)
+ struct writeback_control *wbc, bool do_balance,
+ enum iostat_type io_type)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
nid_t nid;
@@ -1339,6 +1350,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
.page = page,
.encrypted_page = NULL,
.submitted = false,
+ .io_type = io_type,
};
trace_f2fs_writepage(page, NODE);
@@ -1395,6 +1407,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (submitted)
*submitted = fio.submitted;
+ if (do_balance)
+ f2fs_balance_fs(sbi, false);
return 0;
redirty_out:
@@ -1405,7 +1419,7 @@ redirty_out:
static int f2fs_write_node_page(struct page *page,
struct writeback_control *wbc)
{
- return __write_node_page(page, false, NULL, wbc);
+ return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
}
int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
@@ -1493,7 +1507,8 @@ continue_unlock:
ret = __write_node_page(page, atomic &&
page == last_page,
- &submitted, wbc);
+ &submitted, wbc, true,
+ FS_NODE_IO);
if (ret) {
unlock_page(page);
f2fs_put_page(last_page, 0);
@@ -1530,7 +1545,8 @@ out:
return ret ? -EIO: 0;
}
-int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc)
+int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
+ bool do_balance, enum iostat_type io_type)
{
pgoff_t index, end;
struct pagevec pvec;
@@ -1608,7 +1624,8 @@ continue_unlock:
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
- ret = __write_node_page(page, false, &submitted, wbc);
+ ret = __write_node_page(page, false, &submitted,
+ wbc, do_balance, io_type);
if (ret)
unlock_page(page);
else if (submitted)
@@ -1697,7 +1714,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
diff = nr_pages_to_write(sbi, NODE, wbc);
wbc->sync_mode = WB_SYNC_NONE;
blk_start_plug(&plug);
- sync_node_pages(sbi, wbc);
+ sync_node_pages(sbi, wbc, true, FS_NODE_IO);
blk_finish_plug(&plug);
wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return 0;
@@ -2191,7 +2208,8 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
- nid_t new_xnid = nid_of_node(page);
+ nid_t new_xnid;
+ struct dnode_of_data dn;
struct node_info ni;
struct page *xpage;
@@ -2207,22 +2225,22 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
recover_xnid:
/* 2: update xattr nid in inode */
- remove_free_nid(sbi, new_xnid);
- f2fs_i_xnid_write(inode, new_xnid);
- if (unlikely(inc_valid_node_count(sbi, inode, false)))
- f2fs_bug_on(sbi, 1);
+ if (!alloc_nid(sbi, &new_xnid))
+ return -ENOSPC;
+
+ set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
+ xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
+ if (IS_ERR(xpage)) {
+ alloc_nid_failed(sbi, new_xnid);
+ return PTR_ERR(xpage);
+ }
+
+ alloc_nid_done(sbi, new_xnid);
update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid);
- if (!xpage)
- return -ENOMEM;
-
- memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE);
+ memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
- get_node_info(sbi, new_xnid, &ni);
- ni.ino = inode->i_ino;
- set_node_addr(sbi, &ni, NEW_ADDR, false);
set_page_dirty(xpage);
f2fs_put_page(xpage, 1);
@@ -2262,7 +2280,14 @@ retry:
dst->i_blocks = cpu_to_le64(1);
dst->i_links = cpu_to_le32(1);
dst->i_xattr_nid = 0;
- dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
+ dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
+ if (dst->i_inline & F2FS_EXTRA_ATTR) {
+ dst->i_extra_isize = src->i_extra_isize;
+ if (f2fs_sb_has_project_quota(sbi->sb) &&
+ F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+ i_projid))
+ dst->i_projid = src->i_projid;
+ }
new_ni = old_ni;
new_ni.ino = ino;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 907d6b7dde6a..9626758bc762 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
}
static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
- struct list_head *head, nid_t ino)
+ struct list_head *head, nid_t ino, bool quota_inode)
{
struct inode *inode;
struct fsync_inode_entry *entry;
+ int err;
inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
+ err = dquot_initialize(inode);
+ if (err)
+ goto err_out;
+
+ if (quota_inode) {
+ err = dquot_alloc_inode(inode);
+ if (err)
+ goto err_out;
+ }
+
entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
entry->inode = inode;
list_add_tail(&entry->list, head);
return entry;
+err_out:
+ iput(inode);
+ return ERR_PTR(err);
}
static void del_fsync_inode(struct fsync_inode_entry *entry)
@@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
entry = get_fsync_inode(dir_list, pino);
if (!entry) {
- entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino);
+ entry = add_fsync_inode(F2FS_I_SB(inode), dir_list,
+ pino, false);
if (IS_ERR(entry)) {
dir = ERR_CAST(entry);
err = PTR_ERR(entry);
@@ -140,6 +155,13 @@ retry:
err = -EEXIST;
goto out_unmap_put;
}
+
+ err = dquot_initialize(einode);
+ if (err) {
+ iput(einode);
+ goto out_unmap_put;
+ }
+
err = acquire_orphan_inode(F2FS_I_SB(inode));
if (err) {
iput(einode);
@@ -226,18 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
entry = get_fsync_inode(head, ino_of_node(page));
if (!entry) {
+ bool quota_inode = false;
+
if (!check_only &&
IS_INODE(page) && is_dent_dnode(page)) {
err = recover_inode_page(sbi, page);
if (err)
break;
+ quota_inode = true;
}
/*
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry = add_fsync_inode(sbi, head, ino_of_node(page));
+ entry = add_fsync_inode(sbi, head, ino_of_node(page),
+ quota_inode);
if (IS_ERR(entry)) {
err = PTR_ERR(entry);
if (err == -ENOENT) {
@@ -291,7 +317,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
return 0;
/* Get the previous summary */
- for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
@@ -328,10 +354,18 @@ got_it:
f2fs_put_page(node_page, 1);
if (ino != dn->inode->i_ino) {
+ int ret;
+
/* Deallocate previous index in the node page */
inode = f2fs_iget_retry(sbi->sb, ino);
if (IS_ERR(inode))
return PTR_ERR(inode);
+
+ ret = dquot_initialize(inode);
+ if (ret) {
+ iput(inode);
+ return ret;
+ }
} else {
inode = dn->inode;
}
@@ -361,7 +395,8 @@ out:
return 0;
truncate_out:
- if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
+ if (datablock_addr(tdn.inode, tdn.node_page,
+ tdn.ofs_in_node) == blkaddr)
truncate_data_blocks_range(&tdn, 1);
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
unlock_page(dn->inode_page);
@@ -414,8 +449,8 @@ retry_dn:
for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
- src = datablock_addr(dn.node_page, dn.ofs_in_node);
- dest = datablock_addr(page, dn.ofs_in_node);
+ src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+ dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
/* skip recovering if dest is the same as src */
if (src == dest)
@@ -557,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
struct list_head dir_list;
int err;
int ret = 0;
+ unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
+ if (s_flags & MS_RDONLY) {
+ f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
+ sbi->sb->s_flags &= ~MS_RDONLY;
+ }
+
+#ifdef CONFIG_QUOTA
+ /* Needed for iput() to work correctly and not trash data */
+ sbi->sb->s_flags |= MS_ACTIVE;
+ /* Turn on quotas so that they are updated correctly */
+ f2fs_enable_quota_files(sbi);
+#endif
+
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
- if (!fsync_entry_slab)
- return -ENOMEM;
+ if (!fsync_entry_slab) {
+ err = -ENOMEM;
+ goto out;
+ }
INIT_LIST_HEAD(&inode_list);
INIT_LIST_HEAD(&dir_list);
@@ -573,11 +623,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
/* step #1: find fsynced inode numbers */
err = find_fsync_dnodes(sbi, &inode_list, check_only);
if (err || list_empty(&inode_list))
- goto out;
+ goto skip;
if (check_only) {
ret = 1;
- goto out;
+ goto skip;
}
need_writecp = true;
@@ -586,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
err = recover_data(sbi, &inode_list, &dir_list);
if (!err)
f2fs_bug_on(sbi, !list_empty(&inode_list));
-out:
+skip:
destroy_fsync_dnodes(&inode_list);
/* truncate meta pages to be used by the recovery */
@@ -599,8 +649,6 @@ out:
}
clear_sbi_flag(sbi, SBI_POR_DOING);
- if (err)
- set_ckpt_flags(sbi, CP_ERROR_FLAG);
mutex_unlock(&sbi->cp_mutex);
/* let's drop all the directory inodes for clean checkpoint */
@@ -614,5 +662,12 @@ out:
}
kmem_cache_destroy(fsync_entry_slab);
+out:
+#ifdef CONFIG_QUOTA
+ /* Turn quotas off */
+ f2fs_quota_off_umount(sbi->sb);
+#endif
+ sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+
return ret ? ret: err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6f8fc4a6e701..621b9b3d320b 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -17,10 +17,12 @@
#include <linux/swap.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/sched/signal.h>
#include "f2fs.h"
#include "segment.h"
#include "node.h"
+#include "gc.h"
#include "trace.h"
#include <trace/events/f2fs.h>
@@ -167,6 +169,21 @@ found:
return result - size + __reverse_ffz(tmp);
}
+bool need_SSR(struct f2fs_sb_info *sbi)
+{
+ int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
+ int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+ int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+
+ if (test_opt(sbi, LFS))
+ return false;
+ if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
+ return true;
+
+ return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
+ 2 * reserved_sections(sbi));
+}
+
void register_inmem_page(struct inode *inode, struct page *page)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -213,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode,
struct node_info ni;
trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
-
+retry:
set_new_dnode(&dn, inode, NULL, NULL, 0);
- if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+ err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ cond_resched();
+ goto retry;
+ }
err = -EAGAIN;
goto next;
}
@@ -248,6 +271,7 @@ void drop_inmem_pages(struct inode *inode)
mutex_unlock(&fi->inmem_lock);
clear_inode_flag(inode, FI_ATOMIC_FILE);
+ clear_inode_flag(inode, FI_HOT_DATA);
stat_dec_atomic_write(inode);
}
@@ -292,6 +316,7 @@ static int __commit_inmem_pages(struct inode *inode,
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_PRIO,
+ .io_type = FS_DATA_IO,
};
pgoff_t last_idx = ULONG_MAX;
int err = 0;
@@ -309,17 +334,21 @@ static int __commit_inmem_pages(struct inode *inode,
inode_dec_dirty_pages(inode);
remove_dirty_inode(inode);
}
-
+retry:
fio.page = page;
fio.old_blkaddr = NULL_ADDR;
fio.encrypted_page = NULL;
fio.need_lock = LOCK_DONE;
err = do_write_data_page(&fio);
if (err) {
+ if (err == -ENOMEM) {
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
+ cond_resched();
+ goto retry;
+ }
unlock_page(page);
break;
}
-
/* record old blkaddr for revoking */
cur->old_addr = fio.old_blkaddr;
last_idx = page->index;
@@ -481,6 +510,8 @@ repeat:
if (kthread_should_stop())
return 0;
+ sb_start_intwrite(sbi->sb);
+
if (!llist_empty(&fcc->issue_list)) {
struct flush_cmd *cmd, *next;
int ret;
@@ -499,6 +530,8 @@ repeat:
fcc->dispatch_list = NULL;
}
+ sb_end_intwrite(sbi->sb);
+
wait_event_interruptible(*q,
kthread_should_stop() || !llist_empty(&fcc->issue_list));
goto repeat;
@@ -519,8 +552,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
return ret;
}
- if (!atomic_read(&fcc->issing_flush)) {
- atomic_inc(&fcc->issing_flush);
+ if (atomic_inc_return(&fcc->issing_flush) == 1) {
ret = submit_flush_wait(sbi);
atomic_dec(&fcc->issing_flush);
@@ -530,18 +562,39 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
init_completion(&cmd.wait);
- atomic_inc(&fcc->issing_flush);
llist_add(&cmd.llnode, &fcc->issue_list);
- if (!fcc->dispatch_list)
+ /* update issue_list before we wake up issue_flush thread */
+ smp_mb();
+
+ if (waitqueue_active(&fcc->flush_wait_queue))
wake_up(&fcc->flush_wait_queue);
if (fcc->f2fs_issue_flush) {
wait_for_completion(&cmd.wait);
atomic_dec(&fcc->issing_flush);
} else {
- llist_del_all(&fcc->issue_list);
- atomic_set(&fcc->issing_flush, 0);
+ struct llist_node *list;
+
+ list = llist_del_all(&fcc->issue_list);
+ if (!list) {
+ wait_for_completion(&cmd.wait);
+ atomic_dec(&fcc->issing_flush);
+ } else {
+ struct flush_cmd *tmp, *next;
+
+ ret = submit_flush_wait(sbi);
+
+ llist_for_each_entry_safe(tmp, next, list, llnode) {
+ if (tmp == &cmd) {
+ cmd.ret = ret;
+ atomic_dec(&fcc->issing_flush);
+ continue;
+ }
+ tmp->ret = ret;
+ complete(&tmp->wait);
+ }
+ }
}
return cmd.ret;
@@ -778,11 +831,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi,
sentry = get_seg_entry(sbi, segno);
offset = GET_BLKOFF_FROM_SEG0(sbi, blk);
- size = min((unsigned long)(end - blk), max_blocks);
+ if (end < START_BLOCK(sbi, segno + 1))
+ size = GET_BLKOFF_FROM_SEG0(sbi, end);
+ else
+ size = max_blocks;
map = (unsigned long *)(sentry->cur_valid_map);
offset = __find_rev_next_bit(map, size, offset);
f2fs_bug_on(sbi, offset != size);
- blk += size;
+ blk = START_BLOCK(sbi, segno + 1);
}
#endif
}
@@ -815,6 +871,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
submit_bio(bio);
list_move_tail(&dc->list, &dcc->wait_list);
__check_sit_bitmap(sbi, dc->start, dc->start + dc->len);
+
+ f2fs_update_iostat(sbi, FS_DISCARD, 1);
}
} else {
__remove_discard_cmd(sbi, dc);
@@ -996,32 +1054,81 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
return 0;
}
-static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
{
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
struct list_head *pend_list;
struct discard_cmd *dc, *tmp;
struct blk_plug plug;
- int i, iter = 0;
+ int iter = 0, issued = 0;
+ int i;
+ bool io_interrupted = false;
mutex_lock(&dcc->cmd_lock);
f2fs_bug_on(sbi,
!__check_rb_tree_consistence(sbi, &dcc->root));
blk_start_plug(&plug);
- for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ for (i = MAX_PLIST_NUM - 1;
+ i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
pend_list = &dcc->pend_list[i];
list_for_each_entry_safe(dc, tmp, pend_list, list) {
f2fs_bug_on(sbi, dc->state != D_PREP);
- if (!issue_cond || is_idle(sbi))
+ /* Hurry up to finish fstrim */
+ if (dcc->pend_list_tag[i] & P_TRIM) {
+ __submit_discard_cmd(sbi, dc);
+ issued++;
+
+ if (fatal_signal_pending(current))
+ break;
+ continue;
+ }
+
+ if (!issue_cond) {
__submit_discard_cmd(sbi, dc);
- if (issue_cond && iter++ > DISCARD_ISSUE_RATE)
+ issued++;
+ continue;
+ }
+
+ if (is_idle(sbi)) {
+ __submit_discard_cmd(sbi, dc);
+ issued++;
+ } else {
+ io_interrupted = true;
+ }
+
+ if (++iter >= DISCARD_ISSUE_RATE)
goto out;
}
+ if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM)
+ dcc->pend_list_tag[i] &= (~P_TRIM);
}
out:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
+
+ if (!issued && io_interrupted)
+ issued = -1;
+
+ return issued;
+}
+
+static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+ int i;
+
+ mutex_lock(&dcc->cmd_lock);
+ for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list) {
+ f2fs_bug_on(sbi, dc->state != D_PREP);
+ __remove_discard_cmd(sbi, dc);
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
}
static void __wait_one_discard_bio(struct f2fs_sb_info *sbi,
@@ -1102,34 +1209,63 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
}
}
-/* This comes from f2fs_put_super */
+/* This comes from f2fs_put_super and f2fs_trim_fs */
void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
{
__issue_discard_cmd(sbi, false);
+ __drop_discard_cmd(sbi);
__wait_discard_cmd(sbi, false);
}
+static void mark_discard_range_all(struct f2fs_sb_info *sbi)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ int i;
+
+ mutex_lock(&dcc->cmd_lock);
+ for (i = 0; i < MAX_PLIST_NUM; i++)
+ dcc->pend_list_tag[i] |= P_TRIM;
+ mutex_unlock(&dcc->cmd_lock);
+}
+
static int issue_discard_thread(void *data)
{
struct f2fs_sb_info *sbi = data;
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
wait_queue_head_t *q = &dcc->discard_wait_queue;
+ unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ int issued;
set_freezable();
do {
- wait_event_interruptible(*q, kthread_should_stop() ||
- freezing(current) ||
- atomic_read(&dcc->discard_cmd_cnt));
+ wait_event_interruptible_timeout(*q,
+ kthread_should_stop() || freezing(current) ||
+ dcc->discard_wake,
+ msecs_to_jiffies(wait_ms));
if (try_to_freeze())
continue;
if (kthread_should_stop())
return 0;
- __issue_discard_cmd(sbi, true);
- __wait_discard_cmd(sbi, true);
+ if (dcc->discard_wake) {
+ dcc->discard_wake = 0;
+ if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
+ mark_discard_range_all(sbi);
+ }
+
+ sb_start_intwrite(sbi->sb);
+
+ issued = __issue_discard_cmd(sbi, true);
+ if (issued) {
+ __wait_discard_cmd(sbi, true);
+ wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+ } else {
+ wait_ms = DEF_MAX_DISCARD_ISSUE_TIME;
+ }
+
+ sb_end_intwrite(sbi->sb);
- congestion_wait(BLK_RW_SYNC, HZ/50);
} while (!kthread_should_stop());
return 0;
}
@@ -1320,7 +1456,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
- struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ struct list_head *head = &dcc->entry_list;
struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
@@ -1402,11 +1539,11 @@ skip:
goto find_next;
list_del(&entry->list);
- SM_I(sbi)->dcc_info->nr_discards -= total_len;
+ dcc->nr_discards -= total_len;
kmem_cache_free(discard_entry_slab, entry);
}
- wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue);
+ wake_up_discard_thread(sbi, false);
}
static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
@@ -1424,9 +1561,13 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
if (!dcc)
return -ENOMEM;
+ dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
INIT_LIST_HEAD(&dcc->entry_list);
- for (i = 0; i < MAX_PLIST_NUM; i++)
+ for (i = 0; i < MAX_PLIST_NUM; i++) {
INIT_LIST_HEAD(&dcc->pend_list[i]);
+ if (i >= dcc->discard_granularity - 1)
+ dcc->pend_list_tag[i] |= P_ACTIVE;
+ }
INIT_LIST_HEAD(&dcc->wait_list);
mutex_init(&dcc->cmd_lock);
atomic_set(&dcc->issued_discard, 0);
@@ -1491,6 +1632,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
struct seg_entry *se;
unsigned int segno, offset;
long int new_vblocks;
+ bool exist;
+#ifdef CONFIG_F2FS_CHECK_FS
+ bool mir_exist;
+#endif
segno = GET_SEGNO(sbi, blkaddr);
@@ -1507,17 +1652,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* Update valid block bitmap */
if (del > 0) {
- if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) {
+ exist = f2fs_test_and_set_bit(offset, se->cur_valid_map);
#ifdef CONFIG_F2FS_CHECK_FS
- if (f2fs_test_and_set_bit(offset,
- se->cur_valid_map_mir))
- f2fs_bug_on(sbi, 1);
- else
- WARN_ON(1);
-#else
+ mir_exist = f2fs_test_and_set_bit(offset,
+ se->cur_valid_map_mir);
+ if (unlikely(exist != mir_exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
+ "when setting bitmap, blk:%u, old bit:%d",
+ blkaddr, exist);
f2fs_bug_on(sbi, 1);
+ }
#endif
+ if (unlikely(exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Bitmap was wrongly set, blk:%u", blkaddr);
+ f2fs_bug_on(sbi, 1);
+ se->valid_blocks--;
+ del = 0;
}
+
if (f2fs_discard_en(sbi) &&
!f2fs_test_and_set_bit(offset, se->discard_map))
sbi->discard_blks--;
@@ -1528,17 +1681,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
se->ckpt_valid_blocks++;
}
} else {
- if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) {
+ exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map);
#ifdef CONFIG_F2FS_CHECK_FS
- if (!f2fs_test_and_clear_bit(offset,
- se->cur_valid_map_mir))
- f2fs_bug_on(sbi, 1);
- else
- WARN_ON(1);
-#else
+ mir_exist = f2fs_test_and_clear_bit(offset,
+ se->cur_valid_map_mir);
+ if (unlikely(exist != mir_exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error "
+ "when clearing bitmap, blk:%u, old bit:%d",
+ blkaddr, exist);
f2fs_bug_on(sbi, 1);
+ }
#endif
+ if (unlikely(!exist)) {
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Bitmap was wrongly cleared, blk:%u", blkaddr);
+ f2fs_bug_on(sbi, 1);
+ se->valid_blocks++;
+ del = 0;
}
+
if (f2fs_discard_en(sbi) &&
f2fs_test_and_clear_bit(offset, se->discard_map))
sbi->discard_blks++;
@@ -1900,7 +2061,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -1921,12 +2082,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
curseg->alloc_type = SSR;
__next_free_blkoff(sbi, curseg, 0);
- if (reuse) {
- sum_page = get_sum_page(sbi, new_segno);
- sum_node = (struct f2fs_summary_block *)page_address(sum_page);
- memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
- f2fs_put_page(sum_page, 1);
- }
+ sum_page = get_sum_page(sbi, new_segno);
+ sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
+ f2fs_put_page(sum_page, 1);
}
static int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
@@ -1990,7 +2149,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type))
new_curseg(sbi, type, false);
else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
else
new_curseg(sbi, type, false);
@@ -2083,6 +2242,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
schedule();
}
+ /* It's time to issue all the filed discards */
+ mark_discard_range_all(sbi);
+ f2fs_wait_discard_bios(sbi);
out:
range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
return err;
@@ -2202,9 +2364,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
mutex_unlock(&sit_i->sentry_lock);
- if (page && IS_NODESEG(type))
+ if (page && IS_NODESEG(type)) {
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+ f2fs_inode_chksum_set(sbi, page);
+ }
+
if (add_list) {
struct f2fs_bio_info *io;
@@ -2236,7 +2401,8 @@ reallocate:
}
}
-void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
+void write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
+ enum iostat_type io_type)
{
struct f2fs_io_info fio = {
.sbi = sbi,
@@ -2255,6 +2421,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
set_page_writeback(page);
f2fs_submit_page_write(&fio);
+
+ f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE);
}
void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
@@ -2263,6 +2431,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio)
set_summary(&sum, nid, 0, 0);
do_write_page(&sum, fio);
+
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
}
void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
@@ -2276,13 +2446,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio)
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
do_write_page(&sum, fio);
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
+
+ f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
}
int rewrite_data_page(struct f2fs_io_info *fio)
{
+ int err;
+
fio->new_blkaddr = fio->old_blkaddr;
stat_inc_inplace_blocks(fio->sbi);
- return f2fs_submit_page_bio(fio);
+
+ err = f2fs_submit_page_bio(fio);
+
+ f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
+
+ return err;
}
void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -2324,7 +2503,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -2343,7 +2522,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = old_blkoff;
}
@@ -2382,8 +2561,7 @@ void f2fs_wait_on_page_writeback(struct page *page,
}
}
-void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
- block_t blkaddr)
+void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr)
{
struct page *cpage;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 6b871b492fd5..e0a6cc23ace3 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -492,29 +492,11 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi)
return SM_I(sbi)->ovp_segments;
}
-static inline int overprovision_sections(struct f2fs_sb_info *sbi)
-{
- return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi));
-}
-
static inline int reserved_sections(struct f2fs_sb_info *sbi)
{
return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
}
-static inline bool need_SSR(struct f2fs_sb_info *sbi)
-{
- int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
- int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
-
- if (test_opt(sbi, LFS))
- return false;
-
- return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
- 2 * reserved_sections(sbi));
-}
-
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
int freed, int needed)
{
@@ -577,6 +559,10 @@ static inline bool need_inplace_update_policy(struct inode *inode,
if (test_opt(sbi, LFS))
return false;
+ /* if this is cold file, we should overwrite to avoid fragmentation */
+ if (file_is_cold(inode))
+ return true;
+
if (policy & (0x1 << F2FS_IPU_FORCE))
return true;
if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
@@ -799,3 +785,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
wbc->nr_to_write = desired;
return desired - nr_to_write;
}
+
+static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
+{
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ bool wakeup = false;
+ int i;
+
+ if (force)
+ goto wake_up;
+
+ mutex_lock(&dcc->cmd_lock);
+ for (i = MAX_PLIST_NUM - 1;
+ i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+ if (!list_empty(&dcc->pend_list[i])) {
+ wakeup = true;
+ break;
+ }
+ }
+ mutex_unlock(&dcc->cmd_lock);
+ if (!wakeup)
+ return;
+wake_up:
+ dcc->discard_wake = 1;
+ wake_up_interruptible_all(&dcc->discard_wait_queue);
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 32e4c025e97e..89f61eb3d167 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -25,6 +25,7 @@
#include <linux/quotaops.h>
#include <linux/f2fs_fs.h>
#include <linux/sysfs.h>
+#include <linux/quota.h>
#include "f2fs.h"
#include "node.h"
@@ -107,8 +108,20 @@ enum {
Opt_fault_injection,
Opt_lazytime,
Opt_nolazytime,
+ Opt_quota,
+ Opt_noquota,
Opt_usrquota,
Opt_grpquota,
+ Opt_prjquota,
+ Opt_usrjquota,
+ Opt_grpjquota,
+ Opt_prjjquota,
+ Opt_offusrjquota,
+ Opt_offgrpjquota,
+ Opt_offprjjquota,
+ Opt_jqfmt_vfsold,
+ Opt_jqfmt_vfsv0,
+ Opt_jqfmt_vfsv1,
Opt_err,
};
@@ -144,8 +157,20 @@ static match_table_t f2fs_tokens = {
{Opt_fault_injection, "fault_injection=%u"},
{Opt_lazytime, "lazytime"},
{Opt_nolazytime, "nolazytime"},
+ {Opt_quota, "quota"},
+ {Opt_noquota, "noquota"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
+ {Opt_prjquota, "prjquota"},
+ {Opt_usrjquota, "usrjquota=%s"},
+ {Opt_grpjquota, "grpjquota=%s"},
+ {Opt_prjjquota, "prjjquota=%s"},
+ {Opt_offusrjquota, "usrjquota="},
+ {Opt_offgrpjquota, "grpjquota="},
+ {Opt_offprjjquota, "prjjquota="},
+ {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
+ {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
+ {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
{Opt_err, NULL},
};
@@ -157,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+ printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
va_end(args);
}
@@ -168,6 +193,104 @@ static void init_once(void *foo)
inode_init_once(&fi->vfs_inode);
}
+#ifdef CONFIG_QUOTA
+static const char * const quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
+static int f2fs_set_qf_name(struct super_block *sb, int qtype,
+ substring_t *args)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ char *qname;
+ int ret = -EINVAL;
+
+ if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) {
+ f2fs_msg(sb, KERN_ERR,
+ "Cannot change journaled "
+ "quota options when quota turned on");
+ return -EINVAL;
+ }
+ qname = match_strdup(args);
+ if (!qname) {
+ f2fs_msg(sb, KERN_ERR,
+ "Not enough memory for storing quotafile name");
+ return -EINVAL;
+ }
+ if (sbi->s_qf_names[qtype]) {
+ if (strcmp(sbi->s_qf_names[qtype], qname) == 0)
+ ret = 0;
+ else
+ f2fs_msg(sb, KERN_ERR,
+ "%s quota file already specified",
+ QTYPE2NAME(qtype));
+ goto errout;
+ }
+ if (strchr(qname, '/')) {
+ f2fs_msg(sb, KERN_ERR,
+ "quotafile must be on filesystem root");
+ goto errout;
+ }
+ sbi->s_qf_names[qtype] = qname;
+ set_opt(sbi, QUOTA);
+ return 0;
+errout:
+ kfree(qname);
+ return ret;
+}
+
+static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) {
+ f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options"
+ " when quota turned on");
+ return -EINVAL;
+ }
+ kfree(sbi->s_qf_names[qtype]);
+ sbi->s_qf_names[qtype] = NULL;
+ return 0;
+}
+
+static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
+{
+ /*
+ * We do the test below only for project quotas. 'usrquota' and
+ * 'grpquota' mount options are allowed even without quota feature
+ * to support legacy quotas in quota files.
+ */
+ if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. "
+ "Cannot enable project quota enforcement.");
+ return -1;
+ }
+ if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] ||
+ sbi->s_qf_names[PRJQUOTA]) {
+ if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA])
+ clear_opt(sbi, USRQUOTA);
+
+ if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA])
+ clear_opt(sbi, GRPQUOTA);
+
+ if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA])
+ clear_opt(sbi, PRJQUOTA);
+
+ if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
+ test_opt(sbi, PRJQUOTA)) {
+ f2fs_msg(sbi->sb, KERN_ERR, "old and new quota "
+ "format mixing");
+ return -1;
+ }
+
+ if (!sbi->s_jquota_fmt) {
+ f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format "
+ "not specified");
+ return -1;
+ }
+ }
+ return 0;
+}
+#endif
+
static int parse_options(struct super_block *sb, char *options)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -175,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options)
substring_t args[MAX_OPT_ARGS];
char *p, *name;
int arg = 0;
+#ifdef CONFIG_QUOTA
+ int ret;
+#endif
if (!options)
return 0;
@@ -386,15 +512,76 @@ static int parse_options(struct super_block *sb, char *options)
sb->s_flags &= ~MS_LAZYTIME;
break;
#ifdef CONFIG_QUOTA
+ case Opt_quota:
case Opt_usrquota:
set_opt(sbi, USRQUOTA);
break;
case Opt_grpquota:
set_opt(sbi, GRPQUOTA);
break;
+ case Opt_prjquota:
+ set_opt(sbi, PRJQUOTA);
+ break;
+ case Opt_usrjquota:
+ ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]);
+ if (ret)
+ return ret;
+ break;
+ case Opt_grpjquota:
+ ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]);
+ if (ret)
+ return ret;
+ break;
+ case Opt_prjjquota:
+ ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]);
+ if (ret)
+ return ret;
+ break;
+ case Opt_offusrjquota:
+ ret = f2fs_clear_qf_name(sb, USRQUOTA);
+ if (ret)
+ return ret;
+ break;
+ case Opt_offgrpjquota:
+ ret = f2fs_clear_qf_name(sb, GRPQUOTA);
+ if (ret)
+ return ret;
+ break;
+ case Opt_offprjjquota:
+ ret = f2fs_clear_qf_name(sb, PRJQUOTA);
+ if (ret)
+ return ret;
+ break;
+ case Opt_jqfmt_vfsold:
+ sbi->s_jquota_fmt = QFMT_VFS_OLD;
+ break;
+ case Opt_jqfmt_vfsv0:
+ sbi->s_jquota_fmt = QFMT_VFS_V0;
+ break;
+ case Opt_jqfmt_vfsv1:
+ sbi->s_jquota_fmt = QFMT_VFS_V1;
+ break;
+ case Opt_noquota:
+ clear_opt(sbi, QUOTA);
+ clear_opt(sbi, USRQUOTA);
+ clear_opt(sbi, GRPQUOTA);
+ clear_opt(sbi, PRJQUOTA);
+ break;
#else
+ case Opt_quota:
case Opt_usrquota:
case Opt_grpquota:
+ case Opt_prjquota:
+ case Opt_usrjquota:
+ case Opt_grpjquota:
+ case Opt_prjjquota:
+ case Opt_offusrjquota:
+ case Opt_offgrpjquota:
+ case Opt_offprjjquota:
+ case Opt_jqfmt_vfsold:
+ case Opt_jqfmt_vfsv0:
+ case Opt_jqfmt_vfsv1:
+ case Opt_noquota:
f2fs_msg(sb, KERN_INFO,
"quota operations not supported");
break;
@@ -406,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options)
return -EINVAL;
}
}
+#ifdef CONFIG_QUOTA
+ if (f2fs_check_quota_options(sbi))
+ return -EINVAL;
+#endif
if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
f2fs_msg(sb, KERN_ERR,
@@ -439,6 +630,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
init_rwsem(&fi->dio_rwsem[READ]);
init_rwsem(&fi->dio_rwsem[WRITE]);
init_rwsem(&fi->i_mmap_sem);
+ init_rwsem(&fi->i_xattr_sem);
#ifdef CONFIG_QUOTA
memset(&fi->i_dquot, 0, sizeof(fi->i_dquot));
@@ -446,6 +638,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
#endif
/* Will be used by directory only */
fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
return &fi->vfs_inode;
}
@@ -584,7 +777,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
kfree(sbi->devs);
}
-static void f2fs_quota_off_umount(struct super_block *sb);
static void f2fs_put_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -642,7 +834,7 @@ static void f2fs_put_super(struct super_block *sb)
kfree(sbi->ckpt);
- f2fs_exit_sysfs(sbi);
+ f2fs_unregister_sysfs(sbi);
sb->s_fs_info = NULL;
if (sbi->s_chksum_driver)
@@ -651,6 +843,10 @@ static void f2fs_put_super(struct super_block *sb)
destroy_device_list(sbi);
mempool_destroy(sbi->write_io_dummy);
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+#endif
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kfree(sbi->write_io[i]);
@@ -664,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
trace_f2fs_sync_fs(sb, sync);
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ return -EAGAIN;
+
if (sync) {
struct cp_control cpc;
@@ -698,6 +897,48 @@ static int f2fs_unfreeze(struct super_block *sb)
return 0;
}
+#ifdef CONFIG_QUOTA
+static int f2fs_statfs_project(struct super_block *sb,
+ kprojid_t projid, struct kstatfs *buf)
+{
+ struct kqid qid;
+ struct dquot *dquot;
+ u64 limit;
+ u64 curblock;
+
+ qid = make_kqid_projid(projid);
+ dquot = dqget(sb, qid);
+ if (IS_ERR(dquot))
+ return PTR_ERR(dquot);
+ spin_lock(&dq_data_lock);
+
+ limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+ dquot->dq_dqb.dqb_bsoftlimit :
+ dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ if (limit && buf->f_blocks > limit) {
+ curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ buf->f_blocks = limit;
+ buf->f_bfree = buf->f_bavail =
+ (buf->f_blocks > curblock) ?
+ (buf->f_blocks - curblock) : 0;
+ }
+
+ limit = dquot->dq_dqb.dqb_isoftlimit ?
+ dquot->dq_dqb.dqb_isoftlimit :
+ dquot->dq_dqb.dqb_ihardlimit;
+ if (limit && buf->f_files > limit) {
+ buf->f_files = limit;
+ buf->f_ffree =
+ (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+ (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ }
+
+ spin_unlock(&dq_data_lock);
+ dqput(dquot);
+ return 0;
+}
+#endif
+
static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -733,9 +974,49 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
+#ifdef CONFIG_QUOTA
+ if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) &&
+ sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+ f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf);
+ }
+#endif
return 0;
}
+static inline void f2fs_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (sbi->s_jquota_fmt) {
+ char *fmtname = "";
+
+ switch (sbi->s_jquota_fmt) {
+ case QFMT_VFS_OLD:
+ fmtname = "vfsold";
+ break;
+ case QFMT_VFS_V0:
+ fmtname = "vfsv0";
+ break;
+ case QFMT_VFS_V1:
+ fmtname = "vfsv1";
+ break;
+ }
+ seq_printf(seq, ",jqfmt=%s", fmtname);
+ }
+
+ if (sbi->s_qf_names[USRQUOTA])
+ seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
+
+ if (sbi->s_qf_names[GRPQUOTA])
+ seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
+
+ if (sbi->s_qf_names[PRJQUOTA])
+ seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]);
+#endif
+}
+
static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
@@ -809,11 +1090,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
sbi->fault_info.inject_rate);
#endif
#ifdef CONFIG_QUOTA
+ if (test_opt(sbi, QUOTA))
+ seq_puts(seq, ",quota");
if (test_opt(sbi, USRQUOTA))
seq_puts(seq, ",usrquota");
if (test_opt(sbi, GRPQUOTA))
seq_puts(seq, ",grpquota");
+ if (test_opt(sbi, PRJQUOTA))
+ seq_puts(seq, ",prjquota");
#endif
+ f2fs_show_quota_options(seq, sbi->sb);
return 0;
}
@@ -862,6 +1148,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
#ifdef CONFIG_F2FS_FAULT_INJECTION
struct f2fs_fault_info ffi = sbi->fault_info;
#endif
+#ifdef CONFIG_QUOTA
+ int s_jquota_fmt;
+ char *s_qf_names[MAXQUOTAS];
+ int i, j;
+#endif
/*
* Save the old mount options in case we
@@ -871,6 +1162,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
old_sb_flags = sb->s_flags;
active_logs = sbi->active_logs;
+#ifdef CONFIG_QUOTA
+ s_jquota_fmt = sbi->s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (sbi->s_qf_names[i]) {
+ s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
+ GFP_KERNEL);
+ if (!s_qf_names[i]) {
+ for (j = 0; j < i; j++)
+ kfree(s_qf_names[j]);
+ return -ENOMEM;
+ }
+ } else {
+ s_qf_names[i] = NULL;
+ }
+ }
+#endif
+
/* recover superblocks we couldn't write due to previous RO mount */
if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
err = f2fs_commit_super(sbi, false);
@@ -952,6 +1260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_gc;
}
skip:
+#ifdef CONFIG_QUOTA
+ /* Release old quota file names */
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(s_qf_names[i]);
+#endif
/* Update the POSIXACL Flag */
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -966,6 +1279,13 @@ restore_gc:
stop_gc_thread(sbi);
}
restore_opts:
+#ifdef CONFIG_QUOTA
+ sbi->s_jquota_fmt = s_jquota_fmt;
+ for (i = 0; i < MAXQUOTAS; i++) {
+ kfree(sbi->s_qf_names[i]);
+ sbi->s_qf_names[i] = s_qf_names[i];
+ }
+#endif
sbi->mount_opt = org_mount_opt;
sbi->active_logs = active_logs;
sb->s_flags = old_sb_flags;
@@ -1065,7 +1385,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
}
if (len == towrite)
- return err;
+ return 0;
inode->i_version++;
inode->i_mtime = inode->i_ctime = current_time(inode);
f2fs_mark_inode_dirty_sync(inode, false);
@@ -1082,6 +1402,27 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode)
return &F2FS_I(inode)->i_reserved_quota;
}
+static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
+{
+ return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type],
+ sbi->s_jquota_fmt, type);
+}
+
+void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+{
+ int i, ret;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (sbi->s_qf_names[i]) {
+ ret = f2fs_quota_on_mount(sbi, i);
+ if (ret < 0)
+ f2fs_msg(sbi->sb, KERN_ERR,
+ "Cannot turn on journaled "
+ "quota: error %d", ret);
+ }
+ }
+}
+
static int f2fs_quota_sync(struct super_block *sb, int type)
{
struct quota_info *dqopt = sb_dqopt(sb);
@@ -1119,7 +1460,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
struct inode *inode;
int err;
- err = f2fs_quota_sync(sb, -1);
+ err = f2fs_quota_sync(sb, type);
if (err)
return err;
@@ -1147,7 +1488,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
if (!inode || !igrab(inode))
return dquot_quota_off(sb, type);
- f2fs_quota_sync(sb, -1);
+ f2fs_quota_sync(sb, type);
err = dquot_quota_off(sb, type);
if (err)
@@ -1163,7 +1504,7 @@ out_put:
return err;
}
-static void f2fs_quota_off_umount(struct super_block *sb)
+void f2fs_quota_off_umount(struct super_block *sb)
{
int type;
@@ -1171,6 +1512,12 @@ static void f2fs_quota_off_umount(struct super_block *sb)
f2fs_quota_off(sb, type);
}
+int f2fs_get_projid(struct inode *inode, kprojid_t *projid)
+{
+ *projid = F2FS_I(inode)->i_projid;
+ return 0;
+}
+
static const struct dquot_operations f2fs_quota_operations = {
.get_reserved_space = f2fs_get_reserved_space,
.write_dquot = dquot_commit,
@@ -1180,6 +1527,7 @@ static const struct dquot_operations f2fs_quota_operations = {
.write_info = dquot_commit_info,
.alloc_dquot = dquot_alloc,
.destroy_dquot = dquot_destroy,
+ .get_projid = f2fs_get_projid,
.get_next_id = dquot_get_next_id,
};
@@ -1194,12 +1542,12 @@ static const struct quotactl_ops f2fs_quotactl_ops = {
.get_nextdqblk = dquot_get_next_dqblk,
};
#else
-static inline void f2fs_quota_off_umount(struct super_block *sb)
+void f2fs_quota_off_umount(struct super_block *sb)
{
}
#endif
-static struct super_operations f2fs_sops = {
+static const struct super_operations f2fs_sops = {
.alloc_inode = f2fs_alloc_inode,
.drop_inode = f2fs_drop_inode,
.destroy_inode = f2fs_destroy_inode,
@@ -1303,9 +1651,16 @@ static const struct export_operations f2fs_export_ops = {
static loff_t max_file_blocks(void)
{
- loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
+ loff_t result = 0;
loff_t leaf_count = ADDRS_PER_BLOCK;
+ /*
+ * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
+ * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+ * space in inode.i_addr, it will be more safe to reassign
+ * result as zero.
+ */
+
/* two direct node blocks */
result += (leaf_count * 2);
@@ -1922,6 +2277,11 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ /* precompute checksum seed for metadata */
+ if (f2fs_sb_has_inode_chksum(sb))
+ sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
+ sizeof(raw_super->uuid));
+
/*
* The BLKZONED feature indicates that the drive was formatted with
* zone alignment optimization. This is optional for host-aware
@@ -1956,7 +2316,7 @@ try_onemore:
#ifdef CONFIG_QUOTA
sb->dq_op = &f2fs_quota_operations;
sb->s_qcop = &f2fs_quotactl_ops;
- sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+ sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
#endif
sb->s_op = &f2fs_sops;
@@ -1980,6 +2340,10 @@ try_onemore:
set_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
+ /* init iostat info */
+ spin_lock_init(&sbi->iostat_lock);
+ sbi->iostat_enable = false;
+
for (i = 0; i < NR_PAGE_TYPE; i++) {
int n = (i == META) ? 1: NR_TEMP_TYPE;
int j;
@@ -2098,11 +2462,6 @@ try_onemore:
if (err)
goto free_nm;
- /* if there are nt orphan nodes free them */
- err = recover_orphan_inodes(sbi);
- if (err)
- goto free_node_inode;
-
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
if (IS_ERR(root)) {
@@ -2122,10 +2481,15 @@ try_onemore:
goto free_root_inode;
}
- err = f2fs_init_sysfs(sbi);
+ err = f2fs_register_sysfs(sbi);
if (err)
goto free_root_inode;
+ /* if there are nt orphan nodes free them */
+ err = recover_orphan_inodes(sbi);
+ if (err)
+ goto free_sysfs;
+
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
/*
@@ -2135,7 +2499,7 @@ try_onemore:
if (bdev_read_only(sb->s_bdev) &&
!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
err = -EROFS;
- goto free_sysfs;
+ goto free_meta;
}
if (need_fsck)
@@ -2149,7 +2513,7 @@ try_onemore:
need_fsck = true;
f2fs_msg(sb, KERN_ERR,
"Cannot recover all fsync data errno=%d", err);
- goto free_sysfs;
+ goto free_meta;
}
} else {
err = recover_fsync_data(sbi, true);
@@ -2173,7 +2537,7 @@ skip_recovery:
/* After POR, we can run background GC thread.*/
err = start_gc_thread(sbi);
if (err)
- goto free_sysfs;
+ goto free_meta;
}
kfree(options);
@@ -2191,9 +2555,17 @@ skip_recovery:
f2fs_update_time(sbi, REQ_TIME);
return 0;
-free_sysfs:
+free_meta:
f2fs_sync_inode_meta(sbi);
- f2fs_exit_sysfs(sbi);
+ /*
+ * Some dirty meta pages can be produced by recover_orphan_inodes()
+ * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+ * followed by write_checkpoint() through f2fs_write_node_pages(), which
+ * falls into an infinite loop in sync_meta_pages().
+ */
+ truncate_inode_pages_final(META_MAPPING(sbi));
+free_sysfs:
+ f2fs_unregister_sysfs(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
@@ -2202,13 +2574,6 @@ free_node_inode:
mutex_lock(&sbi->umount_mutex);
release_ino_entry(sbi, true);
f2fs_leave_shrinker(sbi);
- /*
- * Some dirty meta pages can be produced by recover_orphan_inodes()
- * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
- * followed by write_checkpoint() through f2fs_write_node_pages(), which
- * falls into an infinite loop in sync_meta_pages().
- */
- truncate_inode_pages_final(META_MAPPING(sbi));
iput(sbi->node_inode);
mutex_unlock(&sbi->umount_mutex);
f2fs_destroy_stats(sbi);
@@ -2228,6 +2593,10 @@ free_options:
for (i = 0; i < NR_PAGE_TYPE; i++)
kfree(sbi->write_io[i]);
destroy_percpu_info(sbi);
+#ifdef CONFIG_QUOTA
+ for (i = 0; i < MAXQUOTAS; i++)
+ kfree(sbi->s_qf_names[i]);
+#endif
kfree(options);
free_sb_buf:
kfree(raw_super);
@@ -2311,7 +2680,7 @@ static int __init init_f2fs_fs(void)
err = create_extent_cache();
if (err)
goto free_checkpoint_caches;
- err = f2fs_register_sysfs();
+ err = f2fs_init_sysfs();
if (err)
goto free_extent_cache;
err = register_shrinker(&f2fs_shrinker_info);
@@ -2330,7 +2699,7 @@ free_filesystem:
free_shrinker:
unregister_shrinker(&f2fs_shrinker_info);
free_sysfs:
- f2fs_unregister_sysfs();
+ f2fs_exit_sysfs();
free_extent_cache:
destroy_extent_cache();
free_checkpoint_caches:
@@ -2350,7 +2719,7 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
unregister_shrinker(&f2fs_shrinker_info);
- f2fs_unregister_sysfs();
+ f2fs_exit_sysfs();
destroy_extent_cache();
destroy_checkpoint_caches();
destroy_segment_manager_caches();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 71191d89917d..e2c258f717cd 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -18,7 +18,6 @@
#include "gc.h"
static struct proc_dir_entry *f2fs_proc_root;
-static struct kset *f2fs_kset;
/* Sysfs support for f2fs */
enum {
@@ -41,6 +40,7 @@ struct f2fs_attr {
const char *, size_t);
int struct_type;
int offset;
+ int id;
};
static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
@@ -76,6 +76,34 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
BD_PART_WRITTEN(sbi)));
}
+static ssize_t features_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->sb;
+ int len = 0;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+
+ if (f2fs_sb_has_crypto(sb))
+ len += snprintf(buf, PAGE_SIZE - len, "%s",
+ "encryption");
+ if (f2fs_sb_mounted_blkzoned(sb))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "blkzoned");
+ if (f2fs_sb_has_extra_attr(sb))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "extra_attr");
+ if (f2fs_sb_has_project_quota(sb))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "projquota");
+ if (f2fs_sb_has_inode_chksum(sb))
+ len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len ? ", " : "", "inode_checksum");
+ len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+ return len;
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -124,7 +152,39 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
spin_unlock(&sbi->stat_lock);
return count;
}
+
+ if (!strcmp(a->attr.name, "discard_granularity")) {
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ int i;
+
+ if (t == 0 || t > MAX_PLIST_NUM)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+
+ mutex_lock(&dcc->cmd_lock);
+ for (i = 0; i < MAX_PLIST_NUM; i++) {
+ if (i >= t - 1)
+ dcc->pend_list_tag[i] |= P_ACTIVE;
+ else
+ dcc->pend_list_tag[i] &= (~P_ACTIVE);
+ }
+ mutex_unlock(&dcc->cmd_lock);
+
+ *ui = t;
+ return count;
+ }
+
*ui = t;
+
+ if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0)
+ f2fs_reset_iostat(sbi);
+ if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) {
+ sbi->gc_thread->gc_wake = 1;
+ wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head);
+ wake_up_discard_thread(sbi, true);
+ }
+
return count;
}
@@ -155,6 +215,30 @@ static void f2fs_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
+enum feat_id {
+ FEAT_CRYPTO = 0,
+ FEAT_BLKZONED,
+ FEAT_ATOMIC_WRITE,
+ FEAT_EXTRA_ATTR,
+ FEAT_PROJECT_QUOTA,
+ FEAT_INODE_CHECKSUM,
+};
+
+static ssize_t f2fs_feature_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ switch (a->id) {
+ case FEAT_CRYPTO:
+ case FEAT_BLKZONED:
+ case FEAT_ATOMIC_WRITE:
+ case FEAT_EXTRA_ATTR:
+ case FEAT_PROJECT_QUOTA:
+ case FEAT_INODE_CHECKSUM:
+ return snprintf(buf, PAGE_SIZE, "supported\n");
+ }
+ return 0;
+}
+
#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
@@ -172,12 +256,23 @@ static struct f2fs_attr f2fs_attr_##_name = { \
#define F2FS_GENERAL_RO_ATTR(name) \
static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
+#define F2FS_FEATURE_RO_ATTR(_name, _id) \
+static struct f2fs_attr f2fs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0444 }, \
+ .show = f2fs_feature_show, \
+ .id = _id, \
+}
+
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time,
+ urgent_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
@@ -191,20 +286,36 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
#ifdef CONFIG_F2FS_FAULT_INJECTION
F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
+F2FS_GENERAL_RO_ATTR(features);
+
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED);
+#endif
+F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
+F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
+F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
+F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
+ ATTR_LIST(gc_urgent_sleep_time),
ATTR_LIST(gc_min_sleep_time),
ATTR_LIST(gc_max_sleep_time),
ATTR_LIST(gc_no_gc_sleep_time),
ATTR_LIST(gc_idle),
+ ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(discard_granularity),
ATTR_LIST(batched_trim_sections),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
@@ -217,26 +328,59 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(dirty_nats_ratio),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
+ ATTR_LIST(iostat_enable),
#ifdef CONFIG_F2FS_FAULT_INJECTION
ATTR_LIST(inject_rate),
ATTR_LIST(inject_type),
#endif
ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(features),
ATTR_LIST(reserved_blocks),
NULL,
};
+static struct attribute *f2fs_feat_attrs[] = {
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+ ATTR_LIST(encryption),
+#endif
+#ifdef CONFIG_BLK_DEV_ZONED
+ ATTR_LIST(block_zoned),
+#endif
+ ATTR_LIST(atomic_write),
+ ATTR_LIST(extra_attr),
+ ATTR_LIST(project_quota),
+ ATTR_LIST(inode_checksum),
+ NULL,
+};
+
static const struct sysfs_ops f2fs_attr_ops = {
.show = f2fs_attr_show,
.store = f2fs_attr_store,
};
-static struct kobj_type f2fs_ktype = {
+static struct kobj_type f2fs_sb_ktype = {
.default_attrs = f2fs_attrs,
.sysfs_ops = &f2fs_attr_ops,
.release = f2fs_sb_release,
};
+static struct kobj_type f2fs_ktype = {
+ .sysfs_ops = &f2fs_attr_ops,
+};
+
+static struct kset f2fs_kset = {
+ .kobj = {.ktype = &f2fs_ktype},
+};
+
+static struct kobj_type f2fs_feat_ktype = {
+ .default_attrs = f2fs_feat_attrs,
+ .sysfs_ops = &f2fs_attr_ops,
+};
+
+static struct kobject f2fs_feat = {
+ .kset = &f2fs_kset,
+};
+
static int segment_info_seq_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
@@ -288,6 +432,48 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset)
return 0;
}
+static int iostat_info_seq_show(struct seq_file *seq, void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ time64_t now = ktime_get_real_seconds();
+
+ if (!sbi->iostat_enable)
+ return 0;
+
+ seq_printf(seq, "time: %-16llu\n", now);
+
+ /* print app IOs */
+ seq_printf(seq, "app buffered: %-16llu\n",
+ sbi->write_iostat[APP_BUFFERED_IO]);
+ seq_printf(seq, "app direct: %-16llu\n",
+ sbi->write_iostat[APP_DIRECT_IO]);
+ seq_printf(seq, "app mapped: %-16llu\n",
+ sbi->write_iostat[APP_MAPPED_IO]);
+
+ /* print fs IOs */
+ seq_printf(seq, "fs data: %-16llu\n",
+ sbi->write_iostat[FS_DATA_IO]);
+ seq_printf(seq, "fs node: %-16llu\n",
+ sbi->write_iostat[FS_NODE_IO]);
+ seq_printf(seq, "fs meta: %-16llu\n",
+ sbi->write_iostat[FS_META_IO]);
+ seq_printf(seq, "fs gc data: %-16llu\n",
+ sbi->write_iostat[FS_GC_DATA_IO]);
+ seq_printf(seq, "fs gc node: %-16llu\n",
+ sbi->write_iostat[FS_GC_NODE_IO]);
+ seq_printf(seq, "fs cp data: %-16llu\n",
+ sbi->write_iostat[FS_CP_DATA_IO]);
+ seq_printf(seq, "fs cp node: %-16llu\n",
+ sbi->write_iostat[FS_CP_NODE_IO]);
+ seq_printf(seq, "fs cp meta: %-16llu\n",
+ sbi->write_iostat[FS_CP_META_IO]);
+ seq_printf(seq, "fs discard: %-16llu\n",
+ sbi->write_iostat[FS_DISCARD]);
+
+ return 0;
+}
+
#define F2FS_PROC_FILE_DEF(_name) \
static int _name##_open_fs(struct inode *inode, struct file *file) \
{ \
@@ -303,28 +489,47 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \
F2FS_PROC_FILE_DEF(segment_info);
F2FS_PROC_FILE_DEF(segment_bits);
+F2FS_PROC_FILE_DEF(iostat_info);
-int __init f2fs_register_sysfs(void)
+int __init f2fs_init_sysfs(void)
{
- f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ int ret;
- f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
- if (!f2fs_kset)
- return -ENOMEM;
- return 0;
+ kobject_set_name(&f2fs_kset.kobj, "f2fs");
+ f2fs_kset.kobj.parent = fs_kobj;
+ ret = kset_register(&f2fs_kset);
+ if (ret)
+ return ret;
+
+ ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
+ NULL, "features");
+ if (ret)
+ kset_unregister(&f2fs_kset);
+ else
+ f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ return ret;
}
-void f2fs_unregister_sysfs(void)
+void f2fs_exit_sysfs(void)
{
- kset_unregister(f2fs_kset);
+ kobject_put(&f2fs_feat);
+ kset_unregister(&f2fs_kset);
remove_proc_entry("fs/f2fs", NULL);
+ f2fs_proc_root = NULL;
}
-int f2fs_init_sysfs(struct f2fs_sb_info *sbi)
+int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
{
struct super_block *sb = sbi->sb;
int err;
+ sbi->s_kobj.kset = &f2fs_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ return err;
+
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -333,33 +538,19 @@ int f2fs_init_sysfs(struct f2fs_sb_info *sbi)
&f2fs_seq_segment_info_fops, sb);
proc_create_data("segment_bits", S_IRUGO, sbi->s_proc,
&f2fs_seq_segment_bits_fops, sb);
+ proc_create_data("iostat_info", S_IRUGO, sbi->s_proc,
+ &f2fs_seq_iostat_info_fops, sb);
}
-
- sbi->s_kobj.kset = f2fs_kset;
- init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
- "%s", sb->s_id);
- if (err)
- goto err_out;
return 0;
-err_out:
- if (sbi->s_proc) {
- remove_proc_entry("segment_info", sbi->s_proc);
- remove_proc_entry("segment_bits", sbi->s_proc);
- remove_proc_entry(sb->s_id, f2fs_proc_root);
- }
- return err;
}
-void f2fs_exit_sysfs(struct f2fs_sb_info *sbi)
+void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
{
- kobject_del(&sbi->s_kobj);
- kobject_put(&sbi->s_kobj);
- wait_for_completion(&sbi->s_kobj_unregister);
-
if (sbi->s_proc) {
+ remove_proc_entry("iostat_info", sbi->s_proc);
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
+ kobject_del(&sbi->s_kobj);
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 832c5110abab..7c65540148f8 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -442,7 +442,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
- xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage);
+ xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
if (IS_ERR(xpage)) {
alloc_nid_failed(sbi, new_nid);
return PTR_ERR(xpage);
@@ -473,8 +473,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
+ down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
&entry, &base_addr);
+ up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -503,7 +505,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
int error = 0;
size_t rest = buffer_size;
+ down_read(&F2FS_I(inode)->i_xattr_sem);
error = read_all_xattrs(inode, NULL, &base_addr);
+ up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -686,7 +690,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_lock_op(sbi);
/* protect xattr_ver */
down_write(&F2FS_I(inode)->i_sem);
+ down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
+ up_write(&F2FS_I(inode)->i_xattr_sem);
up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3b01b646e528..0491da3b28c3 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -741,10 +741,21 @@ static void send_sigio_to_task(struct task_struct *p,
si.si_signo = signum;
si.si_errno = 0;
si.si_code = reason;
+ /*
+ * Posix definies POLL_IN and friends to be signal
+ * specific si_codes for SIG_POLL. Linux extended
+ * these si_codes to other signals in a way that is
+ * ambiguous if other signals also have signal
+ * specific si_codes. In that case use SI_SIGIO instead
+ * to remove the ambiguity.
+ */
+ if (sig_specific_sicodes(signum))
+ si.si_code = SI_SIGIO;
+
/* Make sure we are called with one of the POLL_*
reasons, otherwise we could leak kernel stack into
userspace. */
- BUG_ON((reason & __SI_MASK) != __SI_POLL);
+ BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
if (reason - POLL_IN >= NSIGPOLL)
si.si_band = ~0L;
else
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index c5b6b7165489..e9e97803442a 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt)
static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb);
loff_t pos = 0;
return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
@@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb);
loff_t pos = 0;
/*
* No locking or generic_write_checks(), the server is
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c16d00e53264..13c65dd2d37d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
struct fuse_in *in;
unsigned reqsize;
- if (task_active_pid_ns(current) != fc->pid_ns)
- return -EIO;
-
restart:
spin_lock(&fiq->waitq.lock);
err = -EAGAIN;
@@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
in = &req->in;
reqsize = in->h.len;
+
+ if (task_active_pid_ns(current) != fc->pid_ns) {
+ rcu_read_lock();
+ in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
+ rcu_read_unlock();
+ }
+
/* If request is too large, reply with an error and restart the read */
if (nbytes < reqsize) {
req->out.h.error = -EIO;
@@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
struct fuse_req *req;
struct fuse_out_header oh;
- if (task_active_pid_ns(current) != fc->pid_ns)
- return -EIO;
-
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 00800c07ba1c..622081b97426 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -923,33 +923,29 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
return err;
}
-int fuse_update_attributes(struct inode *inode, struct kstat *stat,
- struct file *file, bool *refreshed)
+static int fuse_update_get_attr(struct inode *inode, struct file *file,
+ struct kstat *stat)
{
struct fuse_inode *fi = get_fuse_inode(inode);
- int err;
- bool r;
+ int err = 0;
if (time_before64(fi->i_time, get_jiffies_64())) {
- r = true;
forget_all_cached_acls(inode);
err = fuse_do_getattr(inode, stat, file);
- } else {
- r = false;
- err = 0;
- if (stat) {
- generic_fillattr(inode, stat);
- stat->mode = fi->orig_i_mode;
- stat->ino = fi->orig_ino;
- }
+ } else if (stat) {
+ generic_fillattr(inode, stat);
+ stat->mode = fi->orig_i_mode;
+ stat->ino = fi->orig_ino;
}
- if (refreshed != NULL)
- *refreshed = r;
-
return err;
}
+int fuse_update_attributes(struct inode *inode, struct file *file)
+{
+ return fuse_update_get_attr(inode, file, NULL);
+}
+
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
u64 child_nodeid, struct qstr *name)
{
@@ -1786,7 +1782,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
if (!fuse_allow_current_process(fc))
return -EACCES;
- return fuse_update_attributes(inode, stat, NULL, NULL);
+ return fuse_update_get_attr(inode, NULL, stat);
}
static const struct inode_operations fuse_dir_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d66789804287..cb7dff5c45d7 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -645,7 +645,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
loff_t pos, size_t count, fl_owner_t owner)
{
- struct file *file = io->file;
+ struct file *file = io->iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -707,7 +707,8 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
static int fuse_do_readpage(struct file *file, struct page *page)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
+ struct kiocb iocb;
+ struct fuse_io_priv io;
struct inode *inode = page->mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_req *req;
@@ -735,6 +736,8 @@ static int fuse_do_readpage(struct file *file, struct page *page)
req->num_pages = 1;
req->pages[0] = page;
req->page_descs[0].length = count;
+ init_sync_kiocb(&iocb, file);
+ io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb);
num_read = fuse_send_read(req, &io, pos, count, NULL);
err = req->out.h.error;
@@ -923,7 +926,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (fc->auto_inval_data ||
(iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
- err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
+ err = fuse_update_attributes(inode, iocb->ki_filp);
if (err)
return err;
}
@@ -957,13 +960,18 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
loff_t pos, size_t count, fl_owner_t owner)
{
- struct file *file = io->file;
+ struct kiocb *iocb = io->iocb;
+ struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
struct fuse_write_in *inarg = &req->misc.write.in;
fuse_write_fill(req, ff, pos, count);
inarg->flags = file->f_flags;
+ if (iocb->ki_flags & IOCB_DSYNC)
+ inarg->flags |= O_DSYNC;
+ if (iocb->ki_flags & IOCB_SYNC)
+ inarg->flags |= O_SYNC;
if (owner != NULL) {
inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
inarg->lock_owner = fuse_lock_owner_id(fc, owner);
@@ -993,14 +1001,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
return ret;
}
-static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb,
struct inode *inode, loff_t pos,
size_t count)
{
size_t res;
unsigned offset;
unsigned i;
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
for (i = 0; i < req->num_pages; i++)
fuse_wait_on_page_writeback(inode, req->pages[i]->index);
@@ -1100,7 +1108,7 @@ static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
FUSE_MAX_PAGES_PER_REQ);
}
-static ssize_t fuse_perform_write(struct file *file,
+static ssize_t fuse_perform_write(struct kiocb *iocb,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos)
{
@@ -1133,7 +1141,7 @@ static ssize_t fuse_perform_write(struct file *file,
} else {
size_t num_written;
- num_written = fuse_send_write_pages(req, file, inode,
+ num_written = fuse_send_write_pages(req, iocb, inode,
pos, count);
err = req->out.h.error;
if (!err) {
@@ -1169,7 +1177,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (get_fuse_conn(inode)->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
- err = fuse_update_attributes(mapping->host, NULL, file, NULL);
+ err = fuse_update_attributes(mapping->host, file);
if (err)
return err;
@@ -1201,7 +1209,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
pos += written;
- written_buffered = fuse_perform_write(file, mapping, from, pos);
+ written_buffered = fuse_perform_write(iocb, mapping, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1220,13 +1228,15 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
} else {
- written = fuse_perform_write(file, mapping, from, iocb->ki_pos);
+ written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
if (written >= 0)
iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
inode_unlock(inode);
+ if (written > 0)
+ written = generic_write_sync(iocb, written);
return written ? written : err;
}
@@ -1317,7 +1327,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
{
int write = flags & FUSE_DIO_WRITE;
int cuse = flags & FUSE_DIO_CUSE;
- struct file *file = io->file;
+ struct file *file = io->iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
@@ -1399,8 +1409,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
loff_t *ppos)
{
ssize_t res;
- struct file *file = io->file;
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(io->iocb->ki_filp);
if (is_bad_inode(inode))
return -EIO;
@@ -1414,15 +1423,14 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
return __fuse_direct_read(&io, to, &iocb->ki_pos);
}
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file);
- struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
ssize_t res;
if (is_bad_inode(inode))
@@ -2181,9 +2189,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
return 0;
- if (pid && pid_nr == 0)
- return -EOVERFLOW;
-
fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
err = fuse_simple_request(fc, &args);
@@ -2303,7 +2308,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
fallback:
- err = fuse_update_attributes(inode, NULL, file, NULL);
+ err = fuse_update_attributes(inode, file);
if (!err)
return generic_file_llseek(file, offset, whence);
else
@@ -2323,7 +2328,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
break;
case SEEK_END:
inode_lock(inode);
- retval = fuse_update_attributes(inode, NULL, file, NULL);
+ retval = fuse_update_attributes(inode, file);
if (!retval)
retval = generic_file_llseek(file, offset, whence);
inode_unlock(inode);
@@ -2874,7 +2879,6 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
io->offset = offset;
io->write = (iov_iter_rw(iter) == WRITE);
io->err = 0;
- io->file = file;
/*
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bd4d2a3e1ec1..d5773ca67ad2 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -252,16 +252,15 @@ struct fuse_io_priv {
bool should_dirty;
int err;
struct kiocb *iocb;
- struct file *file;
struct completion *done;
bool blocking;
};
-#define FUSE_IO_PRIV_SYNC(f) \
+#define FUSE_IO_PRIV_SYNC(i) \
{ \
.refcnt = KREF_INIT(1), \
.async = 0, \
- .file = f, \
+ .iocb = i, \
}
/**
@@ -905,8 +904,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
void fuse_update_ctime(struct inode *inode);
-int fuse_update_attributes(struct inode *inode, struct kstat *stat,
- struct file *file, bool *refreshed);
+int fuse_update_attributes(struct inode *inode, struct file *file);
void fuse_flush_writepages(struct inode *inode);
diff --git a/fs/inode.c b/fs/inode.c
index 210054157a49..d1e35b53bb23 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1570,11 +1570,24 @@ EXPORT_SYMBOL(bmap);
static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode,
bool rcu)
{
- if (!rcu) {
- struct inode *realinode = d_real_inode(dentry);
+ struct dentry *upperdentry;
- if (unlikely(inode != realinode) &&
- (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) ||
+ /*
+ * Nothing to do if in rcu or if non-overlayfs
+ */
+ if (rcu || likely(!(dentry->d_flags & DCACHE_OP_REAL)))
+ return;
+
+ upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
+
+ /*
+ * If file is on lower then we can't update atime, so no worries about
+ * stale mtime/ctime.
+ */
+ if (upperdentry) {
+ struct inode *realinode = d_inode(upperdentry);
+
+ if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) ||
!timespec_equal(&inode->i_ctime, &realinode->i_ctime))) {
inode->i_mtime = realinode->i_mtime;
inode->i_ctime = realinode->i_ctime;
diff --git a/fs/internal.h b/fs/internal.h
index fedfe94d84ba..48cee21b4f14 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -71,8 +71,10 @@ extern void __init mnt_init(void);
extern int __mnt_want_write(struct vfsmount *);
extern int __mnt_want_write_file(struct file *);
+extern int mnt_want_write_file_path(struct file *);
extern void __mnt_drop_write(struct vfsmount *);
extern void __mnt_drop_write_file(struct file *);
+extern void mnt_drop_write_file_path(struct file *);
/*
* fs_struct.c
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 27d577dbe51a..96c1d14c18f1 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -235,12 +235,8 @@ reclaimer(void *ptr)
struct net *net = host->net;
req = kmalloc(sizeof(*req), GFP_KERNEL);
- if (!req) {
- printk(KERN_ERR "lockd: reclaimer unable to alloc memory."
- " Locks for %s won't be reclaimed!\n",
- host->h_name);
+ if (!req)
return 0;
- }
allow_signal(SIGKILL);
diff --git a/fs/namespace.c b/fs/namespace.c
index f8893dc6a989..df0f7521979a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -431,13 +431,18 @@ int __mnt_want_write_file(struct file *file)
}
/**
- * mnt_want_write_file - get write access to a file's mount
+ * mnt_want_write_file_path - get write access to a file's mount
* @file: the file who's mount on which to take a write
*
* This is like mnt_want_write, but it takes a file and can
* do some optimisations if the file is open for write already
+ *
+ * Called by the vfs for cases when we have an open file at hand, but will do an
+ * inode operation on it (important distinction for files opened on overlayfs,
+ * since the file operations will come from the real underlying file, while
+ * inode operations come from the overlay).
*/
-int mnt_want_write_file(struct file *file)
+int mnt_want_write_file_path(struct file *file)
{
int ret;
@@ -447,6 +452,53 @@ int mnt_want_write_file(struct file *file)
sb_end_write(file->f_path.mnt->mnt_sb);
return ret;
}
+
+static inline int may_write_real(struct file *file)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct dentry *upperdentry;
+
+ /* Writable file? */
+ if (file->f_mode & FMODE_WRITER)
+ return 0;
+
+ /* Not overlayfs? */
+ if (likely(!(dentry->d_flags & DCACHE_OP_REAL)))
+ return 0;
+
+ /* File refers to upper, writable layer? */
+ upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER);
+ if (upperdentry && file_inode(file) == d_inode(upperdentry))
+ return 0;
+
+ /* Lower layer: can't write to real file, sorry... */
+ return -EPERM;
+}
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ *
+ * Mostly called by filesystems from their ioctl operation before performing
+ * modification. On overlayfs this needs to check if the file is on a read-only
+ * lower layer and deny access in that case.
+ */
+int mnt_want_write_file(struct file *file)
+{
+ int ret;
+
+ ret = may_write_real(file);
+ if (!ret) {
+ sb_start_write(file_inode(file)->i_sb);
+ ret = __mnt_want_write_file(file);
+ if (ret)
+ sb_end_write(file_inode(file)->i_sb);
+ }
+ return ret;
+}
EXPORT_SYMBOL_GPL(mnt_want_write_file);
/**
@@ -484,10 +536,16 @@ void __mnt_drop_write_file(struct file *file)
__mnt_drop_write(file->f_path.mnt);
}
-void mnt_drop_write_file(struct file *file)
+void mnt_drop_write_file_path(struct file *file)
{
mnt_drop_write(file->f_path.mnt);
}
+
+void mnt_drop_write_file(struct file *file)
+{
+ __mnt_drop_write(file->f_path.mnt);
+ sb_end_write(file_inode(file)->i_sb);
+}
EXPORT_SYMBOL(mnt_drop_write_file);
static int mnt_make_readonly(struct mount *mnt)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 5427cdf04c5a..14358de173fb 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -51,7 +51,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp,
goto out_iput;
res->size = i_size_read(inode);
res->change_attr = delegation->change_attr;
- if (nfsi->nrequests != 0)
+ if (nfs_have_writebacks(inode))
res->change_attr++;
res->ctime = inode->i_ctime;
res->mtime = inode->i_mtime;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d7df5e67b0c1..606dd3871f66 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -1089,7 +1089,7 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode)
delegation = rcu_dereference(nfsi->delegation);
if (delegation == NULL || !(delegation->type & FMODE_WRITE))
goto out;
- if (nfsi->nrequests < delegation->pagemod_limit)
+ if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit)
ret = false;
out:
rcu_read_unlock();
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3522b1249019..5ceaeb1f6fb6 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2260,7 +2260,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
spin_lock(&inode->i_lock);
retry = false;
}
- res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru);
@@ -2296,7 +2295,6 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
goto out;
if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
goto out;
- res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
err = 0;
@@ -2344,7 +2342,6 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
if (cache == NULL)
return;
RB_CLEAR_NODE(&cache->rb_node);
- cache->jiffies = set->jiffies;
cache->cred = get_rpccred(set->cred);
cache->mask = set->mask;
@@ -2432,7 +2429,6 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE
| NFS_MAY_WRITE | NFS_MAY_READ;
cache.cred = cred;
- cache.jiffies = jiffies;
status = NFS_PROTO(inode)->access(inode, &cache);
if (status != 0) {
if (status == -ESTALE) {
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6fb9fad2d1e6..d2972d537469 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -616,13 +616,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode,
struct list_head *list,
struct nfs_commit_info *cinfo)
{
- spin_lock(&cinfo->inode->i_lock);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
#ifdef CONFIG_NFS_V4_1
if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
#endif
nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
- spin_unlock(&cinfo->inode->i_lock);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index af330c31f627..a385d1c3f146 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -631,11 +631,11 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (result <= 0)
goto out;
- result = generic_write_sync(iocb, result);
- if (result < 0)
- goto out;
written = result;
iocb->ki_pos += written;
+ result = generic_write_sync(iocb, written);
+ if (result < 0)
+ goto out;
/* Return error values */
if (nfs_need_check_write(file, inode)) {
@@ -744,15 +744,18 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
goto out;
/*
- * Revalidate the cache if the server has time stamps granular
- * enough to detect subsecond changes. Otherwise, clear the
- * cache to prevent missing any changes.
+ * Invalidate cache to prevent missing any changes. If
+ * the file is mapped, clear the page cache as well so
+ * those mappings will be loaded.
*
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
nfs_zap_caches(inode);
+ if (mapping_mapped(filp->f_mapping))
+ nfs_revalidate_mapping(inode, filp->f_mapping);
+ }
out:
return status;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 109279d6d91b..134d9f560240 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1285,7 +1285,6 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
- struct nfs_inode *nfsi = NFS_I(inode);
unsigned long ret = 0;
if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
@@ -1315,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
&& (fattr->valid & NFS_ATTR_FATTR_SIZE)
&& i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
- && nfsi->nrequests == 0) {
+ && !nfs_have_writebacks(inode)) {
i_size_write(inode, nfs_size_to_loff_t(fattr->size));
ret |= NFS_INO_INVALID_ATTR;
}
@@ -1823,7 +1822,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if (nfsi->nrequests == 0 || new_isize > cur_isize) {
+ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) {
i_size_write(inode, new_isize);
if (!have_writers)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
@@ -2012,10 +2011,11 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
INIT_LIST_HEAD(&nfsi->commit_info.list);
- nfsi->nrequests = 0;
- nfsi->commit_info.ncommit = 0;
+ atomic_long_set(&nfsi->nrequests, 0);
+ atomic_long_set(&nfsi->commit_info.ncommit, 0);
atomic_set(&nfsi->commit_info.rpcs_out, 0);
init_rwsem(&nfsi->rmdir_sem);
+ mutex_init(&nfsi->commit_mutex);
nfs4_init_once(nfsi);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index dc456416d2be..68cc22083639 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -251,7 +251,6 @@ int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
void nfs_pgio_header_free(struct nfs_pgio_header *);
-void nfs_pgio_data_destroy(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 40bd05f05e74..ac4f10b7f6c1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -303,6 +303,17 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
struct rpc_cred *newcred = NULL;
rpc_authflavor_t flavor;
+ if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP ||
+ sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) {
+ /* Using machine creds for cleanup operations
+ * is only relevent if the client credentials
+ * might expire. So don't bother for
+ * RPC_AUTH_UNIX. If file was only exported to
+ * sec=sys, the PUTFH would fail anyway.
+ */
+ if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ return false;
+ }
if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
spin_lock(&clp->cl_lock);
if (clp->cl_machine_cred != NULL)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d90132642340..6c61e2b99635 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1659,12 +1659,52 @@ update:
return state;
}
+static struct inode *
+nfs4_opendata_get_inode(struct nfs4_opendata *data)
+{
+ struct inode *inode;
+
+ switch (data->o_arg.claim) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!(data->f_attr.valid & NFS_ATTR_FATTR))
+ return ERR_PTR(-EAGAIN);
+ inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh,
+ &data->f_attr, data->f_label);
+ break;
+ default:
+ inode = d_inode(data->dentry);
+ ihold(inode);
+ nfs_refresh_inode(inode, &data->f_attr);
+ }
+ return inode;
+}
+
static struct nfs4_state *
-_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data)
{
+ struct nfs4_state *state;
struct inode *inode;
- struct nfs4_state *state = NULL;
- int ret;
+
+ inode = nfs4_opendata_get_inode(data);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ if (data->state != NULL && data->state->inode == inode) {
+ state = data->state;
+ atomic_inc(&state->count);
+ } else
+ state = nfs4_get_open_state(inode, data->owner);
+ iput(inode);
+ if (state == NULL)
+ state = ERR_PTR(-ENOMEM);
+ return state;
+}
+
+static struct nfs4_state *
+_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
+{
+ struct nfs4_state *state;
if (!data->rpc_done) {
state = nfs4_try_open_cached(data);
@@ -1672,29 +1712,17 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
goto out;
}
- ret = -EAGAIN;
- if (!(data->f_attr.valid & NFS_ATTR_FATTR))
- goto err;
- inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label);
- ret = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto err;
- ret = -ENOMEM;
- state = nfs4_get_open_state(inode, data->owner);
- if (state == NULL)
- goto err_put_inode;
+ state = nfs4_opendata_find_nfs4_state(data);
+ if (IS_ERR(state))
+ goto out;
+
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
update_open_stateid(state, &data->o_res.stateid, NULL,
data->o_arg.fmode);
- iput(inode);
out:
nfs_release_seqid(data->o_arg.seqid);
return state;
-err_put_inode:
- iput(inode);
-err:
- return ERR_PTR(ret);
}
static struct nfs4_state *
@@ -2071,7 +2099,6 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
case NFS4_OPEN_CLAIM_FH:
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
- nfs_copy_fh(&data->o_res.fh, data->o_arg.fh);
}
data->timestamp = jiffies;
if (nfs4_setup_sequence(data->o_arg.server->nfs_client,
@@ -2258,7 +2285,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
mask = NFS4_ACCESS_READ;
cache.cred = cred;
- cache.jiffies = jiffies;
nfs_access_set_mask(&cache, opendata->o_res.access_result);
nfs_access_add_cache(state->inode, &cache);
@@ -7318,7 +7344,9 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
1 << (OP_DESTROY_SESSION - 32) |
1 << (OP_DESTROY_CLIENTID - 32)
};
+ unsigned long flags = 0;
unsigned int i;
+ int ret = 0;
if (sp->how == SP4_MACH_CRED) {
/* Print state protect result */
@@ -7334,7 +7362,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) {
if (sp->enforce.u.words[i] & ~supported_enforce[i]) {
dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
}
@@ -7353,10 +7382,11 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) {
dfprintk(MOUNT, "sp4_mach_cred:\n");
dfprintk(MOUNT, " minimal mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags);
} else {
dfprintk(MOUNT, "sp4_mach_cred: disabled\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
@@ -7364,110 +7394,46 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
test_bit(OP_LOCKU, sp->allow.u.longs)) {
dfprintk(MOUNT, " cleanup mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags);
}
if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
dfprintk(MOUNT, " pnfs cleanup mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
- &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags);
}
if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
dfprintk(MOUNT, " secinfo mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags);
}
if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) &&
test_bit(OP_FREE_STATEID, sp->allow.u.longs)) {
dfprintk(MOUNT, " stateid mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags);
}
if (test_bit(OP_WRITE, sp->allow.u.longs)) {
dfprintk(MOUNT, " write mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags);
}
if (test_bit(OP_COMMIT, sp->allow.u.longs)) {
dfprintk(MOUNT, " commit mode enabled\n");
- set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags);
+ __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags);
}
}
-
+out:
+ clp->cl_sp4_flags = flags;
return 0;
}
struct nfs41_exchange_id_data {
struct nfs41_exchange_id_res res;
struct nfs41_exchange_id_args args;
- struct rpc_xprt *xprt;
- int rpc_status;
};
-static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
-{
- struct nfs41_exchange_id_data *cdata =
- (struct nfs41_exchange_id_data *)data;
- struct nfs_client *clp = cdata->args.client;
- int status = task->tk_status;
-
- trace_nfs4_exchange_id(clp, status);
-
- if (status == 0)
- status = nfs4_check_cl_exchange_flags(cdata->res.flags);
-
- if (cdata->xprt && status == 0) {
- status = nfs4_detect_session_trunking(clp, &cdata->res,
- cdata->xprt);
- goto out;
- }
-
- if (status == 0)
- status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
-
- if (status == 0) {
- clp->cl_clientid = cdata->res.clientid;
- clp->cl_exchange_flags = cdata->res.flags;
- clp->cl_seqid = cdata->res.seqid;
- /* Client ID is not confirmed */
- if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R))
- clear_bit(NFS4_SESSION_ESTABLISHED,
- &clp->cl_session->session_state);
-
- kfree(clp->cl_serverowner);
- clp->cl_serverowner = cdata->res.server_owner;
- cdata->res.server_owner = NULL;
-
- /* use the most recent implementation id */
- kfree(clp->cl_implid);
- clp->cl_implid = cdata->res.impl_id;
- cdata->res.impl_id = NULL;
-
- if (clp->cl_serverscope != NULL &&
- !nfs41_same_server_scope(clp->cl_serverscope,
- cdata->res.server_scope)) {
- dprintk("%s: server_scope mismatch detected\n",
- __func__);
- set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
- kfree(clp->cl_serverscope);
- clp->cl_serverscope = NULL;
- }
-
- if (clp->cl_serverscope == NULL) {
- clp->cl_serverscope = cdata->res.server_scope;
- cdata->res.server_scope = NULL;
- }
- /* Save the EXCHANGE_ID verifier session trunk tests */
- memcpy(clp->cl_confirm.data, cdata->args.verifier.data,
- sizeof(clp->cl_confirm.data));
- }
-out:
- cdata->rpc_status = status;
- return;
-}
-
static void nfs4_exchange_id_release(void *data)
{
struct nfs41_exchange_id_data *cdata =
@@ -7481,7 +7447,6 @@ static void nfs4_exchange_id_release(void *data)
}
static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
- .rpc_call_done = nfs4_exchange_id_done,
.rpc_release = nfs4_exchange_id_release,
};
@@ -7490,7 +7455,8 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
*
* Wrapper for EXCHANGE_ID operation.
*/
-static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+static struct rpc_task *
+nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
u32 sp4_how, struct rpc_xprt *xprt)
{
struct rpc_message msg = {
@@ -7504,17 +7470,15 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
.flags = RPC_TASK_TIMEOUT,
};
struct nfs41_exchange_id_data *calldata;
- struct rpc_task *task;
int status;
if (!atomic_inc_not_zero(&clp->cl_count))
- return -EIO;
+ return ERR_PTR(-EIO);
+ status = -ENOMEM;
calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
- if (!calldata) {
- nfs_put_client(clp);
- return -ENOMEM;
- }
+ if (!calldata)
+ goto out;
nfs4_init_boot_verifier(clp, &calldata->args.verifier);
@@ -7553,34 +7517,22 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
goto out_impl_id;
}
if (xprt) {
- calldata->xprt = xprt;
task_setup_data.rpc_xprt = xprt;
task_setup_data.flags |= RPC_TASK_SOFTCONN;
memcpy(calldata->args.verifier.data, clp->cl_confirm.data,
sizeof(calldata->args.verifier.data));
}
calldata->args.client = clp;
-#ifdef CONFIG_NFS_V4_1_MIGRATION
calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
- EXCHGID4_FLAG_BIND_PRINC_STATEID |
- EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
- calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
- EXCHGID4_FLAG_BIND_PRINC_STATEID,
+ EXCHGID4_FLAG_BIND_PRINC_STATEID;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+ calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR;
#endif
msg.rpc_argp = &calldata->args;
msg.rpc_resp = &calldata->res;
task_setup_data.callback_data = calldata;
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
-
- status = calldata->rpc_status;
-
- rpc_put_task(task);
-out:
- return status;
+ return rpc_run_task(&task_setup_data);
out_impl_id:
kfree(calldata->res.impl_id);
@@ -7590,8 +7542,69 @@ out_server_owner:
kfree(calldata->res.server_owner);
out_calldata:
kfree(calldata);
+out:
nfs_put_client(clp);
- goto out;
+ return ERR_PTR(status);
+}
+
+/*
+ * _nfs4_proc_exchange_id()
+ *
+ * Wrapper for EXCHANGE_ID operation.
+ */
+static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
+ u32 sp4_how)
+{
+ struct rpc_task *task;
+ struct nfs41_exchange_id_args *argp;
+ struct nfs41_exchange_id_res *resp;
+ int status;
+
+ task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ argp = task->tk_msg.rpc_argp;
+ resp = task->tk_msg.rpc_resp;
+ status = task->tk_status;
+ if (status != 0)
+ goto out;
+
+ status = nfs4_check_cl_exchange_flags(resp->flags);
+ if (status != 0)
+ goto out;
+
+ status = nfs4_sp4_select_mode(clp, &resp->state_protect);
+ if (status != 0)
+ goto out;
+
+ clp->cl_clientid = resp->clientid;
+ clp->cl_exchange_flags = resp->flags;
+ clp->cl_seqid = resp->seqid;
+ /* Client ID is not confirmed */
+ if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R))
+ clear_bit(NFS4_SESSION_ESTABLISHED,
+ &clp->cl_session->session_state);
+
+ if (clp->cl_serverscope != NULL &&
+ !nfs41_same_server_scope(clp->cl_serverscope,
+ resp->server_scope)) {
+ dprintk("%s: server_scope mismatch detected\n",
+ __func__);
+ set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+ }
+
+ swap(clp->cl_serverowner, resp->server_owner);
+ swap(clp->cl_serverscope, resp->server_scope);
+ swap(clp->cl_implid, resp->impl_id);
+
+ /* Save the EXCHANGE_ID verifier session trunk tests */
+ memcpy(clp->cl_confirm.data, argp->verifier.data,
+ sizeof(clp->cl_confirm.data));
+out:
+ trace_nfs4_exchange_id(clp, status);
+ rpc_put_task(task);
+ return status;
}
/*
@@ -7614,13 +7627,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
/* try SP4_MACH_CRED if krb5i/p */
if (authflavor == RPC_AUTH_GSS_KRB5I ||
authflavor == RPC_AUTH_GSS_KRB5P) {
- status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
+ status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
if (!status)
return 0;
}
/* try SP4_NONE */
- return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+ return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
}
/**
@@ -7642,6 +7655,9 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
void *data)
{
struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+ struct rpc_task *task;
+ int status;
+
u32 sp4_how;
dprintk("--> %s try %s\n", __func__,
@@ -7650,7 +7666,17 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
/* Test connection for session trunking. Async exchange_id call */
- return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+ task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+
+ status = task->tk_status;
+ if (status == 0)
+ status = nfs4_detect_session_trunking(adata->clp,
+ task->tk_msg.rpc_resp, xprt);
+
+ rpc_put_task(task);
+ return status;
}
EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index de9066a92c0d..bec120ec1967 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
/*
* nfs_page_group_lock - lock the head of the page group
* @req - request in group that is to be locked
- * @nonblock - if true don't block waiting for lock
*
- * this lock must be held if modifying the page group list
+ * this lock must be held when traversing or modifying the page
+ * group list
*
- * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
- * result from wait_on_bit_lock
- *
- * NOTE: calling with nonblock=false should always have set the
- * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
- * with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
+ * return 0 on success, < 0 on error
*/
int
-nfs_page_group_lock(struct nfs_page *req, bool nonblock)
+nfs_page_group_lock(struct nfs_page *req)
{
struct nfs_page *head = req->wb_head;
@@ -155,35 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock)
if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
return 0;
- if (!nonblock) {
- set_bit(PG_CONTENDED1, &head->wb_flags);
- smp_mb__after_atomic();
- return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
- TASK_UNINTERRUPTIBLE);
- }
-
- return -EAGAIN;
-}
-
-/*
- * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
- * @req - a request in the group
- *
- * This is a blocking call to wait for the group lock to be cleared.
- */
-void
-nfs_page_group_lock_wait(struct nfs_page *req)
-{
- struct nfs_page *head = req->wb_head;
-
- WARN_ON_ONCE(head != head->wb_head);
-
- if (!test_bit(PG_HEADLOCK, &head->wb_flags))
- return;
set_bit(PG_CONTENDED1, &head->wb_flags);
smp_mb__after_atomic();
- wait_on_bit(&head->wb_flags, PG_HEADLOCK,
- TASK_UNINTERRUPTIBLE);
+ return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ TASK_UNINTERRUPTIBLE);
}
/*
@@ -246,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
{
bool ret;
- nfs_page_group_lock(req, false);
+ nfs_page_group_lock(req);
ret = nfs_page_group_sync_on_bit_locked(req, bit);
nfs_page_group_unlock(req);
@@ -288,9 +258,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
inode = page_file_mapping(req->wb_page)->host;
set_bit(PG_INODE_REF, &req->wb_flags);
kref_get(&req->wb_kref);
- spin_lock(&inode->i_lock);
- NFS_I(inode)->nrequests++;
- spin_unlock(&inode->i_lock);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
}
}
}
@@ -306,14 +274,11 @@ static void
nfs_page_group_destroy(struct kref *kref)
{
struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ struct nfs_page *head = req->wb_head;
struct nfs_page *tmp, *next;
- /* subrequests must release the ref on the head request */
- if (req->wb_head != req)
- nfs_release_request(req->wb_head);
-
if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
- return;
+ goto out;
tmp = req;
do {
@@ -324,6 +289,10 @@ nfs_page_group_destroy(struct kref *kref)
nfs_free_request(tmp);
tmp = next;
} while (tmp != req);
+out:
+ /* subrequests must release the ref on the head request */
+ if (head != req)
+ nfs_release_request(head);
}
/**
@@ -465,6 +434,7 @@ void nfs_release_request(struct nfs_page *req)
{
kref_put(&req->wb_kref, nfs_page_group_destroy);
}
+EXPORT_SYMBOL_GPL(nfs_release_request);
/**
* nfs_wait_on_request - Wait for a request to complete.
@@ -483,6 +453,7 @@ nfs_wait_on_request(struct nfs_page *req)
return wait_on_bit_io(&req->wb_flags, PG_BUSY,
TASK_UNINTERRUPTIBLE);
}
+EXPORT_SYMBOL_GPL(nfs_wait_on_request);
/*
* nfs_generic_pg_test - determine if requests can be coalesced
@@ -530,16 +501,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
}
EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
-/*
- * nfs_pgio_header_free - Free a read or write header
- * @hdr: The header to free
- */
-void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
-{
- hdr->rw_ops->rw_free_header(hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
-
/**
* nfs_pgio_data_destroy - make @hdr suitable for reuse
*
@@ -548,14 +509,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
*
* @hdr: A header that has had nfs_generic_pgio called
*/
-void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
+static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
{
if (hdr->args.context)
put_nfs_open_context(hdr->args.context);
if (hdr->page_array.pagevec != hdr->page_array.page_array)
kfree(hdr->page_array.pagevec);
}
-EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
+
+/*
+ * nfs_pgio_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
+{
+ nfs_pgio_data_destroy(hdr);
+ hdr->rw_ops->rw_free_header(hdr);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
/**
* nfs_pgio_rpcsetup - Set up arguments for a pageio call
@@ -669,7 +640,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
static void nfs_pgio_error(struct nfs_pgio_header *hdr)
{
set_bit(NFS_IOHDR_REDO, &hdr->flags);
- nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
}
@@ -680,7 +650,6 @@ static void nfs_pgio_error(struct nfs_pgio_header *hdr)
static void nfs_pgio_release(void *calldata)
{
struct nfs_pgio_header *hdr = calldata;
- nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
}
@@ -711,12 +680,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
const struct nfs_pgio_completion_ops *compl_ops,
const struct nfs_rw_ops *rw_ops,
size_t bsize,
- int io_flags,
- gfp_t gfp_flags)
+ int io_flags)
{
- struct nfs_pgio_mirror *new;
- int i;
-
desc->pg_moreio = 0;
desc->pg_inode = inode;
desc->pg_ops = pg_ops;
@@ -732,23 +697,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_mirror_count = 1;
desc->pg_mirror_idx = 0;
- if (pg_ops->pg_get_mirror_count) {
- /* until we have a request, we don't have an lseg and no
- * idea how many mirrors there will be */
- new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
- sizeof(struct nfs_pgio_mirror), gfp_flags);
- desc->pg_mirrors_dynamic = new;
- desc->pg_mirrors = new;
-
- for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
- nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
- } else {
- desc->pg_mirrors_dynamic = NULL;
- desc->pg_mirrors = desc->pg_mirrors_static;
- nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
- }
+ desc->pg_mirrors_dynamic = NULL;
+ desc->pg_mirrors = desc->pg_mirrors_static;
+ nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
}
-EXPORT_SYMBOL_GPL(nfs_pageio_init);
/**
* nfs_pgio_result - Basic pageio error handling
@@ -865,32 +817,52 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
return ret;
}
+static struct nfs_pgio_mirror *
+nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc,
+ unsigned int mirror_count)
+{
+ struct nfs_pgio_mirror *ret;
+ unsigned int i;
+
+ kfree(desc->pg_mirrors_dynamic);
+ desc->pg_mirrors_dynamic = NULL;
+ if (mirror_count == 1)
+ return desc->pg_mirrors_static;
+ ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS);
+ if (ret != NULL) {
+ for (i = 0; i < mirror_count; i++)
+ nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
+ desc->pg_mirrors_dynamic = ret;
+ }
+ return ret;
+}
+
/*
* nfs_pageio_setup_mirroring - determine if mirroring is to be used
* by calling the pg_get_mirror_count op
*/
-static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- int mirror_count = 1;
-
- if (!pgio->pg_ops->pg_get_mirror_count)
- return 0;
+ unsigned int mirror_count = 1;
- mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
-
- if (pgio->pg_error < 0)
- return pgio->pg_error;
-
- if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
- return -EINVAL;
+ if (pgio->pg_ops->pg_get_mirror_count)
+ mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+ if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0)
+ return;
- if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
- return -EINVAL;
+ if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) {
+ pgio->pg_error = -EINVAL;
+ return;
+ }
+ pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count);
+ if (pgio->pg_mirrors == NULL) {
+ pgio->pg_error = -ENOMEM;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ mirror_count = 1;
+ }
pgio->pg_mirror_count = mirror_count;
-
- return 0;
}
/*
@@ -1036,7 +1008,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
unsigned int bytes_left = 0;
unsigned int offset, pgbase;
- nfs_page_group_lock(req, false);
+ nfs_page_group_lock(req);
subreq = req;
bytes_left = subreq->wb_bytes;
@@ -1058,7 +1030,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (mirror->pg_recoalesce)
return 0;
/* retry add_request for this subreq */
- nfs_page_group_lock(req, false);
+ nfs_page_group_lock(req);
continue;
}
@@ -1155,7 +1127,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
for (midx = 0; midx < desc->pg_mirror_count; midx++) {
if (midx) {
- nfs_page_group_lock(req, false);
+ nfs_page_group_lock(req);
/* find the last request */
for (lastreq = req->wb_head;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c383d0913b54..7879ed8ceb76 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -529,47 +529,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
}
EXPORT_SYMBOL_GPL(pnfs_put_lseg);
-static void pnfs_free_lseg_async_work(struct work_struct *work)
-{
- struct pnfs_layout_segment *lseg;
- struct pnfs_layout_hdr *lo;
-
- lseg = container_of(work, struct pnfs_layout_segment, pls_work);
- lo = lseg->pls_layout;
-
- pnfs_free_lseg(lseg);
- pnfs_put_layout_hdr(lo);
-}
-
-static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
-{
- INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
- schedule_work(&lseg->pls_work);
-}
-
-void
-pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
-{
- if (!lseg)
- return;
-
- assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
-
- dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
- atomic_read(&lseg->pls_refcount),
- test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
- if (atomic_dec_and_test(&lseg->pls_refcount)) {
- struct pnfs_layout_hdr *lo = lseg->pls_layout;
- if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
- return;
- pnfs_layout_remove_lseg(lo, lseg);
- if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) {
- pnfs_get_layout_hdr(lo);
- pnfs_free_lseg_async(lseg);
- }
- }
-}
-
/*
* is l2 fully contained in l1?
* start1 end1
@@ -2274,7 +2233,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
nfs_pageio_reset_write_mds(desc);
mirror->pg_recoalesce = 1;
}
- nfs_pgio_data_destroy(hdr);
hdr->release(hdr);
}
@@ -2398,7 +2356,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
nfs_pageio_reset_read_mds(desc);
mirror->pg_recoalesce = 1;
}
- nfs_pgio_data_destroy(hdr);
hdr->release(hdr);
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 99731e3e332f..87f144f14d1e 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -67,7 +67,6 @@ struct pnfs_layout_segment {
u32 pls_seq;
unsigned long pls_flags;
struct pnfs_layout_hdr *pls_layout;
- struct work_struct pls_work;
};
enum pnfs_try_status {
@@ -230,7 +229,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
/* pnfs.c */
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
-void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 25f28fa64c57..60da59be83b6 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -83,34 +83,11 @@ pnfs_generic_clear_request_commit(struct nfs_page *req,
}
out:
nfs_request_remove_commit_list(req, cinfo);
- pnfs_put_lseg_locked(freeme);
+ pnfs_put_lseg(freeme);
}
EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
static int
-pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
- struct nfs_commit_info *cinfo, int max)
-{
- struct nfs_page *req, *tmp;
- int ret = 0;
-
- list_for_each_entry_safe(req, tmp, src, wb_list) {
- if (!nfs_lock_request(req))
- continue;
- kref_get(&req->wb_kref);
- if (cond_resched_lock(&cinfo->inode->i_lock))
- list_safe_reset_next(req, tmp, wb_list);
- nfs_request_remove_commit_list(req, cinfo);
- clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
- nfs_list_add_request(req, dst);
- ret++;
- if ((ret == max) && !cinfo->dreq)
- break;
- }
- return ret;
-}
-
-static int
pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo,
int max)
@@ -119,15 +96,15 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
struct list_head *dst = &bucket->committing;
int ret;
- lockdep_assert_held(&cinfo->inode->i_lock);
- ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ ret = nfs_scan_commit_list(src, dst, cinfo, max);
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
if (bucket->clseg == NULL)
bucket->clseg = pnfs_get_lseg(bucket->wlseg);
if (list_empty(src)) {
- pnfs_put_lseg_locked(bucket->wlseg);
+ pnfs_put_lseg(bucket->wlseg);
bucket->wlseg = NULL;
}
}
@@ -142,7 +119,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
{
int i, rv = 0, cnt;
- lockdep_assert_held(&cinfo->inode->i_lock);
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
cinfo, max);
@@ -162,11 +139,10 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst,
int nwritten;
int i;
- lockdep_assert_held(&cinfo->inode->i_lock);
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
restart:
for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
- nwritten = pnfs_generic_transfer_commit_list(&b->written,
- dst, cinfo, 0);
+ nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
if (!nwritten)
continue;
cinfo->ds->nwritten -= nwritten;
@@ -953,12 +929,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
struct list_head *list;
struct pnfs_commit_bucket *buckets;
- spin_lock(&cinfo->inode->i_lock);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
buckets = cinfo->ds->buckets;
list = &buckets[ds_commit_idx].written;
if (list_empty(list)) {
if (!pnfs_is_valid_lseg(lseg)) {
- spin_unlock(&cinfo->inode->i_lock);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
cinfo->completion_ops->resched_write(cinfo, req);
return;
}
@@ -975,7 +951,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
cinfo->ds->nwritten++;
nfs_request_add_commit_list_locked(req, list, cinfo);
- spin_unlock(&cinfo->inode->i_lock);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
nfs_mark_page_unstable(req->wb_page, cinfo);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a8421d9dab6a..0d42573d423d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -68,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
pg_ops = server->pnfs_curr_ld->pg_read_ops;
#endif
nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
- server->rsize, 0, GFP_KERNEL);
+ server->rsize, 0);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d828ef88e7db..6b179af59b92 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1691,8 +1691,8 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
rpc_authflavor_t *server_authlist, unsigned int count)
{
rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
+ bool found_auth_null = false;
unsigned int i;
- int use_auth_null = false;
/*
* If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1701,6 +1701,10 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
* AUTH_NULL has a special meaning when it's in the server list - it
* means that the server will ignore the rpc creds, so any flavor
* can be used but still use the sec= that was specified.
+ *
+ * Note also that the MNT procedure in MNTv1 does not return a list
+ * of supported security flavors. In this case, nfs_mount() fabricates
+ * a security flavor list containing just AUTH_NULL.
*/
for (i = 0; i < count; i++) {
flavor = server_authlist[i];
@@ -1709,11 +1713,11 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
goto out;
if (flavor == RPC_AUTH_NULL)
- use_auth_null = true;
+ found_auth_null = true;
}
- if (use_auth_null) {
- flavor = RPC_AUTH_NULL;
+ if (found_auth_null) {
+ flavor = args->auth_info.flavors[0];
goto out;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b1af5dee5e0a..f68083db63c8 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -102,10 +102,8 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
- if (p) {
- memset(p, 0, sizeof(*p));
- p->rw_mode = FMODE_WRITE;
- }
+ memset(p, 0, sizeof(*p));
+ p->rw_mode = FMODE_WRITE;
return p;
}
@@ -154,6 +152,14 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
}
+static struct nfs_page *
+nfs_page_private_request(struct page *page)
+{
+ if (!PagePrivate(page))
+ return NULL;
+ return (struct nfs_page *)page_private(page);
+}
+
/*
* nfs_page_find_head_request_locked - find head request associated with @page
*
@@ -162,21 +168,41 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
* returns matching head request with reference held, or NULL if not found.
*/
static struct nfs_page *
-nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
+nfs_page_find_private_request(struct page *page)
{
- struct nfs_page *req = NULL;
-
- if (PagePrivate(page))
- req = (struct nfs_page *)page_private(page);
- else if (unlikely(PageSwapCache(page)))
- req = nfs_page_search_commits_for_head_request_locked(nfsi,
- page);
+ struct address_space *mapping = page_file_mapping(page);
+ struct nfs_page *req;
+ if (!PagePrivate(page))
+ return NULL;
+ spin_lock(&mapping->private_lock);
+ req = nfs_page_private_request(page);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
kref_get(&req->wb_kref);
}
+ spin_unlock(&mapping->private_lock);
+ return req;
+}
+static struct nfs_page *
+nfs_page_find_swap_request(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *req = NULL;
+ if (!PageSwapCache(page))
+ return NULL;
+ mutex_lock(&nfsi->commit_mutex);
+ if (PageSwapCache(page)) {
+ req = nfs_page_search_commits_for_head_request_locked(nfsi,
+ page);
+ if (req) {
+ WARN_ON_ONCE(req->wb_head != req);
+ kref_get(&req->wb_kref);
+ }
+ }
+ mutex_unlock(&nfsi->commit_mutex);
return req;
}
@@ -187,12 +213,11 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
*/
static struct nfs_page *nfs_page_find_head_request(struct page *page)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_page *req = NULL;
+ struct nfs_page *req;
- spin_lock(&inode->i_lock);
- req = nfs_page_find_head_request_locked(NFS_I(inode), page);
- spin_unlock(&inode->i_lock);
+ req = nfs_page_find_private_request(page);
+ if (!req)
+ req = nfs_page_find_swap_request(page);
return req;
}
@@ -241,9 +266,6 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
{
struct nfs_page *req;
- WARN_ON_ONCE(head != head->wb_head);
- WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
-
req = head;
do {
if (page_offset >= req->wb_pgbase &&
@@ -269,20 +291,17 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
unsigned int pos = 0;
unsigned int len = nfs_page_length(req->wb_page);
- nfs_page_group_lock(req, false);
+ nfs_page_group_lock(req);
- do {
+ for (;;) {
tmp = nfs_page_group_search_locked(req->wb_head, pos);
- if (tmp) {
- /* no way this should happen */
- WARN_ON_ONCE(tmp->wb_pgbase != pos);
- pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
- }
- } while (tmp && pos < len);
+ if (!tmp)
+ break;
+ pos = tmp->wb_pgbase + tmp->wb_bytes;
+ }
nfs_page_group_unlock(req);
- WARN_ON_ONCE(pos > len);
- return pos == len;
+ return pos >= len;
}
/* We can set the PG_uptodate flag if we see that a write request
@@ -333,8 +352,11 @@ static void nfs_end_page_writeback(struct nfs_page *req)
{
struct inode *inode = page_file_mapping(req->wb_page)->host;
struct nfs_server *nfss = NFS_SERVER(inode);
+ bool is_done;
- if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+ is_done = nfs_page_group_sync_on_bit(req, PG_WB_END);
+ nfs_unlock_request(req);
+ if (!is_done)
return;
end_page_writeback(req->wb_page);
@@ -342,22 +364,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
-
-/* nfs_page_group_clear_bits
- * @req - an nfs request
- * clears all page group related bits from @req
- */
-static void
-nfs_page_group_clear_bits(struct nfs_page *req)
-{
- clear_bit(PG_TEARDOWN, &req->wb_flags);
- clear_bit(PG_UNLOCKPAGE, &req->wb_flags);
- clear_bit(PG_UPTODATE, &req->wb_flags);
- clear_bit(PG_WB_END, &req->wb_flags);
- clear_bit(PG_REMOVE, &req->wb_flags);
-}
-
-
/*
* nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req
*
@@ -366,43 +372,24 @@ nfs_page_group_clear_bits(struct nfs_page *req)
* @inode - inode associated with request page group, must be holding inode lock
* @head - head request of page group, must be holding head lock
* @req - request that couldn't lock and needs to wait on the req bit lock
- * @nonblock - if true, don't actually wait
*
- * NOTE: this must be called holding page_group bit lock and inode spin lock
- * and BOTH will be released before returning.
+ * NOTE: this must be called holding page_group bit lock
+ * which will be released before returning.
*
* returns 0 on success, < 0 on error.
*/
-static int
-nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
- struct nfs_page *req, bool nonblock)
- __releases(&inode->i_lock)
+static void
+nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
+ struct nfs_page *req)
{
struct nfs_page *tmp;
- int ret;
/* relinquish all the locks successfully grabbed this run */
- for (tmp = head ; tmp != req; tmp = tmp->wb_this_page)
- nfs_unlock_request(tmp);
-
- WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
-
- /* grab a ref on the request that will be waited on */
- kref_get(&req->wb_kref);
-
- nfs_page_group_unlock(head);
- spin_unlock(&inode->i_lock);
-
- /* release ref from nfs_page_find_head_request_locked */
- nfs_release_request(head);
-
- if (!nonblock)
- ret = nfs_wait_on_request(req);
- else
- ret = -EAGAIN;
- nfs_release_request(req);
-
- return ret;
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
}
/*
@@ -417,7 +404,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
*/
static void
nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
- struct nfs_page *old_head)
+ struct nfs_page *old_head,
+ struct inode *inode)
{
while (destroy_list) {
struct nfs_page *subreq = destroy_list;
@@ -428,33 +416,28 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
WARN_ON_ONCE(old_head != subreq->wb_head);
/* make sure old group is not used */
- subreq->wb_head = subreq;
subreq->wb_this_page = subreq;
- /* subreq is now totally disconnected from page group or any
- * write / commit lists. last chance to wake any waiters */
- nfs_unlock_request(subreq);
+ clear_bit(PG_REMOVE, &subreq->wb_flags);
- if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) {
- /* release ref on old head request */
- nfs_release_request(old_head);
+ /* Note: races with nfs_page_group_destroy() */
+ if (!kref_read(&subreq->wb_kref)) {
+ /* Check if we raced with nfs_page_group_destroy() */
+ if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
+ nfs_free_request(subreq);
+ continue;
+ }
- nfs_page_group_clear_bits(subreq);
+ subreq->wb_head = subreq;
- /* release the PG_INODE_REF reference */
- if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags))
- nfs_release_request(subreq);
- else
- WARN_ON_ONCE(1);
- } else {
- WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
- /* zombie requests have already released the last
- * reference and were waiting on the rest of the
- * group to complete. Since it's no longer part of a
- * group, simply free the request */
- nfs_page_group_clear_bits(subreq);
- nfs_free_request(subreq);
+ if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
+ nfs_release_request(subreq);
+ atomic_long_dec(&NFS_I(inode)->nrequests);
}
+
+ /* subreq is now totally disconnected from page group or any
+ * write / commit lists. last chance to wake any waiters */
+ nfs_unlock_and_release_request(subreq);
}
}
@@ -464,7 +447,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
* operations for this page.
*
* @page - the page used to lookup the "page group" of nfs_page structures
- * @nonblock - if true, don't block waiting for request locks
*
* This function joins all sub requests to the head request by first
* locking all requests in the group, cancelling any pending operations
@@ -478,7 +460,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
* error was encountered.
*/
static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page, bool nonblock)
+nfs_lock_and_join_requests(struct page *page)
{
struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *head, *subreq;
@@ -487,43 +469,63 @@ nfs_lock_and_join_requests(struct page *page, bool nonblock)
int ret;
try_again:
- total_bytes = 0;
-
- WARN_ON_ONCE(destroy_list);
-
- spin_lock(&inode->i_lock);
-
/*
* A reference is taken only on the head request which acts as a
* reference to the whole page group - the group will not be destroyed
* until the head reference is released.
*/
- head = nfs_page_find_head_request_locked(NFS_I(inode), page);
-
- if (!head) {
- spin_unlock(&inode->i_lock);
+ head = nfs_page_find_head_request(page);
+ if (!head)
return NULL;
- }
- /* holding inode lock, so always make a non-blocking call to try the
- * page group lock */
- ret = nfs_page_group_lock(head, true);
- if (ret < 0) {
- spin_unlock(&inode->i_lock);
+ /* lock the page head first in order to avoid an ABBA inefficiency */
+ if (!nfs_lock_request(head)) {
+ ret = nfs_wait_on_request(head);
+ nfs_release_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ goto try_again;
+ }
- if (!nonblock && ret == -EAGAIN) {
- nfs_page_group_lock_wait(head);
- nfs_release_request(head);
- goto try_again;
- }
+ /* Ensure that nobody removed the request before we locked it */
+ if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
+ nfs_unlock_and_release_request(head);
+ goto try_again;
+ }
- nfs_release_request(head);
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
return ERR_PTR(ret);
}
/* lock each request in the page group */
- subreq = head;
- do {
+ total_bytes = head->wb_bytes;
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+
+ if (!kref_get_unless_zero(&subreq->wb_kref)) {
+ if (subreq->wb_offset == head->wb_offset + total_bytes)
+ total_bytes += subreq->wb_bytes;
+ continue;
+ }
+
+ while (!nfs_lock_request(subreq)) {
+ /*
+ * Unlock page to allow nfs_page_group_sync_on_bit()
+ * to succeed
+ */
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(inode, head, subreq);
+ nfs_release_request(subreq);
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+ }
/*
* Subrequests are always contiguous, non overlapping
* and in order - but may be repeated (mirrored writes).
@@ -535,24 +537,12 @@ try_again:
((subreq->wb_offset + subreq->wb_bytes) >
(head->wb_offset + total_bytes)))) {
nfs_page_group_unlock(head);
- spin_unlock(&inode->i_lock);
+ nfs_unroll_locks(inode, head, subreq);
+ nfs_unlock_and_release_request(subreq);
+ nfs_unlock_and_release_request(head);
return ERR_PTR(-EIO);
}
-
- if (!nfs_lock_request(subreq)) {
- /* releases page group bit lock and
- * inode spin lock and all references */
- ret = nfs_unroll_locks_and_wait(inode, head,
- subreq, nonblock);
-
- if (ret == 0)
- goto try_again;
-
- return ERR_PTR(ret);
- }
-
- subreq = subreq->wb_this_page;
- } while (subreq != head);
+ }
/* Now that all requests are locked, make sure they aren't on any list.
* Commit list removal accounting is done after locks are dropped */
@@ -573,34 +563,30 @@ try_again:
head->wb_bytes = total_bytes;
}
- /*
- * prepare head request to be added to new pgio descriptor
- */
- nfs_page_group_clear_bits(head);
-
- /*
- * some part of the group was still on the inode list - otherwise
- * the group wouldn't be involved in async write.
- * grab a reference for the head request, iff it needs one.
- */
- if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags))
+ /* Postpone destruction of this request */
+ if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
+ set_bit(PG_INODE_REF, &head->wb_flags);
kref_get(&head->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
nfs_page_group_unlock(head);
- /* drop lock to clean uprequests on destroy list */
- spin_unlock(&inode->i_lock);
+ nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
- nfs_destroy_unlinked_subrequests(destroy_list, head);
+ /* Did we lose a race with nfs_inode_remove_request()? */
+ if (!(PagePrivate(page) || PageSwapCache(page))) {
+ nfs_unlock_and_release_request(head);
+ return NULL;
+ }
- /* still holds ref on head from nfs_page_find_head_request_locked
+ /* still holds ref on head from nfs_page_find_head_request
* and still has lock on head from lock loop */
return head;
}
static void nfs_write_error_remove_page(struct nfs_page *req)
{
- nfs_unlock_request(req);
nfs_end_page_writeback(req);
generic_error_remove_page(page_file_mapping(req->wb_page),
req->wb_page);
@@ -624,12 +610,12 @@ nfs_error_is_fatal_on_server(int err)
* May return an error if the user signalled nfs_wait_on_request().
*/
static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
- struct page *page, bool nonblock)
+ struct page *page)
{
struct nfs_page *req;
int ret = 0;
- req = nfs_lock_and_join_requests(page, nonblock);
+ req = nfs_lock_and_join_requests(page);
if (!req)
goto out;
ret = PTR_ERR(req);
@@ -672,7 +658,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
int ret;
nfs_pageio_cond_complete(pgio, page_index(page));
- ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+ ret = nfs_page_async_flush(pgio, page);
if (ret == -EAGAIN) {
redirty_page_for_writepage(wbc, page);
ret = 0;
@@ -759,6 +745,7 @@ out_err:
*/
static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
+ struct address_space *mapping = page_file_mapping(req->wb_page);
struct nfs_inode *nfsi = NFS_I(inode);
WARN_ON_ONCE(req->wb_this_page != req);
@@ -766,27 +753,30 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
/* Lock the request! */
nfs_lock_request(req);
- spin_lock(&inode->i_lock);
- if (!nfsi->nrequests &&
- NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
- inode->i_version++;
/*
* Swap-space should not get truncated. Hence no need to plug the race
* with invalidate/truncate.
*/
+ spin_lock(&mapping->private_lock);
+ if (!nfs_have_writebacks(inode) &&
+ NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) {
+ spin_lock(&inode->i_lock);
+ inode->i_version++;
+ spin_unlock(&inode->i_lock);
+ }
if (likely(!PageSwapCache(req->wb_page))) {
set_bit(PG_MAPPED, &req->wb_flags);
SetPagePrivate(req->wb_page);
set_page_private(req->wb_page, (unsigned long)req);
}
- nfsi->nrequests++;
+ spin_unlock(&mapping->private_lock);
+ atomic_long_inc(&nfsi->nrequests);
/* this a head request for a page group - mark it as having an
* extra reference so sub groups can follow suit.
* This flag also informs pgio layer when to bump nrequests when
* adding subrequests. */
WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
kref_get(&req->wb_kref);
- spin_unlock(&inode->i_lock);
}
/*
@@ -794,25 +784,22 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
*/
static void nfs_inode_remove_request(struct nfs_page *req)
{
- struct inode *inode = d_inode(req->wb_context->dentry);
+ struct address_space *mapping = page_file_mapping(req->wb_page);
+ struct inode *inode = mapping->host;
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
+ atomic_long_dec(&nfsi->nrequests);
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
head = req->wb_head;
- spin_lock(&inode->i_lock);
+ spin_lock(&mapping->private_lock);
if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
set_page_private(head->wb_page, 0);
ClearPagePrivate(head->wb_page);
clear_bit(PG_MAPPED, &head->wb_flags);
}
- nfsi->nrequests--;
- spin_unlock(&inode->i_lock);
- } else {
- spin_lock(&inode->i_lock);
- nfsi->nrequests--;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&mapping->private_lock);
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
@@ -868,7 +855,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
* number of outstanding requests requiring a commit as well as
* the MM page stats.
*
- * The caller must hold cinfo->inode->i_lock, and the nfs_page lock.
+ * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the
+ * nfs_page lock.
*/
void
nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
@@ -876,7 +864,7 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst,
{
set_bit(PG_CLEAN, &req->wb_flags);
nfs_list_add_request(req, dst);
- cinfo->mds->ncommit++;
+ atomic_long_inc(&cinfo->mds->ncommit);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
@@ -896,9 +884,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
void
nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- spin_lock(&cinfo->inode->i_lock);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
- spin_unlock(&cinfo->inode->i_lock);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
if (req->wb_page)
nfs_mark_page_unstable(req->wb_page, cinfo);
}
@@ -922,7 +910,7 @@ nfs_request_remove_commit_list(struct nfs_page *req,
if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags))
return;
nfs_list_remove_request(req);
- cinfo->mds->ncommit--;
+ atomic_long_dec(&cinfo->mds->ncommit);
}
EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list);
@@ -967,7 +955,7 @@ nfs_clear_page_commit(struct page *page)
WB_RECLAIMABLE);
}
-/* Called holding inode (/cinfo) lock */
+/* Called holding the request lock on @req */
static void
nfs_clear_request_commit(struct nfs_page *req)
{
@@ -976,9 +964,11 @@ nfs_clear_request_commit(struct nfs_page *req)
struct nfs_commit_info cinfo;
nfs_init_cinfo_from_inode(&cinfo, inode);
+ mutex_lock(&NFS_I(inode)->commit_mutex);
if (!pnfs_clear_request_commit(req, &cinfo)) {
nfs_request_remove_commit_list(req, &cinfo);
}
+ mutex_unlock(&NFS_I(inode)->commit_mutex);
nfs_clear_page_commit(req->wb_page);
}
}
@@ -1023,7 +1013,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
remove_req:
nfs_inode_remove_request(req);
next:
- nfs_unlock_request(req);
nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1035,10 +1024,10 @@ out:
unsigned long
nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
- return cinfo->mds->ncommit;
+ return atomic_long_read(&cinfo->mds->ncommit);
}
-/* cinfo->inode->i_lock held by caller */
+/* NFS_I(cinfo->inode)->commit_mutex held by caller */
int
nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_commit_info *cinfo, int max)
@@ -1046,20 +1035,37 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_page *req, *tmp;
int ret = 0;
+restart:
list_for_each_entry_safe(req, tmp, src, wb_list) {
- if (!nfs_lock_request(req))
- continue;
kref_get(&req->wb_kref);
- if (cond_resched_lock(&cinfo->inode->i_lock))
- list_safe_reset_next(req, tmp, wb_list);
+ if (!nfs_lock_request(req)) {
+ int status;
+
+ /* Prevent deadlock with nfs_lock_and_join_requests */
+ if (!list_empty(dst)) {
+ nfs_release_request(req);
+ continue;
+ }
+ /* Ensure we make progress to prevent livelock */
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ status = nfs_wait_on_request(req);
+ nfs_release_request(req);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (status < 0)
+ break;
+ goto restart;
+ }
nfs_request_remove_commit_list(req, cinfo);
+ clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
nfs_list_add_request(req, dst);
ret++;
if ((ret == max) && !cinfo->dreq)
break;
+ cond_resched();
}
return ret;
}
+EXPORT_SYMBOL_GPL(nfs_scan_commit_list);
/*
* nfs_scan_commit - Scan an inode for commit requests
@@ -1076,15 +1082,17 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
{
int ret = 0;
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->mds->ncommit > 0) {
+ if (!atomic_long_read(&cinfo->mds->ncommit))
+ return 0;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (atomic_long_read(&cinfo->mds->ncommit) > 0) {
const int max = INT_MAX;
ret = nfs_scan_commit_list(&cinfo->mds->list, dst,
cinfo, max);
ret += pnfs_scan_commit_lists(inode, cinfo, max - ret);
}
- spin_unlock(&cinfo->inode->i_lock);
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
return ret;
}
@@ -1105,43 +1113,21 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
unsigned int end;
int error;
- if (!PagePrivate(page))
- return NULL;
-
end = offset + bytes;
- spin_lock(&inode->i_lock);
-
- for (;;) {
- req = nfs_page_find_head_request_locked(NFS_I(inode), page);
- if (req == NULL)
- goto out_unlock;
- /* should be handled by nfs_flush_incompatible */
- WARN_ON_ONCE(req->wb_head != req);
- WARN_ON_ONCE(req->wb_this_page != req);
-
- rqend = req->wb_offset + req->wb_bytes;
- /*
- * Tell the caller to flush out the request if
- * the offsets are non-contiguous.
- * Note: nfs_flush_incompatible() will already
- * have flushed out requests having wrong owners.
- */
- if (offset > rqend
- || end < req->wb_offset)
- goto out_flushme;
-
- if (nfs_lock_request(req))
- break;
+ req = nfs_lock_and_join_requests(page);
+ if (IS_ERR_OR_NULL(req))
+ return req;
- /* The request is locked, so wait and then retry */
- spin_unlock(&inode->i_lock);
- error = nfs_wait_on_request(req);
- nfs_release_request(req);
- if (error != 0)
- goto out_err;
- spin_lock(&inode->i_lock);
- }
+ rqend = req->wb_offset + req->wb_bytes;
+ /*
+ * Tell the caller to flush out the request if
+ * the offsets are non-contiguous.
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
+ */
+ if (offset > rqend || end < req->wb_offset)
+ goto out_flushme;
/* Okay, the request matches. Update the region */
if (offset < req->wb_offset) {
@@ -1152,17 +1138,17 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
req->wb_bytes = end - req->wb_offset;
else
req->wb_bytes = rqend - req->wb_offset;
-out_unlock:
- if (req)
- nfs_clear_request_commit(req);
- spin_unlock(&inode->i_lock);
return req;
out_flushme:
- spin_unlock(&inode->i_lock);
- nfs_release_request(req);
+ /*
+ * Note: we mark the request dirty here because
+ * nfs_lock_and_join_requests() cannot preserve
+ * commit flags, so we have to replay the write.
+ */
+ nfs_mark_request_dirty(req);
+ nfs_unlock_and_release_request(req);
error = nfs_wb_page(inode, page);
-out_err:
- return ERR_PTR(error);
+ return (error < 0) ? ERR_PTR(error) : NULL;
}
/*
@@ -1227,8 +1213,6 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
l_ctx = req->wb_lock_context;
do_flush = req->wb_page != page ||
!nfs_match_open_context(req->wb_context, ctx);
- /* for now, flush if more than 1 request in page_group */
- do_flush |= req->wb_this_page != req;
if (l_ctx && flctx &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock))) {
@@ -1412,7 +1396,6 @@ static void nfs_redirty_request(struct nfs_page *req)
{
nfs_mark_request_dirty(req);
set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags);
- nfs_unlock_request(req);
nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1452,7 +1435,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
pg_ops = server->pnfs_curr_ld->pg_write_ops;
#endif
nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
- server->wsize, ioflags, GFP_NOIO);
+ server->wsize, ioflags);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
@@ -1934,7 +1917,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
int ret = 0;
/* no commits means nothing needs to be done */
- if (!nfsi->commit_info.ncommit)
+ if (!atomic_long_read(&nfsi->commit_info.ncommit))
return ret;
if (wbc->sync_mode == WB_SYNC_NONE) {
@@ -2015,7 +1998,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
/* blocking call to cancel all requests and join to a single (head)
* request */
- req = nfs_lock_and_join_requests(page, false);
+ req = nfs_lock_and_join_requests(page);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
diff --git a/fs/open.c b/fs/open.c
index 35bb784763a4..7ea118471dce 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -96,7 +96,7 @@ long vfs_truncate(const struct path *path, loff_t length)
* write access on the upper inode, not on the overlay inode. For
* non-overlay filesystems d_real() is an identity function.
*/
- upperdentry = d_real(path->dentry, NULL, O_WRONLY);
+ upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0);
error = PTR_ERR(upperdentry);
if (IS_ERR(upperdentry))
goto mnt_drop_write_and_out;
@@ -670,12 +670,12 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
if (!f.file)
goto out;
- error = mnt_want_write_file(f.file);
+ error = mnt_want_write_file_path(f.file);
if (error)
goto out_fput;
audit_file(f.file);
error = chown_common(&f.file->f_path, user, group);
- mnt_drop_write_file(f.file);
+ mnt_drop_write_file_path(f.file);
out_fput:
fdput(f);
out:
@@ -857,7 +857,7 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
- struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags);
+ struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 48b70e6490f3..9cb0c80e5967 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -155,7 +155,7 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry, bool hardlink)
{
- ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_version_inc(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
if (!hardlink) {
ovl_inode_update(inode, newdentry);
@@ -692,7 +692,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
if (flags)
ovl_cleanup(wdir, upper);
- ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_version_inc(dentry->d_parent, true);
out_d_drop:
d_drop(dentry);
dput(whiteout);
@@ -742,7 +742,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
- ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_version_inc(dentry->d_parent, ovl_type_origin(dentry));
/*
* Keeping this dentry hashed would mean having to release
@@ -1089,8 +1089,9 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
drop_nlink(d_inode(new));
}
- ovl_dentry_version_inc(old->d_parent);
- ovl_dentry_version_inc(new->d_parent);
+ ovl_dentry_version_inc(old->d_parent,
+ !overwrite && ovl_type_origin(new));
+ ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old));
out_dput:
dput(newdentry);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 5bc71642b226..a619addecafc 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -498,6 +498,9 @@ static int ovl_set_nlink_common(struct dentry *dentry,
len = snprintf(buf, sizeof(buf), format,
(int) (inode->i_nlink - realinode->i_nlink));
+ if (WARN_ON(len >= sizeof(buf)))
+ return -EIO;
+
return ovl_do_setxattr(ovl_dentry_upper(dentry),
OVL_XATTR_NLINK, buf, len, 0);
}
@@ -576,10 +579,13 @@ static int ovl_inode_set(struct inode *inode, void *data)
static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
struct dentry *upperdentry)
{
- struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL;
-
- /* Lower (origin) inode must match, even if NULL */
- if (ovl_inode_lower(inode) != lowerinode)
+ /*
+ * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
+ * This happens when finding a copied up overlay inode for a renamed
+ * or hardlinked overlay dentry and lower dentry cannot be followed
+ * by origin because lower fs does not support file handles.
+ */
+ if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
return false;
/*
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e927a62c97ae..d4e8c1a08fb0 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -204,8 +204,8 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode);
struct inode *ovl_inode_upper(struct inode *inode);
struct inode *ovl_inode_lower(struct inode *inode);
struct inode *ovl_inode_real(struct inode *inode);
-struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
-void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
+struct ovl_dir_cache *ovl_dir_cache(struct inode *inode);
+void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache);
bool ovl_dentry_is_opaque(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
@@ -217,7 +217,7 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
struct dentry *lowerdentry);
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
-void ovl_dentry_version_inc(struct dentry *dentry);
+void ovl_dentry_version_inc(struct dentry *dentry, bool impurity);
u64 ovl_dentry_version_get(struct dentry *dentry);
bool ovl_is_whiteout(struct dentry *dentry);
struct file *ovl_path_open(struct path *path, int flags);
@@ -229,6 +229,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
int xerr);
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
void ovl_set_flag(unsigned long flag, struct inode *inode);
+void ovl_clear_flag(unsigned long flag, struct inode *inode);
bool ovl_test_flag(unsigned long flag, struct inode *inode);
bool ovl_inuse_trylock(struct dentry *dentry);
void ovl_inuse_unlock(struct dentry *dentry);
@@ -256,6 +257,7 @@ extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
+void ovl_dir_cache_free(struct inode *inode);
int ovl_check_d_type_supported(struct path *realpath);
void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
struct dentry *dentry, int level);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index f0fd3adb1693..62e9b22a2077 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -15,11 +15,13 @@
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
+#include <linux/ratelimit.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
+ u64 real_ino;
u64 ino;
struct list_head l_node;
struct rb_node node;
@@ -32,18 +34,20 @@ struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
+ struct rb_root root;
};
struct ovl_readdir_data {
struct dir_context ctx;
struct dentry *dentry;
bool is_lowest;
- struct rb_root root;
+ struct rb_root *root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
+ bool is_upper;
bool d_type_supported;
};
@@ -58,7 +62,33 @@ struct ovl_dir_file {
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
- return container_of(n, struct ovl_cache_entry, node);
+ return rb_entry(n, struct ovl_cache_entry, node);
+}
+
+static bool ovl_cache_entry_find_link(const char *name, int len,
+ struct rb_node ***link,
+ struct rb_node **parent)
+{
+ bool found = false;
+ struct rb_node **newp = *link;
+
+ while (!found && *newp) {
+ int cmp;
+ struct ovl_cache_entry *tmp;
+
+ *parent = *newp;
+ tmp = ovl_cache_entry_from_node(*newp);
+ cmp = strncmp(name, tmp->name, len);
+ if (cmp > 0)
+ newp = &tmp->node.rb_right;
+ else if (cmp < 0 || len < tmp->len)
+ newp = &tmp->node.rb_left;
+ else
+ found = true;
+ }
+ *link = newp;
+
+ return found;
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
@@ -82,6 +112,32 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
return NULL;
}
+static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd,
+ struct ovl_cache_entry *p)
+{
+ /* Don't care if not doing ovl_iter() */
+ if (!rdd->dentry)
+ return false;
+
+ /* Always recalc d_ino for parent */
+ if (strcmp(p->name, "..") == 0)
+ return true;
+
+ /* If this is lower, then native d_ino will do */
+ if (!rdd->is_upper)
+ return false;
+
+ /*
+ * Recalc d_ino for '.' and for all entries if dir is impure (contains
+ * copied up entries)
+ */
+ if ((p->name[0] == '.' && p->len == 1) ||
+ ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry)))
+ return true;
+
+ return false;
+}
+
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
@@ -97,7 +153,11 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
p->name[len] = '\0';
p->len = len;
p->type = d_type;
+ p->real_ino = ino;
p->ino = ino;
+ /* Defer setting d_ino for upper entry to ovl_iterate() */
+ if (ovl_calc_d_ino(rdd, p))
+ p->ino = 0;
p->is_whiteout = false;
if (d_type == DT_CHR) {
@@ -111,32 +171,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
- struct rb_node **newp = &rdd->root.rb_node;
+ struct rb_node **newp = &rdd->root->rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
- while (*newp) {
- int cmp;
- struct ovl_cache_entry *tmp;
-
- parent = *newp;
- tmp = ovl_cache_entry_from_node(*newp);
- cmp = strncmp(name, tmp->name, len);
- if (cmp > 0)
- newp = &tmp->node.rb_right;
- else if (cmp < 0 || len < tmp->len)
- newp = &tmp->node.rb_left;
- else
- return 0;
- }
+ if (ovl_cache_entry_find_link(name, len, &newp, &parent))
+ return 0;
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
- if (p == NULL)
+ if (p == NULL) {
+ rdd->err = -ENOMEM;
return -ENOMEM;
+ }
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
- rb_insert_color(&p->node, &rdd->root);
+ rb_insert_color(&p->node, rdd->root);
return 0;
}
@@ -147,7 +197,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
{
struct ovl_cache_entry *p;
- p = ovl_cache_entry_find(&rdd->root, name, namelen);
+ p = ovl_cache_entry_find(rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
@@ -172,6 +222,16 @@ void ovl_cache_free(struct list_head *list)
INIT_LIST_HEAD(list);
}
+void ovl_dir_cache_free(struct inode *inode)
+{
+ struct ovl_dir_cache *cache = ovl_dir_cache(inode);
+
+ if (cache) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ }
+}
+
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
@@ -179,8 +239,8 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
- if (ovl_dir_cache(dentry) == cache)
- ovl_set_dir_cache(dentry, NULL);
+ if (ovl_dir_cache(d_inode(dentry)) == cache)
+ ovl_set_dir_cache(d_inode(dentry), NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
@@ -273,7 +333,8 @@ static void ovl_dir_reset(struct file *file)
od->is_real = false;
}
-static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
+ struct rb_root *root)
{
int err;
struct path realpath;
@@ -281,13 +342,14 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
.ctx.actor = ovl_fill_merge,
.dentry = dentry,
.list = list,
- .root = RB_ROOT,
+ .root = root,
.is_lowest = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
+ rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
@@ -326,12 +388,13 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
int res;
struct ovl_dir_cache *cache;
- cache = ovl_dir_cache(dentry);
+ cache = ovl_dir_cache(d_inode(dentry));
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
+ WARN_ON(!cache->refcount);
cache->refcount++;
return cache;
}
- ovl_set_dir_cache(dentry, NULL);
+ ovl_set_dir_cache(d_inode(dentry), NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
@@ -339,8 +402,9 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
+ cache->root = RB_ROOT;
- res = ovl_dir_read_merged(dentry, &cache->entries);
+ res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
@@ -348,22 +412,266 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
}
cache->version = ovl_dentry_version_get(dentry);
- ovl_set_dir_cache(dentry, cache);
+ ovl_set_dir_cache(d_inode(dentry), cache);
return cache;
}
+/*
+ * Set d_ino for upper entries. Non-upper entries should always report
+ * the uppermost real inode ino and should not call this function.
+ *
+ * When not all layer are on same fs, report real ino also for upper.
+ *
+ * When all layers are on the same fs, and upper has a reference to
+ * copy up origin, call vfs_getattr() on the overlay entry to make
+ * sure that d_ino will be consistent with st_ino from stat(2).
+ */
+static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p)
+
+{
+ struct dentry *dir = path->dentry;
+ struct dentry *this = NULL;
+ enum ovl_path_type type;
+ u64 ino = p->real_ino;
+ int err = 0;
+
+ if (!ovl_same_sb(dir->d_sb))
+ goto out;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1) {
+ this = dget(dir);
+ goto get;
+ }
+ if (p->len == 2 && p->name[1] == '.') {
+ /* we shall not be moved */
+ this = dget(dir->d_parent);
+ goto get;
+ }
+ }
+ this = lookup_one_len(p->name, dir, p->len);
+ if (IS_ERR_OR_NULL(this) || !this->d_inode) {
+ if (IS_ERR(this)) {
+ err = PTR_ERR(this);
+ this = NULL;
+ goto fail;
+ }
+ goto out;
+ }
+
+get:
+ type = ovl_path_type(this);
+ if (OVL_TYPE_ORIGIN(type)) {
+ struct kstat stat;
+ struct path statpath = *path;
+
+ statpath.dentry = this;
+ err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
+ if (err)
+ goto fail;
+
+ WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
+ ino = stat.ino;
+ }
+
+out:
+ p->ino = ino;
+ dput(this);
+ return err;
+
+fail:
+ pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n",
+ p->name, err);
+ goto out;
+}
+
+static int ovl_fill_plain(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_cache_entry *p;
+ struct ovl_readdir_data *rdd =
+ container_of(ctx, struct ovl_readdir_data, ctx);
+
+ rdd->count++;
+ p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
+ if (p == NULL) {
+ rdd->err = -ENOMEM;
+ return -ENOMEM;
+ }
+ list_add_tail(&p->l_node, rdd->list);
+
+ return 0;
+}
+
+static int ovl_dir_read_impure(struct path *path, struct list_head *list,
+ struct rb_root *root)
+{
+ int err;
+ struct path realpath;
+ struct ovl_cache_entry *p, *n;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_plain,
+ .list = list,
+ .root = root,
+ };
+
+ INIT_LIST_HEAD(list);
+ *root = RB_ROOT;
+ ovl_path_upper(path->dentry, &realpath);
+
+ err = ovl_dir_read(&realpath, &rdd);
+ if (err)
+ return err;
+
+ list_for_each_entry_safe(p, n, list, l_node) {
+ if (strcmp(p->name, ".") != 0 &&
+ strcmp(p->name, "..") != 0) {
+ err = ovl_cache_update_ino(path, p);
+ if (err)
+ return err;
+ }
+ if (p->ino == p->real_ino) {
+ list_del(&p->l_node);
+ kfree(p);
+ } else {
+ struct rb_node **newp = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len,
+ &newp, &parent)))
+ return -EIO;
+
+ rb_link_node(&p->node, parent, newp);
+ rb_insert_color(&p->node, root);
+ }
+ }
+ return 0;
+}
+
+static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path)
+{
+ int res;
+ struct dentry *dentry = path->dentry;
+ struct ovl_dir_cache *cache;
+
+ cache = ovl_dir_cache(d_inode(dentry));
+ if (cache && ovl_dentry_version_get(dentry) == cache->version)
+ return cache;
+
+ /* Impure cache is not refcounted, free it here */
+ ovl_dir_cache_free(d_inode(dentry));
+ ovl_set_dir_cache(d_inode(dentry), NULL);
+
+ cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
+ if (!cache)
+ return ERR_PTR(-ENOMEM);
+
+ res = ovl_dir_read_impure(path, &cache->entries, &cache->root);
+ if (res) {
+ ovl_cache_free(&cache->entries);
+ kfree(cache);
+ return ERR_PTR(res);
+ }
+ if (list_empty(&cache->entries)) {
+ /* Good oportunity to get rid of an unnecessary "impure" flag */
+ ovl_do_removexattr(ovl_dentry_upper(dentry), OVL_XATTR_IMPURE);
+ ovl_clear_flag(OVL_IMPURE, d_inode(dentry));
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->version = ovl_dentry_version_get(dentry);
+ ovl_set_dir_cache(d_inode(dentry), cache);
+
+ return cache;
+}
+
+struct ovl_readdir_translate {
+ struct dir_context *orig_ctx;
+ struct ovl_dir_cache *cache;
+ struct dir_context ctx;
+ u64 parent_ino;
+};
+
+static int ovl_fill_real(struct dir_context *ctx, const char *name,
+ int namelen, loff_t offset, u64 ino,
+ unsigned int d_type)
+{
+ struct ovl_readdir_translate *rdt =
+ container_of(ctx, struct ovl_readdir_translate, ctx);
+ struct dir_context *orig_ctx = rdt->orig_ctx;
+
+ if (rdt->parent_ino && strcmp(name, "..") == 0)
+ ino = rdt->parent_ino;
+ else if (rdt->cache) {
+ struct ovl_cache_entry *p;
+
+ p = ovl_cache_entry_find(&rdt->cache->root, name, namelen);
+ if (p)
+ ino = p->ino;
+ }
+
+ return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+}
+
+static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
+{
+ int err;
+ struct ovl_dir_file *od = file->private_data;
+ struct dentry *dir = file->f_path.dentry;
+ struct ovl_readdir_translate rdt = {
+ .ctx.actor = ovl_fill_real,
+ .orig_ctx = ctx,
+ };
+
+ if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) {
+ struct kstat stat;
+ struct path statpath = file->f_path;
+
+ statpath.dentry = dir->d_parent;
+ err = vfs_getattr(&statpath, &stat, STATX_INO, 0);
+ if (err)
+ return err;
+
+ WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
+ rdt.parent_ino = stat.ino;
+ }
+
+ if (ovl_test_flag(OVL_IMPURE, d_inode(dir))) {
+ rdt.cache = ovl_cache_get_impure(&file->f_path);
+ if (IS_ERR(rdt.cache))
+ return PTR_ERR(rdt.cache);
+ }
+
+ return iterate_dir(od->realfile, &rdt.ctx);
+}
+
+
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
+ int err;
if (!ctx->pos)
ovl_dir_reset(file);
- if (od->is_real)
+ if (od->is_real) {
+ /*
+ * If parent is merge, then need to adjust d_ino for '..', if
+ * dir is impure then need to adjust d_ino for copied up
+ * entries.
+ */
+ if (ovl_same_sb(dentry->d_sb) &&
+ (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) ||
+ OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) {
+ return ovl_iterate_real(file, ctx);
+ }
return iterate_dir(od->realfile, ctx);
+ }
if (!od->cache) {
struct ovl_dir_cache *cache;
@@ -378,9 +686,15 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx)
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
- if (!p->is_whiteout)
+ if (!p->is_whiteout) {
+ if (!p->ino) {
+ err = ovl_cache_update_ino(&file->f_path, p);
+ if (err)
+ return err;
+ }
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
+ }
od->cursor = p->l_node.next;
ctx->pos++;
}
@@ -522,8 +836,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
+ struct rb_root root = RB_ROOT;
- err = ovl_dir_read_merged(dentry, list);
+ err = ovl_dir_read_merged(dentry, list, &root);
if (err)
return err;
@@ -612,12 +927,13 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level)
int err;
struct inode *dir = path->dentry->d_inode;
LIST_HEAD(list);
+ struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.dentry = NULL,
.list = &list,
- .root = RB_ROOT,
+ .root = &root,
.is_lowest = false,
};
@@ -675,12 +991,13 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt,
struct inode *dir = dentry->d_inode;
struct path path = { .mnt = mnt, .dentry = dentry };
LIST_HEAD(list);
+ struct rb_root root = RB_ROOT;
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.dentry = NULL,
.list = &list,
- .root = RB_ROOT,
+ .root = &root,
.is_lowest = false,
};
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index d86e89f97201..cd49c0298ddf 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -70,20 +70,20 @@ static int ovl_check_append_only(struct inode *inode, int flag)
static struct dentry *ovl_d_real(struct dentry *dentry,
const struct inode *inode,
- unsigned int open_flags)
+ unsigned int open_flags, unsigned int flags)
{
struct dentry *real;
int err;
+ if (flags & D_REAL_UPPER)
+ return ovl_dentry_upper(dentry);
+
if (!d_is_reg(dentry)) {
if (!inode || inode == d_inode(dentry))
return dentry;
goto bug;
}
- if (d_is_negative(dentry))
- return dentry;
-
if (open_flags) {
err = ovl_open_maybe_copy_up(dentry, open_flags);
if (err)
@@ -105,7 +105,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
goto bug;
/* Handle recursion */
- real = d_real(real, inode, open_flags);
+ real = d_real(real, inode, open_flags, 0);
if (!inode || inode == d_inode(real))
return real;
@@ -198,6 +198,7 @@ static void ovl_destroy_inode(struct inode *inode)
dput(oi->__upperdentry);
kfree(oi->redirect);
+ ovl_dir_cache_free(inode);
mutex_destroy(&oi->lock);
call_rcu(&inode->i_rcu, ovl_i_callback);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index f46ad75dc96a..117794582f9f 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -180,14 +180,14 @@ struct inode *ovl_inode_real(struct inode *inode)
}
-struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+struct ovl_dir_cache *ovl_dir_cache(struct inode *inode)
{
- return OVL_I(d_inode(dentry))->cache;
+ return OVL_I(inode)->cache;
}
-void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache)
{
- OVL_I(d_inode(dentry))->cache = cache;
+ OVL_I(inode)->cache = cache;
}
bool ovl_dentry_is_opaque(struct dentry *dentry)
@@ -275,12 +275,19 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry)
}
}
-void ovl_dentry_version_inc(struct dentry *dentry)
+void ovl_dentry_version_inc(struct dentry *dentry, bool impurity)
{
struct inode *inode = d_inode(dentry);
WARN_ON(!inode_is_locked(inode));
- OVL_I(inode)->version++;
+ /*
+ * Version is used by readdir code to keep cache consistent. For merge
+ * dirs all changes need to be noted. For non-merge dirs, cache only
+ * contains impure (ones which have been copied up and have origins)
+ * entries, so only need to note changes to impure entries.
+ */
+ if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity)
+ OVL_I(inode)->version++;
}
u64 ovl_dentry_version_get(struct dentry *dentry)
@@ -382,6 +389,11 @@ void ovl_set_flag(unsigned long flag, struct inode *inode)
set_bit(flag, &OVL_I(inode)->flags);
}
+void ovl_clear_flag(unsigned long flag, struct inode *inode)
+{
+ clear_bit(flag, &OVL_I(inode)->flags);
+}
+
bool ovl_test_flag(unsigned long flag, struct inode *inode)
{
return test_bit(flag, &OVL_I(inode)->flags);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 593b022ac11b..d2c434112f42 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -95,23 +95,23 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
*/
err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo);
err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno);
- err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code);
- switch (kinfo->si_code & __SI_MASK) {
- case __SI_KILL:
+ err |= __put_user(kinfo->si_code, &uinfo->ssi_code);
+ switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) {
+ case SIL_KILL:
err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
break;
- case __SI_TIMER:
+ case SIL_TIMER:
err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
break;
- case __SI_POLL:
+ case SIL_POLL:
err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd);
break;
- case __SI_FAULT:
+ case SIL_FAULT:
err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
@@ -128,20 +128,14 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
&uinfo->ssi_addr_lsb);
#endif
break;
- case __SI_CHLD:
+ case SIL_CHLD:
err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
err |= __put_user(kinfo->si_status, &uinfo->ssi_status);
err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime);
err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime);
break;
- case __SI_RT: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ: /* But this is */
- err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
- err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
- err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
- err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
- break;
+ case SIL_RT:
default:
/*
* This case catches also the signals queued by sigqueue().
diff --git a/fs/xattr.c b/fs/xattr.c
index 464c94bf65f9..4424f7fecf14 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -23,6 +23,7 @@
#include <linux/posix_acl_xattr.h>
#include <linux/uaccess.h>
+#include "internal.h"
static const char *
strcmp_prefix(const char *a, const char *a_prefix)
@@ -441,6 +442,12 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
(strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0))
posix_acl_fix_xattr_from_user(kvalue, size);
+ else if (strcmp(kname, XATTR_NAME_CAPS) == 0) {
+ error = cap_convert_nscap(d, &kvalue, size);
+ if (error < 0)
+ goto out;
+ size = error;
+ }
}
error = vfs_setxattr(d, kname, kvalue, size, flags);
@@ -496,10 +503,10 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
if (!f.file)
return error;
audit_file(f.file);
- error = mnt_want_write_file(f.file);
+ error = mnt_want_write_file_path(f.file);
if (!error) {
error = setxattr(f.file->f_path.dentry, name, value, size, flags);
- mnt_drop_write_file(f.file);
+ mnt_drop_write_file_path(f.file);
}
fdput(f);
return error;
@@ -728,10 +735,10 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
if (!f.file)
return error;
audit_file(f.file);
- error = mnt_want_write_file(f.file);
+ error = mnt_want_write_file_path(f.file);
if (!error) {
error = removexattr(f.file->f_path.dentry, name);
- mnt_drop_write_file(f.file);
+ mnt_drop_write_file_path(f.file);
}
fdput(f);
return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index fffae1390d7f..29172609f2a3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -80,6 +80,19 @@ xfs_find_bdev_for_inode(
return mp->m_ddev_targp->bt_bdev;
}
+struct dax_device *
+xfs_find_daxdev_for_inode(
+ struct inode *inode)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (XFS_IS_REALTIME_INODE(ip))
+ return mp->m_rtdev_targp->bt_daxdev;
+ else
+ return mp->m_ddev_targp->bt_daxdev;
+}
+
/*
* We're now finished for good with this page. Update the page state via the
* associated buffer_heads, paying attention to the start and end offsets that
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index cc174ec6c2fd..88c85ea63da0 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -59,5 +59,6 @@ int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
extern void xfs_count_page_state(struct page *, int *, int *);
extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
+extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b1c9711e79a4..da14658da310 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1802,7 +1802,8 @@ xfs_setsize_buftarg_early(
xfs_buftarg_t *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev)
+ struct block_device *bdev,
+ struct dax_device *dax_dev)
{
xfs_buftarg_t *btp;
@@ -1811,6 +1812,7 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
+ btp->bt_daxdev = dax_dev;
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 20721261dae5..bf71507ddb16 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -108,6 +108,7 @@ typedef unsigned int xfs_buf_flags_t;
typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
+ struct dax_device *bt_daxdev;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
size_t bt_meta_sectormask;
@@ -385,7 +386,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
* Handling of buftargs.
*/
extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *);
+ struct block_device *, struct dax_device *);
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 79cb5b3d140c..a1909bc064e9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -69,6 +69,7 @@ xfs_bmbt_to_iomap(
iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+ iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
}
xfs_extlen_t
@@ -975,7 +976,6 @@ xfs_file_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false, trimmed = false;
unsigned lockmode;
- struct block_device *bdev;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
@@ -1085,13 +1085,6 @@ xfs_file_iomap_begin(
xfs_bmbt_to_iomap(ip, iomap, &imap);
- /* optionally associate a dax device with the iomap bdev */
- bdev = iomap->bdev;
- if (blk_queue_dax(bdev->bd_queue))
- iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name);
- else
- iomap->dax_dev = NULL;
-
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
@@ -1169,7 +1162,6 @@ xfs_file_iomap_end(
unsigned flags,
struct iomap *iomap)
{
- fs_put_dax(iomap->dax_dev);
if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
length, written, iomap);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 9301c5a6060b..dcd1292664b3 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -270,7 +270,14 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
#endif /* DEBUG */
#ifdef CONFIG_XFS_RT
-#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME)
+
+/*
+ * make sure we ignore the inode flag if the filesystem doesn't have a
+ * configured realtime device.
+ */
+#define XFS_IS_REALTIME_INODE(ip) \
+ (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \
+ (ip)->i_mount->m_rtdev_targp)
#else
#define XFS_IS_REALTIME_INODE(ip) (0)
#endif
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index c1c4c2ea1014..3008f31753df 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -714,17 +714,26 @@ STATIC void
xfs_close_devices(
struct xfs_mount *mp)
{
+ struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev;
+
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+ struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev;
+
xfs_free_buftarg(mp, mp->m_logdev_targp);
xfs_blkdev_put(logdev);
+ fs_put_dax(dax_logdev);
}
if (mp->m_rtdev_targp) {
struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+ struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev;
+
xfs_free_buftarg(mp, mp->m_rtdev_targp);
xfs_blkdev_put(rtdev);
+ fs_put_dax(dax_rtdev);
}
xfs_free_buftarg(mp, mp->m_ddev_targp);
+ fs_put_dax(dax_ddev);
}
/*
@@ -742,6 +751,8 @@ xfs_open_devices(
struct xfs_mount *mp)
{
struct block_device *ddev = mp->m_super->s_bdev;
+ struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev);
+ struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL;
struct block_device *logdev = NULL, *rtdev = NULL;
int error;
@@ -752,6 +763,7 @@ xfs_open_devices(
error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
goto out;
+ dax_logdev = fs_dax_get_by_bdev(logdev);
}
if (mp->m_rtname) {
@@ -765,24 +777,25 @@ xfs_open_devices(
error = -EINVAL;
goto out_close_rtdev;
}
+ dax_rtdev = fs_dax_get_by_bdev(rtdev);
}
/*
* Setup xfs_mount buffer target pointers
*/
error = -ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -798,10 +811,14 @@ xfs_open_devices(
xfs_free_buftarg(mp, mp->m_ddev_targp);
out_close_rtdev:
xfs_blkdev_put(rtdev);
+ fs_put_dax(dax_rtdev);
out_close_logdev:
- if (logdev && logdev != ddev)
+ if (logdev && logdev != ddev) {
xfs_blkdev_put(logdev);
+ fs_put_dax(dax_logdev);
+ }
out:
+ fs_put_dax(dax_ddev);
return error;
}
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6ffb67e10c06..b52e278e4744 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -248,4 +248,6 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size);
+
#endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index edf5b04b918a..b422170b791a 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -167,6 +167,8 @@ struct ceph_mon_request_header {
struct ceph_mon_statfs {
struct ceph_mon_request_header monhdr;
struct ceph_fsid fsid;
+ __u8 contains_data_pool;
+ __le64 data_pool;
} __attribute__ ((packed));
struct ceph_statfs {
@@ -669,7 +671,9 @@ enum {
extern const char *ceph_cap_op_name(int op);
/* flags field in client cap messages (version >= 10) */
-#define CEPH_CLIENT_CAPS_SYNC (0x1)
+#define CEPH_CLIENT_CAPS_SYNC (1<<0)
+#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1)
+#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2);
/*
* caps message, used for capability callbacks, acks, requests, etc.
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 8a79587e1317..4c846aabd9f6 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -84,17 +84,6 @@ struct ceph_options {
#define CEPH_AUTH_NAME_DEFAULT "guest"
-/*
- * Delay telling the MDS we no longer want caps, in case we reopen
- * the file. Delay a minimum amount of time, even if we send a cap
- * message for some other reason. Otherwise, take the oppotunity to
- * update the mds to avoid sending another message later.
- */
-#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
-#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
-
-#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
-
/* mount state */
enum {
CEPH_MOUNT_MOUNTING,
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index d5a3ecea578d..0fa990bf867a 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -133,8 +133,8 @@ void ceph_monc_renew_subs(struct ceph_mon_client *monc);
extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
unsigned long timeout);
-extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
- struct ceph_statfs *buf);
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+ struct ceph_statfs *buf);
int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what,
u64 *newest);
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h
index b8281feda9c7..01408841c9c4 100644
--- a/include/linux/ceph/rados.h
+++ b/include/linux/ceph/rados.h
@@ -230,7 +230,6 @@ extern const char *ceph_osd_state_name(int s);
\
/* fancy write */ \
f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \
- f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \
f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \
f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \
\
diff --git a/include/linux/cper.h b/include/linux/cper.h
index 4c671fc2081e..723e952fde0d 100644
--- a/include/linux/cper.h
+++ b/include/linux/cper.h
@@ -74,36 +74,36 @@ enum {
* Corrected Machine Check
*/
#define CPER_NOTIFY_CMC \
- UUID_LE(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \
- 0xEB, 0xD4, 0xF8, 0x90)
+ GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \
+ 0xEB, 0xD4, 0xF8, 0x90)
/* Corrected Platform Error */
#define CPER_NOTIFY_CPE \
- UUID_LE(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81, \
- 0xF2, 0x7E, 0xBE, 0xEE)
+ GUID_INIT(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81, \
+ 0xF2, 0x7E, 0xBE, 0xEE)
/* Machine Check Exception */
#define CPER_NOTIFY_MCE \
- UUID_LE(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \
- 0xE1, 0x49, 0x13, 0xBB)
+ GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \
+ 0xE1, 0x49, 0x13, 0xBB)
/* PCI Express Error */
#define CPER_NOTIFY_PCIE \
- UUID_LE(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D, \
- 0xAF, 0x67, 0xC1, 0x04)
+ GUID_INIT(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D, \
+ 0xAF, 0x67, 0xC1, 0x04)
/* INIT Record (for IPF) */
#define CPER_NOTIFY_INIT \
- UUID_LE(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B, \
- 0xD3, 0x9B, 0xC9, 0x8E)
+ GUID_INIT(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B, \
+ 0xD3, 0x9B, 0xC9, 0x8E)
/* Non-Maskable Interrupt */
#define CPER_NOTIFY_NMI \
- UUID_LE(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24, \
- 0x85, 0xD6, 0xE9, 0x8A)
+ GUID_INIT(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24, \
+ 0x85, 0xD6, 0xE9, 0x8A)
/* BOOT Error Record */
#define CPER_NOTIFY_BOOT \
- UUID_LE(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \
- 0xD4, 0x64, 0xB3, 0x8F)
+ GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \
+ 0xD4, 0x64, 0xB3, 0x8F)
/* DMA Remapping Error */
#define CPER_NOTIFY_DMAR \
- UUID_LE(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
- 0x72, 0x2D, 0xEB, 0x41)
+ GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \
+ 0x72, 0x2D, 0xEB, 0x41)
/*
* Flags bits definitions for flags in struct cper_record_header
@@ -170,50 +170,50 @@ enum {
* Processor Generic
*/
#define CPER_SEC_PROC_GENERIC \
- UUID_LE(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1, \
- 0x93, 0xC4, 0xF3, 0xDB)
+ GUID_INIT(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1, \
+ 0x93, 0xC4, 0xF3, 0xDB)
/* Processor Specific: X86/X86_64 */
#define CPER_SEC_PROC_IA \
- UUID_LE(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \
- 0x24, 0x2B, 0x6E, 0x1D)
+ GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \
+ 0x24, 0x2B, 0x6E, 0x1D)
/* Processor Specific: IA64 */
#define CPER_SEC_PROC_IPF \
- UUID_LE(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00, \
- 0x80, 0xC7, 0x3C, 0x88, 0x81)
+ GUID_INIT(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00, \
+ 0x80, 0xC7, 0x3C, 0x88, 0x81)
/* Processor Specific: ARM */
#define CPER_SEC_PROC_ARM \
- UUID_LE(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05, \
- 0x1D, 0x5D, 0x46, 0xB0)
+ GUID_INIT(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05, \
+ 0x1D, 0x5D, 0x46, 0xB0)
/* Platform Memory */
#define CPER_SEC_PLATFORM_MEM \
- UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
- 0xED, 0x7C, 0x83, 0xB1)
+ GUID_INIT(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
+ 0xED, 0x7C, 0x83, 0xB1)
#define CPER_SEC_PCIE \
- UUID_LE(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D, \
- 0xCB, 0x3C, 0x6F, 0x35)
+ GUID_INIT(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D, \
+ 0xCB, 0x3C, 0x6F, 0x35)
/* Firmware Error Record Reference */
#define CPER_SEC_FW_ERR_REC_REF \
- UUID_LE(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72, \
- 0x9C, 0x8E, 0x69, 0xED)
+ GUID_INIT(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72, \
+ 0x9C, 0x8E, 0x69, 0xED)
/* PCI/PCI-X Bus */
#define CPER_SEC_PCI_X_BUS \
- UUID_LE(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA, \
- 0xD3, 0xF9, 0xC9, 0xDD)
+ GUID_INIT(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA, \
+ 0xD3, 0xF9, 0xC9, 0xDD)
/* PCI Component/Device */
#define CPER_SEC_PCI_DEV \
- UUID_LE(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06, \
- 0x8B, 0x00, 0x13, 0x26)
+ GUID_INIT(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06, \
+ 0x8B, 0x00, 0x13, 0x26)
#define CPER_SEC_DMAR_GENERIC \
- UUID_LE(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62, \
- 0xDE, 0x3E, 0x2C, 0x64)
+ GUID_INIT(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62, \
+ 0xDE, 0x3E, 0x2C, 0x64)
/* Intel VT for Directed I/O specific DMAr */
#define CPER_SEC_DMAR_VT \
- UUID_LE(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE, \
- 0xDD, 0x93, 0xE8, 0xCF)
+ GUID_INIT(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE, \
+ 0xDD, 0x93, 0xE8, 0xCF)
/* IOMMU specific DMAr */
#define CPER_SEC_DMAR_IOMMU \
- UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \
- 0xDF, 0xAA, 0x84, 0xEC)
+ GUID_INIT(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \
+ 0xDF, 0xAA, 0x84, 0xEC)
#define CPER_PROC_VALID_TYPE 0x0001
#define CPER_PROC_VALID_ISA 0x0002
@@ -290,10 +290,10 @@ struct cper_record_header {
__u32 validation_bits;
__u32 record_length;
__u64 timestamp;
- uuid_le platform_id;
- uuid_le partition_id;
- uuid_le creator_id;
- uuid_le notification_type;
+ guid_t platform_id;
+ guid_t partition_id;
+ guid_t creator_id;
+ guid_t notification_type;
__u64 record_id;
__u32 flags;
__u64 persistence_information;
@@ -309,8 +309,8 @@ struct cper_section_descriptor {
__u8 validation_bits;
__u8 reserved; /* must be zero */
__u32 flags;
- uuid_le section_type;
- uuid_le fru_id;
+ guid_t section_type;
+ guid_t fru_id;
__u32 section_severity;
__u8 fru_text[20];
};
@@ -343,7 +343,7 @@ struct cper_sec_proc_ia {
/* IA32/X64 Processor Error Information Structure */
struct cper_ia_err_info {
- uuid_le err_type;
+ guid_t err_type;
__u64 validation_bits;
__u64 check_info;
__u64 target_id;
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index e74655d941b7..a1e6a33a4b03 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,7 +51,9 @@ static inline void cpuset_dec(void)
extern int cpuset_init(void);
extern void cpuset_init_smp(void);
+extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
+extern void cpuset_wait_for_hotplug(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -164,11 +166,15 @@ static inline bool cpusets_enabled(void) { return false; }
static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}
+static inline void cpuset_force_rebuild(void) { }
+
static inline void cpuset_update_active_cpus(void)
{
partition_sched_domains(1, NULL, NULL);
}
+static inline void cpuset_wait_for_hotplug(void) { }
+
static inline void cpuset_cpus_allowed(struct task_struct *p,
struct cpumask *mask)
{
diff --git a/include/linux/dax.h b/include/linux/dax.h
index eb0bff6f1eab..46cad1d0f129 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -57,6 +57,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
put_dax(dax_dev);
}
+struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
#else
static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
{
@@ -71,6 +72,11 @@ static inline struct dax_device *fs_dax_get_by_host(const char *host)
static inline void fs_put_dax(struct dax_device *dax_dev)
{
}
+
+static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
+{
+ return NULL;
+}
#endif
int dax_read_lock(void);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index aae1cdb76851..ed1a7cf6923a 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -147,7 +147,7 @@ struct dentry_operations {
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(const struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *,
- unsigned int);
+ unsigned int, unsigned int);
} ____cacheline_aligned;
/*
@@ -562,11 +562,15 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
return upper;
}
+/* d_real() flags */
+#define D_REAL_UPPER 0x2 /* return upper dentry or NULL if non-upper */
+
/**
* d_real - Return the real dentry
* @dentry: the dentry to query
* @inode: inode to select the dentry from multiple layers (can be NULL)
- * @flags: open flags to control copy-up behavior
+ * @open_flags: open flags to control copy-up behavior
+ * @flags: flags to control what is returned by this function
*
* If dentry is on a union/overlay, then return the underlying, real dentry.
* Otherwise return the dentry itself.
@@ -575,10 +579,10 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
*/
static inline struct dentry *d_real(struct dentry *dentry,
const struct inode *inode,
- unsigned int flags)
+ unsigned int open_flags, unsigned int flags)
{
if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
- return dentry->d_op->d_real(dentry, inode, flags);
+ return dentry->d_op->d_real(dentry, inode, open_flags, flags);
else
return dentry;
}
@@ -593,7 +597,7 @@ static inline struct dentry *d_real(struct dentry *dentry,
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
/* This usage of d_real() results in const dentry */
- return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0));
+ return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0, 0));
}
struct name_snapshot {
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2189c79cde5d..29ce9815da87 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -550,26 +550,13 @@ static inline void dma_free_coherent(struct device *dev, size_t size,
return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0);
}
-static inline void *dma_alloc_noncoherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp)
-{
- return dma_alloc_attrs(dev, size, dma_handle, gfp,
- DMA_ATTR_NON_CONSISTENT);
-}
-
-static inline void dma_free_noncoherent(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_handle)
-{
- dma_free_attrs(dev, size, cpu_addr, dma_handle,
- DMA_ATTR_NON_CONSISTENT);
-}
-
static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
- debug_dma_mapping_error(dev, dma_addr);
+ const struct dma_map_ops *ops = get_dma_ops(dev);
- if (get_dma_ops(dev)->mapping_error)
- return get_dma_ops(dev)->mapping_error(dev, dma_addr);
+ debug_dma_mapping_error(dev, dma_addr);
+ if (ops->mapping_error)
+ return ops->mapping_error(dev, dma_addr);
return 0;
}
@@ -720,10 +707,7 @@ static inline int dma_get_cache_alignment(void)
#endif
/* flags for the coherent memory api */
-#define DMA_MEMORY_MAP 0x01
-#define DMA_MEMORY_IO 0x02
-#define DMA_MEMORY_INCLUDES_CHILDREN 0x04
-#define DMA_MEMORY_EXCLUSIVE 0x08
+#define DMA_MEMORY_EXCLUSIVE 0x01
#ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT
int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
@@ -736,7 +720,7 @@ static inline int
dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
dma_addr_t device_addr, size_t size, int flags)
{
- return 0;
+ return -ENOSYS;
}
static inline void
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 65905c3cb655..66f4a4e79f4b 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -47,10 +47,10 @@ typedef u16 efi_char16_t; /* UNICODE character */
typedef u64 efi_physical_addr_t;
typedef void *efi_handle_t;
-typedef uuid_le efi_guid_t;
+typedef guid_t efi_guid_t;
#define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \
- UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
+ GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)
/*
* Generic EFI table header
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index b6feed6547ce..2a0c453d7235 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -186,6 +186,8 @@ struct f2fs_extent {
#define F2FS_NAME_LEN 255
#define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */
#define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */
+#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \
+ get_extra_isize(inode))
#define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */
#define ADDRS_PER_INODE(inode) addrs_per_inode(inode)
#define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */
@@ -205,9 +207,7 @@ struct f2fs_extent {
#define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */
#define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */
#define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */
-
-#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \
- F2FS_INLINE_XATTR_ADDRS - 1))
+#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */
struct f2fs_inode {
__le16 i_mode; /* file mode */
@@ -235,8 +235,16 @@ struct f2fs_inode {
struct f2fs_extent i_ext; /* caching a largest extent */
- __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
-
+ union {
+ struct {
+ __le16 i_extra_isize; /* extra inode attribute size */
+ __le16 i_padding; /* padding */
+ __le32 i_projid; /* project id */
+ __le32 i_inode_checksum;/* inode meta checksum */
+ __le32 i_extra_end[0]; /* for attribute size calculation */
+ };
+ __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */
+ };
__le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2),
double_indirect(1) node id */
} __packed;
@@ -465,7 +473,7 @@ typedef __le32 f2fs_hash_t;
#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
/*
- * space utilization of regular dentry and inline dentry
+ * space utilization of regular dentry and inline dentry (w/o extra reservation)
* regular dentry inline dentry
* bitmap 1 * 27 = 27 1 * 23 = 23
* reserved 1 * 3 = 3 1 * 7 = 7
@@ -501,24 +509,6 @@ struct f2fs_dentry_block {
__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
} __packed;
-/* for inline dir */
-#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \
- ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
- BITS_PER_BYTE + 1))
-#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \
- BITS_PER_BYTE - 1) / BITS_PER_BYTE)
-#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \
- ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \
- NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE))
-
-/* inline directory entry structure */
-struct f2fs_inline_dentry {
- __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE];
- __u8 reserved[INLINE_RESERVED_SIZE];
- struct f2fs_dir_entry dentry[NR_INLINE_DENTRY];
- __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN];
-} __packed;
-
/* file types used in inode_info->flags */
enum {
F2FS_FT_UNKNOWN,
@@ -534,4 +524,6 @@ enum {
#define S_SHIFT 12
+#define F2FS_DEF_PROJID 0 /* default project ID */
+
#endif /* _LINUX_F2FS_FS_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2d0e6748e46e..33d8e45cd874 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1235,7 +1235,7 @@ static inline struct inode *file_inode(const struct file *f)
static inline struct dentry *file_dentry(const struct file *file)
{
- return d_real(file->f_path.dentry, file_inode(file), 0);
+ return d_real(file->f_path.dentry, file_inode(file), 0, 0);
}
static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
index f3d3e6af8838..3eaad2fbf284 100644
--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -87,6 +87,7 @@ struct nd_mapping_desc {
struct nvdimm *nvdimm;
u64 start;
u64 size;
+ int position;
};
struct nd_region_desc {
@@ -173,4 +174,19 @@ u64 nd_fletcher64(void *addr, size_t len, bool le);
void nvdimm_flush(struct nd_region *nd_region);
int nvdimm_has_flush(struct nd_region *nd_region);
int nvdimm_has_cache(struct nd_region *nd_region);
+
+#ifdef CONFIG_ARCH_HAS_PMEM_API
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WB
+void arch_wb_cache_pmem(void *addr, size_t size);
+void arch_invalidate_pmem(void *addr, size_t size);
+#else
+#define ARCH_MEMREMAP_PMEM MEMREMAP_WT
+static inline void arch_wb_cache_pmem(void *addr, size_t size)
+{
+}
+static inline void arch_invalidate_pmem(void *addr, size_t size)
+{
+}
+#endif
+
#endif /* __LIBNVDIMM_H__ */
diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
index 22b5d4e687ce..d1c2901f1542 100644
--- a/include/linux/lsm_audit.h
+++ b/include/linux/lsm_audit.h
@@ -4,7 +4,7 @@
*
* Author : Etienne BASSET <etienne.basset@ensta.org>
*
- * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * All credits to : Stephen Smalley, <sds@tycho.nsa.gov>
* All BUGS to : Etienne BASSET <etienne.basset@ensta.org>
*/
#ifndef _LSM_COMMON_LOGGING_
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index d1c7bef25691..c9258124e417 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -528,11 +528,6 @@
*
* Security hooks for task operations.
*
- * @task_create:
- * Check permission before creating a child process. See the clone(2)
- * manual page for definitions of the @clone_flags.
- * @clone_flags contains the flags indicating what should be shared.
- * Return 0 if permission is granted.
* @task_alloc:
* @task task being allocated.
* @clone_flags contains the flags indicating what should be shared.
@@ -1505,7 +1500,6 @@ union security_list_options {
int (*file_receive)(struct file *file);
int (*file_open)(struct file *file, const struct cred *cred);
- int (*task_create)(unsigned long clone_flags);
int (*task_alloc)(struct task_struct *task, unsigned long clone_flags);
void (*task_free)(struct task_struct *task);
int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp);
@@ -1779,7 +1773,6 @@ struct security_hook_heads {
struct list_head file_send_sigiotask;
struct list_head file_receive;
struct list_head file_open;
- struct list_head task_create;
struct list_head task_alloc;
struct list_head task_free;
struct list_head cred_alloc_blank;
diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h
index 1255f09f5e42..265a9cd21cb4 100644
--- a/include/linux/mem_encrypt.h
+++ b/include/linux/mem_encrypt.h
@@ -21,7 +21,7 @@
#else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */
-#define sme_me_mask 0UL
+#define sme_me_mask 0ULL
#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */
@@ -30,18 +30,23 @@ static inline bool sme_active(void)
return !!sme_me_mask;
}
-static inline unsigned long sme_get_me_mask(void)
+static inline u64 sme_get_me_mask(void)
{
return sme_me_mask;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
* The __sme_set() and __sme_clr() macros are useful for adding or removing
* the encryption mask from a value (e.g. when dealing with pagetable
* entries).
*/
-#define __sme_set(x) ((unsigned long)(x) | sme_me_mask)
-#define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask)
+#define __sme_set(x) ((x) | sme_me_mask)
+#define __sme_clr(x) ((x) & ~sme_me_mask)
+#else
+#define __sme_set(x) (x)
+#define __sme_clr(x) (x)
+#endif
#endif /* __ASSEMBLY__ */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5cc91d6381a3..a0282ceaa48b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -49,7 +49,6 @@
struct nfs_access_entry {
struct rb_node rb_node;
struct list_head lru;
- unsigned long jiffies;
struct rpc_cred * cred;
__u32 mask;
struct rcu_head rcu_head;
@@ -154,7 +153,7 @@ struct nfs_inode {
*/
__be32 cookieverf[2];
- unsigned long nrequests;
+ atomic_long_t nrequests;
struct nfs_mds_commit_info commit_info;
/* Open contexts for shared mmap writes */
@@ -163,6 +162,7 @@ struct nfs_inode {
/* Readers: in-flight sillydelete RPC calls */
/* Writers: rmdir */
struct rw_semaphore rmdir_sem;
+ struct mutex commit_mutex;
#if IS_ENABLED(CONFIG_NFS_V4)
struct nfs4_cached_acl *nfs4_acl;
@@ -510,7 +510,7 @@ extern void nfs_commit_free(struct nfs_commit_data *data);
static inline int
nfs_have_writebacks(struct inode *inode)
{
- return NFS_I(inode)->nrequests != 0;
+ return atomic_long_read(&NFS_I(inode)->nrequests) != 0;
}
/*
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index d67b67ae6c8b..d117120c9b6e 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -125,8 +125,7 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
const struct nfs_pgio_completion_ops *compl_ops,
const struct nfs_rw_ops *rw_ops,
size_t bsize,
- int how,
- gfp_t gfp_flags);
+ int how);
extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *,
struct nfs_page *);
extern int nfs_pageio_resend(struct nfs_pageio_descriptor *,
@@ -139,8 +138,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
extern int nfs_wait_on_request(struct nfs_page *);
extern void nfs_unlock_request(struct nfs_page *req);
extern void nfs_unlock_and_release_request(struct nfs_page *);
-extern int nfs_page_group_lock(struct nfs_page *, bool);
-extern void nfs_page_group_lock_wait(struct nfs_page *);
+extern int nfs_page_group_lock(struct nfs_page *);
extern void nfs_page_group_unlock(struct nfs_page *);
extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 62cbcb842f99..164d5359d4ab 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1476,7 +1476,7 @@ struct nfs_pgio_header {
struct nfs_mds_commit_info {
atomic_t rpcs_out;
- unsigned long ncommit;
+ atomic_long_t ncommit;
struct list_head list;
};
diff --git a/include/linux/security.h b/include/linux/security.h
index 974bb9b0996c..ce6265960d6c 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -90,6 +90,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
extern int cap_inode_need_killpriv(struct dentry *dentry);
extern int cap_inode_killpriv(struct dentry *dentry);
+extern int cap_inode_getsecurity(struct inode *inode, const char *name,
+ void **buffer, bool alloc);
extern int cap_mmap_addr(unsigned long addr);
extern int cap_mmap_file(struct file *file, unsigned long reqprot,
unsigned long prot, unsigned long flags);
@@ -316,7 +318,6 @@ int security_file_send_sigiotask(struct task_struct *tsk,
struct fown_struct *fown, int sig);
int security_file_receive(struct file *file);
int security_file_open(struct file *file, const struct cred *cred);
-int security_task_create(unsigned long clone_flags);
int security_task_alloc(struct task_struct *task, unsigned long clone_flags);
void security_task_free(struct task_struct *task);
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp);
@@ -878,11 +879,6 @@ static inline int security_file_open(struct file *file,
return 0;
}
-static inline int security_task_create(unsigned long clone_flags)
-{
- return 0;
-}
-
static inline int security_task_alloc(struct task_struct *task,
unsigned long clone_flags)
{
diff --git a/include/linux/signal.h b/include/linux/signal.h
index e2678b5dbb21..38564e3e54c7 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -21,6 +21,20 @@ static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from);
+enum siginfo_layout {
+ SIL_KILL,
+ SIL_TIMER,
+ SIL_POLL,
+ SIL_FAULT,
+ SIL_CHLD,
+ SIL_RT,
+#ifdef __ARCH_SIGSYS
+ SIL_SYS,
+#endif
+};
+
+enum siginfo_layout siginfo_layout(int sig, int si_code);
+
/*
* Define some primitives to manipulate sigset_t.
*/
@@ -380,10 +394,18 @@ int unhandled_signal(struct task_struct *tsk, int sig);
rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \
rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) )
+#define SIG_SPECIFIC_SICODES_MASK (\
+ rt_sigmask(SIGILL) | rt_sigmask(SIGFPE) | \
+ rt_sigmask(SIGSEGV) | rt_sigmask(SIGBUS) | \
+ rt_sigmask(SIGTRAP) | rt_sigmask(SIGCHLD) | \
+ rt_sigmask(SIGPOLL) | rt_sigmask(SIGSYS) | \
+ SIGEMT_MASK )
+
#define sig_kernel_only(sig) siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig) siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK)
+#define sig_specific_sicodes(sig) siginmask(sig, SIG_SPECIFIC_SICODES_MASK)
#define sig_fatal(t, signr) \
(!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 50a99a117da7..c1768f9d993b 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -139,6 +139,8 @@ struct rpc_task_setup {
#define RPC_TASK_RUNNING 0
#define RPC_TASK_QUEUED 1
#define RPC_TASK_ACTIVE 2
+#define RPC_TASK_MSG_RECV 3
+#define RPC_TASK_MSG_RECV_WAIT 4
#define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
#define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate)
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 261b48a2701d..86b59e3525a5 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -239,6 +239,19 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+/**
+ * xdr_stream_remaining - Return the number of bytes remaining in the stream
+ * @xdr: pointer to struct xdr_stream
+ *
+ * Return value:
+ * Number of bytes remaining in @xdr before xdr->end
+ */
+static inline size_t
+xdr_stream_remaining(const struct xdr_stream *xdr)
+{
+ return xdr->nwords << 2;
+}
+
ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
size_t maxlen, gfp_t gfp_flags);
/**
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index eab1c749e192..5a7bff41f6b7 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -174,7 +174,7 @@ enum xprt_transports {
struct rpc_xprt {
struct kref kref; /* Reference count */
- struct rpc_xprt_ops * ops; /* transport methods */
+ const struct rpc_xprt_ops *ops; /* transport methods */
const struct rpc_timeout *timeout; /* timeout parms */
struct sockaddr_storage addr; /* server address */
@@ -232,6 +232,7 @@ struct rpc_xprt {
*/
spinlock_t transport_lock; /* lock transport info */
spinlock_t reserve_lock; /* lock slot table */
+ spinlock_t recv_lock; /* lock receive list */
u32 xid; /* Next XID value to use */
struct rpc_task * snd_task; /* Task blocked in send */
struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */
@@ -372,6 +373,8 @@ void xprt_write_space(struct rpc_xprt *xprt);
void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result);
struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid);
void xprt_complete_rqst(struct rpc_task *task, int copied);
+void xprt_pin_rqst(struct rpc_rqst *req);
+void xprt_unpin_rqst(struct rpc_rqst *req);
void xprt_release_rqst_cong(struct rpc_task *task);
void xprt_disconnect_done(struct rpc_xprt *xprt);
void xprt_force_disconnect(struct rpc_xprt *xprt);
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index dab11f97e1c6..fd5b959c753c 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -102,6 +102,7 @@ enum thermal_notify_event {
THERMAL_DEVICE_DOWN, /* Thermal device is down */
THERMAL_DEVICE_UP, /* Thermal device is up after a down event */
THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */
+ THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */
};
struct thermal_zone_device_ops {
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b3575ce29148..c18e01252346 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -112,8 +112,9 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t,
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
+extern bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
-
struct ns_common *ns_get_owner(struct ns_common *ns);
#else
@@ -144,6 +145,12 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns)
return true;
}
+static inline bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child)
+{
+ return true;
+}
+
static inline bool current_in_userns(const struct user_namespace *target_ns)
{
return true;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index bc4dd7837e4c..5d216f7fb05a 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -543,14 +543,14 @@ TRACE_EVENT(f2fs_map_blocks,
TRACE_EVENT(f2fs_background_gc,
- TP_PROTO(struct super_block *sb, long wait_ms,
+ TP_PROTO(struct super_block *sb, unsigned int wait_ms,
unsigned int prefree, unsigned int free),
TP_ARGS(sb, wait_ms, prefree, free),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(long, wait_ms)
+ __field(unsigned int, wait_ms)
__field(unsigned int, prefree)
__field(unsigned int, free)
),
@@ -562,13 +562,120 @@ TRACE_EVENT(f2fs_background_gc,
__entry->free = free;
),
- TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u",
+ TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u",
show_dev(__entry->dev),
__entry->wait_ms,
__entry->prefree,
__entry->free)
);
+TRACE_EVENT(f2fs_gc_begin,
+
+ TP_PROTO(struct super_block *sb, bool sync, bool background,
+ long long dirty_nodes, long long dirty_dents,
+ long long dirty_imeta, unsigned int free_sec,
+ unsigned int free_seg, int reserved_seg,
+ unsigned int prefree_seg),
+
+ TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta,
+ free_sec, free_seg, reserved_seg, prefree_seg),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(bool, sync)
+ __field(bool, background)
+ __field(long long, dirty_nodes)
+ __field(long long, dirty_dents)
+ __field(long long, dirty_imeta)
+ __field(unsigned int, free_sec)
+ __field(unsigned int, free_seg)
+ __field(int, reserved_seg)
+ __field(unsigned int, prefree_seg)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->sync = sync;
+ __entry->background = background;
+ __entry->dirty_nodes = dirty_nodes;
+ __entry->dirty_dents = dirty_dents;
+ __entry->dirty_imeta = dirty_imeta;
+ __entry->free_sec = free_sec;
+ __entry->free_seg = free_seg;
+ __entry->reserved_seg = reserved_seg;
+ __entry->prefree_seg = prefree_seg;
+ ),
+
+ TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, "
+ "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, "
+ "rsv_seg:%d, prefree_seg:%u",
+ show_dev(__entry->dev),
+ __entry->sync,
+ __entry->background,
+ __entry->dirty_nodes,
+ __entry->dirty_dents,
+ __entry->dirty_imeta,
+ __entry->free_sec,
+ __entry->free_seg,
+ __entry->reserved_seg,
+ __entry->prefree_seg)
+);
+
+TRACE_EVENT(f2fs_gc_end,
+
+ TP_PROTO(struct super_block *sb, int ret, int seg_freed,
+ int sec_freed, long long dirty_nodes,
+ long long dirty_dents, long long dirty_imeta,
+ unsigned int free_sec, unsigned int free_seg,
+ int reserved_seg, unsigned int prefree_seg),
+
+ TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents,
+ dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, ret)
+ __field(int, seg_freed)
+ __field(int, sec_freed)
+ __field(long long, dirty_nodes)
+ __field(long long, dirty_dents)
+ __field(long long, dirty_imeta)
+ __field(unsigned int, free_sec)
+ __field(unsigned int, free_seg)
+ __field(int, reserved_seg)
+ __field(unsigned int, prefree_seg)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sb->s_dev;
+ __entry->ret = ret;
+ __entry->seg_freed = seg_freed;
+ __entry->sec_freed = sec_freed;
+ __entry->dirty_nodes = dirty_nodes;
+ __entry->dirty_dents = dirty_dents;
+ __entry->dirty_imeta = dirty_imeta;
+ __entry->free_sec = free_sec;
+ __entry->free_seg = free_seg;
+ __entry->reserved_seg = reserved_seg;
+ __entry->prefree_seg = prefree_seg;
+ ),
+
+ TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, "
+ "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, "
+ "free_seg:%u, rsv_seg:%d, prefree_seg:%u",
+ show_dev(__entry->dev),
+ __entry->ret,
+ __entry->seg_freed,
+ __entry->sec_freed,
+ __entry->dirty_nodes,
+ __entry->dirty_dents,
+ __entry->dirty_imeta,
+ __entry->free_sec,
+ __entry->free_seg,
+ __entry->reserved_seg,
+ __entry->prefree_seg)
+);
+
TRACE_EVENT(f2fs_get_victim,
TP_PROTO(struct super_block *sb, int type, int gc_type,
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 9c4eca6b374a..e5aa6794cea4 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -151,29 +151,6 @@ typedef struct siginfo {
#define si_arch _sifields._sigsys._arch
#endif
-#ifdef __KERNEL__
-#define __SI_MASK 0xffff0000u
-#define __SI_KILL (0 << 16)
-#define __SI_TIMER (1 << 16)
-#define __SI_POLL (2 << 16)
-#define __SI_FAULT (3 << 16)
-#define __SI_CHLD (4 << 16)
-#define __SI_RT (5 << 16)
-#define __SI_MESGQ (6 << 16)
-#define __SI_SYS (7 << 16)
-#define __SI_CODE(T,N) ((T) | ((N) & 0xffff))
-#else /* __KERNEL__ */
-#define __SI_KILL 0
-#define __SI_TIMER 0
-#define __SI_POLL 0
-#define __SI_FAULT 0
-#define __SI_CHLD 0
-#define __SI_RT 0
-#define __SI_MESGQ 0
-#define __SI_SYS 0
-#define __SI_CODE(T,N) (N)
-#endif /* __KERNEL__ */
-
/*
* si_code values
* Digital reserves positive values for kernel-generated signals.
@@ -181,8 +158,8 @@ typedef struct siginfo {
#define SI_USER 0 /* sent by kill, sigsend, raise */
#define SI_KERNEL 0x80 /* sent by the kernel from somewhere */
#define SI_QUEUE -1 /* sent by sigqueue */
-#define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
-#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
+#define SI_TIMER -2 /* sent by timer expiration */
+#define SI_MESGQ -3 /* sent by real time mesq state change */
#define SI_ASYNCIO -4 /* sent by AIO completion */
#define SI_SIGIO -5 /* sent by queued SIGIO */
#define SI_TKILL -6 /* sent by tkill system call */
@@ -194,86 +171,86 @@ typedef struct siginfo {
/*
* SIGILL si_codes
*/
-#define ILL_ILLOPC (__SI_FAULT|1) /* illegal opcode */
-#define ILL_ILLOPN (__SI_FAULT|2) /* illegal operand */
-#define ILL_ILLADR (__SI_FAULT|3) /* illegal addressing mode */
-#define ILL_ILLTRP (__SI_FAULT|4) /* illegal trap */
-#define ILL_PRVOPC (__SI_FAULT|5) /* privileged opcode */
-#define ILL_PRVREG (__SI_FAULT|6) /* privileged register */
-#define ILL_COPROC (__SI_FAULT|7) /* coprocessor error */
-#define ILL_BADSTK (__SI_FAULT|8) /* internal stack error */
+#define ILL_ILLOPC 1 /* illegal opcode */
+#define ILL_ILLOPN 2 /* illegal operand */
+#define ILL_ILLADR 3 /* illegal addressing mode */
+#define ILL_ILLTRP 4 /* illegal trap */
+#define ILL_PRVOPC 5 /* privileged opcode */
+#define ILL_PRVREG 6 /* privileged register */
+#define ILL_COPROC 7 /* coprocessor error */
+#define ILL_BADSTK 8 /* internal stack error */
#define NSIGILL 8
/*
* SIGFPE si_codes
*/
-#define FPE_INTDIV (__SI_FAULT|1) /* integer divide by zero */
-#define FPE_INTOVF (__SI_FAULT|2) /* integer overflow */
-#define FPE_FLTDIV (__SI_FAULT|3) /* floating point divide by zero */
-#define FPE_FLTOVF (__SI_FAULT|4) /* floating point overflow */
-#define FPE_FLTUND (__SI_FAULT|5) /* floating point underflow */
-#define FPE_FLTRES (__SI_FAULT|6) /* floating point inexact result */
-#define FPE_FLTINV (__SI_FAULT|7) /* floating point invalid operation */
-#define FPE_FLTSUB (__SI_FAULT|8) /* subscript out of range */
+#define FPE_INTDIV 1 /* integer divide by zero */
+#define FPE_INTOVF 2 /* integer overflow */
+#define FPE_FLTDIV 3 /* floating point divide by zero */
+#define FPE_FLTOVF 4 /* floating point overflow */
+#define FPE_FLTUND 5 /* floating point underflow */
+#define FPE_FLTRES 6 /* floating point inexact result */
+#define FPE_FLTINV 7 /* floating point invalid operation */
+#define FPE_FLTSUB 8 /* subscript out of range */
#define NSIGFPE 8
/*
* SIGSEGV si_codes
*/
-#define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */
-#define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */
-#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */
-#define SEGV_PKUERR (__SI_FAULT|4) /* failed protection key checks */
+#define SEGV_MAPERR 1 /* address not mapped to object */
+#define SEGV_ACCERR 2 /* invalid permissions for mapped object */
+#define SEGV_BNDERR 3 /* failed address bound checks */
+#define SEGV_PKUERR 4 /* failed protection key checks */
#define NSIGSEGV 4
/*
* SIGBUS si_codes
*/
-#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */
-#define BUS_ADRERR (__SI_FAULT|2) /* non-existent physical address */
-#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */
+#define BUS_ADRALN 1 /* invalid address alignment */
+#define BUS_ADRERR 2 /* non-existent physical address */
+#define BUS_OBJERR 3 /* object specific hardware error */
/* hardware memory error consumed on a machine check: action required */
-#define BUS_MCEERR_AR (__SI_FAULT|4)
+#define BUS_MCEERR_AR 4
/* hardware memory error detected in process but not consumed: action optional*/
-#define BUS_MCEERR_AO (__SI_FAULT|5)
+#define BUS_MCEERR_AO 5
#define NSIGBUS 5
/*
* SIGTRAP si_codes
*/
-#define TRAP_BRKPT (__SI_FAULT|1) /* process breakpoint */
-#define TRAP_TRACE (__SI_FAULT|2) /* process trace trap */
-#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */
-#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint/watchpoint */
+#define TRAP_BRKPT 1 /* process breakpoint */
+#define TRAP_TRACE 2 /* process trace trap */
+#define TRAP_BRANCH 3 /* process taken branch trap */
+#define TRAP_HWBKPT 4 /* hardware breakpoint/watchpoint */
#define NSIGTRAP 4
/*
* SIGCHLD si_codes
*/
-#define CLD_EXITED (__SI_CHLD|1) /* child has exited */
-#define CLD_KILLED (__SI_CHLD|2) /* child was killed */
-#define CLD_DUMPED (__SI_CHLD|3) /* child terminated abnormally */
-#define CLD_TRAPPED (__SI_CHLD|4) /* traced child has trapped */
-#define CLD_STOPPED (__SI_CHLD|5) /* child has stopped */
-#define CLD_CONTINUED (__SI_CHLD|6) /* stopped child has continued */
+#define CLD_EXITED 1 /* child has exited */
+#define CLD_KILLED 2 /* child was killed */
+#define CLD_DUMPED 3 /* child terminated abnormally */
+#define CLD_TRAPPED 4 /* traced child has trapped */
+#define CLD_STOPPED 5 /* child has stopped */
+#define CLD_CONTINUED 6 /* stopped child has continued */
#define NSIGCHLD 6
/*
- * SIGPOLL si_codes
+ * SIGPOLL (or any other signal without signal specific si_codes) si_codes
*/
-#define POLL_IN (__SI_POLL|1) /* data input available */
-#define POLL_OUT (__SI_POLL|2) /* output buffers available */
-#define POLL_MSG (__SI_POLL|3) /* input message available */
-#define POLL_ERR (__SI_POLL|4) /* i/o error */
-#define POLL_PRI (__SI_POLL|5) /* high priority input available */
-#define POLL_HUP (__SI_POLL|6) /* device disconnected */
+#define POLL_IN 1 /* data input available */
+#define POLL_OUT 2 /* output buffers available */
+#define POLL_MSG 3 /* input message available */
+#define POLL_ERR 4 /* i/o error */
+#define POLL_PRI 5 /* high priority input available */
+#define POLL_HUP 6 /* device disconnected */
#define NSIGPOLL 6
/*
* SIGSYS si_codes
*/
-#define SYS_SECCOMP (__SI_SYS|1) /* seccomp triggered */
-#define NSIGSYS 1
+#define SYS_SECCOMP 1 /* seccomp triggered */
+#define NSIGSYS 1
/*
* sigevent definitions
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 6fe14d001f68..230e05d35191 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct {
#define VFS_CAP_U32_2 2
#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
-#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
-#define VFS_CAP_U32 VFS_CAP_U32_2
-#define VFS_CAP_REVISION VFS_CAP_REVISION_2
+#define VFS_CAP_REVISION_3 0x03000000
+#define VFS_CAP_U32_3 2
+#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3))
+
+#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3
+#define VFS_CAP_U32 VFS_CAP_U32_3
+#define VFS_CAP_REVISION VFS_CAP_REVISION_3
struct vfs_cap_data {
__le32 magic_etc; /* Little endian */
@@ -72,6 +76,18 @@ struct vfs_cap_data {
} data[VFS_CAP_U32];
};
+/*
+ * same as vfs_cap_data but with a rootid at the end
+ */
+struct vfs_ns_cap_data {
+ __le32 magic_etc;
+ struct {
+ __le32 permitted; /* Little endian */
+ __le32 inheritable; /* Little endian */
+ } data[VFS_CAP_U32];
+ __le32 rootid;
+};
+
#ifndef __KERNEL__
/*
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 67230ecf2ce1..4657e2924ecb 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2275,6 +2275,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2349,8 +2356,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2363,6 +2372,11 @@ void cpuset_update_active_cpus(void)
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
diff --git a/kernel/exit.c b/kernel/exit.c
index a35d8a17e01f..3481ababd06a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1615,7 +1615,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
@@ -1741,7 +1741,7 @@ COMPAT_SYSCALL_DEFINE5(waitid,
user_access_begin();
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
- unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
diff --git a/kernel/fork.c b/kernel/fork.c
index 6f1b0af00bda..10646182440f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1569,10 +1569,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- retval = security_task_create(clone_flags);
- if (retval)
- goto fork_out;
-
retval = -ENOMEM;
p = dup_task_struct(current, node);
if (!p)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 73be2b3909bd..82afb7ed369f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -421,10 +421,8 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- mutex_lock(&sparse_irq_lock);
kobject_del(&desc->kobj);
delete_irq_desc(irq);
- mutex_unlock(&sparse_irq_lock);
/*
* We free the descriptor, masks and stat fields via RCU. That
@@ -462,20 +460,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
- mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
- mutex_unlock(&sparse_irq_lock);
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
@@ -575,6 +568,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc->owner = owner;
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -670,10 +664,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
+ mutex_lock(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
- mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
@@ -720,19 +714,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
- goto err;
+ goto unlock;
if (start + cnt > nr_irqs) {
ret = irq_expand_nr_irqs(start + cnt);
if (ret)
- goto err;
+ goto unlock;
}
-
- bitmap_set(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, affinity, owner);
-
-err:
+ ret = alloc_descs(start, cnt, node, affinity, owner);
+unlock:
mutex_unlock(&sparse_irq_lock);
return ret;
}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 48eadf416c24..3fa4bd59f569 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -315,11 +315,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
ops->set_desc(arg, desc);
/* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
+ arg);
if (ret)
break;
- irq_set_msi_desc_off(virq, 0, desc);
+ irq_set_msi_desc_off(desc->irq, 0, desc);
}
if (ret) {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 74a5a7255b4d..4918314893bc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
int i;
int err;
+ err = -EINVAL;
+ if (!in_userns(parent_pid_ns->user_ns, user_ns))
+ goto out;
+
err = -ENOSPC;
if (level > MAX_PID_NS_LEVEL)
goto out;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78672d324a6e..50f25cb370c6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -20,8 +20,9 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -202,6 +203,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 60f356d91060..84b1367935e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (unlikely(in_compat_syscall())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
- if (copy_siginfo_to_user32(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user32(uinfo, &info)) {
ret = -EFAULT;
break;
}
@@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child,
{
siginfo_t __user *uinfo = (siginfo_t __user *) data;
- if (copy_siginfo_to_user(uinfo, &info) ||
- __put_user(info.si_code, &uinfo->si_code)) {
+ if (copy_siginfo_to_user(uinfo, &info)) {
ret = -EFAULT;
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6d2c7ff9ba98..136a76d80dbf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5556,16 +5556,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus();
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a5d83ed8dd82..0a85641e62ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5424,7 +5424,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
return false;
/* if this cache has capacity, come here */
- if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running)
return true;
/*
@@ -7708,7 +7708,7 @@ next_group:
* number.
*
* Return: 1 when packing is required and a task should be moved to
- * this CPU. The amount of the imbalance is returned in *imbalance.
+ * this CPU. The amount of the imbalance is returned in env->imbalance.
*
* @env: The load balancing environment.
* @sds: Statistics of the sched_domain which is to be packed
diff --git a/kernel/signal.c b/kernel/signal.c
index ed804a470dcd..800a18f77732 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2686,6 +2686,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
}
#endif
+enum siginfo_layout siginfo_layout(int sig, int si_code)
+{
+ enum siginfo_layout layout = SIL_KILL;
+ if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
+ static const struct {
+ unsigned char limit, layout;
+ } filter[] = {
+ [SIGILL] = { NSIGILL, SIL_FAULT },
+ [SIGFPE] = { NSIGFPE, SIL_FAULT },
+ [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
+ [SIGBUS] = { NSIGBUS, SIL_FAULT },
+ [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
+#if defined(SIGMET) && defined(NSIGEMT)
+ [SIGEMT] = { NSIGEMT, SIL_FAULT },
+#endif
+ [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
+ [SIGPOLL] = { NSIGPOLL, SIL_POLL },
+#ifdef __ARCH_SIGSYS
+ [SIGSYS] = { NSIGSYS, SIL_SYS },
+#endif
+ };
+ if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit))
+ layout = filter[sig].layout;
+ else if (si_code <= NSIGPOLL)
+ layout = SIL_POLL;
+ } else {
+ if (si_code == SI_TIMER)
+ layout = SIL_TIMER;
+ else if (si_code == SI_SIGIO)
+ layout = SIL_POLL;
+ else if (si_code < 0)
+ layout = SIL_RT;
+ /* Tests to support buggy kernel ABIs */
+#ifdef TRAP_FIXME
+ if ((sig == SIGTRAP) && (si_code == TRAP_FIXME))
+ layout = SIL_FAULT;
+#endif
+#ifdef FPE_FIXME
+ if ((sig == SIGFPE) && (si_code == FPE_FIXME))
+ layout = SIL_FAULT;
+#endif
+ }
+ return layout;
+}
+
#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER
int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
@@ -2708,22 +2753,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
*/
err = __put_user(from->si_signo, &to->si_signo);
err |= __put_user(from->si_errno, &to->si_errno);
- err |= __put_user((short)from->si_code, &to->si_code);
- switch (from->si_code & __SI_MASK) {
- case __SI_KILL:
+ err |= __put_user(from->si_code, &to->si_code);
+ switch (siginfo_layout(from->si_signo, from->si_code)) {
+ case SIL_KILL:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
break;
- case __SI_TIMER:
- err |= __put_user(from->si_tid, &to->si_tid);
- err |= __put_user(from->si_overrun, &to->si_overrun);
- err |= __put_user(from->si_ptr, &to->si_ptr);
+ case SIL_TIMER:
+ /* Unreached SI_TIMER is negative */
break;
- case __SI_POLL:
+ case SIL_POLL:
err |= __put_user(from->si_band, &to->si_band);
err |= __put_user(from->si_fd, &to->si_fd);
break;
- case __SI_FAULT:
+ case SIL_FAULT:
err |= __put_user(from->si_addr, &to->si_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(from->si_trapno, &to->si_trapno);
@@ -2748,30 +2791,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
err |= __put_user(from->si_pkey, &to->si_pkey);
#endif
break;
- case __SI_CHLD:
+ case SIL_CHLD:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_status, &to->si_status);
err |= __put_user(from->si_utime, &to->si_utime);
err |= __put_user(from->si_stime, &to->si_stime);
break;
- case __SI_RT: /* This is not generated by the kernel as of now. */
- case __SI_MESGQ: /* But this is */
+ case SIL_RT:
err |= __put_user(from->si_pid, &to->si_pid);
err |= __put_user(from->si_uid, &to->si_uid);
err |= __put_user(from->si_ptr, &to->si_ptr);
break;
#ifdef __ARCH_SIGSYS
- case __SI_SYS:
+ case SIL_SYS:
err |= __put_user(from->si_call_addr, &to->si_call_addr);
err |= __put_user(from->si_syscall, &to->si_syscall);
err |= __put_user(from->si_arch, &to->si_arch);
break;
#endif
- default: /* this is just in case for now ... */
- err |= __put_user(from->si_pid, &to->si_pid);
- err |= __put_user(from->si_uid, &to->si_uid);
- break;
}
return err;
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 2855ee73acd0..9aebc2935013 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1896,15 +1896,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
/*
* Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local root should
+ * change /proc/pid/exe link: only local sys admin should
* be allowed to.
*/
if (prctl_map->exe_fd != (u32)-1) {
- struct user_namespace *ns = current_user_ns();
- const struct cred *cred = current_cred();
-
- if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
- !gid_eq(cred->gid, make_kgid(ns, 0)))
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
goto out;
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 2f735cbe05e8..c490f1e4313b 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns)
}
/*
- * Returns true if @ns is the same namespace as or a descendant of
- * @target_ns.
+ * Returns true if @child is the same namespace or a descendant of
+ * @ancestor.
*/
+bool in_userns(const struct user_namespace *ancestor,
+ const struct user_namespace *child)
+{
+ const struct user_namespace *ns;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return (ns == ancestor);
+}
+
bool current_in_userns(const struct user_namespace *target_ns)
{
- struct user_namespace *ns;
- for (ns = current_user_ns(); ns; ns = ns->parent) {
- if (ns == target_ns)
- return true;
- }
- return false;
+ return in_userns(target_ns, current_user_ns());
}
static inline struct user_namespace *to_user_ns(struct ns_common *ns)
diff --git a/lib/Kconfig b/lib/Kconfig
index 40b114a11d7c..a85e6f76add5 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -559,9 +559,6 @@ config ARCH_HAS_PMEM_API
config ARCH_HAS_UACCESS_FLUSHCACHE
bool
-config ARCH_HAS_MMIO_FLUSH
- bool
-
config STACKDEPOT
bool
select STACKTRACE
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 303c779bfe38..43ba91c440bc 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -58,7 +58,7 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
u8 code, u8 ident, u16 dlen, void *data);
static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
void *data);
-static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data);
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size);
static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err);
static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
@@ -1473,7 +1473,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -2987,12 +2987,15 @@ static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen,
return len;
}
-static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
+static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size)
{
struct l2cap_conf_opt *opt = *ptr;
BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val);
+ if (size < L2CAP_CONF_OPT_SIZE + len)
+ return;
+
opt->type = type;
opt->len = len;
@@ -3017,7 +3020,7 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val)
*ptr += L2CAP_CONF_OPT_SIZE + len;
}
-static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan)
+static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size)
{
struct l2cap_conf_efs efs;
@@ -3045,7 +3048,7 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan)
}
l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, size);
}
static void l2cap_ack_timeout(struct work_struct *work)
@@ -3191,11 +3194,12 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
chan->ack_win = chan->tx_win;
}
-static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data)
+static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
{
struct l2cap_conf_req *req = data;
struct l2cap_conf_rfc rfc = { .mode = chan->mode };
void *ptr = req->data;
+ void *endptr = data + data_size;
u16 size;
BT_DBG("chan %p", chan);
@@ -3220,7 +3224,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data)
done:
if (chan->imtu != L2CAP_DEFAULT_MTU)
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
switch (chan->mode) {
case L2CAP_MODE_BASIC:
@@ -3239,7 +3243,7 @@ done:
rfc.max_pdu_size = 0;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
break;
case L2CAP_MODE_ERTM:
@@ -3259,21 +3263,21 @@ done:
L2CAP_DEFAULT_TX_WINDOW);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
- l2cap_add_opt_efs(&ptr, chan);
+ l2cap_add_opt_efs(&ptr, chan, endptr - ptr);
if (test_bit(FLAG_EXT_CTRL, &chan->flags))
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
- chan->tx_win);
+ chan->tx_win, endptr - ptr);
if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
if (chan->fcs == L2CAP_FCS_NONE ||
test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
chan->fcs = L2CAP_FCS_NONE;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
- chan->fcs);
+ chan->fcs, endptr - ptr);
}
break;
@@ -3291,17 +3295,17 @@ done:
rfc.max_pdu_size = cpu_to_le16(size);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
if (test_bit(FLAG_EFS_ENABLE, &chan->flags))
- l2cap_add_opt_efs(&ptr, chan);
+ l2cap_add_opt_efs(&ptr, chan, endptr - ptr);
if (chan->conn->feat_mask & L2CAP_FEAT_FCS)
if (chan->fcs == L2CAP_FCS_NONE ||
test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) {
chan->fcs = L2CAP_FCS_NONE;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1,
- chan->fcs);
+ chan->fcs, endptr - ptr);
}
break;
}
@@ -3312,10 +3316,11 @@ done:
return ptr - data;
}
-static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data)
+static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
{
struct l2cap_conf_rsp *rsp = data;
void *ptr = rsp->data;
+ void *endptr = data + data_size;
void *req = chan->conf_req;
int len = chan->conf_len;
int type, hint, olen;
@@ -3417,7 +3422,7 @@ done:
return -ECONNREFUSED;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
}
if (result == L2CAP_CONF_SUCCESS) {
@@ -3430,7 +3435,7 @@ done:
chan->omtu = mtu;
set_bit(CONF_MTU_DONE, &chan->conf_state);
}
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr);
if (remote_efs) {
if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
@@ -3444,7 +3449,7 @@ done:
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, endptr - ptr);
} else {
/* Send PENDING Conf Rsp */
result = L2CAP_CONF_PENDING;
@@ -3477,7 +3482,7 @@ done:
set_bit(CONF_MODE_DONE, &chan->conf_state);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
- sizeof(rfc), (unsigned long) &rfc);
+ sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
chan->remote_id = efs.id;
@@ -3491,7 +3496,7 @@ done:
le32_to_cpu(efs.sdu_itime);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS,
sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, endptr - ptr);
}
break;
@@ -3505,7 +3510,7 @@ done:
set_bit(CONF_MODE_DONE, &chan->conf_state);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
- (unsigned long) &rfc);
+ (unsigned long) &rfc, endptr - ptr);
break;
@@ -3527,10 +3532,11 @@ done:
}
static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
- void *data, u16 *result)
+ void *data, size_t size, u16 *result)
{
struct l2cap_conf_req *req = data;
void *ptr = req->data;
+ void *endptr = data + size;
int type, olen;
unsigned long val;
struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
@@ -3548,13 +3554,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
chan->imtu = L2CAP_DEFAULT_MIN_MTU;
} else
chan->imtu = val;
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
break;
case L2CAP_CONF_FLUSH_TO:
chan->flush_to = val;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO,
- 2, chan->flush_to);
+ 2, chan->flush_to, endptr - ptr);
break;
case L2CAP_CONF_RFC:
@@ -3568,13 +3574,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
chan->fcs = 0;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
- sizeof(rfc), (unsigned long) &rfc);
+ sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
break;
case L2CAP_CONF_EWS:
chan->ack_win = min_t(u16, val, chan->ack_win);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
- chan->tx_win);
+ chan->tx_win, endptr - ptr);
break;
case L2CAP_CONF_EFS:
@@ -3587,7 +3593,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
return -ECONNREFUSED;
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
- (unsigned long) &efs);
+ (unsigned long) &efs, endptr - ptr);
break;
case L2CAP_CONF_FCS:
@@ -3692,7 +3698,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
return;
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -3900,7 +3906,7 @@ sendresp:
u8 buf[128];
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -3978,7 +3984,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
break;
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, req), req);
+ l2cap_build_conf_req(chan, req, sizeof(req)), req);
chan->num_conf_req++;
break;
@@ -4090,7 +4096,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
}
/* Complete config. */
- len = l2cap_parse_conf_req(chan, rsp);
+ len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp));
if (len < 0) {
l2cap_send_disconn_req(chan, ECONNRESET);
goto unlock;
@@ -4124,7 +4130,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) {
u8 buf[64];
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
@@ -4184,7 +4190,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
char buf[64];
len = l2cap_parse_conf_rsp(chan, rsp->data, len,
- buf, &result);
+ buf, sizeof(buf), &result);
if (len < 0) {
l2cap_send_disconn_req(chan, ECONNRESET);
goto done;
@@ -4214,7 +4220,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
/* throw out any old stored conf requests */
result = L2CAP_CONF_SUCCESS;
len = l2cap_parse_conf_rsp(chan, rsp->data, len,
- req, &result);
+ req, sizeof(req), &result);
if (len < 0) {
l2cap_send_disconn_req(chan, ECONNRESET);
goto done;
@@ -4791,7 +4797,7 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result,
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn),
L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf), buf);
+ l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
chan->num_conf_req++;
}
}
@@ -7465,7 +7471,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
set_bit(CONF_REQ_SENT, &chan->conf_state);
l2cap_send_cmd(conn, l2cap_get_ident(conn),
L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf),
+ l2cap_build_conf_req(chan, buf, sizeof(buf)),
buf);
chan->num_conf_req++;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 875675765531..63edc6e5f026 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -676,7 +676,8 @@ bad:
/*
* Do a synchronous statfs().
*/
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool,
+ struct ceph_statfs *buf)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_statfs *h;
@@ -696,6 +697,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
goto out;
req->u.st = buf;
+ req->request->hdr.version = cpu_to_le16(2);
mutex_lock(&monc->mutex);
register_generic_request(req);
@@ -705,6 +707,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
h->monhdr.session_mon = cpu_to_le16(-1);
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
+ h->contains_data_pool = (data_pool != CEPH_NOPOOL);
+ h->data_pool = cpu_to_le64(data_pool);
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index dcfbdd74dfd1..e02f01f534e2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -863,8 +863,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
dst->cls.method_len = src->cls.method_len;
dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
break;
- case CEPH_OSD_OP_STARTSYNC:
- break;
case CEPH_OSD_OP_WATCH:
dst->watch.cookie = cpu_to_le64(src->watch.cookie);
dst->watch.ver = cpu_to_le64(0);
@@ -916,9 +914,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
* if the file was recently truncated, we include information about its
* old and new size so that the object can be updated appropriately. (we
* avoid synchronously deleting truncated objects because it's slow.)
- *
- * if @do_sync, include a 'startsync' command so that the osd will flush
- * data quickly.
*/
struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index ac701c28f44f..c2c68a15b59d 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -171,10 +171,10 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
/*
* Add the temporary list to the backchannel preallocation list
*/
- spin_lock_bh(&xprt->bc_pa_lock);
+ spin_lock(&xprt->bc_pa_lock);
list_splice(&tmp_list, &xprt->bc_pa_list);
xprt_inc_alloc_count(xprt, min_reqs);
- spin_unlock_bh(&xprt->bc_pa_lock);
+ spin_unlock(&xprt->bc_pa_lock);
dprintk("RPC: setup backchannel transport done\n");
return 0;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 2e49d1f892b7..2ad827db2704 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1903,6 +1903,14 @@ call_connect_status(struct rpc_task *task)
task->tk_status = 0;
switch (status) {
case -ECONNREFUSED:
+ /* A positive refusal suggests a rebind is needed. */
+ if (RPC_IS_SOFTCONN(task))
+ break;
+ if (clnt->cl_autobind) {
+ rpc_force_rebind(clnt);
+ task->tk_action = call_bind;
+ return;
+ }
case -ECONNRESET:
case -ECONNABORTED:
case -ENETUNREACH:
@@ -2139,10 +2147,6 @@ call_status(struct rpc_task *task)
rpc_delay(task, 3*HZ);
case -ETIMEDOUT:
task->tk_action = call_timeout;
- if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT)
- && task->tk_client->cl_discrtry)
- xprt_conditional_disconnect(req->rq_xprt,
- req->rq_connect_cookie);
break;
case -ECONNREFUSED:
case -ECONNRESET:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 399fab5d1936..ff8e06cd067e 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1013,7 +1013,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
if (!bc_xprt)
return -EAGAIN;
- spin_lock_bh(&bc_xprt->transport_lock);
+ spin_lock(&bc_xprt->recv_lock);
req = xprt_lookup_rqst(bc_xprt, xid);
if (!req)
goto unlock_notfound;
@@ -1031,7 +1031,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp)
memcpy(dst->iov_base, src->iov_base, src->iov_len);
xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len);
rqstp->rq_arg.len = 0;
- spin_unlock_bh(&bc_xprt->transport_lock);
+ spin_unlock(&bc_xprt->recv_lock);
return 0;
unlock_notfound:
printk(KERN_NOTICE
@@ -1040,7 +1040,7 @@ unlock_notfound:
__func__, ntohl(calldir),
bc_xprt, ntohl(xid));
unlock_eagain:
- spin_unlock_bh(&bc_xprt->transport_lock);
+ spin_unlock(&bc_xprt->recv_lock);
return -EAGAIN;
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 4654a9934269..e741ec2b4d8e 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -844,6 +844,50 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid)
}
EXPORT_SYMBOL_GPL(xprt_lookup_rqst);
+/**
+ * xprt_pin_rqst - Pin a request on the transport receive list
+ * @req: Request to pin
+ *
+ * Caller must ensure this is atomic with the call to xprt_lookup_rqst()
+ * so should be holding the xprt transport lock.
+ */
+void xprt_pin_rqst(struct rpc_rqst *req)
+{
+ set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate);
+}
+EXPORT_SYMBOL_GPL(xprt_pin_rqst);
+
+/**
+ * xprt_unpin_rqst - Unpin a request on the transport receive list
+ * @req: Request to pin
+ *
+ * Caller should be holding the xprt transport lock.
+ */
+void xprt_unpin_rqst(struct rpc_rqst *req)
+{
+ struct rpc_task *task = req->rq_task;
+
+ clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate);
+ if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate))
+ wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV);
+}
+EXPORT_SYMBOL_GPL(xprt_unpin_rqst);
+
+static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req)
+__must_hold(&req->rq_xprt->recv_lock)
+{
+ struct rpc_task *task = req->rq_task;
+
+ if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) {
+ spin_unlock(&req->rq_xprt->recv_lock);
+ set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
+ wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV,
+ TASK_UNINTERRUPTIBLE);
+ clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate);
+ spin_lock(&req->rq_xprt->recv_lock);
+ }
+}
+
static void xprt_update_rtt(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
@@ -966,13 +1010,13 @@ void xprt_transmit(struct rpc_task *task)
/*
* Add to the list only if we're expecting a reply
*/
- spin_lock_bh(&xprt->transport_lock);
/* Update the softirq receive buffer */
memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
sizeof(req->rq_private_buf));
/* Add request to the receive list */
+ spin_lock(&xprt->recv_lock);
list_add_tail(&req->rq_list, &xprt->recv);
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
xprt_reset_majortimeo(req);
/* Turn off autodisconnect */
del_singleshot_timer_sync(&xprt->timer);
@@ -1287,12 +1331,16 @@ void xprt_release(struct rpc_task *task)
task->tk_ops->rpc_count_stats(task, task->tk_calldata);
else if (task->tk_client)
rpc_count_iostats(task, task->tk_client->cl_metrics);
+ spin_lock(&xprt->recv_lock);
+ if (!list_empty(&req->rq_list)) {
+ list_del(&req->rq_list);
+ xprt_wait_on_pinned_rqst(req);
+ }
+ spin_unlock(&xprt->recv_lock);
spin_lock_bh(&xprt->transport_lock);
xprt->ops->release_xprt(xprt, task);
if (xprt->ops->release_request)
xprt->ops->release_request(task);
- if (!list_empty(&req->rq_list))
- list_del(&req->rq_list);
xprt->last_used = jiffies;
xprt_schedule_autodisconnect(xprt);
spin_unlock_bh(&xprt->transport_lock);
@@ -1318,6 +1366,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
spin_lock_init(&xprt->transport_lock);
spin_lock_init(&xprt->reserve_lock);
+ spin_lock_init(&xprt->recv_lock);
INIT_LIST_HEAD(&xprt->free);
INIT_LIST_HEAD(&xprt->recv);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 03f6b5840764..d31d0ac5ada9 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -49,6 +49,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
if (IS_ERR(rb))
goto out_fail;
req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
size = r_xprt->rx_data.inline_rsize;
rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
@@ -202,20 +203,24 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
*/
int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
{
- struct rpc_xprt *xprt = rqst->rq_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
- struct rpcrdma_msg *headerp;
-
- headerp = rdmab_to_msg(req->rl_rdmabuf);
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit =
- cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
- headerp->rm_type = rdma_msg;
- headerp->rm_body.rm_chunks[0] = xdr_zero;
- headerp->rm_body.rm_chunks[1] = xdr_zero;
- headerp->rm_body.rm_chunks[2] = xdr_zero;
+ __be32 *p;
+
+ rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+ xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf,
+ req->rl_rdmabuf->rg_base);
+
+ p = xdr_reserve_space(&req->rl_stream, 28);
+ if (unlikely(!p))
+ return -EIO;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+ *p++ = rdma_msg;
+ *p++ = xdr_zero;
+ *p++ = xdr_zero;
+ *p = xdr_zero;
if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
&rqst->rq_snd_buf, rpcrdma_noch))
@@ -271,9 +276,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
* @xprt: transport receiving the call
* @rep: receive buffer containing the call
*
- * Called in the RPC reply handler, which runs in a tasklet.
- * Be quick about it.
- *
* Operational assumptions:
* o Backchannel credits are ignored, just as the NFS server
* forechannel currently does
@@ -284,7 +286,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_rep *rep)
{
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct rpcrdma_msg *headerp;
struct svc_serv *bc_serv;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
@@ -292,24 +293,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
size_t size;
__be32 *p;
- headerp = rdmab_to_msg(rep->rr_rdmabuf);
+ p = xdr_inline_decode(&rep->rr_stream, 0);
+ size = xdr_stream_remaining(&rep->rr_stream);
+
#ifdef RPCRDMA_BACKCHANNEL_DEBUG
pr_info("RPC: %s: callback XID %08x, length=%u\n",
- __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
- pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp);
+ __func__, be32_to_cpup(p), size);
+ pr_info("RPC: %s: %*ph\n", __func__, size, p);
#endif
- /* Sanity check:
- * Need at least enough bytes for RPC/RDMA header, as code
- * here references the header fields by array offset. Also,
- * backward calls are always inline, so ensure there
- * are some bytes beyond the RPC/RDMA header.
- */
- if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
- goto out_short;
- p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
- size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
-
/* Grab a free bc rqst */
spin_lock(&xprt->bc_pa_lock);
if (list_empty(&xprt->bc_pa_list)) {
@@ -325,7 +317,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
/* Prepare rqst */
rqst->rq_reply_bytes_recvd = 0;
rqst->rq_bytes_sent = 0;
- rqst->rq_xid = headerp->rm_xid;
+ rqst->rq_xid = *p;
rqst->rq_private_buf.len = size;
set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -337,9 +329,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
buf->len = size;
/* The receive buffer has to be hooked to the rpcrdma_req
- * so that it can be reposted after the server is done
- * parsing it but just before sending the backward
- * direction reply.
+ * so that it is not released while the req is pointing
+ * to its buffer, and so that it can be reposted after
+ * the Upper Layer is done decoding it.
*/
req = rpcr_to_rdmar(rqst);
dprintk("RPC: %s: attaching rep %p to req %p\n",
@@ -367,13 +359,4 @@ out_overflow:
* when the connection is re-established.
*/
return;
-
-out_short:
- pr_warn("RPC/RDMA short backward direction call\n");
-
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
- xprt_disconnect_done(xprt);
- else
- pr_warn("RPC: %s: reposting rep %p\n",
- __func__, rep);
}
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index d3f84bb1d443..6c7151341194 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -177,7 +177,7 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
-static int
+static struct rpcrdma_mr_seg *
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
@@ -188,7 +188,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
@@ -232,13 +232,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw->mw_offset = dma_pages[0] + pageoff;
*out = mw;
- return mw->mw_nents;
+ return seg;
out_dmamap_err:
pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
mw->mw_sg, i);
rpcrdma_put_mw(r_xprt, mw);
- return -EIO;
+ return ERR_PTR(-EIO);
out_maperr:
pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
@@ -247,7 +247,7 @@ out_maperr:
ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
mw->mw_sg, mw->mw_nents, mw->mw_dir);
rpcrdma_put_mw(r_xprt, mw);
- return -EIO;
+ return ERR_PTR(-EIO);
}
/* Invalidate all memory regions that were registered for "req".
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 6aea36a38bfd..5a936a6a31a3 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -344,7 +344,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
/* Post a REG_MR Work Request to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
-static int
+static struct rpcrdma_mr_seg *
frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mw **out)
{
@@ -364,7 +364,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
rpcrdma_defer_mr_recovery(mw);
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
- return -ENOBUFS;
+ return ERR_PTR(-ENOBUFS);
} while (mw->frmr.fr_state != FRMR_IS_INVALID);
frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
@@ -429,25 +429,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw->mw_offset = mr->iova;
*out = mw;
- return mw->mw_nents;
+ return seg;
out_dmamap_err:
pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
mw->mw_sg, i);
frmr->fr_state = FRMR_IS_INVALID;
rpcrdma_put_mw(r_xprt, mw);
- return -EIO;
+ return ERR_PTR(-EIO);
out_mapmr_err:
pr_err("rpcrdma: failed to map mr %p (%d/%d)\n",
frmr->fr_mr, n, mw->mw_nents);
rpcrdma_defer_mr_recovery(mw);
- return -EIO;
+ return ERR_PTR(-EIO);
out_senderr:
pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc);
rpcrdma_defer_mr_recovery(mw);
- return -ENOTCONN;
+ return ERR_PTR(-ENOTCONN);
}
/* Invalidate all memory regions that were registered for "req".
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index ca4d6e4528f3..f1889f4d4803 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -169,40 +169,41 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
}
-/* Split "vec" on page boundaries into segments. FMR registers pages,
- * not a byte range. Other modes coalesce these segments into a single
- * MR when they can.
+/* Split @vec on page boundaries into SGEs. FMR registers pages, not
+ * a byte range. Other modes coalesce these SGEs into a single MR
+ * when they can.
+ *
+ * Returns pointer to next available SGE, and bumps the total number
+ * of SGEs consumed.
*/
-static int
-rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
+static struct rpcrdma_mr_seg *
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
+ unsigned int *n)
{
- size_t page_offset;
- u32 remaining;
+ u32 remaining, page_offset;
char *base;
base = vec->iov_base;
page_offset = offset_in_page(base);
remaining = vec->iov_len;
- while (remaining && n < RPCRDMA_MAX_SEGS) {
- seg[n].mr_page = NULL;
- seg[n].mr_offset = base;
- seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
- remaining -= seg[n].mr_len;
- base += seg[n].mr_len;
- ++n;
+ while (remaining) {
+ seg->mr_page = NULL;
+ seg->mr_offset = base;
+ seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
+ remaining -= seg->mr_len;
+ base += seg->mr_len;
+ ++seg;
+ ++(*n);
page_offset = 0;
}
- return n;
+ return seg;
}
-/*
- * Chunk assembly from upper layer xdr_buf.
- *
- * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk
- * elements. Segments are then coalesced when registered, if possible
- * within the selected memreg mode.
+/* Convert @xdrbuf into SGEs no larger than a page each. As they
+ * are registered, these SGEs are then coalesced into RDMA segments
+ * when the selected memreg mode supports it.
*
- * Returns positive number of segments converted, or a negative errno.
+ * Returns positive number of SGEs consumed, or a negative errno.
*/
static int
@@ -210,47 +211,41 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
unsigned int pos, enum rpcrdma_chunktype type,
struct rpcrdma_mr_seg *seg)
{
- int len, n, p, page_base;
+ unsigned long page_base;
+ unsigned int len, n;
struct page **ppages;
n = 0;
- if (pos == 0) {
- n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n);
- if (n == RPCRDMA_MAX_SEGS)
- goto out_overflow;
- }
+ if (pos == 0)
+ seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n);
len = xdrbuf->page_len;
ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT);
page_base = offset_in_page(xdrbuf->page_base);
- p = 0;
- while (len && n < RPCRDMA_MAX_SEGS) {
- if (!ppages[p]) {
- /* alloc the pagelist for receiving buffer */
- ppages[p] = alloc_page(GFP_ATOMIC);
- if (!ppages[p])
+ while (len) {
+ if (unlikely(!*ppages)) {
+ /* XXX: Certain upper layer operations do
+ * not provide receive buffer pages.
+ */
+ *ppages = alloc_page(GFP_ATOMIC);
+ if (!*ppages)
return -EAGAIN;
}
- seg[n].mr_page = ppages[p];
- seg[n].mr_offset = (void *)(unsigned long) page_base;
- seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len);
- if (seg[n].mr_len > PAGE_SIZE)
- goto out_overflow;
- len -= seg[n].mr_len;
+ seg->mr_page = *ppages;
+ seg->mr_offset = (char *)page_base;
+ seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len);
+ len -= seg->mr_len;
+ ++ppages;
+ ++seg;
++n;
- ++p;
- page_base = 0; /* page offset only applies to first page */
+ page_base = 0;
}
- /* Message overflows the seg array */
- if (len && n == RPCRDMA_MAX_SEGS)
- goto out_overflow;
-
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
- return n;
+ goto out;
/* When encoding a Write chunk, some servers need to see an
* extra segment for non-XDR-aligned Write chunks. The upper
@@ -258,30 +253,81 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
* for this purpose.
*/
if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
- return n;
+ goto out;
- if (xdrbuf->tail[0].iov_len) {
- n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n);
- if (n == RPCRDMA_MAX_SEGS)
- goto out_overflow;
- }
+ if (xdrbuf->tail[0].iov_len)
+ seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n);
+out:
+ if (unlikely(n > RPCRDMA_MAX_SEGS))
+ return -EIO;
return n;
+}
-out_overflow:
- pr_err("rpcrdma: segment array overflow\n");
- return -EIO;
+static inline int
+encode_item_present(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ *p = xdr_one;
+ return 0;
}
-static inline __be32 *
+static inline int
+encode_item_not_present(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ *p = xdr_zero;
+ return 0;
+}
+
+static void
xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
{
*iptr++ = cpu_to_be32(mw->mw_handle);
*iptr++ = cpu_to_be32(mw->mw_length);
- return xdr_encode_hyper(iptr, mw->mw_offset);
+ xdr_encode_hyper(iptr, mw->mw_offset);
}
-/* XDR-encode the Read list. Supports encoding a list of read
+static int
+encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ xdr_encode_rdma_segment(p, mw);
+ return 0;
+}
+
+static int
+encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw,
+ u32 position)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 6 * sizeof(*p));
+ if (unlikely(!p))
+ return -EMSGSIZE;
+
+ *p++ = xdr_one; /* Item present */
+ *p++ = cpu_to_be32(position);
+ xdr_encode_rdma_segment(p, mw);
+ return 0;
+}
+
+/* Register and XDR encode the Read list. Supports encoding a list of read
* segments that belong to a single read chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
@@ -290,23 +336,20 @@ xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw)
* N elements, position P (same P for all chunks of same arg!):
* 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0
*
- * Returns a pointer to the XDR word in the RDMA header following
- * the end of the Read list, or an error pointer.
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single @pos value is currently supported.
*/
-static __be32 *
-rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, struct rpc_rqst *rqst,
- __be32 *iptr, enum rpcrdma_chunktype rtype)
+static noinline int
+rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype)
{
+ struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
unsigned int pos;
- int n, nsegs;
-
- if (rtype == rpcrdma_noch) {
- *iptr++ = xdr_zero; /* item not present */
- return iptr;
- }
+ int nsegs;
pos = rqst->rq_snd_buf.head[0].iov_len;
if (rtype == rpcrdma_areadch)
@@ -315,40 +358,33 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos,
rtype, seg);
if (nsegs < 0)
- return ERR_PTR(nsegs);
+ return nsegs;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- false, &mw);
- if (n < 0)
- return ERR_PTR(n);
+ seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ false, &mw);
+ if (IS_ERR(seg))
+ return PTR_ERR(seg);
rpcrdma_push_mw(mw, &req->rl_registered);
- *iptr++ = xdr_one; /* item present */
-
- /* All read segments in this chunk
- * have the same "position".
- */
- *iptr++ = cpu_to_be32(pos);
- iptr = xdr_encode_rdma_segment(iptr, mw);
+ if (encode_read_segment(xdr, mw, pos) < 0)
+ return -EMSGSIZE;
dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__, pos,
mw->mw_length, (unsigned long long)mw->mw_offset,
- mw->mw_handle, n < nsegs ? "more" : "last");
+ mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
r_xprt->rx_stats.read_chunk_count++;
- seg += n;
- nsegs -= n;
+ nsegs -= mw->mw_nents;
} while (nsegs);
- /* Finish Read list */
- *iptr++ = xdr_zero; /* Next item not present */
- return iptr;
+ return 0;
}
-/* XDR-encode the Write list. Supports encoding a list containing
- * one array of plain segments that belong to a single write chunk.
+/* Register and XDR encode the Write list. Supports encoding a list
+ * containing one array of plain segments that belong to a single
+ * write chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -356,66 +392,65 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO - 0
*
- * Returns a pointer to the XDR word in the RDMA header following
- * the end of the Write list, or an error pointer.
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
+ *
+ * Only a single Write chunk is currently supported.
*/
-static __be32 *
+static noinline int
rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
- struct rpc_rqst *rqst, __be32 *iptr,
- enum rpcrdma_chunktype wtype)
+ struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
{
+ struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- int n, nsegs, nchunks;
+ int nsegs, nchunks;
__be32 *segcount;
- if (wtype != rpcrdma_writech) {
- *iptr++ = xdr_zero; /* no Write list present */
- return iptr;
- }
-
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf,
rqst->rq_rcv_buf.head[0].iov_len,
wtype, seg);
if (nsegs < 0)
- return ERR_PTR(nsegs);
+ return nsegs;
- *iptr++ = xdr_one; /* Write list present */
- segcount = iptr++; /* save location of segment count */
+ if (encode_item_present(xdr) < 0)
+ return -EMSGSIZE;
+ segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+ if (unlikely(!segcount))
+ return -EMSGSIZE;
+ /* Actual value encoded below */
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- true, &mw);
- if (n < 0)
- return ERR_PTR(n);
+ seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (IS_ERR(seg))
+ return PTR_ERR(seg);
rpcrdma_push_mw(mw, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, mw);
+ if (encode_rdma_segment(xdr, mw) < 0)
+ return -EMSGSIZE;
dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
mw->mw_length, (unsigned long long)mw->mw_offset,
- mw->mw_handle, n < nsegs ? "more" : "last");
+ mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
r_xprt->rx_stats.write_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
nchunks++;
- seg += n;
- nsegs -= n;
+ nsegs -= mw->mw_nents;
} while (nsegs);
/* Update count of segments in this Write chunk */
*segcount = cpu_to_be32(nchunks);
- /* Finish Write list */
- *iptr++ = xdr_zero; /* Next item not present */
- return iptr;
+ return 0;
}
-/* XDR-encode the Reply chunk. Supports encoding an array of plain
- * segments that belong to a single write (reply) chunk.
+/* Register and XDR encode the Reply chunk. Supports encoding an array
+ * of plain segments that belong to a single write (reply) chunk.
*
* Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64):
*
@@ -423,58 +458,57 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
* N elements:
* 1 - N - HLOO - HLOO - ... - HLOO
*
- * Returns a pointer to the XDR word in the RDMA header following
- * the end of the Reply chunk, or an error pointer.
+ * Returns zero on success, or a negative errno if a failure occurred.
+ * @xdr is advanced to the next position in the stream.
*/
-static __be32 *
-rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
- struct rpcrdma_req *req, struct rpc_rqst *rqst,
- __be32 *iptr, enum rpcrdma_chunktype wtype)
+static noinline int
+rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+ struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype)
{
+ struct xdr_stream *xdr = &req->rl_stream;
struct rpcrdma_mr_seg *seg;
struct rpcrdma_mw *mw;
- int n, nsegs, nchunks;
+ int nsegs, nchunks;
__be32 *segcount;
- if (wtype != rpcrdma_replych) {
- *iptr++ = xdr_zero; /* no Reply chunk present */
- return iptr;
- }
-
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
- return ERR_PTR(nsegs);
+ return nsegs;
- *iptr++ = xdr_one; /* Reply chunk present */
- segcount = iptr++; /* save location of segment count */
+ if (encode_item_present(xdr) < 0)
+ return -EMSGSIZE;
+ segcount = xdr_reserve_space(xdr, sizeof(*segcount));
+ if (unlikely(!segcount))
+ return -EMSGSIZE;
+ /* Actual value encoded below */
nchunks = 0;
do {
- n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
- true, &mw);
- if (n < 0)
- return ERR_PTR(n);
+ seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs,
+ true, &mw);
+ if (IS_ERR(seg))
+ return PTR_ERR(seg);
rpcrdma_push_mw(mw, &req->rl_registered);
- iptr = xdr_encode_rdma_segment(iptr, mw);
+ if (encode_rdma_segment(xdr, mw) < 0)
+ return -EMSGSIZE;
dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n",
rqst->rq_task->tk_pid, __func__,
mw->mw_length, (unsigned long long)mw->mw_offset,
- mw->mw_handle, n < nsegs ? "more" : "last");
+ mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last");
r_xprt->rx_stats.reply_chunk_count++;
r_xprt->rx_stats.total_rdma_request += seg->mr_len;
nchunks++;
- seg += n;
- nsegs -= n;
+ nsegs -= mw->mw_nents;
} while (nsegs);
/* Update count of segments in the Reply chunk */
*segcount = cpu_to_be32(nchunks);
- return iptr;
+ return 0;
}
/* Prepare the RPC-over-RDMA header SGE.
@@ -651,37 +685,52 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
req->rl_mapped_sges = 0;
}
-/*
- * Marshal a request: the primary job of this routine is to choose
- * the transfer modes. See comments below.
+/**
+ * rpcrdma_marshal_req - Marshal and send one RPC request
+ * @r_xprt: controlling transport
+ * @rqst: RPC request to be marshaled
+ *
+ * For the RPC in "rqst", this function:
+ * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG)
+ * - Registers Read, Write, and Reply chunks
+ * - Constructs the transport header
+ * - Posts a Send WR to send the transport header and request
*
- * Returns zero on success, otherwise a negative errno.
+ * Returns:
+ * %0 if the RPC was sent successfully,
+ * %-ENOTCONN if the connection was lost,
+ * %-EAGAIN if not enough pages are available for on-demand reply buffer,
+ * %-ENOBUFS if no MRs are available to register chunks,
+ * %-EMSGSIZE if the transport header is too small,
+ * %-EIO if a permanent problem occurred while marshaling.
*/
-
int
-rpcrdma_marshal_req(struct rpc_rqst *rqst)
+rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst)
{
- struct rpc_xprt *xprt = rqst->rq_xprt;
- struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct xdr_stream *xdr = &req->rl_stream;
enum rpcrdma_chunktype rtype, wtype;
- struct rpcrdma_msg *headerp;
bool ddp_allowed;
- ssize_t hdrlen;
- size_t rpclen;
- __be32 *iptr;
+ __be32 *p;
+ int ret;
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
return rpcrdma_bc_marshal_reply(rqst);
#endif
- headerp = rdmab_to_msg(req->rl_rdmabuf);
- /* don't byte-swap XID, it's already done in request */
- headerp->rm_xid = rqst->rq_xid;
- headerp->rm_vers = rpcrdma_version;
- headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
- headerp->rm_type = rdma_msg;
+ rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0);
+ xdr_init_encode(xdr, &req->rl_hdrbuf,
+ req->rl_rdmabuf->rg_base);
+
+ /* Fixed header fields */
+ ret = -EMSGSIZE;
+ p = xdr_reserve_space(xdr, 4 * sizeof(*p));
+ if (!p)
+ goto out_err;
+ *p++ = rqst->rq_xid;
+ *p++ = rpcrdma_version;
+ *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests);
/* When the ULP employs a GSS flavor that guarantees integrity
* or privacy, direct data placement of individual data items
@@ -721,22 +770,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* by themselves are larger than the inline threshold.
*/
if (rpcrdma_args_inline(r_xprt, rqst)) {
+ *p++ = rdma_msg;
rtype = rpcrdma_noch;
- rpclen = rqst->rq_snd_buf.len;
} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
+ *p++ = rdma_msg;
rtype = rpcrdma_readch;
- rpclen = rqst->rq_snd_buf.head[0].iov_len +
- rqst->rq_snd_buf.tail[0].iov_len;
} else {
r_xprt->rx_stats.nomsg_call_count++;
- headerp->rm_type = htonl(RDMA_NOMSG);
+ *p++ = rdma_nomsg;
rtype = rpcrdma_areadch;
- rpclen = 0;
}
- req->rl_xid = rqst->rq_xid;
- rpcrdma_insert_req(&r_xprt->rx_buf, req);
-
/* This implementation supports the following combinations
* of chunk lists in one RPC-over-RDMA Call message:
*
@@ -759,79 +803,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
* send a Call message with a Position Zero Read chunk and a
* regular Read chunk at the same time.
*/
- iptr = headerp->rm_body.rm_chunks;
- iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype);
- if (IS_ERR(iptr))
+ if (rtype != rpcrdma_noch) {
+ ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype);
+ if (ret)
+ goto out_err;
+ }
+ ret = encode_item_not_present(xdr);
+ if (ret)
goto out_err;
- iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype);
- if (IS_ERR(iptr))
+
+ if (wtype == rpcrdma_writech) {
+ ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype);
+ if (ret)
+ goto out_err;
+ }
+ ret = encode_item_not_present(xdr);
+ if (ret)
goto out_err;
- iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype);
- if (IS_ERR(iptr))
+
+ if (wtype != rpcrdma_replych)
+ ret = encode_item_not_present(xdr);
+ else
+ ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype);
+ if (ret)
goto out_err;
- hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
- dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
+ dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n",
rqst->rq_task->tk_pid, __func__,
transfertypes[rtype], transfertypes[wtype],
- hdrlen, rpclen);
+ xdr_stream_pos(xdr));
- if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+ if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req,
+ xdr_stream_pos(xdr),
&rqst->rq_snd_buf, rtype)) {
- iptr = ERR_PTR(-EIO);
+ ret = -EIO;
goto out_err;
}
return 0;
out_err:
- if (PTR_ERR(iptr) != -ENOBUFS) {
- pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n",
- PTR_ERR(iptr));
+ if (ret != -ENOBUFS) {
+ pr_err("rpcrdma: header marshaling failed (%d)\n", ret);
r_xprt->rx_stats.failed_marshal_count++;
}
- return PTR_ERR(iptr);
-}
-
-/*
- * Chase down a received write or reply chunklist to get length
- * RDMA'd by server. See map at rpcrdma_create_chunks()! :-)
- */
-static int
-rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp)
-{
- unsigned int i, total_len;
- struct rpcrdma_write_chunk *cur_wchunk;
- char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf);
-
- i = be32_to_cpu(**iptrp);
- cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1);
- total_len = 0;
- while (i--) {
- struct rpcrdma_segment *seg = &cur_wchunk->wc_target;
- ifdebug(FACILITY) {
- u64 off;
- xdr_decode_hyper((__be32 *)&seg->rs_offset, &off);
- dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n",
- __func__,
- be32_to_cpu(seg->rs_length),
- (unsigned long long)off,
- be32_to_cpu(seg->rs_handle));
- }
- total_len += be32_to_cpu(seg->rs_length);
- ++cur_wchunk;
- }
- /* check and adjust for properly terminated write chunk */
- if (wrchunk) {
- __be32 *w = (__be32 *) cur_wchunk;
- if (*w++ != xdr_zero)
- return -1;
- cur_wchunk = (struct rpcrdma_write_chunk *) w;
- }
- if ((char *)cur_wchunk > base + rep->rr_len)
- return -1;
-
- *iptrp = (__be32 *) cur_wchunk;
- return total_len;
+ return ret;
}
/**
@@ -949,37 +964,254 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws,
}
}
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
/* By convention, backchannel calls arrive via rdma_msg type
* messages, and never populate the chunk lists. This makes
* the RPC/RDMA header small and fixed in size, so it is
* straightforward to check the RPC header's direction field.
*/
static bool
-rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ __be32 xid, __be32 proc)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
{
- __be32 *p = (__be32 *)headerp;
+ struct xdr_stream *xdr = &rep->rr_stream;
+ __be32 *p;
- if (headerp->rm_type != rdma_msg)
+ if (proc != rdma_msg)
return false;
- if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+
+ /* Peek at stream contents without advancing. */
+ p = xdr_inline_decode(xdr, 0);
+
+ /* Chunk lists */
+ if (*p++ != xdr_zero)
return false;
- if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+ if (*p++ != xdr_zero)
return false;
- /* sanity */
- if (p[7] != headerp->rm_xid)
+ /* RPC header */
+ if (*p++ != xid)
return false;
- /* call direction */
- if (p[8] != cpu_to_be32(RPC_CALL))
+ if (*p != cpu_to_be32(RPC_CALL))
return false;
+ /* Now that we are sure this is a backchannel call,
+ * advance to the RPC header.
+ */
+ p = xdr_inline_decode(xdr, 3 * sizeof(*p));
+ if (unlikely(!p))
+ goto out_short;
+
+ rpcrdma_bc_receive_call(r_xprt, rep);
+ return true;
+
+out_short:
+ pr_warn("RPC/RDMA short backward direction call\n");
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
+ xprt_disconnect_done(&r_xprt->rx_xprt);
return true;
}
+#else /* CONFIG_SUNRPC_BACKCHANNEL */
+{
+ return false;
+}
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ ifdebug(FACILITY) {
+ u64 offset;
+ u32 handle;
+
+ handle = be32_to_cpup(p++);
+ *length = be32_to_cpup(p++);
+ xdr_decode_hyper(p, &offset);
+ dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n",
+ __func__, *length, (unsigned long long)offset,
+ handle);
+ } else {
+ *length = be32_to_cpup(p + 1);
+ }
+
+ return 0;
+}
+
+static int decode_write_chunk(struct xdr_stream *xdr, u32 *length)
+{
+ u32 segcount, seglength;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ *length = 0;
+ segcount = be32_to_cpup(p);
+ while (segcount--) {
+ if (decode_rdma_segment(xdr, &seglength))
+ return -EIO;
+ *length += seglength;
+ }
+
+ dprintk("RPC: %s: segcount=%u, %u bytes\n",
+ __func__, be32_to_cpup(p), *length);
+ return 0;
+}
+
+/* In RPC-over-RDMA Version One replies, a Read list is never
+ * expected. This decoder is a stub that returns an error if
+ * a Read list is present.
+ */
+static int decode_read_list(struct xdr_stream *xdr)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+ if (unlikely(*p != xdr_zero))
+ return -EIO;
+ return 0;
+}
+
+/* Supports only one Write chunk in the Write list
+ */
+static int decode_write_list(struct xdr_stream *xdr, u32 *length)
+{
+ u32 chunklen;
+ bool first;
+ __be32 *p;
+
+ *length = 0;
+ first = true;
+ do {
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+ if (*p == xdr_zero)
+ break;
+ if (!first)
+ return -EIO;
+
+ if (decode_write_chunk(xdr, &chunklen))
+ return -EIO;
+ *length += chunklen;
+ first = false;
+ } while (true);
+ return 0;
+}
+
+static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ *length = 0;
+ if (*p != xdr_zero)
+ if (decode_write_chunk(xdr, length))
+ return -EIO;
+ return 0;
+}
+
+static int
+rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ u32 writelist, replychunk, rpclen;
+ char *base;
+
+ /* Decode the chunk lists */
+ if (decode_read_list(xdr))
+ return -EIO;
+ if (decode_write_list(xdr, &writelist))
+ return -EIO;
+ if (decode_reply_chunk(xdr, &replychunk))
+ return -EIO;
+
+ /* RDMA_MSG sanity checks */
+ if (unlikely(replychunk))
+ return -EIO;
+
+ /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */
+ base = (char *)xdr_inline_decode(xdr, 0);
+ rpclen = xdr_stream_remaining(xdr);
+ r_xprt->rx_stats.fixup_copy_count +=
+ rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3);
+
+ r_xprt->rx_stats.total_rdma_reply += writelist;
+ return rpclen + xdr_align_size(writelist);
+}
+
+static noinline int
+rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ u32 writelist, replychunk;
+
+ /* Decode the chunk lists */
+ if (decode_read_list(xdr))
+ return -EIO;
+ if (decode_write_list(xdr, &writelist))
+ return -EIO;
+ if (decode_reply_chunk(xdr, &replychunk))
+ return -EIO;
+
+ /* RDMA_NOMSG sanity checks */
+ if (unlikely(writelist))
+ return -EIO;
+ if (unlikely(!replychunk))
+ return -EIO;
+
+ /* Reply chunk buffer already is the reply vector */
+ r_xprt->rx_stats.total_rdma_reply += replychunk;
+ return replychunk;
+}
+
+static noinline int
+rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_stream *xdr = &rep->rr_stream;
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, sizeof(*p));
+ if (unlikely(!p))
+ return -EIO;
+
+ switch (*p) {
+ case err_vers:
+ p = xdr_inline_decode(xdr, 2 * sizeof(*p));
+ if (!p)
+ break;
+ dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n",
+ rqst->rq_task->tk_pid, __func__,
+ be32_to_cpup(p), be32_to_cpu(*(p + 1)));
+ break;
+ case err_chunk:
+ dprintk("RPC: %5u: %s: server reports header decoding error\n",
+ rqst->rq_task->tk_pid, __func__);
+ break;
+ default:
+ dprintk("RPC: %5u: %s: server reports unrecognized error %d\n",
+ rqst->rq_task->tk_pid, __func__, be32_to_cpup(p));
+ }
+
+ r_xprt->rx_stats.bad_reply_count++;
+ return -EREMOTEIO;
+}
+
/* Process received RPC/RDMA messages.
*
* Errors must result in the RPC task either being awakened, or
@@ -991,51 +1223,48 @@ rpcrdma_reply_handler(struct work_struct *work)
struct rpcrdma_rep *rep =
container_of(work, struct rpcrdma_rep, rr_work);
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
- struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct rpcrdma_msg *headerp;
+ struct xdr_stream *xdr = &rep->rr_stream;
struct rpcrdma_req *req;
struct rpc_rqst *rqst;
- __be32 *iptr;
- int rdmalen, status, rmerr;
+ __be32 *p, xid, vers, proc;
unsigned long cwnd;
- struct list_head mws;
+ int status;
dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
- if (rep->rr_len == RPCRDMA_BAD_LEN)
+ if (rep->rr_hdrbuf.head[0].iov_len == 0)
goto out_badstatus;
- if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
+
+ xdr_init_decode(xdr, &rep->rr_hdrbuf,
+ rep->rr_hdrbuf.head[0].iov_base);
+
+ /* Fixed transport header fields */
+ p = xdr_inline_decode(xdr, 4 * sizeof(*p));
+ if (unlikely(!p))
goto out_shortreply;
+ xid = *p++;
+ vers = *p++;
+ p++; /* credits */
+ proc = *p++;
- headerp = rdmab_to_msg(rep->rr_rdmabuf);
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
- if (rpcrdma_is_bcall(headerp))
- goto out_bcall;
-#endif
+ if (rpcrdma_is_bcall(r_xprt, rep, xid, proc))
+ return;
/* Match incoming rpcrdma_rep to an rpcrdma_req to
* get context for handling any incoming chunks.
*/
- spin_lock(&buf->rb_lock);
- req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf,
- headerp->rm_xid);
- if (!req)
- goto out_nomatch;
- if (req->rl_reply)
- goto out_duplicate;
-
- list_replace_init(&req->rl_registered, &mws);
- rpcrdma_mark_remote_invalidation(&mws, rep);
-
- /* Avoid races with signals and duplicate replies
- * by marking this req as matched.
- */
+ spin_lock(&xprt->recv_lock);
+ rqst = xprt_lookup_rqst(xprt, xid);
+ if (!rqst)
+ goto out_norqst;
+ xprt_pin_rqst(rqst);
+ spin_unlock(&xprt->recv_lock);
+ req = rpcr_to_rdmar(rqst);
req->rl_reply = rep;
- spin_unlock(&buf->rb_lock);
dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
- __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+ __func__, rep, req, be32_to_cpu(xid));
/* Invalidate and unmap the data payloads before waking the
* waiting application. This guarantees the memory regions
@@ -1044,99 +1273,42 @@ rpcrdma_reply_handler(struct work_struct *work)
* waking the next RPC waits until this RPC has relinquished
* all its Send Queue entries.
*/
- if (!list_empty(&mws))
- r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws);
+ if (!list_empty(&req->rl_registered)) {
+ rpcrdma_mark_remote_invalidation(&req->rl_registered, rep);
+ r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt,
+ &req->rl_registered);
+ }
- /* Perform XID lookup, reconstruction of the RPC reply, and
- * RPC completion while holding the transport lock to ensure
- * the rep, rqst, and rq_task pointers remain stable.
- */
- spin_lock_bh(&xprt->transport_lock);
- rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
- if (!rqst)
- goto out_norqst;
xprt->reestablish_timeout = 0;
- if (headerp->rm_vers != rpcrdma_version)
+ if (vers != rpcrdma_version)
goto out_badversion;
- /* check for expected message types */
- /* The order of some of these tests is important. */
- switch (headerp->rm_type) {
+ switch (proc) {
case rdma_msg:
- /* never expect read chunks */
- /* never expect reply chunks (two ways to check) */
- if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
- (headerp->rm_body.rm_chunks[1] == xdr_zero &&
- headerp->rm_body.rm_chunks[2] != xdr_zero))
- goto badheader;
- if (headerp->rm_body.rm_chunks[1] != xdr_zero) {
- /* count any expected write chunks in read reply */
- /* start at write chunk array count */
- iptr = &headerp->rm_body.rm_chunks[2];
- rdmalen = rpcrdma_count_chunks(rep, 1, &iptr);
- /* check for validity, and no reply chunk after */
- if (rdmalen < 0 || *iptr++ != xdr_zero)
- goto badheader;
- rep->rr_len -=
- ((unsigned char *)iptr - (unsigned char *)headerp);
- status = rep->rr_len + rdmalen;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* special case - last chunk may omit padding */
- if (rdmalen &= 3) {
- rdmalen = 4 - rdmalen;
- status += rdmalen;
- }
- } else {
- /* else ordinary inline */
- rdmalen = 0;
- iptr = (__be32 *)((unsigned char *)headerp +
- RPCRDMA_HDRLEN_MIN);
- rep->rr_len -= RPCRDMA_HDRLEN_MIN;
- status = rep->rr_len;
- }
-
- r_xprt->rx_stats.fixup_copy_count +=
- rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len,
- rdmalen);
+ status = rpcrdma_decode_msg(r_xprt, rep, rqst);
break;
-
case rdma_nomsg:
- /* never expect read or write chunks, always reply chunks */
- if (headerp->rm_body.rm_chunks[0] != xdr_zero ||
- headerp->rm_body.rm_chunks[1] != xdr_zero ||
- headerp->rm_body.rm_chunks[2] != xdr_one)
- goto badheader;
- iptr = (__be32 *)((unsigned char *)headerp +
- RPCRDMA_HDRLEN_MIN);
- rdmalen = rpcrdma_count_chunks(rep, 0, &iptr);
- if (rdmalen < 0)
- goto badheader;
- r_xprt->rx_stats.total_rdma_reply += rdmalen;
- /* Reply chunk buffer already is the reply vector - no fixup. */
- status = rdmalen;
+ status = rpcrdma_decode_nomsg(r_xprt, rep);
break;
-
case rdma_error:
- goto out_rdmaerr;
-
-badheader:
+ status = rpcrdma_decode_error(r_xprt, rep, rqst);
+ break;
default:
- dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
- rqst->rq_task->tk_pid, __func__,
- be32_to_cpu(headerp->rm_type));
status = -EIO;
- r_xprt->rx_stats.bad_reply_count++;
- break;
}
+ if (status < 0)
+ goto out_badheader;
out:
+ spin_lock(&xprt->recv_lock);
cwnd = xprt->cwnd;
xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(rqst->rq_task);
xprt_complete_rqst(rqst->rq_task, status);
- spin_unlock_bh(&xprt->transport_lock);
+ xprt_unpin_rqst(rqst);
+ spin_unlock(&xprt->recv_lock);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
return;
@@ -1149,72 +1321,38 @@ out_badstatus:
}
return;
-#if defined(CONFIG_SUNRPC_BACKCHANNEL)
-out_bcall:
- rpcrdma_bc_receive_call(r_xprt, rep);
- return;
-#endif
-
/* If the incoming reply terminated a pending RPC, the next
* RPC call will post a replacement receive buffer as it is
* being marshaled.
*/
out_badversion:
dprintk("RPC: %s: invalid version %d\n",
- __func__, be32_to_cpu(headerp->rm_vers));
+ __func__, be32_to_cpu(vers));
status = -EIO;
r_xprt->rx_stats.bad_reply_count++;
goto out;
-out_rdmaerr:
- rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err);
- switch (rmerr) {
- case ERR_VERS:
- pr_err("%s: server reports header version error (%u-%u)\n",
- __func__,
- be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low),
- be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high));
- break;
- case ERR_CHUNK:
- pr_err("%s: server reports header decoding error\n",
- __func__);
- break;
- default:
- pr_err("%s: server reports unknown error %d\n",
- __func__, rmerr);
- }
- status = -EREMOTEIO;
+out_badheader:
+ dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n",
+ rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc));
r_xprt->rx_stats.bad_reply_count++;
+ status = -EIO;
goto out;
-/* The req was still available, but by the time the transport_lock
+/* The req was still available, but by the time the recv_lock
* was acquired, the rqst and task had been released. Thus the RPC
* has already been terminated.
*/
out_norqst:
- spin_unlock_bh(&xprt->transport_lock);
- rpcrdma_buffer_put(req);
- dprintk("RPC: %s: race, no rqst left for req %p\n",
- __func__, req);
- return;
+ spin_unlock(&xprt->recv_lock);
+ dprintk("RPC: %s: no match for incoming xid 0x%08x\n",
+ __func__, be32_to_cpu(xid));
+ goto repost;
out_shortreply:
dprintk("RPC: %s: short/invalid reply\n", __func__);
goto repost;
-out_nomatch:
- spin_unlock(&buf->rb_lock);
- dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
- __func__, be32_to_cpu(headerp->rm_xid),
- rep->rr_len);
- goto repost;
-
-out_duplicate:
- spin_unlock(&buf->rb_lock);
- dprintk("RPC: %s: "
- "duplicate reply %p to RPC request %p: xid 0x%08x\n",
- __func__, rep, req, be32_to_cpu(headerp->rm_xid));
-
/* If no pending RPC transaction was matched, post a replacement
* receive buffer before returning.
*/
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index c676ed0efb5a..ec37ad83b068 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -52,7 +52,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
if (src->iov_len < 24)
goto out_shortreply;
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
req = xprt_lookup_rqst(xprt, xid);
if (!req)
goto out_notfound;
@@ -69,17 +69,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp,
else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
credits = r_xprt->rx_buf.rb_bc_max_requests;
+ spin_lock_bh(&xprt->transport_lock);
cwnd = xprt->cwnd;
xprt->cwnd = credits << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(req->rq_task);
+ spin_unlock_bh(&xprt->transport_lock);
+
ret = 0;
xprt_complete_rqst(req->rq_task, rcvbuf->len);
rcvbuf->len = 0;
out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
out:
return ret;
@@ -266,7 +269,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt)
module_put(THIS_MODULE);
}
-static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+static const struct rpc_xprt_ops xprt_rdma_bc_procs = {
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong,
.alloc_slot = xprt_alloc_slot,
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index d1c458e5ec4d..c84e2b644e13 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -149,7 +149,7 @@ static struct ctl_table sunrpc_table[] = {
#endif
-static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */
+static const struct rpc_xprt_ops xprt_rdma_procs;
static void
xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -559,6 +559,7 @@ rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
r_xprt->rx_stats.hardway_register_count += size;
req->rl_rdmabuf = rb;
+ xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb));
return true;
}
@@ -684,7 +685,6 @@ xprt_rdma_free(struct rpc_task *task)
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
- rpcrdma_remove_req(&r_xprt->rx_buf, req);
if (!list_empty(&req->rl_registered))
ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
rpcrdma_unmap_sges(ia, req);
@@ -730,7 +730,7 @@ xprt_rdma_send_request(struct rpc_task *task)
if (unlikely(!list_empty(&req->rl_registered)))
r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
- rc = rpcrdma_marshal_req(rqst);
+ rc = rpcrdma_marshal_req(r_xprt, rqst);
if (rc < 0)
goto failed_marshal;
@@ -811,7 +811,7 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt)
* Plumbing for rpc transport switch and kernel module
*/
-static struct rpc_xprt_ops xprt_rdma_procs = {
+static const struct rpc_xprt_ops xprt_rdma_procs = {
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
.alloc_slot = xprt_alloc_slot,
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index e4171f2abe37..11a1fbf7e59e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -139,14 +139,11 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
static void
rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
{
- struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf);
struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
+ __be32 *p = rep->rr_rdmabuf->rg_base;
u32 credits;
- if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
- return;
-
- credits = be32_to_cpu(rmsgp->rm_credit);
+ credits = be32_to_cpup(p + 2);
if (credits == 0)
credits = 1; /* don't deadlock */
else if (credits > buffer->rb_max_requests)
@@ -173,21 +170,19 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
goto out_fail;
/* status == SUCCESS means all fields in wc are trustworthy */
- if (wc->opcode != IB_WC_RECV)
- return;
-
dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n",
__func__, rep, wc->byte_len);
- rep->rr_len = wc->byte_len;
+ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len);
rep->rr_wc_flags = wc->wc_flags;
rep->rr_inv_rkey = wc->ex.invalidate_rkey;
ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf),
rdmab_addr(rep->rr_rdmabuf),
- rep->rr_len, DMA_FROM_DEVICE);
+ wc->byte_len, DMA_FROM_DEVICE);
- rpcrdma_update_granted_credits(rep);
+ if (wc->byte_len >= RPCRDMA_HDRLEN_ERR)
+ rpcrdma_update_granted_credits(rep);
out_schedule:
queue_work(rpcrdma_receive_wq, &rep->rr_work);
@@ -198,7 +193,7 @@ out_fail:
pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
ib_wc_status_msg(wc->status),
wc->status, wc->vendor_err);
- rep->rr_len = RPCRDMA_BAD_LEN;
+ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0);
goto out_schedule;
}
@@ -974,6 +969,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(rep->rr_rdmabuf);
goto out_free;
}
+ xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base,
+ rdmab_length(rep->rr_rdmabuf));
rep->rr_cqe.done = rpcrdma_wc_receive;
rep->rr_rxprt = r_xprt;
@@ -1004,7 +1001,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
spin_lock_init(&buf->rb_recovery_lock);
INIT_LIST_HEAD(&buf->rb_mws);
INIT_LIST_HEAD(&buf->rb_all);
- INIT_LIST_HEAD(&buf->rb_pending);
INIT_LIST_HEAD(&buf->rb_stale_mrs);
INIT_DELAYED_WORK(&buf->rb_refresh_worker,
rpcrdma_mr_refresh_worker);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index b282d3f8cdd8..e26a97d2f922 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -218,18 +218,17 @@ enum {
struct rpcrdma_rep {
struct ib_cqe rr_cqe;
- unsigned int rr_len;
int rr_wc_flags;
u32 rr_inv_rkey;
+ struct rpcrdma_regbuf *rr_rdmabuf;
struct rpcrdma_xprt *rr_rxprt;
struct work_struct rr_work;
+ struct xdr_buf rr_hdrbuf;
+ struct xdr_stream rr_stream;
struct list_head rr_list;
struct ib_recv_wr rr_recv_wr;
- struct rpcrdma_regbuf *rr_rdmabuf;
};
-#define RPCRDMA_BAD_LEN (~0U)
-
/*
* struct rpcrdma_mw - external memory region metadata
*
@@ -341,11 +340,12 @@ enum {
struct rpcrdma_buffer;
struct rpcrdma_req {
struct list_head rl_list;
- __be32 rl_xid;
unsigned int rl_mapped_sges;
unsigned int rl_connect_cookie;
struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;
+ struct xdr_stream rl_stream;
+ struct xdr_buf rl_hdrbuf;
struct ib_send_wr rl_send_wr;
struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES];
struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */
@@ -403,7 +403,6 @@ struct rpcrdma_buffer {
int rb_send_count, rb_recv_count;
struct list_head rb_send_bufs;
struct list_head rb_recv_bufs;
- struct list_head rb_pending;
u32 rb_max_requests;
atomic_t rb_credits; /* most recent credit grant */
@@ -440,24 +439,27 @@ struct rpcrdma_create_data_internal {
* Statistics for RPCRDMA
*/
struct rpcrdma_stats {
+ /* accessed when sending a call */
unsigned long read_chunk_count;
unsigned long write_chunk_count;
unsigned long reply_chunk_count;
-
unsigned long long total_rdma_request;
- unsigned long long total_rdma_reply;
+ /* rarely accessed error counters */
unsigned long long pullup_copy_count;
- unsigned long long fixup_copy_count;
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
- unsigned long nomsg_call_count;
- unsigned long bcall_count;
unsigned long mrs_recovered;
unsigned long mrs_orphaned;
unsigned long mrs_allocated;
+
+ /* accessed when receiving a reply */
+ unsigned long long total_rdma_reply;
+ unsigned long long fixup_copy_count;
unsigned long local_inv_needed;
+ unsigned long nomsg_call_count;
+ unsigned long bcall_count;
};
/*
@@ -465,7 +467,8 @@ struct rpcrdma_stats {
*/
struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
- int (*ro_map)(struct rpcrdma_xprt *,
+ struct rpcrdma_mr_seg *
+ (*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool,
struct rpcrdma_mw **);
void (*ro_unmap_sync)(struct rpcrdma_xprt *,
@@ -552,34 +555,6 @@ void rpcrdma_destroy_req(struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
-static inline void
-rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
-{
- spin_lock(&buffers->rb_lock);
- if (list_empty(&req->rl_list))
- list_add_tail(&req->rl_list, &buffers->rb_pending);
- spin_unlock(&buffers->rb_lock);
-}
-
-static inline struct rpcrdma_req *
-rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid)
-{
- struct rpcrdma_req *pos;
-
- list_for_each_entry(pos, &buffers->rb_pending, rl_list)
- if (pos->rl_xid == xid)
- return pos;
- return NULL;
-}
-
-static inline void
-rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req)
-{
- spin_lock(&buffers->rb_lock);
- list_del(&req->rl_list);
- spin_unlock(&buffers->rb_lock);
-}
-
struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *);
void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *);
struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *);
@@ -638,10 +613,16 @@ enum rpcrdma_chunktype {
bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
u32, struct xdr_buf *, enum rpcrdma_chunktype);
void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
-int rpcrdma_marshal_req(struct rpc_rqst *);
+int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
void rpcrdma_reply_handler(struct work_struct *work);
+static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len)
+{
+ xdr->head[0].iov_len = len;
+ xdr->len = len;
+}
+
/* RPC/RDMA module init - xprtrdma/transport.c
*/
extern unsigned int xprt_rdma_max_inline_read;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 4f154d388748..9b5de31aa429 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -969,10 +969,12 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
+ xprt_pin_rqst(rovr);
+ spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
copied = rovr->rq_private_buf.buflen;
@@ -981,13 +983,16 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt,
if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) {
dprintk("RPC: sk_buff copy failed\n");
- goto out_unlock;
+ spin_lock(&xprt->recv_lock);
+ goto out_unpin;
}
+ spin_lock(&xprt->recv_lock);
xprt_complete_rqst(task, copied);
-
+out_unpin:
+ xprt_unpin_rqst(rovr);
out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
}
static void xs_local_data_receive(struct sock_xprt *transport)
@@ -1050,10 +1055,12 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
+ xprt_pin_rqst(rovr);
+ spin_unlock(&xprt->recv_lock);
task = rovr->rq_task;
if ((copied = rovr->rq_private_buf.buflen) > repsize)
@@ -1062,16 +1069,21 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
/* Suck it into the iovec, verify checksum if not done by hw. */
if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) {
__UDPX_INC_STATS(sk, UDP_MIB_INERRORS);
- goto out_unlock;
+ spin_lock(&xprt->recv_lock);
+ goto out_unpin;
}
__UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS);
+ spin_lock_bh(&xprt->transport_lock);
xprt_adjust_cwnd(xprt, task, copied);
+ spin_unlock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
xprt_complete_rqst(task, copied);
-
+out_unpin:
+ xprt_unpin_rqst(rovr);
out_unlock:
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
}
static void xs_udp_data_receive(struct sock_xprt *transport)
@@ -1277,25 +1289,12 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
}
len = desc->count;
- if (len > transport->tcp_reclen - transport->tcp_offset) {
- struct xdr_skb_reader my_desc;
-
- len = transport->tcp_reclen - transport->tcp_offset;
- memcpy(&my_desc, desc, sizeof(my_desc));
- my_desc.count = len;
- r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
- &my_desc, xdr_skb_read_bits);
- desc->count -= r;
- desc->offset += r;
- } else
- r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
+ if (len > transport->tcp_reclen - transport->tcp_offset)
+ desc->count = transport->tcp_reclen - transport->tcp_offset;
+ r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied,
desc, xdr_skb_read_bits);
- if (r > 0) {
- transport->tcp_copied += r;
- transport->tcp_offset += r;
- }
- if (r != len) {
+ if (desc->count) {
/* Error when copying to the receive buffer,
* usually because we weren't able to allocate
* additional buffer pages. All we can do now
@@ -1315,6 +1314,10 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt,
return;
}
+ transport->tcp_copied += r;
+ transport->tcp_offset += r;
+ desc->count = len - r;
+
dprintk("RPC: XID %08x read %zd bytes\n",
ntohl(transport->tcp_xid), r);
dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, "
@@ -1343,21 +1346,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
/* Find and lock the request corresponding to this xid */
- spin_lock_bh(&xprt->transport_lock);
+ spin_lock(&xprt->recv_lock);
req = xprt_lookup_rqst(xprt, transport->tcp_xid);
if (!req) {
dprintk("RPC: XID %08x request not found!\n",
ntohl(transport->tcp_xid));
- spin_unlock_bh(&xprt->transport_lock);
+ spin_unlock(&xprt->recv_lock);
return -1;
}
+ xprt_pin_rqst(req);
+ spin_unlock(&xprt->recv_lock);
xs_tcp_read_common(xprt, desc, req);
+ spin_lock(&xprt->recv_lock);
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_rqst(req->rq_task, transport->tcp_copied);
-
- spin_unlock_bh(&xprt->transport_lock);
+ xprt_unpin_rqst(req);
+ spin_unlock(&xprt->recv_lock);
return 0;
}
@@ -1376,11 +1382,9 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
container_of(xprt, struct sock_xprt, xprt);
struct rpc_rqst *req;
- /* Look up and lock the request corresponding to the given XID */
- spin_lock_bh(&xprt->transport_lock);
+ /* Look up the request corresponding to the given XID */
req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
- spin_unlock_bh(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
@@ -1391,7 +1395,6 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_bc_request(req, transport->tcp_copied);
- spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1516,6 +1519,7 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
.arg.data = xprt,
};
unsigned long total = 0;
+ int loop;
int read = 0;
mutex_lock(&transport->recv_mutex);
@@ -1524,20 +1528,20 @@ static void xs_tcp_data_receive(struct sock_xprt *transport)
goto out;
/* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
- for (;;) {
+ for (loop = 0; loop < 64; loop++) {
lock_sock(sk);
read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
if (read <= 0) {
clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state);
release_sock(sk);
- if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
- break;
- } else {
- release_sock(sk);
- total += read;
+ break;
}
+ release_sock(sk);
+ total += read;
rd_desc.count = 65536;
}
+ if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state))
+ queue_work(xprtiod_workqueue, &transport->recv_worker);
out:
mutex_unlock(&transport->recv_mutex);
trace_xs_tcp_data_ready(xprt, read, total);
@@ -2724,7 +2728,7 @@ static void bc_destroy(struct rpc_xprt *xprt)
module_put(THIS_MODULE);
}
-static struct rpc_xprt_ops xs_local_ops = {
+static const struct rpc_xprt_ops xs_local_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xs_tcp_release_xprt,
.alloc_slot = xprt_alloc_slot,
@@ -2742,7 +2746,7 @@ static struct rpc_xprt_ops xs_local_ops = {
.disable_swap = xs_disable_swap,
};
-static struct rpc_xprt_ops xs_udp_ops = {
+static const struct rpc_xprt_ops xs_udp_ops = {
.set_buffer_size = xs_udp_set_buffer_size,
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong,
@@ -2764,7 +2768,7 @@ static struct rpc_xprt_ops xs_udp_ops = {
.inject_disconnect = xs_inject_disconnect,
};
-static struct rpc_xprt_ops xs_tcp_ops = {
+static const struct rpc_xprt_ops xs_tcp_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xs_tcp_release_xprt,
.alloc_slot = xprt_lock_and_alloc_slot,
@@ -2795,7 +2799,7 @@ static struct rpc_xprt_ops xs_tcp_ops = {
* The rpc_xprt_ops for the server backchannel
*/
-static struct rpc_xprt_ops bc_tcp_ops = {
+static const struct rpc_xprt_ops bc_tcp_ops = {
.reserve_xprt = xprt_reserve_xprt,
.release_xprt = xprt_release_xprt,
.alloc_slot = xprt_alloc_slot,
diff --git a/scripts/selinux/genheaders/genheaders.c b/scripts/selinux/genheaders/genheaders.c
index 6a24569c3578..672b069dcfea 100644
--- a/scripts/selinux/genheaders/genheaders.c
+++ b/scripts/selinux/genheaders/genheaders.c
@@ -129,11 +129,16 @@ int main(int argc, char *argv[])
for (i = 0; secclass_map[i].name; i++) {
struct security_class_mapping *map = &secclass_map[i];
for (j = 0; map->perms[j]; j++) {
+ if (j >= 32) {
+ fprintf(stderr, "Too many permissions to fit into an access vector at (%s, %s).\n",
+ map->name, map->perms[j]);
+ exit(5);
+ }
fprintf(fout, "#define %s__%s", map->name,
map->perms[j]);
for (k = 0; k < max(1, 40 - strlen(map->name) - strlen(map->perms[j])); k++)
fprintf(fout, " ");
- fprintf(fout, "0x%08xUL\n", (1<<j));
+ fprintf(fout, "0x%08xU\n", (1<<j));
}
}
diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install
index 677756ae34c9..067459760a7b 100755
--- a/scripts/sphinx-pre-install
+++ b/scripts/sphinx-pre-install
@@ -40,7 +40,6 @@ my $virtualenv = 1;
#
my %texlive = (
- 'adjustbox.sty' => 'texlive-adjustbox',
'amsfonts.sty' => 'texlive-amsfonts',
'amsmath.sty' => 'texlive-amsmath',
'amssymb.sty' => 'texlive-amsfonts',
diff --git a/security/commoncap.c b/security/commoncap.c
index d8e26fb9781d..6bf72b175b49 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -82,8 +82,11 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
if (ns == cred->user_ns)
return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;
- /* Have we tried all of the parent namespaces? */
- if (ns == &init_user_ns)
+ /*
+ * If we're already at a lower level than we're looking for,
+ * we're done searching.
+ */
+ if (ns->level <= cred->user_ns->level)
return -EPERM;
/*
@@ -323,6 +326,209 @@ int cap_inode_killpriv(struct dentry *dentry)
return error;
}
+static bool rootid_owns_currentns(kuid_t kroot)
+{
+ struct user_namespace *ns;
+
+ if (!uid_valid(kroot))
+ return false;
+
+ for (ns = current_user_ns(); ; ns = ns->parent) {
+ if (from_kuid(ns, kroot) == 0)
+ return true;
+ if (ns == &init_user_ns)
+ break;
+ }
+
+ return false;
+}
+
+static __u32 sansflags(__u32 m)
+{
+ return m & ~VFS_CAP_FLAGS_EFFECTIVE;
+}
+
+static bool is_v2header(size_t size, __le32 magic)
+{
+ __u32 m = le32_to_cpu(magic);
+ if (size != XATTR_CAPS_SZ_2)
+ return false;
+ return sansflags(m) == VFS_CAP_REVISION_2;
+}
+
+static bool is_v3header(size_t size, __le32 magic)
+{
+ __u32 m = le32_to_cpu(magic);
+
+ if (size != XATTR_CAPS_SZ_3)
+ return false;
+ return sansflags(m) == VFS_CAP_REVISION_3;
+}
+
+/*
+ * getsecurity: We are called for security.* before any attempt to read the
+ * xattr from the inode itself.
+ *
+ * This gives us a chance to read the on-disk value and convert it. If we
+ * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
+ *
+ * Note we are not called by vfs_getxattr_alloc(), but that is only called
+ * by the integrity subsystem, which really wants the unconverted values -
+ * so that's good.
+ */
+int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer,
+ bool alloc)
+{
+ int size, ret;
+ kuid_t kroot;
+ uid_t root, mappedroot;
+ char *tmpbuf = NULL;
+ struct vfs_cap_data *cap;
+ struct vfs_ns_cap_data *nscap;
+ struct dentry *dentry;
+ struct user_namespace *fs_ns;
+
+ if (strcmp(name, "capability") != 0)
+ return -EOPNOTSUPP;
+
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ return -EINVAL;
+
+ size = sizeof(struct vfs_ns_cap_data);
+ ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS,
+ &tmpbuf, size, GFP_NOFS);
+ dput(dentry);
+
+ if (ret < 0)
+ return ret;
+
+ fs_ns = inode->i_sb->s_user_ns;
+ cap = (struct vfs_cap_data *) tmpbuf;
+ if (is_v2header((size_t) ret, cap->magic_etc)) {
+ /* If this is sizeof(vfs_cap_data) then we're ok with the
+ * on-disk value, so return that. */
+ if (alloc)
+ *buffer = tmpbuf;
+ else
+ kfree(tmpbuf);
+ return ret;
+ } else if (!is_v3header((size_t) ret, cap->magic_etc)) {
+ kfree(tmpbuf);
+ return -EINVAL;
+ }
+
+ nscap = (struct vfs_ns_cap_data *) tmpbuf;
+ root = le32_to_cpu(nscap->rootid);
+ kroot = make_kuid(fs_ns, root);
+
+ /* If the root kuid maps to a valid uid in current ns, then return
+ * this as a nscap. */
+ mappedroot = from_kuid(current_user_ns(), kroot);
+ if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
+ if (alloc) {
+ *buffer = tmpbuf;
+ nscap->rootid = cpu_to_le32(mappedroot);
+ } else
+ kfree(tmpbuf);
+ return size;
+ }
+
+ if (!rootid_owns_currentns(kroot)) {
+ kfree(tmpbuf);
+ return -EOPNOTSUPP;
+ }
+
+ /* This comes from a parent namespace. Return as a v2 capability */
+ size = sizeof(struct vfs_cap_data);
+ if (alloc) {
+ *buffer = kmalloc(size, GFP_ATOMIC);
+ if (*buffer) {
+ struct vfs_cap_data *cap = *buffer;
+ __le32 nsmagic, magic;
+ magic = VFS_CAP_REVISION_2;
+ nsmagic = le32_to_cpu(nscap->magic_etc);
+ if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
+ magic |= VFS_CAP_FLAGS_EFFECTIVE;
+ memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+ cap->magic_etc = cpu_to_le32(magic);
+ }
+ }
+ kfree(tmpbuf);
+ return size;
+}
+
+static kuid_t rootid_from_xattr(const void *value, size_t size,
+ struct user_namespace *task_ns)
+{
+ const struct vfs_ns_cap_data *nscap = value;
+ uid_t rootid = 0;
+
+ if (size == XATTR_CAPS_SZ_3)
+ rootid = le32_to_cpu(nscap->rootid);
+
+ return make_kuid(task_ns, rootid);
+}
+
+static bool validheader(size_t size, __le32 magic)
+{
+ return is_v2header(size, magic) || is_v3header(size, magic);
+}
+
+/*
+ * User requested a write of security.capability. If needed, update the
+ * xattr to change from v2 to v3, or to fixup the v3 rootid.
+ *
+ * If all is ok, we return the new size, on error return < 0.
+ */
+int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size)
+{
+ struct vfs_ns_cap_data *nscap;
+ uid_t nsrootid;
+ const struct vfs_cap_data *cap = *ivalue;
+ __u32 magic, nsmagic;
+ struct inode *inode = d_backing_inode(dentry);
+ struct user_namespace *task_ns = current_user_ns(),
+ *fs_ns = inode->i_sb->s_user_ns;
+ kuid_t rootid;
+ size_t newsize;
+
+ if (!*ivalue)
+ return -EINVAL;
+ if (!validheader(size, cap->magic_etc))
+ return -EINVAL;
+ if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+ return -EPERM;
+ if (size == XATTR_CAPS_SZ_2)
+ if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
+ /* user is privileged, just write the v2 */
+ return size;
+
+ rootid = rootid_from_xattr(*ivalue, size, task_ns);
+ if (!uid_valid(rootid))
+ return -EINVAL;
+
+ nsrootid = from_kuid(fs_ns, rootid);
+ if (nsrootid == -1)
+ return -EINVAL;
+
+ newsize = sizeof(struct vfs_ns_cap_data);
+ nscap = kmalloc(newsize, GFP_ATOMIC);
+ if (!nscap)
+ return -ENOMEM;
+ nscap->rootid = cpu_to_le32(nsrootid);
+ nsmagic = VFS_CAP_REVISION_3;
+ magic = le32_to_cpu(cap->magic_etc);
+ if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+ nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
+ nscap->magic_etc = cpu_to_le32(nsmagic);
+ memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+
+ kvfree(*ivalue);
+ *ivalue = nscap;
+ return newsize;
+}
+
/*
* Calculate the new process capability sets from the capability sets attached
* to a file.
@@ -376,7 +582,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
__u32 magic_etc;
unsigned tocopy, i;
int size;
- struct vfs_cap_data caps;
+ struct vfs_ns_cap_data data, *nscaps = &data;
+ struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
+ kuid_t rootkuid;
+ struct user_namespace *fs_ns = inode->i_sb->s_user_ns;
memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
@@ -384,18 +593,20 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
return -ENODATA;
size = __vfs_getxattr((struct dentry *)dentry, inode,
- XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ);
+ XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
if (size == -ENODATA || size == -EOPNOTSUPP)
/* no data, that's ok */
return -ENODATA;
+
if (size < 0)
return size;
if (size < sizeof(magic_etc))
return -EINVAL;
- cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc);
+ cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);
+ rootkuid = make_kuid(fs_ns, 0);
switch (magic_etc & VFS_CAP_REVISION_MASK) {
case VFS_CAP_REVISION_1:
if (size != XATTR_CAPS_SZ_1)
@@ -407,15 +618,27 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data
return -EINVAL;
tocopy = VFS_CAP_U32_2;
break;
+ case VFS_CAP_REVISION_3:
+ if (size != XATTR_CAPS_SZ_3)
+ return -EINVAL;
+ tocopy = VFS_CAP_U32_3;
+ rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
+ break;
+
default:
return -EINVAL;
}
+ /* Limit the caps to the mounter of the filesystem
+ * or the more limited uid specified in the xattr.
+ */
+ if (!rootid_owns_currentns(rootkuid))
+ return -ENODATA;
CAP_FOR_EACH_U32(i) {
if (i >= tocopy)
break;
- cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted);
- cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable);
+ cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable);
}
cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
@@ -453,8 +676,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c
rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
- printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
- __func__, rc, bprm->filename);
+ printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
+ bprm->filename);
else if (rc == -ENODATA)
rc = 0;
goto out;
@@ -633,15 +856,19 @@ skip:
int cap_inode_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
- if (!capable(CAP_SETFCAP))
- return -EPERM;
+ /* Ignore non-security xattrs */
+ if (strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+ return 0;
+
+ /*
+ * For XATTR_NAME_CAPS the check will be done in
+ * cap_convert_nscap(), called by setxattr()
+ */
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
return 0;
- }
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- sizeof(XATTR_SECURITY_PREFIX) - 1) &&
- !capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -659,15 +886,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
*/
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
- if (!capable(CAP_SETFCAP))
+ /* Ignore non-security xattrs */
+ if (strncmp(name, XATTR_SECURITY_PREFIX,
+ sizeof(XATTR_SECURITY_PREFIX) - 1) != 0)
+ return 0;
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ /* security.capability gets namespaced */
+ struct inode *inode = d_backing_inode(dentry);
+ if (!inode)
+ return -EINVAL;
+ if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
return -EPERM;
return 0;
}
- if (!strncmp(name, XATTR_SECURITY_PREFIX,
- sizeof(XATTR_SECURITY_PREFIX) - 1) &&
- !capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -1054,6 +1288,7 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(bprm_set_creds, cap_bprm_set_creds),
LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+ LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
LSM_HOOK_INIT(mmap_file, cap_mmap_file),
LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 28d4c3a528ab..67703dbe29ea 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -2,7 +2,7 @@
* common LSM auditing functions
*
* Based on code written for SELinux by :
- * Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Stephen Smalley, <sds@tycho.nsa.gov>
* James Morris <jmorris@redhat.com>
* Author : Etienne Basset, <etienne.basset@ensta.org>
*
diff --git a/security/security.c b/security/security.c
index afc34f46c6c5..4bf0f571b4ef 100644
--- a/security/security.c
+++ b/security/security.c
@@ -974,11 +974,6 @@ int security_file_open(struct file *file, const struct cred *cred)
return fsnotify_perm(file, MAY_OPEN);
}
-int security_task_create(unsigned long clone_flags)
-{
- return call_int_hook(task_create, 0, clone_flags);
-}
-
int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
return call_int_hook(task_alloc, 0, task, clone_flags);
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 4b4293194aee..2380b8d72cec 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -1,7 +1,7 @@
/*
* Implementation of the kernel access vector cache (AVC).
*
- * Authors: Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Authors: Stephen Smalley, <sds@tycho.nsa.gov>
* James Morris <jmorris@redhat.com>
*
* Update: KaiGai, Kohei <kaigai@ak.jp.nec.com>
@@ -346,27 +346,26 @@ static struct avc_xperms_decision_node
struct avc_xperms_decision_node *xpd_node;
struct extended_perms_decision *xpd;
- xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
- GFP_ATOMIC | __GFP_NOMEMALLOC);
+ xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT);
if (!xpd_node)
return NULL;
xpd = &xpd_node->xpd;
if (which & XPERMS_ALLOWED) {
xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
- GFP_ATOMIC | __GFP_NOMEMALLOC);
+ GFP_NOWAIT);
if (!xpd->allowed)
goto error;
}
if (which & XPERMS_AUDITALLOW) {
xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
- GFP_ATOMIC | __GFP_NOMEMALLOC);
+ GFP_NOWAIT);
if (!xpd->auditallow)
goto error;
}
if (which & XPERMS_DONTAUDIT) {
xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
- GFP_ATOMIC | __GFP_NOMEMALLOC);
+ GFP_NOWAIT);
if (!xpd->dontaudit)
goto error;
}
@@ -394,8 +393,7 @@ static struct avc_xperms_node *avc_xperms_alloc(void)
{
struct avc_xperms_node *xp_node;
- xp_node = kmem_cache_zalloc(avc_xperms_cachep,
- GFP_ATOMIC|__GFP_NOMEMALLOC);
+ xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT);
if (!xp_node)
return xp_node;
INIT_LIST_HEAD(&xp_node->xpd_head);
@@ -548,7 +546,7 @@ static struct avc_node *avc_alloc_node(void)
{
struct avc_node *node;
- node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC);
+ node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT);
if (!node)
goto out;
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ad3b0f53ede0..f5d304736852 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3,7 +3,7 @@
*
* This file contains the SELinux hook function implementations.
*
- * Authors: Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Authors: Stephen Smalley, <sds@tycho.nsa.gov>
* Chris Vance, <cvance@nai.com>
* Wayne Salamon, <wsalamon@nai.com>
* James Morris <jmorris@redhat.com>
@@ -815,7 +815,9 @@ static int selinux_set_mnt_opts(struct super_block *sb,
if (!strcmp(sb->s_type->name, "debugfs") ||
!strcmp(sb->s_type->name, "tracefs") ||
!strcmp(sb->s_type->name, "sysfs") ||
- !strcmp(sb->s_type->name, "pstore"))
+ !strcmp(sb->s_type->name, "pstore") ||
+ !strcmp(sb->s_type->name, "cgroup") ||
+ !strcmp(sb->s_type->name, "cgroup2"))
sbsec->flags |= SE_SBGENFS;
if (!sbsec->behavior) {
@@ -1303,6 +1305,7 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc
case SOCK_SEQPACKET:
return SECCLASS_UNIX_STREAM_SOCKET;
case SOCK_DGRAM:
+ case SOCK_RAW:
return SECCLASS_UNIX_DGRAM_SOCKET;
}
break;
@@ -2317,6 +2320,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
int rc;
+ u32 av;
if (!nnp && !nosuid)
return 0; /* neither NNP nor nosuid */
@@ -2325,24 +2329,40 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm,
return 0; /* No change in credentials */
/*
- * The only transitions we permit under NNP or nosuid
- * are transitions to bounded SIDs, i.e. SIDs that are
- * guaranteed to only be allowed a subset of the permissions
- * of the current SID.
+ * If the policy enables the nnp_nosuid_transition policy capability,
+ * then we permit transitions under NNP or nosuid if the
+ * policy allows the corresponding permission between
+ * the old and new contexts.
*/
- rc = security_bounded_transition(old_tsec->sid, new_tsec->sid);
- if (rc) {
- /*
- * On failure, preserve the errno values for NNP vs nosuid.
- * NNP: Operation not permitted for caller.
- * nosuid: Permission denied to file.
- */
+ if (selinux_policycap_nnp_nosuid_transition) {
+ av = 0;
if (nnp)
- return -EPERM;
- else
- return -EACCES;
+ av |= PROCESS2__NNP_TRANSITION;
+ if (nosuid)
+ av |= PROCESS2__NOSUID_TRANSITION;
+ rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
+ SECCLASS_PROCESS2, av, NULL);
+ if (!rc)
+ return 0;
}
- return 0;
+
+ /*
+ * We also permit NNP or nosuid transitions to bounded SIDs,
+ * i.e. SIDs that are guaranteed to only be allowed a subset
+ * of the permissions of the current SID.
+ */
+ rc = security_bounded_transition(old_tsec->sid, new_tsec->sid);
+ if (!rc)
+ return 0;
+
+ /*
+ * On failure, preserve the errno values for NNP vs nosuid.
+ * NNP: Operation not permitted for caller.
+ * nosuid: Permission denied to file.
+ */
+ if (nnp)
+ return -EPERM;
+ return -EACCES;
}
static int selinux_bprm_set_creds(struct linux_binprm *bprm)
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index 0999df03af8b..a5004e9de11a 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -1,7 +1,7 @@
/*
* Access vector cache interface for object managers.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SELINUX_AVC_H_
#define _SELINUX_AVC_H_
diff --git a/security/selinux/include/avc_ss.h b/security/selinux/include/avc_ss.h
index d5c328452df0..37d57dadd476 100644
--- a/security/selinux/include/avc_ss.h
+++ b/security/selinux/include/avc_ss.h
@@ -1,7 +1,7 @@
/*
* Access vector cache interface for the security server.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SELINUX_AVC_SS_H_
#define _SELINUX_AVC_SS_H_
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index b9fe3434b036..35ffb29a69cb 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -48,6 +48,8 @@ struct security_class_mapping secclass_map[] = {
"setrlimit", "rlimitinh", "dyntransition", "setcurrent",
"execmem", "execstack", "execheap", "setkeycreate",
"setsockcreate", "getrlimit", NULL } },
+ { "process2",
+ { "nnp_transition", "nosuid_transition", NULL } },
{ "system",
{ "ipc_info", "syslog_read", "syslog_mod",
"syslog_console", "module_request", "module_load", NULL } },
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h
index 6ebc61e370ff..1649cd18eb0b 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -3,7 +3,7 @@
*
* This file contains the SELinux security data structures for kernel objects.
*
- * Author(s): Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author(s): Stephen Smalley, <sds@tycho.nsa.gov>
* Chris Vance, <cvance@nai.com>
* Wayne Salamon, <wsalamon@nai.com>
* James Morris <jmorris@redhat.com>
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index e91f08c16c0b..28dfb2f93e4d 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -1,7 +1,7 @@
/*
* Security server interface.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*
*/
@@ -73,6 +73,7 @@ enum {
POLICYDB_CAPABILITY_EXTSOCKCLASS,
POLICYDB_CAPABILITY_ALWAYSNETWORK,
POLICYDB_CAPABILITY_CGROUPSECLABEL,
+ POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION,
__POLICYDB_CAPABILITY_MAX
};
#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
@@ -84,6 +85,7 @@ extern int selinux_policycap_openperm;
extern int selinux_policycap_extsockclass;
extern int selinux_policycap_alwaysnetwork;
extern int selinux_policycap_cgroupseclabel;
+extern int selinux_policycap_nnp_nosuid_transition;
/*
* type_datum properties
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 3628d3a868b6..2c3c7d010d8a 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -1,7 +1,7 @@
/*
* Implementation of the access vector table type.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/* Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index d946c9dc3c9c..725853cadc42 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -5,7 +5,7 @@
* table is used to represent the type enforcement
* tables.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/* Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
diff --git a/security/selinux/ss/constraint.h b/security/selinux/ss/constraint.h
index 96fd947c494b..33ae2aec4f36 100644
--- a/security/selinux/ss/constraint.h
+++ b/security/selinux/ss/constraint.h
@@ -10,7 +10,7 @@
* process from labeling an object with a different user
* identity.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_CONSTRAINT_H_
#define _SS_CONSTRAINT_H_
diff --git a/security/selinux/ss/context.h b/security/selinux/ss/context.h
index 212e3479a0d9..a2c0f37c42ae 100644
--- a/security/selinux/ss/context.h
+++ b/security/selinux/ss/context.h
@@ -10,7 +10,7 @@
* security server and can be changed without affecting
* clients of the security server.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_CONTEXT_H_
#define _SS_CONTEXT_H_
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index ad38299164c3..fc28149a4f2e 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -1,7 +1,7 @@
/*
* Implementation of the extensible bitmap type.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
* Updated: Hewlett-Packard <paul@paul-moore.com>
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h
index 6d5a9ac4251f..da1325dda550 100644
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -9,7 +9,7 @@
* an explicitly specified starting bit position within
* the total bitmap.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_EBITMAP_H_
#define _SS_EBITMAP_H_
diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c
index 3858706a29fb..686c3917064c 100644
--- a/security/selinux/ss/hashtab.c
+++ b/security/selinux/ss/hashtab.c
@@ -1,7 +1,7 @@
/*
* Implementation of the hash table type.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#include <linux/kernel.h>
#include <linux/slab.h>
diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h
index 953872cd84ab..009fb5e06172 100644
--- a/security/selinux/ss/hashtab.h
+++ b/security/selinux/ss/hashtab.h
@@ -5,7 +5,7 @@
* functions for hash computation and key comparison are
* provided by the creator of the table.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_HASHTAB_H_
#define _SS_HASHTAB_H_
diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c
index e1088842232c..d9dc34f4fade 100644
--- a/security/selinux/ss/mls.c
+++ b/security/selinux/ss/mls.c
@@ -1,7 +1,7 @@
/*
* Implementation of the multi-level security (MLS) policy.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
* Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
diff --git a/security/selinux/ss/mls.h b/security/selinux/ss/mls.h
index e4369e3e6366..0f0a1d65b2ce 100644
--- a/security/selinux/ss/mls.h
+++ b/security/selinux/ss/mls.h
@@ -1,7 +1,7 @@
/*
* Multi-level security (MLS) policy operations.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
* Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
diff --git a/security/selinux/ss/mls_types.h b/security/selinux/ss/mls_types.h
index e93648774137..47f3702cd596 100644
--- a/security/selinux/ss/mls_types.h
+++ b/security/selinux/ss/mls_types.h
@@ -1,7 +1,7 @@
/*
* Type definitions for the multi-level security (MLS) policy.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
* Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index aa6500abb178..6e8c8056d7ad 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -1,7 +1,7 @@
/*
* Implementation of the policy database.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index 5d23eed35fa7..215f8f30ac5a 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -2,7 +2,7 @@
* A policy database (policydb) specifies the
* configuration data for the security policy.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
/*
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 2f02fa67ec2e..e4a1c0dc561a 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -1,7 +1,7 @@
/*
* Implementation of the security services.
*
- * Authors : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Authors : Stephen Smalley, <sds@tycho.nsa.gov>
* James Morris <jmorris@redhat.com>
*
* Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
@@ -76,7 +76,8 @@ char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = {
"open_perms",
"extended_socket_class",
"always_check_network",
- "cgroup_seclabel"
+ "cgroup_seclabel",
+ "nnp_nosuid_transition"
};
int selinux_policycap_netpeer;
@@ -84,6 +85,7 @@ int selinux_policycap_openperm;
int selinux_policycap_extsockclass;
int selinux_policycap_alwaysnetwork;
int selinux_policycap_cgroupseclabel;
+int selinux_policycap_nnp_nosuid_transition;
static DEFINE_RWLOCK(policy_rwlock);
@@ -2009,6 +2011,9 @@ static void security_load_policycaps(void)
selinux_policycap_cgroupseclabel =
ebitmap_get_bit(&policydb.policycaps,
POLICYDB_CAPABILITY_CGROUPSECLABEL);
+ selinux_policycap_nnp_nosuid_transition =
+ ebitmap_get_bit(&policydb.policycaps,
+ POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION);
for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++)
pr_info("SELinux: policy capability %s=%d\n",
diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h
index 6abcd8729ec3..3d9fa9556b4f 100644
--- a/security/selinux/ss/services.h
+++ b/security/selinux/ss/services.h
@@ -1,7 +1,7 @@
/*
* Implementation of the security services.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_SERVICES_H_
#define _SS_SERVICES_H_
diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c
index c5f436b15d19..6ae08efc5ae7 100644
--- a/security/selinux/ss/sidtab.c
+++ b/security/selinux/ss/sidtab.c
@@ -1,7 +1,7 @@
/*
* Implementation of the SID table type.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#include <linux/kernel.h>
#include <linux/slab.h>
diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h
index 84dc154d9389..de5d0ea583d2 100644
--- a/security/selinux/ss/sidtab.h
+++ b/security/selinux/ss/sidtab.h
@@ -2,7 +2,7 @@
* A security identifier table (sidtab) is a hash table
* of security context structures indexed by SID value.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_SIDTAB_H_
#define _SS_SIDTAB_H_
diff --git a/security/selinux/ss/symtab.c b/security/selinux/ss/symtab.c
index 160326ee99e5..d1a6745849a7 100644
--- a/security/selinux/ss/symtab.c
+++ b/security/selinux/ss/symtab.c
@@ -1,7 +1,7 @@
/*
* Implementation of the symbol table type.
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#include <linux/kernel.h>
#include <linux/string.h>
diff --git a/security/selinux/ss/symtab.h b/security/selinux/ss/symtab.h
index ca422b42fbc0..0bc12d587d3a 100644
--- a/security/selinux/ss/symtab.h
+++ b/security/selinux/ss/symtab.h
@@ -4,7 +4,7 @@
* is arbitrary. The symbol table type is implemented
* using the hash table type (hashtab).
*
- * Author : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * Author : Stephen Smalley, <sds@tycho.nsa.gov>
*/
#ifndef _SS_SYMTAB_H_
#define _SS_SYMTAB_H_
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 2a37ae925d85..140ae638cfd6 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_PHYS_ADDR = 1U << 19,
- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};
/*
@@ -814,6 +815,7 @@ enum perf_event_type {
* { u64 transaction; } && PERF_SAMPLE_TRANSACTION
* { u64 abi; # enum perf_sample_regs_abi
* u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+ * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt
index ab1b0825130a..76971d2e4164 100644
--- a/tools/perf/Documentation/intel-pt.txt
+++ b/tools/perf/Documentation/intel-pt.txt
@@ -873,7 +873,7 @@ amended to take the number of elements as a parameter.
$ cat ~/.perfconfig
[intel-pt]
- mispred-all
+ mispred-all = on
$ perf record -e intel_pt//u ./sort 3000
Bubble sorting array of 3000 elements
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 73496320fca3..4be08a1e3f8d 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -59,6 +59,10 @@ OPTIONS
--ldload::
Specify desired latency for loads event.
+-p::
+--phys-data::
+ Record/Report sample physical addresses
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 9bdea047c5db..e397453e5a46 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -249,7 +249,10 @@ OPTIONS
-d::
--data::
- Record the sample addresses.
+ Record the sample virtual addresses.
+
+--phys-data::
+ Record the sample physical addresses.
-T::
--timestamp::
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 9fa84617181e..383a98d992ed 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -137,6 +137,7 @@ OPTIONS
- mem: type of memory access for the data at the time of the sample
- snoop: type of snoop (if any) for the data at the time of the sample
- dcacheline: the cacheline the data address is on at the time of the sample
+ - phys_daddr: physical address of data being executed on at the time of sample
And the default sort keys are changed to local_weight, mem, sym, dso,
symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 5ee8796be96e..18dfcfa38454 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -117,7 +117,7 @@ OPTIONS
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff,
- callindent, insn, insnlen, synth.
+ callindent, insn, insnlen, synth, phys_addr.
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index c1e3288a2dfb..d53bea6bd571 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -37,7 +37,7 @@ OPTIONS
--expr::
--event::
List of syscalls and other perf events (tracepoints, HW cache events,
- etc) to show.
+ etc) to show. Globbing is supported, e.g.: "epoll_*", "*msg*", etc.
See 'perf list' for a complete list of events.
Prefixing with ! shows all syscalls but the ones specified. You may
need to escape it.
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index e001c0290793..0f15634ef82c 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -23,6 +23,7 @@ struct perf_mem {
bool hide_unresolved;
bool dump_raw;
bool force;
+ bool phys_addr;
int operation;
const char *cpu_list;
DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
@@ -101,6 +102,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
rec_argv[i++] = "-d";
+ if (mem->phys_addr)
+ rec_argv[i++] = "--phys-data";
+
for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) {
if (!perf_mem_events[j].record)
continue;
@@ -161,30 +165,60 @@ dump_raw_samples(struct perf_tool *tool,
if (al.map != NULL)
al.map->dso->hit = 1;
- if (symbol_conf.field_sep) {
- fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
- "%s0x%"PRIx64"%s%s:%s\n";
+ if (mem->phys_addr) {
+ if (symbol_conf.field_sep) {
+ fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64
+ "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n";
+ } else {
+ fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
+ "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64
+ "%s%s:%s\n";
+ symbol_conf.field_sep = " ";
+ }
+
+ printf(fmt,
+ sample->pid,
+ symbol_conf.field_sep,
+ sample->tid,
+ symbol_conf.field_sep,
+ sample->ip,
+ symbol_conf.field_sep,
+ sample->addr,
+ symbol_conf.field_sep,
+ sample->phys_addr,
+ symbol_conf.field_sep,
+ sample->weight,
+ symbol_conf.field_sep,
+ sample->data_src,
+ symbol_conf.field_sep,
+ al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+ al.sym ? al.sym->name : "???");
} else {
- fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
- "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
- symbol_conf.field_sep = " ";
- }
+ if (symbol_conf.field_sep) {
+ fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
+ "%s0x%"PRIx64"%s%s:%s\n";
+ } else {
+ fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
+ "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
+ symbol_conf.field_sep = " ";
+ }
- printf(fmt,
- sample->pid,
- symbol_conf.field_sep,
- sample->tid,
- symbol_conf.field_sep,
- sample->ip,
- symbol_conf.field_sep,
- sample->addr,
- symbol_conf.field_sep,
- sample->weight,
- symbol_conf.field_sep,
- sample->data_src,
- symbol_conf.field_sep,
- al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
- al.sym ? al.sym->name : "???");
+ printf(fmt,
+ sample->pid,
+ symbol_conf.field_sep,
+ sample->tid,
+ symbol_conf.field_sep,
+ sample->ip,
+ symbol_conf.field_sep,
+ sample->addr,
+ symbol_conf.field_sep,
+ sample->weight,
+ symbol_conf.field_sep,
+ sample->data_src,
+ symbol_conf.field_sep,
+ al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+ al.sym ? al.sym->name : "???");
+ }
out_put:
addr_location__put(&al);
return 0;
@@ -224,7 +258,10 @@ static int report_raw_events(struct perf_mem *mem)
if (ret < 0)
goto out_delete;
- printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+ if (mem->phys_addr)
+ printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+ else
+ printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
ret = perf_session__process_events(session);
@@ -254,9 +291,16 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem)
* there is no weight (cost) associated with stores, so don't print
* the column
*/
- if (!(mem->operation & MEM_OPERATION_LOAD))
- rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
- "dso_daddr,tlb,locked";
+ if (!(mem->operation & MEM_OPERATION_LOAD)) {
+ if (mem->phys_addr)
+ rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
+ "dso_daddr,tlb,locked,phys_daddr";
+ else
+ rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr,"
+ "dso_daddr,tlb,locked";
+ } else if (mem->phys_addr)
+ rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr,"
+ "dso_daddr,snoop,tlb,locked,phys_daddr";
for (j = 1; j < argc; j++, i++)
rep_argv[i] = argv[j];
@@ -373,6 +417,7 @@ int cmd_mem(int argc, const char **argv)
"separator for columns, no spaces will be added"
" between columns '.' is reserved."),
OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"),
+ OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"),
OPT_END()
};
const char *const mem_subcommands[] = { "record", "report", NULL };
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 36d7117a7562..56f8142ff97f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1604,6 +1604,8 @@ static struct option __record_options[] = {
OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
"per thread counts"),
OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
+ OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
+ "Record the sample physical addresses"),
OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
&record.opts.sample_time_set,
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 378f76cdf923..3d4c3b5e1868 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -87,6 +87,7 @@ enum perf_output_field {
PERF_OUTPUT_BRSTACKINSN = 1U << 23,
PERF_OUTPUT_BRSTACKOFF = 1U << 24,
PERF_OUTPUT_SYNTH = 1U << 25,
+ PERF_OUTPUT_PHYS_ADDR = 1U << 26,
};
struct output_option {
@@ -119,6 +120,7 @@ struct output_option {
{.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN},
{.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF},
{.str = "synth", .field = PERF_OUTPUT_SYNTH},
+ {.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR},
};
enum {
@@ -175,7 +177,8 @@ static struct {
PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
PERF_OUTPUT_SYM | PERF_OUTPUT_DSO |
PERF_OUTPUT_PERIOD | PERF_OUTPUT_ADDR |
- PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT,
+ PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT |
+ PERF_OUTPUT_PHYS_ADDR,
.invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT,
},
@@ -382,6 +385,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
PERF_OUTPUT_IREGS))
return -EINVAL;
+ if (PRINT_FIELD(PHYS_ADDR) &&
+ perf_evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR",
+ PERF_OUTPUT_PHYS_ADDR))
+ return -EINVAL;
+
return 0;
}
@@ -1446,6 +1454,9 @@ static void process_event(struct perf_script *script,
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
print_insn(sample, attr, thread, machine);
+
+ if (PRINT_FIELD(PHYS_ADDR))
+ printf("%16" PRIx64, sample->phys_addr);
printf("\n");
}
@@ -2729,7 +2740,7 @@ int cmd_script(int argc, const char **argv)
"Valid types: hw,sw,trace,raw,synth. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,period,iregs,brstack,brstacksym,flags,"
- "bpf-output,callindent,insn,insnlen,brstackinsn,synth",
+ "bpf-output,callindent,insn,insnlen,brstackinsn,synth,phys_addr",
parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 866da7aa54bf..85e992d9215b 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1257,7 +1257,7 @@ static bool collect_data(struct perf_evsel *counter,
if (counter->merged_stat)
return false;
cb(counter, data, true);
- if (!no_merge)
+ if (!no_merge && counter->auto_merge_stats)
collect_all_aliases(counter, cb, data);
return true;
}
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index d59cdadf3a79..771ddab94bb0 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1261,6 +1261,7 @@ static int trace__read_syscall_info(struct trace *trace, int id)
static int trace__validate_ev_qualifier(struct trace *trace)
{
int err = 0, i;
+ size_t nr_allocated;
struct str_node *pos;
trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
@@ -1274,13 +1275,18 @@ static int trace__validate_ev_qualifier(struct trace *trace)
goto out;
}
+ nr_allocated = trace->ev_qualifier_ids.nr;
i = 0;
strlist__for_each_entry(pos, trace->ev_qualifier) {
const char *sc = pos->s;
- int id = syscalltbl__id(trace->sctbl, sc);
+ int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
if (id < 0) {
+ id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
+ if (id >= 0)
+ goto matches;
+
if (err == 0) {
fputs("Error:\tInvalid syscall ", trace->output);
err = -EINVAL;
@@ -1290,13 +1296,37 @@ static int trace__validate_ev_qualifier(struct trace *trace)
fputs(sc, trace->output);
}
-
+matches:
trace->ev_qualifier_ids.entries[i++] = id;
+ if (match_next == -1)
+ continue;
+
+ while (1) {
+ id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
+ if (id < 0)
+ break;
+ if (nr_allocated == trace->ev_qualifier_ids.nr) {
+ void *entries;
+
+ nr_allocated += 8;
+ entries = realloc(trace->ev_qualifier_ids.entries,
+ nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
+ if (entries == NULL) {
+ err = -ENOMEM;
+ fputs("\nError:\t Not enough memory for parsing\n", trace->output);
+ goto out_free;
+ }
+ trace->ev_qualifier_ids.entries = entries;
+ }
+ trace->ev_qualifier_ids.nr++;
+ trace->ev_qualifier_ids.entries[i++] = id;
+ }
}
if (err < 0) {
fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
"\nHint:\tand: 'man syscalls'\n", trace->output);
+out_free:
zfree(&trace->ev_qualifier_ids.entries);
trace->ev_qualifier_ids.nr = 0;
}
@@ -2814,7 +2844,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
struct trace *trace = (struct trace *)opt->value;
const char *s = str;
char *sep = NULL, *lists[2] = { NULL, NULL, };
- int len = strlen(str) + 1, err = -1, list;
+ int len = strlen(str) + 1, err = -1, list, idx;
char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
char group_name[PATH_MAX];
@@ -2831,7 +2861,8 @@ static int trace__parse_events_option(const struct option *opt, const char *str,
*sep = '\0';
list = 0;
- if (syscalltbl__id(trace->sctbl, s) >= 0) {
+ if (syscalltbl__id(trace->sctbl, s) >= 0 ||
+ syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
list = 1;
} else {
path__join(group_name, sizeof(group_name), strace_groups_dir, s);
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 2c010dd6a79d..dc442ba21bf6 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -43,6 +43,7 @@ struct record_opts {
bool no_samples;
bool raw_samples;
bool sample_address;
+ bool sample_phys_addr;
bool sample_weight;
bool sample_time;
bool sample_time_set;
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
index 7e62c46d7a20..c63a919eda98 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json
@@ -80,11 +80,6 @@
"BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
},
{,
- "EventCode": "0x400F0",
- "EventName": "PM_LD_MISS_L1",
- "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load."
- },
- {,
"EventCode": "0x2E01A",
"EventName": "PM_CMPLU_STALL_LSU_FLUSH_NEXT",
"BriefDescription": "Completion stall of one cycle because the LSU requested to flush the next iop in the sequence. It takes 1 cycle for the ISU to process this request before the LSU instruction is allowed to complete"
@@ -374,4 +369,4 @@
"EventName": "PM_IPTEG_FROM_L31_ECO_MOD",
"BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request"
}
-] \ No newline at end of file
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/other.json b/tools/perf/pmu-events/arch/powerpc/power9/other.json
index 00f3d2a21f31..54cc3be00fc2 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/other.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/other.json
@@ -605,11 +605,6 @@
"BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)"
},
{,
- "EventCode": "0x3689E",
- "EventName": "PM_L2_RTY_LD",
- "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)"
- },
- {,
"EventCode": "0xE08C",
"EventName": "PM_LSU0_ERAT_HIT",
"BriefDescription": "Primary ERAT hit. There is no secondary ERAT"
@@ -715,11 +710,6 @@
"BriefDescription": "Lifetime, sample of RD machine 0 valid"
},
{,
- "EventCode": "0x468B4",
- "EventName": "PM_L3_RD0_BUSY",
- "BriefDescription": "Lifetime, sample of RD machine 0 valid"
- },
- {,
"EventCode": "0x46080",
"EventName": "PM_L2_DISP_ALL_L2MISS",
"BriefDescription": "All successful Ld/St dispatches for this thread that were an L2 miss (excludes i_l2mru_tch_reqs)"
@@ -850,21 +840,11 @@
"BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)"
},
{,
- "EventCode": "0x2608C",
- "EventName": "PM_RC0_BUSY",
- "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)"
- },
- {,
"EventCode": "0x36082",
"EventName": "PM_L2_LD_DISP",
"BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)."
},
{,
- "EventCode": "0x1609E",
- "EventName": "PM_L2_LD_DISP",
- "BriefDescription": "All successful D side load dispatches for this thread (L2 miss + L2 hits)"
- },
- {,
"EventCode": "0xF8B0",
"EventName": "PM_L3_SW_PREF",
"BriefDescription": "L3 load prefetch, sourced from a software prefetch stream, was sent to the nest"
@@ -1040,11 +1020,6 @@
"BriefDescription": "L3 castouts in Mepf state for this thread"
},
{,
- "EventCode": "0x168A0",
- "EventName": "PM_L3_CO_MEPF",
- "BriefDescription": "L3 CO of line in Mep state (includes casthrough to memory). The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request"
- },
- {,
"EventCode": "0x460A2",
"EventName": "PM_L3_LAT_CI_HIT",
"BriefDescription": "L3 Lateral Castins Hit"
@@ -1150,11 +1125,6 @@
"BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)"
},
{,
- "EventCode": "0x4689E",
- "EventName": "PM_L2_RTY_ST",
- "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)"
- },
- {,
"EventCode": "0x24040",
"EventName": "PM_INST_FROM_L2_MEPF",
"BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to an instruction fetch (not prefetch)"
@@ -1255,11 +1225,6 @@
"BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)"
},
{,
- "EventCode": "0x4608C",
- "EventName": "PM_CO0_BUSY",
- "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)"
- },
- {,
"EventCode": "0x2C122",
"EventName": "PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC",
"BriefDescription": "Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load"
@@ -1395,11 +1360,6 @@
"BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request"
},
{,
- "EventCode": "0x40006",
- "EventName": "PM_ISLB_MISS",
- "BriefDescription": "Number of ISLB misses for this thread"
- },
- {,
"EventCode": "0xD8A8",
"EventName": "PM_ISLB_MISS",
"BriefDescription": "Instruction SLB miss - Total of all segment sizes"
@@ -1515,11 +1475,6 @@
"BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)."
},
{,
- "EventCode": "0x3609E",
- "EventName": "PM_L2_INST",
- "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
- },
- {,
"EventCode": "0x3504C",
"EventName": "PM_IPTEG_FROM_DL4",
"BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a instruction side request"
@@ -1690,11 +1645,6 @@
"BriefDescription": "All successful I-or-D side load dispatches for this thread that were L2 hits (excludes i_l2mru_tch_reqs)"
},
{,
- "EventCode": "0x2609E",
- "EventName": "PM_L2_LD_HIT",
- "BriefDescription": "All successful D side load dispatches for this thread that were L2 hits for this thread"
- },
- {,
"EventCode": "0x168AC",
"EventName": "PM_L3_CI_USAGE",
"BriefDescription": "Rotating sample of 16 CI or CO actives"
@@ -1795,21 +1745,11 @@
"BriefDescription": "Rotating sample of 8 WI valid"
},
{,
- "EventCode": "0x260B6",
- "EventName": "PM_L3_WI0_BUSY",
- "BriefDescription": "Rotating sample of 8 WI valid (duplicate)"
- },
- {,
"EventCode": "0x368AC",
"EventName": "PM_L3_CO0_BUSY",
"BriefDescription": "Lifetime, sample of CO machine 0 valid"
},
{,
- "EventCode": "0x468AC",
- "EventName": "PM_L3_CO0_BUSY",
- "BriefDescription": "Lifetime, sample of CO machine 0 valid"
- },
- {,
"EventCode": "0x2E040",
"EventName": "PM_DPTEG_FROM_L2_MEPF",
"BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included"
@@ -1840,11 +1780,6 @@
"BriefDescription": "L3 PF received retry port 0, every retry counted"
},
{,
- "EventCode": "0x260AE",
- "EventName": "PM_L3_P0_PF_RTY",
- "BriefDescription": "L3 PF received retry port 0, every retry counted"
- },
- {,
"EventCode": "0x268B2",
"EventName": "PM_L3_LOC_GUESS_WRONG",
"BriefDescription": "Initial scope=node (LNS) but data from out side local node (near or far or rem). Prediction too Low"
@@ -1895,11 +1830,6 @@
"BriefDescription": "Lifetime, sample of snooper machine 0 valid"
},
{,
- "EventCode": "0x460AC",
- "EventName": "PM_L3_SN0_BUSY",
- "BriefDescription": "Lifetime, sample of snooper machine 0 valid"
- },
- {,
"EventCode": "0x3005C",
"EventName": "PM_BFU_BUSY",
"BriefDescription": "Cycles in which all 4 Binary Floating Point units are busy. The BFU is running at capacity"
@@ -1935,11 +1865,6 @@
"BriefDescription": "Lifetime, sample of PF machine 0 valid"
},
{,
- "EventCode": "0x460B4",
- "EventName": "PM_L3_PF0_BUSY",
- "BriefDescription": "Lifetime, sample of PF machine 0 valid"
- },
- {,
"EventCode": "0xC0B0",
"EventName": "PM_LSU_FLUSH_UE",
"BriefDescription": "Correctable ECC error on reload data, reported at critical data forward time"
@@ -2085,11 +2010,6 @@
"BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted"
},
{,
- "EventCode": "0x468AE",
- "EventName": "PM_L3_P1_CO_RTY",
- "BriefDescription": "L3 CO received retry port 3 (memory only), every retry counted"
- },
- {,
"EventCode": "0xC0AC",
"EventName": "PM_LSU_FLUSH_EMSH",
"BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address"
@@ -2195,11 +2115,6 @@
"BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)"
},
{,
- "EventCode": "0x46886",
- "EventName": "PM_L2_SN_M_WR_DONE",
- "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)"
- },
- {,
"EventCode": "0x489C",
"EventName": "PM_BR_CORECT_PRED_TAKEN_CMPL",
"BriefDescription": "Conditional Branch Completed in which the HW correctly predicted the direction as taken. Counted at completion time"
@@ -2290,21 +2205,11 @@
"BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)"
},
{,
- "EventCode": "0x26090",
- "EventName": "PM_SN0_BUSY",
- "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)"
- },
- {,
"EventCode": "0x360AE",
"EventName": "PM_L3_P0_CO_RTY",
"BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted"
},
{,
- "EventCode": "0x460AE",
- "EventName": "PM_L3_P0_CO_RTY",
- "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted"
- },
- {,
"EventCode": "0x168A8",
"EventName": "PM_L3_WI_USAGE",
"BriefDescription": "Lifetime, sample of Write Inject machine 0 valid"
@@ -2340,26 +2245,11 @@
"BriefDescription": "L3 PF received retry port 1, every retry counted"
},
{,
- "EventCode": "0x268AE",
- "EventName": "PM_L3_P1_PF_RTY",
- "BriefDescription": "L3 PF received retry port 3, every retry counted"
- },
- {,
"EventCode": "0x46082",
"EventName": "PM_L2_ST_DISP",
"BriefDescription": "All successful D-side store dispatches for this thread "
},
{,
- "EventCode": "0x1689E",
- "EventName": "PM_L2_ST_DISP",
- "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)"
- },
- {,
- "EventCode": "0x36880",
- "EventName": "PM_L2_INST_MISS",
- "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
- },
- {,
"EventCode": "0x4609E",
"EventName": "PM_L2_INST_MISS",
"BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)"
@@ -2430,11 +2320,6 @@
"BriefDescription": "# PPC Dispatched"
},
{,
- "EventCode": "0x300F2",
- "EventName": "PM_INST_DISP",
- "BriefDescription": "# PPC Dispatched"
- },
- {,
"EventCode": "0x4E05E",
"EventName": "PM_TM_OUTER_TBEGIN_DISP",
"BriefDescription": "Number of outer tbegin instructions dispatched. The dispatch unit determines whether the tbegin instruction is outer or nested. This is a speculative count, which includes flushed instructions"
@@ -2460,11 +2345,6 @@
"BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits"
},
{,
- "EventCode": "0x2689E",
- "EventName": "PM_L2_ST_HIT",
- "BriefDescription": "All successful D-side store dispatches that were L2 hits for this thread"
- },
- {,
"EventCode": "0x360A8",
"EventName": "PM_L3_CO",
"BriefDescription": "L3 castout occurring (does not include casthrough or log writes (cinj/dmaw))"
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
index 47a82568a8df..bc2db636dabf 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json
@@ -420,11 +420,6 @@
"BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for an instruction fetch"
},
{,
- "EventCode": "0x10016",
- "EventName": "PM_DSLB_MISS",
- "BriefDescription": "Data SLB Miss - Total of all segment sizes"
- },
- {,
"EventCode": "0xD0A8",
"EventName": "PM_DSLB_MISS",
"BriefDescription": "Data SLB Miss - Total of all segment sizes"
@@ -554,4 +549,4 @@
"EventName": "PM_MRK_DATA_FROM_L21_SHR_CYC",
"BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load"
}
-] \ No newline at end of file
+]
diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
index a2c95a99e168..3ef8a10aac86 100644
--- a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
+++ b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json
@@ -5,11 +5,6 @@
"BriefDescription": "Branches that are not strongly biased"
},
{,
- "EventCode": "0x40036",
- "EventName": "PM_BR_2PATH",
- "BriefDescription": "Branches that are not strongly biased"
- },
- {,
"EventCode": "0x40056",
"EventName": "PM_MEM_LOC_THRESH_LSU_HIGH",
"BriefDescription": "Local memory above threshold for LSU medium"
@@ -124,4 +119,4 @@
"EventName": "PM_1FLOP_CMPL",
"BriefDescription": "one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation completed"
}
-] \ No newline at end of file
+]
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 761c5a448c56..466a462b26d1 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -237,6 +237,11 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode,
thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al);
if (!al.map || !al.map->dso) {
+ if (cpumode == PERF_RECORD_MISC_HYPERVISOR) {
+ pr_debug("Hypervisor address can not be resolved - skipping\n");
+ return 0;
+ }
+
pr_debug("thread__find_addr_map failed\n");
return -1;
}
diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c
index 6d028f42b3cf..c3858487159d 100644
--- a/tools/perf/tests/sample-parsing.c
+++ b/tools/perf/tests/sample-parsing.c
@@ -141,6 +141,9 @@ static bool samples_same(const struct perf_sample *s1,
}
}
+ if (type & PERF_SAMPLE_PHYS_ADDR)
+ COMP(phys_addr);
+
return true;
}
@@ -206,6 +209,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format)
.mask = sample_regs,
.regs = regs,
},
+ .phys_addr = 113,
};
struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},};
struct perf_sample sample_out;
@@ -305,7 +309,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u
* were added. Please actually update the test rather than just change
* the condition below.
*/
- if (PERF_SAMPLE_MAX > PERF_SAMPLE_REGS_INTR << 1) {
+ if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) {
pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n");
return -1;
}
diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c
index ba0aee576a2b..786fecaf578e 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -829,7 +829,8 @@ static int annotate_browser__run(struct annotate_browser *browser,
"q/ESC/CTRL+C Exit\n\n"
"ENTER Go to target\n"
"ESC Exit\n"
- "H Cycle thru hottest instructions\n"
+ "H Go to hottest instruction\n"
+ "TAB/shift+TAB Cycle thru hottest instructions\n"
"j Toggle showing jump to target arrows\n"
"J Toggle showing number of jump sources on targets\n"
"n Search next string\n"
diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c
index f4bc2462bc2c..13dfb0a0bdeb 100644
--- a/tools/perf/ui/browsers/hists.c
+++ b/tools/perf/ui/browsers/hists.c
@@ -931,12 +931,8 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser,
browser->show_dso);
if (symbol_conf.show_branchflag_count) {
- if (need_percent)
- callchain_list_counts__printf_value(node, chain, NULL,
- buf, sizeof(buf));
- else
- callchain_list_counts__printf_value(NULL, chain, NULL,
- buf, sizeof(buf));
+ callchain_list_counts__printf_value(chain, NULL,
+ buf, sizeof(buf));
if (asprintf(&alloc_str2, "%s%s", str, buf) < 0)
str = "Not enough memory!";
diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c
index 5c95b8301c67..8bdb7a500181 100644
--- a/tools/perf/ui/stdio/hist.c
+++ b/tools/perf/ui/stdio/hist.c
@@ -124,12 +124,8 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node,
str = callchain_list__sym_name(chain, bf, sizeof(bf), false);
if (symbol_conf.show_branchflag_count) {
- if (!period)
- callchain_list_counts__printf_value(node, chain, NULL,
- buf, sizeof(buf));
- else
- callchain_list_counts__printf_value(NULL, chain, NULL,
- buf, sizeof(buf));
+ callchain_list_counts__printf_value(chain, NULL,
+ buf, sizeof(buf));
if (asprintf(&alloc_str, "%s%s", str, buf) < 0)
str = "Not enough memory!";
@@ -313,7 +309,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root,
if (symbol_conf.show_branchflag_count)
ret += callchain_list_counts__printf_value(
- NULL, chain, fp, NULL, 0);
+ chain, fp, NULL, 0);
ret += fprintf(fp, "\n");
if (++entries_printed == callchain_param.print_limit)
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c
index f320b0777e0d..510b513e0f01 100644
--- a/tools/perf/util/callchain.c
+++ b/tools/perf/util/callchain.c
@@ -588,7 +588,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor)
call->cycles_count =
cursor_node->branch_flags.cycles;
call->iter_count = cursor_node->nr_loop_iter;
- call->samples_count = cursor_node->samples;
+ call->iter_cycles = cursor_node->iter_cycles;
}
}
@@ -722,7 +722,7 @@ static enum match_result match_chain(struct callchain_cursor_node *node,
cnode->cycles_count +=
node->branch_flags.cycles;
cnode->iter_count += node->nr_loop_iter;
- cnode->samples_count += node->samples;
+ cnode->iter_cycles += node->iter_cycles;
}
}
@@ -998,7 +998,7 @@ int callchain_merge(struct callchain_cursor *cursor,
int callchain_cursor_append(struct callchain_cursor *cursor,
u64 ip, struct map *map, struct symbol *sym,
bool branch, struct branch_flags *flags,
- int nr_loop_iter, int samples, u64 branch_from)
+ int nr_loop_iter, u64 iter_cycles, u64 branch_from)
{
struct callchain_cursor_node *node = *cursor->last;
@@ -1016,7 +1016,7 @@ int callchain_cursor_append(struct callchain_cursor *cursor,
node->sym = sym;
node->branch = branch;
node->nr_loop_iter = nr_loop_iter;
- node->samples = samples;
+ node->iter_cycles = iter_cycles;
if (flags)
memcpy(&node->branch_flags, flags,
@@ -1306,7 +1306,7 @@ static int branch_to_str(char *bf, int bfsize,
static int branch_from_str(char *bf, int bfsize,
u64 branch_count,
u64 cycles_count, u64 iter_count,
- u64 samples_count)
+ u64 iter_cycles)
{
int printed = 0, i = 0;
u64 cycles;
@@ -1318,9 +1318,13 @@ static int branch_from_str(char *bf, int bfsize,
bf + printed, bfsize - printed);
}
- if (iter_count && samples_count) {
- printed += count_pri64_printf(i++, "iterations",
- iter_count / samples_count,
+ if (iter_count) {
+ printed += count_pri64_printf(i++, "iter",
+ iter_count,
+ bf + printed, bfsize - printed);
+
+ printed += count_pri64_printf(i++, "avg_cycles",
+ iter_cycles / iter_count,
bf + printed, bfsize - printed);
}
@@ -1333,7 +1337,7 @@ static int branch_from_str(char *bf, int bfsize,
static int counts_str_build(char *bf, int bfsize,
u64 branch_count, u64 predicted_count,
u64 abort_count, u64 cycles_count,
- u64 iter_count, u64 samples_count,
+ u64 iter_count, u64 iter_cycles,
struct branch_type_stat *brtype_stat)
{
int printed;
@@ -1346,7 +1350,7 @@ static int counts_str_build(char *bf, int bfsize,
predicted_count, abort_count, brtype_stat);
} else {
printed = branch_from_str(bf, bfsize, branch_count,
- cycles_count, iter_count, samples_count);
+ cycles_count, iter_count, iter_cycles);
}
if (!printed)
@@ -1358,14 +1362,14 @@ static int counts_str_build(char *bf, int bfsize,
static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
u64 branch_count, u64 predicted_count,
u64 abort_count, u64 cycles_count,
- u64 iter_count, u64 samples_count,
+ u64 iter_count, u64 iter_cycles,
struct branch_type_stat *brtype_stat)
{
char str[256];
counts_str_build(str, sizeof(str), branch_count,
predicted_count, abort_count, cycles_count,
- iter_count, samples_count, brtype_stat);
+ iter_count, iter_cycles, brtype_stat);
if (fp)
return fprintf(fp, "%s", str);
@@ -1373,31 +1377,23 @@ static int callchain_counts_printf(FILE *fp, char *bf, int bfsize,
return scnprintf(bf, bfsize, "%s", str);
}
-int callchain_list_counts__printf_value(struct callchain_node *node,
- struct callchain_list *clist,
+int callchain_list_counts__printf_value(struct callchain_list *clist,
FILE *fp, char *bf, int bfsize)
{
u64 branch_count, predicted_count;
u64 abort_count, cycles_count;
- u64 iter_count = 0, samples_count = 0;
+ u64 iter_count, iter_cycles;
branch_count = clist->branch_count;
predicted_count = clist->predicted_count;
abort_count = clist->abort_count;
cycles_count = clist->cycles_count;
-
- if (node) {
- struct callchain_list *call;
-
- list_for_each_entry(call, &node->val, list) {
- iter_count += call->iter_count;
- samples_count += call->samples_count;
- }
- }
+ iter_count = clist->iter_count;
+ iter_cycles = clist->iter_cycles;
return callchain_counts_printf(fp, bf, bfsize, branch_count,
predicted_count, abort_count,
- cycles_count, iter_count, samples_count,
+ cycles_count, iter_count, iter_cycles,
&clist->brtype_stat);
}
@@ -1523,7 +1519,8 @@ int callchain_cursor__copy(struct callchain_cursor *dst,
rc = callchain_cursor_append(dst, node->ip, node->map, node->sym,
node->branch, &node->branch_flags,
- node->nr_loop_iter, node->samples,
+ node->nr_loop_iter,
+ node->iter_cycles,
node->branch_from);
if (rc)
break;
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 97738201464a..1ed6fc61d0a5 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -119,7 +119,7 @@ struct callchain_list {
u64 abort_count;
u64 cycles_count;
u64 iter_count;
- u64 samples_count;
+ u64 iter_cycles;
struct branch_type_stat brtype_stat;
char *srcline;
struct list_head list;
@@ -139,7 +139,7 @@ struct callchain_cursor_node {
struct branch_flags branch_flags;
u64 branch_from;
int nr_loop_iter;
- int samples;
+ u64 iter_cycles;
struct callchain_cursor_node *next;
};
@@ -201,7 +201,7 @@ static inline void callchain_cursor_reset(struct callchain_cursor *cursor)
int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip,
struct map *map, struct symbol *sym,
bool branch, struct branch_flags *flags,
- int nr_loop_iter, int samples, u64 branch_from);
+ int nr_loop_iter, u64 iter_cycles, u64 branch_from);
/* Close a cursor writing session. Initialize for the reader */
static inline void callchain_cursor_commit(struct callchain_cursor *cursor)
@@ -282,8 +282,7 @@ char *callchain_node__scnprintf_value(struct callchain_node *node,
int callchain_node__fprintf_value(struct callchain_node *node,
FILE *fp, u64 total);
-int callchain_list_counts__printf_value(struct callchain_node *node,
- struct callchain_list *clist,
+int callchain_list_counts__printf_value(struct callchain_list *clist,
FILE *fp, char *bf, int bfsize);
void free_callchain(struct callchain_root *root);
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 423ac82605f3..ee7bcc898d35 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -200,6 +200,7 @@ struct perf_sample {
u32 cpu;
u32 raw_size;
u64 data_src;
+ u64 phys_addr;
u32 flags;
u16 insn_len;
u8 cpumode;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index d9bd632ed7db..4bb89373eb52 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -955,6 +955,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts,
if (opts->sample_address)
perf_evsel__set_sample_bit(evsel, DATA_SRC);
+ if (opts->sample_phys_addr)
+ perf_evsel__set_sample_bit(evsel, PHYS_ADDR);
+
if (opts->no_buffering) {
attr->watermark = 0;
attr->wakeup_events = 1;
@@ -1464,7 +1467,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value)
bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW),
bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER),
bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC),
- bit_name(WEIGHT),
+ bit_name(WEIGHT), bit_name(PHYS_ADDR),
{ .name = NULL, }
};
#undef bit_name
@@ -2206,6 +2209,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event,
}
}
+ data->phys_addr = 0;
+ if (type & PERF_SAMPLE_PHYS_ADDR) {
+ data->phys_addr = *array;
+ array++;
+ }
+
return 0;
}
@@ -2311,6 +2320,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type,
}
}
+ if (type & PERF_SAMPLE_PHYS_ADDR)
+ result += sizeof(u64);
+
return result;
}
@@ -2500,6 +2512,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type,
}
}
+ if (type & PERF_SAMPLE_PHYS_ADDR) {
+ *array = sample->phys_addr;
+ array++;
+ }
+
return 0;
}
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 351d3b2d8887..dd2c4b5112a5 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -131,6 +131,7 @@ struct perf_evsel {
bool cmdline_group_boundary;
struct list_head config_terms;
int bpf_fd;
+ bool auto_merge_stats;
bool merged_stat;
const char * metric_expr;
const char * metric_name;
diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c
index 9453b2e27015..e60d8d8ea4c2 100644
--- a/tools/perf/util/hist.c
+++ b/tools/perf/util/hist.c
@@ -167,6 +167,10 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h)
symlen = unresolved_col_width + 4 + 2;
hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO);
}
+
+ hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR,
+ unresolved_col_width + 4 + 2);
+
} else {
symlen = unresolved_col_width + 4 + 2;
hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen);
diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h
index ee3670a388df..e60dda26a920 100644
--- a/tools/perf/util/hist.h
+++ b/tools/perf/util/hist.h
@@ -47,6 +47,7 @@ enum hist_column {
HISTC_GLOBAL_WEIGHT,
HISTC_MEM_DADDR_SYMBOL,
HISTC_MEM_DADDR_DSO,
+ HISTC_MEM_PHYS_DADDR,
HISTC_MEM_LOCKED,
HISTC_MEM_TLB,
HISTC_MEM_LVL,
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 5c8eacaca4f4..df709363ef69 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -1635,10 +1635,12 @@ static void ip__resolve_ams(struct thread *thread,
ams->al_addr = al.addr;
ams->sym = al.sym;
ams->map = al.map;
+ ams->phys_addr = 0;
}
static void ip__resolve_data(struct thread *thread,
- u8 m, struct addr_map_symbol *ams, u64 addr)
+ u8 m, struct addr_map_symbol *ams,
+ u64 addr, u64 phys_addr)
{
struct addr_location al;
@@ -1658,6 +1660,7 @@ static void ip__resolve_data(struct thread *thread,
ams->al_addr = al.addr;
ams->sym = al.sym;
ams->map = al.map;
+ ams->phys_addr = phys_addr;
}
struct mem_info *sample__resolve_mem(struct perf_sample *sample,
@@ -1669,12 +1672,18 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample,
return NULL;
ip__resolve_ams(al->thread, &mi->iaddr, sample->ip);
- ip__resolve_data(al->thread, al->cpumode, &mi->daddr, sample->addr);
+ ip__resolve_data(al->thread, al->cpumode, &mi->daddr,
+ sample->addr, sample->phys_addr);
mi->data_src.val = sample->data_src;
return mi;
}
+struct iterations {
+ int nr_loop_iter;
+ u64 cycles;
+};
+
static int add_callchain_ip(struct thread *thread,
struct callchain_cursor *cursor,
struct symbol **parent,
@@ -1683,11 +1692,12 @@ static int add_callchain_ip(struct thread *thread,
u64 ip,
bool branch,
struct branch_flags *flags,
- int nr_loop_iter,
- int samples,
+ struct iterations *iter,
u64 branch_from)
{
struct addr_location al;
+ int nr_loop_iter = 0;
+ u64 iter_cycles = 0;
al.filtered = 0;
al.sym = NULL;
@@ -1737,9 +1747,15 @@ static int add_callchain_ip(struct thread *thread,
if (symbol_conf.hide_unresolved && al.sym == NULL)
return 0;
+
+ if (iter) {
+ nr_loop_iter = iter->nr_loop_iter;
+ iter_cycles = iter->cycles;
+ }
+
return callchain_cursor_append(cursor, al.addr, al.map, al.sym,
- branch, flags, nr_loop_iter, samples,
- branch_from);
+ branch, flags, nr_loop_iter,
+ iter_cycles, branch_from);
}
struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
@@ -1760,6 +1776,18 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
return bi;
}
+static void save_iterations(struct iterations *iter,
+ struct branch_entry *be, int nr)
+{
+ int i;
+
+ iter->nr_loop_iter = nr;
+ iter->cycles = 0;
+
+ for (i = 0; i < nr; i++)
+ iter->cycles += be[i].flags.cycles;
+}
+
#define CHASHSZ 127
#define CHASHBITS 7
#define NO_ENTRY 0xff
@@ -1767,7 +1795,8 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample,
#define PERF_MAX_BRANCH_DEPTH 127
/* Remove loops. */
-static int remove_loops(struct branch_entry *l, int nr)
+static int remove_loops(struct branch_entry *l, int nr,
+ struct iterations *iter)
{
int i, j, off;
unsigned char chash[CHASHSZ];
@@ -1792,8 +1821,18 @@ static int remove_loops(struct branch_entry *l, int nr)
break;
}
if (is_loop) {
- memmove(l + i, l + i + off,
- (nr - (i + off)) * sizeof(*l));
+ j = nr - (i + off);
+ if (j > 0) {
+ save_iterations(iter + i + off,
+ l + i, off);
+
+ memmove(iter + i, iter + i + off,
+ j * sizeof(*iter));
+
+ memmove(l + i, l + i + off,
+ j * sizeof(*l));
+ }
+
nr -= off;
}
}
@@ -1883,7 +1922,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread,
err = add_callchain_ip(thread, cursor, parent,
root_al, &cpumode, ip,
- branch, flags, 0, 0,
+ branch, flags, NULL,
branch_from);
if (err)
return (err < 0) ? err : 0;
@@ -1909,7 +1948,6 @@ static int thread__resolve_callchain_sample(struct thread *thread,
int i, j, err, nr_entries;
int skip_idx = -1;
int first_call = 0;
- int nr_loop_iter;
if (chain)
chain_nr = chain->nr;
@@ -1942,6 +1980,7 @@ static int thread__resolve_callchain_sample(struct thread *thread,
if (branch && callchain_param.branch_callstack) {
int nr = min(max_stack, (int)branch->nr);
struct branch_entry be[nr];
+ struct iterations iter[nr];
if (branch->nr > PERF_MAX_BRANCH_DEPTH) {
pr_warning("corrupted branch chain. skipping...\n");
@@ -1972,38 +2011,21 @@ static int thread__resolve_callchain_sample(struct thread *thread,
be[i] = branch->entries[branch->nr - i - 1];
}
- nr_loop_iter = nr;
- nr = remove_loops(be, nr);
-
- /*
- * Get the number of iterations.
- * It's only approximation, but good enough in practice.
- */
- if (nr_loop_iter > nr)
- nr_loop_iter = nr_loop_iter - nr + 1;
- else
- nr_loop_iter = 0;
+ memset(iter, 0, sizeof(struct iterations) * nr);
+ nr = remove_loops(be, nr, iter);
for (i = 0; i < nr; i++) {
- if (i == nr - 1)
- err = add_callchain_ip(thread, cursor, parent,
- root_al,
- NULL, be[i].to,
- true, &be[i].flags,
- nr_loop_iter, 1,
- be[i].from);
- else
- err = add_callchain_ip(thread, cursor, parent,
- root_al,
- NULL, be[i].to,
- true, &be[i].flags,
- 0, 0, be[i].from);
+ err = add_callchain_ip(thread, cursor, parent,
+ root_al,
+ NULL, be[i].to,
+ true, &be[i].flags,
+ NULL, be[i].from);
if (!err)
err = add_callchain_ip(thread, cursor, parent, root_al,
NULL, be[i].from,
true, &be[i].flags,
- 0, 0, 0);
+ &iter[i], 0);
if (err == -EINVAL)
break;
if (err)
@@ -2037,7 +2059,7 @@ check_calls:
err = add_callchain_ip(thread, cursor, parent,
root_al, &cpumode, ip,
- false, NULL, 0, 0, 0);
+ false, NULL, NULL, 0);
if (err)
return (err < 0) ? err : 0;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f44aeba51d1f..f6257fb4f08c 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -310,7 +310,7 @@ static struct perf_evsel *
__add_event(struct list_head *list, int *idx,
struct perf_event_attr *attr,
char *name, struct cpu_map *cpus,
- struct list_head *config_terms)
+ struct list_head *config_terms, bool auto_merge_stats)
{
struct perf_evsel *evsel;
@@ -324,6 +324,7 @@ __add_event(struct list_head *list, int *idx,
evsel->cpus = cpu_map__get(cpus);
evsel->own_cpus = cpu_map__get(cpus);
evsel->system_wide = !!cpus;
+ evsel->auto_merge_stats = auto_merge_stats;
if (name)
evsel->name = strdup(name);
@@ -339,7 +340,7 @@ static int add_event(struct list_head *list, int *idx,
struct perf_event_attr *attr, char *name,
struct list_head *config_terms)
{
- return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM;
+ return __add_event(list, idx, attr, name, NULL, config_terms, false) ? 0 : -ENOMEM;
}
static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size)
@@ -1209,9 +1210,9 @@ int parse_events_add_numeric(struct parse_events_state *parse_state,
get_config_name(head_config), &config_terms);
}
-int parse_events_add_pmu(struct parse_events_state *parse_state,
+static int __parse_events_add_pmu(struct parse_events_state *parse_state,
struct list_head *list, char *name,
- struct list_head *head_config)
+ struct list_head *head_config, bool auto_merge_stats)
{
struct perf_event_attr attr;
struct perf_pmu_info info;
@@ -1232,7 +1233,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
if (!head_config) {
attr.type = pmu->type;
- evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL);
+ evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL, auto_merge_stats);
return evsel ? 0 : -ENOMEM;
}
@@ -1254,7 +1255,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
evsel = __add_event(list, &parse_state->idx, &attr,
get_config_name(head_config), pmu->cpus,
- &config_terms);
+ &config_terms, auto_merge_stats);
if (evsel) {
evsel->unit = info.unit;
evsel->scale = info.scale;
@@ -1267,6 +1268,13 @@ int parse_events_add_pmu(struct parse_events_state *parse_state,
return evsel ? 0 : -ENOMEM;
}
+int parse_events_add_pmu(struct parse_events_state *parse_state,
+ struct list_head *list, char *name,
+ struct list_head *head_config)
+{
+ return __parse_events_add_pmu(parse_state, list, name, head_config, false);
+}
+
int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
char *str, struct list_head **listp)
{
@@ -1296,8 +1304,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state,
return -1;
list_add_tail(&term->list, head);
- if (!parse_events_add_pmu(parse_state, list,
- pmu->name, head)) {
+ if (!__parse_events_add_pmu(parse_state, list,
+ pmu->name, head, true)) {
pr_debug("%s -> %s/%s/\n", str,
pmu->name, alias->str);
ok++;
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index ac863691605f..a7ebd9fe8e40 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -1120,6 +1120,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event,
if (sample_type & PERF_SAMPLE_DATA_SRC)
printf(" . data_src: 0x%"PRIx64"\n", sample->data_src);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr);
+
if (sample_type & PERF_SAMPLE_TRANSACTION)
printf("... transaction: %" PRIx64 "\n", sample->transaction);
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c
index 12359bd986db..eb3ab902a1c0 100644
--- a/tools/perf/util/sort.c
+++ b/tools/perf/util/sort.c
@@ -1316,6 +1316,47 @@ struct sort_entry sort_mem_dcacheline = {
};
static int64_t
+sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+ uint64_t l = 0, r = 0;
+
+ if (left->mem_info)
+ l = left->mem_info->daddr.phys_addr;
+ if (right->mem_info)
+ r = right->mem_info->daddr.phys_addr;
+
+ return (int64_t)(r - l);
+}
+
+static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf,
+ size_t size, unsigned int width)
+{
+ uint64_t addr = 0;
+ size_t ret = 0;
+ size_t len = BITS_PER_LONG / 4;
+
+ addr = he->mem_info->daddr.phys_addr;
+
+ ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level);
+
+ ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx", len, addr);
+
+ ret += repsep_snprintf(bf + ret, size - ret, "%-*s", width - ret, "");
+
+ if (ret > width)
+ bf[width] = '\0';
+
+ return width;
+}
+
+struct sort_entry sort_mem_phys_daddr = {
+ .se_header = "Data Physical Address",
+ .se_cmp = sort__phys_daddr_cmp,
+ .se_snprintf = hist_entry__phys_daddr_snprintf,
+ .se_width_idx = HISTC_MEM_PHYS_DADDR,
+};
+
+static int64_t
sort__abort_cmp(struct hist_entry *left, struct hist_entry *right)
{
if (!left->branch_info || !right->branch_info)
@@ -1547,6 +1588,7 @@ static struct sort_dimension memory_sort_dimensions[] = {
DIM(SORT_MEM_LVL, "mem", sort_mem_lvl),
DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop),
DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline),
+ DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr),
};
#undef DIM
diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h
index b7c75597e18f..f36dc4980a6c 100644
--- a/tools/perf/util/sort.h
+++ b/tools/perf/util/sort.h
@@ -245,6 +245,7 @@ enum sort_type {
SORT_MEM_SNOOP,
SORT_MEM_DCACHELINE,
SORT_MEM_IADDR_SYMBOL,
+ SORT_MEM_PHYS_DADDR,
};
/*
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index d00a012cfdfb..2bd6a1f01a1c 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -186,6 +186,7 @@ struct addr_map_symbol {
struct symbol *sym;
u64 addr;
u64 al_addr;
+ u64 phys_addr;
};
struct branch_info {
diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c
index bbb4c1957578..19e5db90394c 100644
--- a/tools/perf/util/syscalltbl.c
+++ b/tools/perf/util/syscalltbl.c
@@ -19,6 +19,7 @@
#ifdef HAVE_SYSCALL_TABLE
#include <linux/compiler.h>
#include <string.h>
+#include "string2.h"
#include "util.h"
#if defined(__x86_64__)
@@ -105,6 +106,27 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name)
return sc ? sc->id : -1;
}
+int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx)
+{
+ int i;
+ struct syscall *syscalls = tbl->syscalls.entries;
+
+ for (i = *idx + 1; i < tbl->syscalls.nr_entries; ++i) {
+ if (strglobmatch(syscalls[i].name, syscall_glob)) {
+ *idx = i;
+ return syscalls[i].id;
+ }
+ }
+
+ return -1;
+}
+
+int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx)
+{
+ *idx = -1;
+ return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx);
+}
+
#else /* HAVE_SYSCALL_TABLE */
#include <libaudit.h>
@@ -131,4 +153,15 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name)
{
return audit_name_to_syscall(name, tbl->audit_machine);
}
+
+int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused,
+ const char *syscall_glob __maybe_unused, int *idx __maybe_unused)
+{
+ return -1;
+}
+
+int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx)
+{
+ return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx);
+}
#endif /* HAVE_SYSCALL_TABLE */
diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h
index e2951510484f..e9fb8786da7c 100644
--- a/tools/perf/util/syscalltbl.h
+++ b/tools/perf/util/syscalltbl.h
@@ -17,4 +17,7 @@ void syscalltbl__delete(struct syscalltbl *tbl);
const char *syscalltbl__name(const struct syscalltbl *tbl, int id);
int syscalltbl__id(struct syscalltbl *tbl, const char *name);
+int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx);
+int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx);
+
#endif /* __PERF_SYSCALLTBL_H */
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 4c2fa98ef39d..d20791c3f499 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -1546,8 +1546,8 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
else {
memcpy(iobuf, mmio->addr.base + dpa, len);
- /* give us some some coverage of the mmio_flush_range() API */
- mmio_flush_range(mmio->addr.base + dpa, len);
+ /* give us some some coverage of the arch_invalidate_pmem() API */
+ arch_invalidate_pmem(mmio->addr.base + dpa, len);
}
nd_region_release_lane(nd_region, lane);
diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c
index a8df159a8924..ec0f6b45ce8b 100644
--- a/tools/testing/selftests/x86/mpx-mini-test.c
+++ b/tools/testing/selftests/x86/mpx-mini-test.c
@@ -391,8 +391,7 @@ void handler(int signum, siginfo_t *si, void *vucontext)
br_count++;
dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count);
-#define __SI_FAULT (3 << 16)
-#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */
+#define SEGV_BNDERR 3 /* failed address bound checks */
dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n",
status, ip, br_reason);
diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c
index 3237bc010e1c..23927845518d 100644
--- a/tools/testing/selftests/x86/protection_keys.c
+++ b/tools/testing/selftests/x86/protection_keys.c
@@ -212,19 +212,18 @@ void dump_mem(void *dumpme, int len_bytes)
}
}
-#define __SI_FAULT (3 << 16)
-#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */
-#define SEGV_PKUERR (__SI_FAULT|4)
+#define SEGV_BNDERR 3 /* failed address bound checks */
+#define SEGV_PKUERR 4
static char *si_code_str(int si_code)
{
- if (si_code & SEGV_MAPERR)
+ if (si_code == SEGV_MAPERR)
return "SEGV_MAPERR";
- if (si_code & SEGV_ACCERR)
+ if (si_code == SEGV_ACCERR)
return "SEGV_ACCERR";
- if (si_code & SEGV_BNDERR)
+ if (si_code == SEGV_BNDERR)
return "SEGV_BNDERR";
- if (si_code & SEGV_PKUERR)
+ if (si_code == SEGV_PKUERR)
return "SEGV_PKUERR";
return "UNKNOWN";
}