diff options
361 files changed, 8234 insertions, 4138 deletions
@@ -2606,11 +2606,9 @@ E: tmolina@cablespeed.com D: bug fixes, documentation, minor hackery N: Paul Moore -E: paul.moore@hp.com -D: NetLabel author -S: Hewlett-Packard -S: 110 Spit Brook Road -S: Nashua, NH 03062 +E: paul@paul-moore.com +W: http://www.paul-moore.com +D: NetLabel, SELinux, audit N: James Morris E: jmorris@namei.org diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 84c606fb3ca4..11b7f4ebea7c 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> Description: Controls the issue rate of small discard commands. +What: /sys/fs/f2fs/<disk>/discard_granularity +Date: July 2017 +Contact: "Chao Yu" <yuchao0@huawei.com> +Description: + Controls discard granularity of inner discard thread, inner thread + will not issue discards with size that is smaller than granularity. + The unit size is one block, now only support configuring in range + of [1, 512]. + What: /sys/fs/f2fs/<disk>/max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" <jaegeuk.kim@samsung.com> @@ -130,3 +139,15 @@ Date: June 2017 Contact: "Chao Yu" <yuchao0@huawei.com> Description: Controls current reserved blocks in system. + +What: /sys/fs/f2fs/<disk>/gc_urgent +Date: August 2017 +Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> +Description: + Do background GC agressively + +What: /sys/fs/f2fs/<disk>/gc_urgent_sleep_time +Date: August 2017 +Contact: "Jaegeuk Kim" <jaegeuk@kernel.org> +Description: + Controls sleep time of GC urgent mode diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt index 45b29326d719..ac66ae2509a9 100644 --- a/Documentation/DMA-API.txt +++ b/Documentation/DMA-API.txt @@ -515,14 +515,15 @@ API at all. :: void * - dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag) + dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, + gfp_t flag, unsigned long attrs) -Identical to dma_alloc_coherent() except that the platform will -choose to return either consistent or non-consistent memory as it sees -fit. By using this API, you are guaranteeing to the platform that you -have all the correct and necessary sync points for this memory in the -driver should it choose to return non-consistent memory. +Identical to dma_alloc_coherent() except that when the +DMA_ATTR_NON_CONSISTENT flags is passed in the attrs argument, the +platform will choose to return either consistent or non-consistent memory +as it sees fit. By using this API, you are guaranteeing to the platform +that you have all the correct and necessary sync points for this memory +in the driver should it choose to return non-consistent memory. Note: where the platform can return consistent memory, it will guarantee that the sync points become nops. @@ -535,12 +536,13 @@ that simply cannot make consistent memory. :: void - dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr, - dma_addr_t dma_handle) + dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, unsigned long attrs) -Free memory allocated by the nonconsistent API. All parameters must -be identical to those passed in (and returned by -dma_alloc_noncoherent()). +Free memory allocated by the dma_alloc_attrs(). All parameters common +parameters must identical to those otherwise passed to dma_fre_coherent, +and the attrs argument must be identical to the attrs passed to +dma_alloc_attrs(). :: @@ -564,8 +566,8 @@ memory or doing partial flushes. dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) -Do a partial sync of memory that was allocated by -dma_alloc_noncoherent(), starting at virtual address vaddr and +Do a partial sync of memory that was allocated by dma_alloc_attrs() with +the DMA_ATTR_NON_CONSISTENT flag starting at virtual address vaddr and continuing on for size. Again, you *must* observe the cache line boundaries when doing this. @@ -590,34 +592,11 @@ size is the size of the area (must be multiples of PAGE_SIZE). flags can be ORed together and are: -- DMA_MEMORY_MAP - request that the memory returned from - dma_alloc_coherent() be directly writable. - -- DMA_MEMORY_IO - request that the memory returned from - dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc. - -One or both of these flags must be present. - -- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by - dma_alloc_coherent of any child devices of this one (for memory residing - on a bridge). - - DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions. Do not allow dma_alloc_coherent() to fall back to system memory when it's out of memory in the declared region. -The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and -must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO -if only DMA_MEMORY_MAP were passed in) for success or zero for -failure. - -Note, for DMA_MEMORY_IO returns, all subsequent memory returned by -dma_alloc_coherent() may no longer be accessed directly, but instead -must be accessed using the correct bus functions. If your driver -isn't prepared to handle this contingency, it should not specify -DMA_MEMORY_IO in the input flags. - -As a simplification for the platforms, only **one** such region of +As a simplification for the platforms, only *one* such region of memory may be declared per device. For reasons of efficiency, most platforms choose to track the declared diff --git a/Documentation/admin-guide/LSM/tomoyo.rst b/Documentation/admin-guide/LSM/tomoyo.rst index a5947218fa64..e2d6b6e15082 100644 --- a/Documentation/admin-guide/LSM/tomoyo.rst +++ b/Documentation/admin-guide/LSM/tomoyo.rst @@ -9,8 +9,8 @@ TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel. LiveCD-based tutorials are available at -http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/ -http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/ +http://tomoyo.sourceforge.jp/1.8/ubuntu12.04-live.html +http://tomoyo.sourceforge.jp/1.8/centos6-live.html Though these tutorials use non-LSM version of TOMOYO, they are useful for you to know what TOMOYO is. @@ -21,35 +21,35 @@ How to enable TOMOYO? Build the kernel with ``CONFIG_SECURITY_TOMOYO=y`` and pass ``security=tomoyo`` on kernel's command line. -Please see http://tomoyo.sourceforge.jp/2.3/ for details. +Please see http://tomoyo.osdn.jp/2.5/ for details. Where is documentation? ======================= User <-> Kernel interface documentation is available at -http://tomoyo.sourceforge.jp/2.3/policy-reference.html . +http://tomoyo.osdn.jp/2.5/policy-specification/index.html . Materials we prepared for seminars and symposiums are available at -http://sourceforge.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . +http://osdn.jp/projects/tomoyo/docs/?category_id=532&language_id=1 . Below lists are chosen from three aspects. What is TOMOYO? TOMOYO Linux Overview - http://sourceforge.jp/projects/tomoyo/docs/lca2009-takeda.pdf + http://osdn.jp/projects/tomoyo/docs/lca2009-takeda.pdf TOMOYO Linux: pragmatic and manageable security for Linux - http://sourceforge.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf + http://osdn.jp/projects/tomoyo/docs/freedomhectaipei-tomoyo.pdf TOMOYO Linux: A Practical Method to Understand and Protect Your Own Linux Box - http://sourceforge.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf + http://osdn.jp/projects/tomoyo/docs/PacSec2007-en-no-demo.pdf What can TOMOYO do? Deep inside TOMOYO Linux - http://sourceforge.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf + http://osdn.jp/projects/tomoyo/docs/lca2009-kumaneko.pdf The role of "pathname based access control" in security. - http://sourceforge.jp/projects/tomoyo/docs/lfj2008-bof.pdf + http://osdn.jp/projects/tomoyo/docs/lfj2008-bof.pdf History of TOMOYO? Realities of Mainlining - http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf + http://osdn.jp/projects/tomoyo/docs/lfj2008.pdf What is future plan? ==================== @@ -60,6 +60,6 @@ multiple LSM modules at the same time. We feel sorry that you have to give up SELinux/SMACK/AppArmor etc. when you want to use TOMOYO. We hope that LSM becomes stackable in future. Meanwhile, you can use non-LSM -version of TOMOYO, available at http://tomoyo.sourceforge.jp/1.7/ . +version of TOMOYO, available at http://tomoyo.osdn.jp/1.8/ . LSM version of TOMOYO is a subset of non-LSM version of TOMOYO. We are planning to port non-LSM version's functionalities to LSM versions. diff --git a/Documentation/conf.py b/Documentation/conf.py index f9054ab60cb1..63857d33778c 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -271,10 +271,29 @@ latex_elements = { # Additional stuff for the LaTeX preamble. 'preamble': ''' - \\usepackage{ifthen} + % Use some font with UTF-8 support with XeLaTeX + \\usepackage{fontspec} + \\setsansfont{DejaVu Serif} + \\setromanfont{DejaVu Sans} + \\setmonofont{DejaVu Sans Mono} + + ''' +} + +# Fix reference escape troubles with Sphinx 1.4.x +if major == 1 and minor > 3: + latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n' + +if major == 1 and minor <= 4: + latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}' +elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)): + latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=1in' + latex_elements['preamble'] += '\\fvset{fontsize=auto}\n' - % Allow generate some pages in landscape - \\usepackage{lscape} +# Customize notice background colors on Sphinx < 1.6: +if major == 1 and minor < 6: + latex_elements['preamble'] += ''' + \\usepackage{ifthen} % Put notes in color and let them be inside a table \\definecolor{NoteColor}{RGB}{204,255,255} @@ -325,27 +344,26 @@ latex_elements = { } \\makeatother - % Use some font with UTF-8 support with XeLaTeX - \\usepackage{fontspec} - \\setsansfont{DejaVu Serif} - \\setromanfont{DejaVu Sans} - \\setmonofont{DejaVu Sans Mono} - - % To allow adjusting table sizes - \\usepackage{adjustbox} - ''' -} - -# Fix reference escape troubles with Sphinx 1.4.x -if major == 1 and minor > 3: - latex_elements['preamble'] += '\\renewcommand*{\\DUrole}[2]{ #2 }\n' -if major == 1 and minor <= 4: - latex_elements['preamble'] += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}' -elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)): - latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=1in' - latex_elements['preamble'] += '\\fvset{fontsize=auto}\n' +# With Sphinx 1.6, it is possible to change the Bg color directly +# by using: +# \definecolor{sphinxnoteBgColor}{RGB}{204,255,255} +# \definecolor{sphinxwarningBgColor}{RGB}{255,204,204} +# \definecolor{sphinxattentionBgColor}{RGB}{255,255,204} +# \definecolor{sphinximportantBgColor}{RGB}{192,255,204} +# +# However, it require to use sphinx heavy box with: +# +# \renewenvironment{sphinxlightbox} {% +# \\begin{sphinxheavybox} +# } +# \\end{sphinxheavybox} +# } +# +# Unfortunately, the implementation is buggy: if a note is inside a +# table, it isn't displayed well. So, for now, let's use boring +# black and white notes. # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, diff --git a/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt b/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt index cf573e85b11d..8cf87d1bfca5 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-bcm2835.txt @@ -6,7 +6,7 @@ Required properties: - clocks: This clock defines the base clock frequency of the PWM hardware system, the period and the duty_cycle of the PWM signal is a multiple of the base period. -- #pwm-cells: Should be 2. See pwm.txt in this directory for a description of +- #pwm-cells: Should be 3. See pwm.txt in this directory for a description of the cells format. Examples: @@ -15,7 +15,7 @@ pwm@2020c000 { compatible = "brcm,bcm2835-pwm"; reg = <0x2020c000 0x28>; clocks = <&clk_pwm>; - #pwm-cells = <2>; + #pwm-cells = <3>; }; clocks { diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt index 54c59b0560ad..ef8bd3cb67ab 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt @@ -2,6 +2,8 @@ MediaTek PWM controller Required properties: - compatible: should be "mediatek,<name>-pwm": + - "mediatek,mt2712-pwm": found on mt2712 SoC. + - "mediatek,mt7622-pwm": found on mt7622 SoC. - "mediatek,mt7623-pwm": found on mt7623 SoC. - reg: physical base address and length of the controller's registers. - #pwm-cells: must be 2. See pwm.txt in this directory for a description of @@ -10,7 +12,9 @@ Required properties: - clock-names: must contain the following: - "top": the top clock generator - "main": clock used by the PWM core - - "pwm1-5": the five per PWM clocks + - "pwm1-8": the eight per PWM clocks for mt2712 + - "pwm1-6": the six per PWM clocks for mt7622 + - "pwm1-5": the five per PWM clocks for mt7623 - pinctrl-names: Must contain a "default" entry. - pinctrl-0: One property must exist for each entry in pinctrl-names. See pinctrl/pinctrl-bindings.txt for details of the property values. diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt index b8be3d09ee26..2c5e52a5bede 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt @@ -3,10 +3,17 @@ Rockchip PWM controller Required properties: - compatible: should be "rockchip,<name>-pwm" "rockchip,rk2928-pwm": found on RK29XX,RK3066 and RK3188 SoCs - "rockchip,rk3288-pwm": found on RK3288 SoC + "rockchip,rk3288-pwm": found on RK3288 SOC + "rockchip,rv1108-pwm", "rockchip,rk3288-pwm": found on RV1108 SoC "rockchip,vop-pwm": found integrated in VOP on RK3288 SoC - reg: physical base address and length of the controller's registers - - clocks: phandle and clock specifier of the PWM reference clock + - clocks: See ../clock/clock-bindings.txt + - For older hardware (rk2928, rk3066, rk3188, rk3228, rk3288, rk3399): + - There is one clock that's used both to derive the functional clock + for the device and as the bus clock. + - For newer hardware (rk3328 and future socs): specified by name + - "pwm": This is used to derive the functional clock. + - "pclk": This is the APB bus clock. - #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.txt in this directory for a description of the cell format. diff --git a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt index 8007e839a716..06a363d9ccef 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-tiecap.txt @@ -6,6 +6,7 @@ Required properties: for am4372 - compatible = "ti,am4372-ecap", "ti,am3352-ecap", "ti,am33xx-ecap"; for da850 - compatible = "ti,da850-ecap", "ti,am3352-ecap", "ti,am33xx-ecap"; for dra746 - compatible = "ti,dra746-ecap", "ti,am3352-ecap"; + for 66ak2g - compatible = "ti,k2g-ecap", "ti,am3352-ecap"; - #pwm-cells: should be 3. See pwm.txt in this directory for a description of the cells format. The PWM channel index ranges from 0 to 4. The only third cell flag supported by this binding is PWM_POLARITY_INVERTED. diff --git a/Documentation/devicetree/bindings/pwm/pwm-zx.txt b/Documentation/devicetree/bindings/pwm/pwm-zx.txt new file mode 100644 index 000000000000..a6bcc75c9164 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/pwm-zx.txt @@ -0,0 +1,22 @@ +ZTE ZX PWM controller + +Required properties: + - compatible: Should be "zte,zx296718-pwm". + - reg: Physical base address and length of the controller's registers. + - clocks : The phandle and specifier referencing the controller's clocks. + - clock-names: "pclk" for PCLK, "wclk" for WCLK to the PWM controller. The + PCLK is for register access, while WCLK is the reference clock for + calculating period and duty cycles. + - #pwm-cells: Should be 3. See pwm.txt in this directory for a description of + the cells format. + +Example: + + pwm: pwm@1439000 { + compatible = "zte,zx296718-pwm"; + reg = <0x1439000 0x1000>; + clocks = <&lsp1crm LSP1_PWM_PCLK>, + <&lsp1crm LSP1_PWM_WCLK>; + clock-names = "pclk", "wclk"; + #pwm-cells = <3>; + }; diff --git a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt index b067e84a94b5..1aadc804dae4 100644 --- a/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/renesas,tpu-pwm.txt @@ -6,7 +6,6 @@ Required Properties: - "renesas,tpu-r8a73a4": for R8A77A4 (R-Mobile APE6) compatible PWM controller. - "renesas,tpu-r8a7740": for R8A7740 (R-Mobile A1) compatible PWM controller. - "renesas,tpu-r8a7790": for R8A7790 (R-Car H2) compatible PWM controller. - - "renesas,tpu-sh7372": for SH7372 (SH-Mobile AP4) compatible PWM controller. - "renesas,tpu": for generic R-Car TPU PWM controller. - reg: Base address and length of each memory resource used by the PWM diff --git a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt index e2f494d74d8a..0d73ea5e9c0c 100644 --- a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt @@ -11,6 +11,7 @@ Required properties: - compatible: - "mediatek,mt8173-thermal" : For MT8173 family of SoCs - "mediatek,mt2701-thermal" : For MT2701 family of SoCs + - "mediatek,mt2712-thermal" : For MT2712 family of SoCs - reg: Address range of the thermal controller - interrupts: IRQ for the thermal controller - clocks, clock-names: Clocks needed for the thermal controller. required diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt index 43003aec94bd..e3a6234fb1ac 100644 --- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt @@ -4,6 +4,7 @@ Required properties: - compatible : should be "rockchip,<name>-tsadc" "rockchip,rk3228-tsadc": found on RK3228 SoCs "rockchip,rk3288-tsadc": found on RK3288 SoCs + "rockchip,rk3328-tsadc": found on RK3328 SoCs "rockchip,rk3368-tsadc": found on RK3368 SoCs "rockchip,rk3399-tsadc": found on RK3399 SoCs - reg : physical base address of the controller and length of memory mapped diff --git a/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt new file mode 100644 index 000000000000..686c0b42ed3f --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/uniphier-thermal.txt @@ -0,0 +1,64 @@ +* UniPhier Thermal bindings + +This describes the devicetree bindings for thermal monitor supported by +PVT(Process, Voltage and Temperature) monitoring unit implemented on Socionext +UniPhier SoCs. + +Required properties: +- compatible : + - "socionext,uniphier-pxs2-thermal" : For UniPhier PXs2 SoC + - "socionext,uniphier-ld20-thermal" : For UniPhier LD20 SoC +- interrupts : IRQ for the temperature alarm +- #thermal-sensor-cells : Should be 0. See ./thermal.txt for details. + +Optional properties: +- socionext,tmod-calibration: A pair of calibrated values referred from PVT, + in case that the values aren't set on SoC, + like a reference board. + +Example: + + sysctrl@61840000 { + compatible = "socionext,uniphier-ld20-sysctrl", + "simple-mfd", "syscon"; + reg = <0x61840000 0x10000>; + ... + pvtctl: pvtctl { + compatible = "socionext,uniphier-ld20-thermal"; + interrupts = <0 3 1>; + #thermal-sensor-cells = <0>; + }; + ... + }; + + thermal-zones { + cpu_thermal { + polling-delay-passive = <250>; /* 250ms */ + polling-delay = <1000>; /* 1000ms */ + thermal-sensors = <&pvtctl>; + + trips { + cpu_crit: cpu_crit { + temperature = <110000>; /* 110C */ + hysteresis = <2000>; + type = "critical"; + }; + cpu_alert: cpu_alert { + temperature = <100000>; /* 100C */ + hysteresis = <2000>; + type = "passive"; + }; + }; + + cooling-maps { + map0 { + trip = <&cpu_alert>; + cooling-device = <&cpu0 (-1) (-1)>; + }; + map1 { + trip = <&cpu_alert>; + cooling-device = <&cpu2 (-1) (-1)>; + }; + }; + }; + }; diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index fe25787ff6d4..75d2d57e2c44 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -22,7 +22,7 @@ prototypes: struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int); + unsigned int, unsigned int); locking rules: rename_lock ->d_lock may block rcu-walk diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 273ccb26885e..13c2ff034348 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -164,6 +164,16 @@ io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. +usrjquota=<file> Appoint specified file and type during mount, so that quota +grpjquota=<file> information can be properly updated during recovery flow, +prjjquota=<file> <quota file>: must be in root directory; +jqfmt=<quota type> <quota type>: [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journelled quota. +offgrpjquota Turn off group journelled quota. +offprjjquota Turn off project journelled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. ================================================================================ DEBUGFS ENTRIES @@ -209,6 +219,15 @@ Files in /sys/fs/f2fs/<devname> gc_idle = 1 will select the Cost Benefit approach & setting gc_idle = 2 will select the greedy approach. + gc_urgent This parameter controls triggering background GCs + urgently or not. Setting gc_urgent = 0 [default] + makes back to default behavior, while if it is set + to 1, background thread starts to do GC by given + gc_urgent_sleep_time interval. + + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 405a3df759b3..5fd325df59e2 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -988,7 +988,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int); + unsigned int, unsigned int); }; d_revalidate: called when the VFS needs to revalidate a dentry. This diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt index bc80fc0e210f..a7a813258013 100644 --- a/Documentation/translations/ko_KR/memory-barriers.txt +++ b/Documentation/translations/ko_KR/memory-barriers.txt @@ -523,11 +523,11 @@ CPU ì—게 ê¸°ëŒ€í• ìˆ˜ 있는 ìµœì†Œí•œì˜ ë³´ìž¥ì‚¬í• ëª‡ê°€ì§€ê°€ 있습니 즉, ACQUIRE 는 ìµœì†Œí•œì˜ "ì·¨ë“" ë™ìž‘처럼, ê·¸ë¦¬ê³ RELEASE 는 ìµœì†Œí•œì˜ "공개" 처럼 ë™ìž‘한다는 ì˜ë¯¸ìž…니다. -core-api/atomic_ops.rst ì—서 설명ë˜ëŠ” ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ë“¤ 중ì—는 ì™„ì „ížˆ -순서잡힌 것들과 (배리어를 사용하지 않는) ì™„í™”ëœ ìˆœì„œì˜ ê²ƒë“¤ ì™¸ì— ACQUIRE 와 -RELEASE ë¶€ë¥˜ì˜ ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 ìŠ¤í† ì–´ë¥¼ ëª¨ë‘ ìˆ˜í–‰í•˜ëŠ” ì¡°í•©ëœ ì–´í† ë¯¹ -오í¼ë ˆì´ì…˜ì—서, ACQUIRE 는 해당 오í¼ë ˆì´ì…˜ì˜ 로드 부분ì—ë§Œ ì ìš©ë˜ê³ RELEASE 는 -해당 오í¼ë ˆì´ì…˜ì˜ ìŠ¤í† ì–´ 부분ì—ë§Œ ì ìš©ë©ë‹ˆë‹¤. +atomic_t.txt ì— ì„¤ëª…ëœ ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ë“¤ 중 ì¼ë¶€ëŠ” ì™„ì „ížˆ 순서잡힌 것들과 +(배리어를 사용하지 않는) ì™„í™”ëœ ìˆœì„œì˜ ê²ƒë“¤ ì™¸ì— ACQUIRE 와 RELEASE ë¶€ë¥˜ì˜ +ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 ìŠ¤í† ì–´ë¥¼ ëª¨ë‘ ìˆ˜í–‰í•˜ëŠ” ì¡°í•©ëœ ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ì—서, +ACQUIRE 는 해당 오í¼ë ˆì´ì…˜ì˜ 로드 부분ì—ë§Œ ì ìš©ë˜ê³ RELEASE 는 해당 +오í¼ë ˆì´ì…˜ì˜ ìŠ¤í† ì–´ 부분ì—ë§Œ ì ìš©ë©ë‹ˆë‹¤. 메모리 ë°°ë¦¬ì–´ë“¤ì€ ë‘ CPU ê°„, ë˜ëŠ” CPU 와 디바ì´ìФ ê°„ì— ìƒí˜¸ìž‘ìš©ì˜ ê°€ëŠ¥ì„±ì´ ìžˆì„ ë•Œì—ë§Œ 필요합니다. 만약 ì–´ë–¤ ì½”ë“œì— ê·¸ëŸ° ìƒí˜¸ìž‘ìš©ì´ ì—†ì„ ê²ƒì´ ë³´ìž¥ëœë‹¤ë©´, 해당 @@ -617,7 +617,22 @@ RELEASE ë¶€ë¥˜ì˜ ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 ìŠ¤í† ì–´ë¥¼ ëª¨ë‘ ìˆ˜í ì´ ë³€ê²½ì€ ì•žì˜ ì²˜ìŒ ë‘가지 ê²°ê³¼ 중 í•˜ë‚˜ë§Œì´ ë°œìƒí• 수 ìžˆê³ , ì„¸ë²ˆì§¸ì˜ ê²°ê³¼ëŠ” ë°œìƒí• 수 ì—†ë„ë¡ í•©ë‹ˆë‹¤. -ë°ì´í„° ì˜ì¡´ì„± 배리어는 ì˜ì¡´ì ì“°ê¸°ì— ëŒ€í•´ì„œë„ ìˆœì„œë¥¼ 잡아ì¤ë‹ˆë‹¤: + +[!] ì´ ìƒë‹¹ížˆ ë°˜ì§ê´€ì ì¸ ìƒí™©ì€ ë¶„ë¦¬ëœ ìºì‹œë¥¼ 가지는 기계들ì—서 가장 잘 +ë°œìƒí•˜ëŠ”ë°, 예를 들면 한 ìºì‹œ ë±…í¬ëŠ” ì§ìˆ˜ ë²ˆí˜¸ì˜ ìºì‹œ ë¼ì¸ë“¤ì„ ì²˜ë¦¬í•˜ê³ , 다른 +ë±…í¬ëŠ” 홀수 ë²ˆí˜¸ì˜ ìºì‹œ ë¼ì¸ë“¤ì„ 처리하는 ê²½ìš°ìž„ì„ ì•Œì•„ë‘시기 ë°”ëžë‹ˆë‹¤. í¬ì¸í„° +P 는 ì§ìˆ˜ 번호 ìºì‹œ ë¼ì¸ì— ì €ìž¥ë˜ì–´ ìžˆê³ , 변수 B 는 홀수 번호 ìºì‹œ ë¼ì¸ì— +ì €ìž¥ë˜ì–´ ìžˆì„ ìˆ˜ 있습니다. 여기서 ê°’ì„ ì½ì–´ì˜¤ëŠ” CPU ì˜ ìºì‹œì˜ 홀수 번호 처리 +ë±…í¬ëŠ” 열심히 ì¼ê°ì„ ì²˜ë¦¬ì¤‘ì¸ ë°˜ë©´ 홀수 번호 처리 ë±…í¬ëŠ” í• ì¼ ì—†ì´ í•œê°€í•œ +중ì´ë¼ë©´ í¬ì¸í„° P (&B) ì˜ ìƒˆë¡œìš´ ê°’ê³¼ 변수 B ì˜ ê¸°ì¡´ ê°’ (2) 를 ë³¼ 수 있습니다. + + +ì˜ì¡´ì ì“°ê¸°ë“¤ì˜ ìˆœì„œë¥¼ 맞추는ë°ì—는 ë°ì´í„° ì˜ì¡´ì„± 배리어가 필요치 않ì€ë°, ì´ëŠ” +리눅스 커ë„ì´ ì§€ì›í•˜ëŠ” CPU ë“¤ì€ (1) 쓰기가 ì •ë§ë¡œ ì¼ì–´ë‚ ì§€, (2) 쓰기가 ì–´ë””ì— +ì´ë£¨ì–´ì§ˆì§€, ê·¸ë¦¬ê³ (3) 쓰여질 ê°’ì„ í™•ì‹¤ížˆ 알기 ì „ê¹Œì§€ëŠ” 쓰기를 수행하지 않기 +때문입니다. 하지만 "컨트롤 ì˜ì¡´ì„±" 섹션과 +Documentation/RCU/rcu_dereference.txt 파ì¼ì„ ì£¼ì˜ ê¹Šê²Œ ì½ì–´ 주시기 ë°”ëžë‹ˆë‹¤: +컴파ì¼ëŸ¬ëŠ” 매우 ì°½ì˜ì ì¸ ë§Žì€ ë°©ë²•ìœ¼ë¡œ 종ì†ì„±ì„ ê¹° 수 있습니다. CPU 1 CPU 2 =============== =============== @@ -626,28 +641,19 @@ RELEASE ë¶€ë¥˜ì˜ ê²ƒë“¤ë„ ì¡´ìž¬í•©ë‹ˆë‹¤. 로드와 ìŠ¤í† ì–´ë¥¼ ëª¨ë‘ ìˆ˜í <쓰기 배리어> WRITE_ONCE(P, &B); Q = READ_ONCE(P); - <ë°ì´í„° ì˜ì¡´ì„± 배리어> - *Q = 5; + WRITE_ONCE(*Q, 5); -ì´ ë°ì´í„° ì˜ì¡´ì„± 배리어는 Q ë¡œì˜ ì½ê¸°ê°€ *Q ë¡œì˜ ìŠ¤í† ì–´ì™€ 순서를 맞추게 -í•´ì¤ë‹ˆë‹¤. ì´ëŠ” 다ìŒê³¼ ê°™ì€ ê²°ê³¼ë¥¼ 막습니다: +ë”°ë¼ì„œ, Q ë¡œì˜ ì½ê¸°ì™€ *Q ë¡œì˜ ì“°ê¸° 사ì´ì—는 ë°ì´í„° 종ì†ì„± 배리어가 필요치 +않습니다. 달리 ë§í•˜ë©´, ë°ì´í„° 종ì†ì„± 배리어가 ì—†ë”ë¼ë„ ë‹¤ìŒ ê²°ê³¼ëŠ” ìƒê¸°ì§€ +않습니다: (Q == &B) && (B == 4) ì´ëŸ° íŒ¨í„´ì€ ë“œë¬¼ê²Œ 사용ë˜ì–´ì•¼ í•¨ì„ ì•Œì•„ ë‘시기 ë°”ëžë‹ˆë‹¤. 무엇보다ë„, ì˜ì¡´ì„± 순서 ê·œì¹™ì˜ ì˜ë„는 쓰기 ìž‘ì—…ì„ -예방- 해서 그로 ì¸í•´ ë°œìƒí•˜ëŠ” 비싼 ìºì‹œ ë¯¸ìŠ¤ë„ ì—†ì• ë ¤ëŠ” 것입니다. ì´ íŒ¨í„´ì€ ë“œë¬¼ê²Œ ë°œìƒí•˜ëŠ” ì—러 ì¡°ê±´ ê°™ì€ê²ƒë“¤ì„ 기ë¡í•˜ëŠ”ë° -사용ë 수 ìžˆê³ , ì´ë ‡ê²Œ 배리어를 사용해 순서를 지키게 í•¨ìœ¼ë¡œì¨ ê·¸ëŸ° 기ë¡ì´ -사ë¼ì§€ëŠ” ê²ƒì„ ë§‰ìŠµë‹ˆë‹¤. - - -[!] ìƒë‹¹ížˆ 비ì§ê´€ì ì¸ ì´ ìƒí™©ì€ ë¶„ë¦¬ëœ ìºì‹œë¥¼ 가진 기계, 예를 들어 한 ìºì‹œ -ë±…í¬ê°€ ì§ìˆ˜ë²ˆ ìºì‹œ ë¼ì¸ì„ ì²˜ë¦¬í•˜ê³ ë‹¤ë¥¸ ë±…í¬ëŠ” 홀수번 ìºì‹œ ë¼ì¸ì„ 처리하는 기계 -등ì—서 가장 잘 ë°œìƒí•©ë‹ˆë‹¤. í¬ì¸í„° P 는 홀수 ë²ˆí˜¸ì˜ ìºì‹œ ë¼ì¸ì— ìžˆê³ , 변수 B 는 -ì§ìˆ˜ 번호 ìºì‹œ ë¼ì¸ì— ìžˆë‹¤ê³ ìƒê°í•´ 봅시다. 그런 ìƒíƒœì—서 ì½ê¸° ìž‘ì—…ì„ í•˜ëŠ” CPU -ì˜ ì§ìˆ˜ë²ˆ ë±…í¬ëŠ” í• ì¼ì´ 쌓여 매우 ë°”ì˜ì§€ë§Œ 홀수번 ë±…í¬ëŠ” í• ì¼ì´ 없어 아무 -ì¼ë„ 하지 ì•Šê³ ìžˆì—ˆë‹¤ë©´, í¬ì¸í„° P 는 새 ê°’ (&B) ì„, ê·¸ë¦¬ê³ ë³€ìˆ˜ B 는 ì˜›ë‚ ê°’ -(2) ì„ ê°€ì§€ê³ ìžˆëŠ” ìƒíƒœê°€ 보여질 ìˆ˜ë„ ìžˆìŠµë‹ˆë‹¤. +사용ë 수 있으며, CPUì˜ ìžì—°ì ì¸ ìˆœì„œ ë³´ìž¥ì´ ê·¸ëŸ° 기ë¡ë“¤ì„ 사ë¼ì§€ì§€ 않게 +í•´ì¤ë‹ˆë‹¤. ë°ì´í„° ì˜ì¡´ì„± 배리어는 매우 중요한ë°, 예를 들어 RCU 시스템ì—서 ê·¸ë ‡ìŠµë‹ˆë‹¤. @@ -1848,8 +1854,7 @@ Mandatory ë°°ë¦¬ì–´ë“¤ì€ SMP 시스템ì—ì„œë„ UP 시스템ì—ì„œë„ SMP íš¨ê³ ì´ ì½”ë“œëŠ” ê°ì²´ì˜ ì—…ë°ì´íŠ¸ëœ death 마í¬ê°€ ë ˆí¼ëŸ°ìФ ì¹´ìš´í„° ê°ì†Œ ë™ìž‘ *ì „ì—* ë³´ì¼ ê²ƒì„ ë³´ìž¥í•©ë‹ˆë‹¤. - ë” ë§Žì€ ì •ë³´ë¥¼ ìœ„í•´ì„ Documentation/core-api/atomic_ops.rst 문서를 ì°¸ê³ í•˜ì„¸ìš”. - 어디서 ì´ê²ƒë“¤ì„ 사용해야 í• ì§€ ê¶ê¸ˆí•˜ë‹¤ë©´ "ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜" ì„œë¸Œì„¹ì…˜ì„ + ë” ë§Žì€ ì •ë³´ë¥¼ ìœ„í•´ì„ Documentation/atomic_{t,bitops}.txt 문서를 ì°¸ê³ í•˜ì„¸ìš”. @@ -2468,86 +2473,7 @@ _않습니다_. ì „ì²´ 메모리 배리어를 ë‚´í¬í•˜ê³ ë˜ ì¼ë¶€ëŠ” ë‚´í¬í•˜ì§€ 않지만, 커ë„ì—서 ìƒë‹¹ížˆ ì˜ì¡´ì 으로 사용하는 기능 중 하나입니다. -ë©”ëª¨ë¦¬ì˜ ì–´ë–¤ ìƒíƒœë¥¼ ìˆ˜ì •í•˜ê³ í•´ë‹¹ ìƒíƒœì— 대한 (ì˜ˆì „ì˜ ë˜ëŠ” ìµœì‹ ì˜) ì •ë³´ë¥¼ -리턴하는 ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ì€ ëª¨ë‘ SMP-ì¡°ê±´ì 범용 메모리 배리어(smp_mb())를 -ì‹¤ì œ 오í¼ë ˆì´ì…˜ì˜ 앞과 ë’¤ì— ë‚´í¬í•©ë‹ˆë‹¤. ì´ëŸ° 오í¼ë ˆì´ì…˜ì€ 다ìŒì˜ ê²ƒë“¤ì„ -í¬í•¨í•©ë‹ˆë‹¤: - - xchg(); - atomic_xchg(); atomic_long_xchg(); - atomic_inc_return(); atomic_long_inc_return(); - atomic_dec_return(); atomic_long_dec_return(); - atomic_add_return(); atomic_long_add_return(); - atomic_sub_return(); atomic_long_sub_return(); - atomic_inc_and_test(); atomic_long_inc_and_test(); - atomic_dec_and_test(); atomic_long_dec_and_test(); - atomic_sub_and_test(); atomic_long_sub_and_test(); - atomic_add_negative(); atomic_long_add_negative(); - test_and_set_bit(); - test_and_clear_bit(); - test_and_change_bit(); - - /* exchange ì¡°ê±´ì´ ì„±ê³µí• ë•Œ */ - cmpxchg(); - atomic_cmpxchg(); atomic_long_cmpxchg(); - atomic_add_unless(); atomic_long_add_unless(); - -ì´ê²ƒë“¤ì€ 메모리 배리어 효과가 필요한 ACQUIRE 부류와 RELEASE 부류 오í¼ë ˆì´ì…˜ë“¤ì„ -êµ¬í˜„í• ë•Œ, ê·¸ë¦¬ê³ ê°ì²´ í•´ì œë¥¼ 위해 ë ˆí¼ëŸ°ìФ 카운터를 ì¡°ì •í• ë•Œ, 암묵ì 메모리 -배리어 효과가 필요한 ê³³ ë“±ì— ì‚¬ìš©ë©ë‹ˆë‹¤. - - -다ìŒì˜ 오í¼ë ˆì´ì…˜ë“¤ì€ 메모리 배리어를 ë‚´í¬í•˜ì§€ _않기_ ë•Œë¬¸ì— ë¬¸ì œê°€ ë 수 -있지만, RELEASE ë¶€ë¥˜ì˜ ì˜¤í¼ë ˆì´ì…˜ë“¤ê³¼ ê°™ì€ ê²ƒë“¤ì„ êµ¬í˜„í• ë•Œ 사용ë ìˆ˜ë„ -있습니다: - - atomic_set(); - set_bit(); - clear_bit(); - change_bit(); - -ì´ê²ƒë“¤ì„ ì‚¬ìš©í• ë•Œì—는 필요하다면 ì ì ˆí•œ (예를 들면 smp_mb__before_atomic() -ê°™ì€) 메모리 배리어가 명시ì 으로 함께 사용ë˜ì–´ì•¼ 합니다. - - -ì•„ëž˜ì˜ ê²ƒë“¤ë„ ë©”ëª¨ë¦¬ 배리어를 ë‚´í¬í•˜ì§€ _않기_ 때문ì—, ì¼ë¶€ 환경ì—서는 (예를 -들면 smp_mb__before_atomic() ê³¼ ê°™ì€) 명시ì ì¸ ë©”ëª¨ë¦¬ 배리어 ì‚¬ìš©ì´ í•„ìš”í•©ë‹ˆë‹¤. - - atomic_add(); - atomic_sub(); - atomic_inc(); - atomic_dec(); - -ì´ê²ƒë“¤ì´ 통계 ìƒì„±ì„ 위해 사용ëœë‹¤ë©´, ê·¸ë¦¬ê³ í†µê³„ ë°ì´í„° 사ì´ì— 관계가 존재하지 -않는다면 메모리 배리어는 필요치 ì•Šì„ ê²ë‹ˆë‹¤. - -ê°ì²´ì˜ ìˆ˜ëª…ì„ ê´€ë¦¬í•˜ê¸° 위해 ë ˆí¼ëŸ°ìФ 카운팅 목ì 으로 사용ëœë‹¤ë©´, ë ˆí¼ëŸ°ìФ -카운터는 ë½ìœ¼ë¡œ 보호ë˜ëŠ” 섹션ì—서만 ì¡°ì •ë˜ê±°ë‚˜ 호출하는 ìª½ì´ ì´ë¯¸ 충분한 -ë ˆí¼ëŸ°ìŠ¤ë¥¼ ìž¡ê³ ìžˆì„ ê²ƒì´ê¸° ë•Œë¬¸ì— ë©”ëª¨ë¦¬ 배리어는 아마 í•„ìš” ì—†ì„ ê²ë‹ˆë‹¤. - -만약 ì–´ë–¤ ë½ì„ 구성하기 위해 사용ëœë‹¤ë©´, ë½ ê´€ë ¨ ë™ìž‘ì€ ì¼ë°˜ì 으로 ìž‘ì—…ì„ íŠ¹ì • -순서대로 진행해야 하므로 메모리 배리어가 í•„ìš”í• ìˆ˜ 있습니다. - -기본ì 으로, ê° ì‚¬ìš©ì²˜ì—서는 메모리 배리어가 필요한지 아닌지 충분히 ê³ ë ¤í•´ì•¼ -합니다. - -ì•„ëž˜ì˜ ì˜¤í¼ë ˆì´ì…˜ë“¤ì€ 특별한 ë½ ê´€ë ¨ ë™ìž‘들입니다: - - test_and_set_bit_lock(); - clear_bit_unlock(); - __clear_bit_unlock(); - -ì´ê²ƒë“¤ì€ ACQUIRE 류와 RELEASE ë¥˜ì˜ ì˜¤í¼ë ˆì´ì…˜ë“¤ì„ 구현합니다. ë½ ê´€ë ¨ ë„구를 -êµ¬í˜„í• ë•Œì—는 ì´ê²ƒë“¤ì„ 좀 ë” ì„ í˜¸í•˜ëŠ” íŽ¸ì´ ë‚˜ì€ë°, ì´ê²ƒë“¤ì˜ êµ¬í˜„ì€ ë§Žì€ -아키í…ì³ì—서 최ì í™” ë 수 있기 때문입니다. - -[!] ì´ëŸ° ìƒí™©ì— ì‚¬ìš©í• ìˆ˜ 있는 특수한 메모리 배리어 ë„êµ¬ë“¤ì´ ìžˆìŠµë‹ˆë‹¤ë§Œ, ì¼ë¶€ -CPU ì—서는 사용ë˜ëŠ” ì–´í† ë¯¹ ì¸ìŠ¤íŠ¸ëŸì…˜ ìžì²´ì— 메모리 배리어가 ë‚´í¬ë˜ì–´ 있어서 -ì–´í† ë¯¹ 오í¼ë ˆì´ì…˜ê³¼ 메모리 배리어를 함께 사용하는 게 불필요한 ì¼ì´ ë 수 -있는ë°, 그런 ê²½ìš°ì— ì´ íŠ¹ìˆ˜ 메모리 배리어 ë„êµ¬ë“¤ì€ no-op ì´ ë˜ì–´ 실질ì 으로 -아무ì¼ë„ 하지 않습니다. - -ë” ë§Žì€ ë‚´ìš©ì„ ìœ„í•´ì„ Documentation/core-api/atomic_ops.rst 를 ì°¸ê³ í•˜ì„¸ìš”. +ë” ë§Žì€ ë‚´ìš©ì„ ìœ„í•´ì„ Documentation/atomic_t.txt 를 ì°¸ê³ í•˜ì„¸ìš”. 디바ì´ìФ 액세스 diff --git a/MAINTAINERS b/MAINTAINERS index fbb269415f06..f46a3225e398 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4219,7 +4219,7 @@ DMA MAPPING HELPERS M: Christoph Hellwig <hch@lst.de> M: Marek Szyprowski <m.szyprowski@samsung.com> R: Robin Murphy <robin.murphy@arm.com> -L: linux-kernel@vger.kernel.org +L: iommu@lists.linux-foundation.org T: git git://git.infradead.org/users/hch/dma-mapping.git W: http://git.infradead.org/users/hch/dma-mapping.git S: Supported @@ -9298,15 +9298,6 @@ F: net/*/netfilter/ F: net/netfilter/ F: net/bridge/br_netfilter*.c -NETLABEL -M: Paul Moore <paul@paul-moore.com> -W: http://netlabel.sf.net -L: netdev@vger.kernel.org -S: Maintained -F: Documentation/netlabel/ -F: include/net/netlabel.h -F: net/netlabel/ - NETROM NETWORK LAYER M: Ralf Baechle <ralf@linux-mips.org> L: linux-hams@vger.kernel.org @@ -9434,10 +9425,23 @@ F: net/ipv6/ F: include/net/ip* F: arch/x86/net/* -NETWORKING [LABELED] (NetLabel, CIPSO, Labeled IPsec, SECMARK) +NETWORKING [LABELED] (NetLabel, Labeled IPsec, SECMARK) M: Paul Moore <paul@paul-moore.com> +W: https://github.com/netlabel L: netdev@vger.kernel.org +L: linux-security-module@vger.kernel.org S: Maintained +F: Documentation/netlabel/ +F: include/net/calipso.h +F: include/net/cipso_ipv4.h +F: include/net/netlabel.h +F: include/uapi/linux/netfilter/xt_SECMARK.h +F: include/uapi/linux/netfilter/xt_CONNSECMARK.h +F: net/netlabel/ +F: net/ipv4/cipso_ipv4.c +F: net/ipv6/calipso.c +F: net/netfilter/xt_CONNSECMARK.c +F: net/netfilter/xt_SECMARK.c NETWORKING [TLS] M: Ilya Lesokhin <ilyal@mellanox.com> @@ -12023,8 +12027,9 @@ M: Paul Moore <paul@paul-moore.com> M: Stephen Smalley <sds@tycho.nsa.gov> M: Eric Paris <eparis@parisplace.org> L: selinux@tycho.nsa.gov (moderated for non-subscribers) -W: http://selinuxproject.org -T: git git://git.infradead.org/users/pcmoore/selinux +W: https://selinuxproject.org +W: https://github.com/SELinuxProject +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git S: Supported F: include/linux/selinux* F: security/selinux/ diff --git a/arch/alpha/include/uapi/asm/siginfo.h b/arch/alpha/include/uapi/asm/siginfo.h index 9822362a8424..70494d1d8f29 100644 --- a/arch/alpha/include/uapi/asm/siginfo.h +++ b/arch/alpha/include/uapi/asm/siginfo.h @@ -6,4 +6,18 @@ #include <asm-generic/siginfo.h> +/* + * SIGFPE si_codes + */ +#ifdef __KERNEL__ +#define FPE_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ + +/* + * SIGTRAP si_codes + */ +#ifdef __KERNEL__ +#define TRAP_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ + #endif diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index ddb89a18cf26..49d3b1e63ce5 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -280,7 +280,7 @@ do_entIF(unsigned long type, struct pt_regs *regs) case 1: /* bugcheck */ info.si_signo = SIGTRAP; info.si_errno = 0; - info.si_code = __SI_FAULT; + info.si_code = TRAP_FIXME; info.si_addr = (void __user *) regs->pc; info.si_trapno = 0; send_sig_info(SIGTRAP, &info, current); @@ -320,7 +320,7 @@ do_entIF(unsigned long type, struct pt_regs *regs) break; case GEN_ROPRAND: signo = SIGFPE; - code = __SI_FAULT; + code = FPE_FIXME; break; case GEN_DECOVF: @@ -342,7 +342,7 @@ do_entIF(unsigned long type, struct pt_regs *regs) case GEN_SUBRNG7: default: signo = SIGTRAP; - code = __SI_FAULT; + code = TRAP_FIXME; break; } diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f1b3f1d575d4..7888c9803eb0 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1531,7 +1531,6 @@ config THUMB2_KERNEL bool "Compile the kernel in Thumb-2 mode" if !CPU_THUMBONLY depends on (CPU_V7 || CPU_V7M) && !CPU_V6 && !CPU_V6K default y if CPU_THUMBONLY - select AEABI select ARM_ASM_UNIFIED select ARM_UNWIND help @@ -1594,7 +1593,8 @@ config ARM_PATCH_IDIV code to do integer division. config AEABI - bool "Use the ARM EABI to compile the kernel" + bool "Use the ARM EABI to compile the kernel" if !CPU_V7 && !CPU_V7M && !CPU_V6 && !CPU_V6K + default CPU_V7 || CPU_V7M || CPU_V6 || CPU_V6K help This option allows for the kernel to be compiled using the latest ARM ABI (aka EABI). This is only useful if you are using a user diff --git a/arch/arm/include/asm/smp_scu.h b/arch/arm/include/asm/smp_scu.h index bfe163c40024..5983f6bc62d5 100644 --- a/arch/arm/include/asm/smp_scu.h +++ b/arch/arm/include/asm/smp_scu.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLER__ +#include <linux/errno.h> #include <asm/cputype.h> static inline bool scu_a9_has_base(void) diff --git a/arch/arm/include/asm/suspend.h b/arch/arm/include/asm/suspend.h index 6c7182f32cef..a61905c86732 100644 --- a/arch/arm/include/asm/suspend.h +++ b/arch/arm/include/asm/suspend.h @@ -1,6 +1,8 @@ #ifndef __ASM_ARM_SUSPEND_H #define __ASM_ARM_SUSPEND_H +#include <linux/types.h> + struct sleep_save_sp { u32 *save_ptr_stash; u32 save_ptr_stash_phys; diff --git a/arch/arm/include/debug/omap2plus.S b/arch/arm/include/debug/omap2plus.S index 8be08d907a16..192a7583999c 100644 --- a/arch/arm/include/debug/omap2plus.S +++ b/arch/arm/include/debug/omap2plus.S @@ -22,6 +22,7 @@ #define UART_OFFSET(addr) ((addr) & 0x00ffffff) .pushsection .data + .align 2 omap_uart_phys: .word 0 omap_uart_virt: .word 0 omap_uart_lsr: .word 0 diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index c731f0d2b2af..fbc707626b3e 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -721,6 +721,7 @@ do_fpe: */ .pushsection .data + .align 2 ENTRY(fp_enter) .word no_fp .popsection @@ -1224,6 +1225,7 @@ vector_addrexcptn: W(b) vector_fiq .data + .align 2 .globl cr_alignment cr_alignment: diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index e33c32d56193..ca3614dc6938 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -27,6 +27,14 @@ #include "entry-header.S" +saved_psr .req r8 +#if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING) +saved_pc .req r9 +#define TRACE(x...) x +#else +saved_pc .req lr +#define TRACE(x...) +#endif .align 5 #if !(IS_ENABLED(CONFIG_TRACE_IRQFLAGS) || IS_ENABLED(CONFIG_CONTEXT_TRACKING)) @@ -146,16 +154,17 @@ ENTRY(vector_swi) ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr THUMB( mov r8, sp ) THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr - mrs r8, spsr @ called from non-FIQ mode, so ok. - str lr, [sp, #S_PC] @ Save calling PC - str r8, [sp, #S_PSR] @ Save CPSR + mrs saved_psr, spsr @ called from non-FIQ mode, so ok. + TRACE( mov saved_pc, lr ) + str saved_pc, [sp, #S_PC] @ Save calling PC + str saved_psr, [sp, #S_PSR] @ Save CPSR str r0, [sp, #S_OLD_R0] @ Save OLD_R0 #endif zero_fp alignment_trap r10, ip, __cr_alignment - enable_irq - ct_user_exit - get_thread_info tsk + asm_trace_hardirqs_on save=0 + enable_irq_notrace + ct_user_exit save=0 /* * Get the system call number. @@ -168,11 +177,11 @@ ENTRY(vector_swi) * value to determine if it is an EABI or an old ABI call. */ #ifdef CONFIG_ARM_THUMB - tst r8, #PSR_T_BIT + tst saved_psr, #PSR_T_BIT movne r10, #0 @ no thumb OABI emulation - USER( ldreq r10, [lr, #-4] ) @ get SWI instruction + USER( ldreq r10, [saved_pc, #-4] ) @ get SWI instruction #else - USER( ldr r10, [lr, #-4] ) @ get SWI instruction + USER( ldr r10, [saved_pc, #-4] ) @ get SWI instruction #endif ARM_BE8(rev r10, r10) @ little endian instruction @@ -183,15 +192,17 @@ ENTRY(vector_swi) */ #elif defined(CONFIG_ARM_THUMB) /* Legacy ABI only, possibly thumb mode. */ - tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs + tst saved_psr, #PSR_T_BIT @ this is SPSR from save_user_regs addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in - USER( ldreq scno, [lr, #-4] ) + USER( ldreq scno, [saved_pc, #-4] ) #else /* Legacy ABI only. */ - USER( ldr scno, [lr, #-4] ) @ get SWI instruction + USER( ldr scno, [saved_pc, #-4] ) @ get SWI instruction #endif + /* saved_psr and saved_pc are now dead */ + uaccess_disable tbl adr tbl, sys_call_table @ load syscall table pointer @@ -210,6 +221,12 @@ ENTRY(vector_swi) bic scno, scno, #0xff000000 @ mask off SWI op-code eor scno, scno, #__NR_SYSCALL_BASE @ check OS number #endif + get_thread_info tsk + /* + * Reload the registers that may have been corrupted on entry to + * the syscall assembly (by tracing or context tracking.) + */ + TRACE( ldmia sp, {r0 - r3} ) local_restart: ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing @@ -239,8 +256,9 @@ local_restart: * current task. */ 9001: - sub lr, lr, #4 + sub lr, saved_pc, #4 str lr, [sp, #S_PC] + get_thread_info tsk b ret_fast_syscall #endif ENDPROC(vector_swi) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 04286fd9e09c..6b1148cafffd 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -556,6 +556,7 @@ ENDPROC(__fixup_smp) .word __smpalt_end .pushsection .data + .align 2 .globl smp_on_up smp_on_up: ALT_SMP(.long 1) @@ -716,6 +717,7 @@ ENTRY(fixup_pv_table) ENDPROC(fixup_pv_table) .data + .align 2 .globl __pv_phys_pfn_offset .type __pv_phys_pfn_offset, %object __pv_phys_pfn_offset: diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S index ec7e7377d423..60146e32619a 100644 --- a/arch/arm/kernel/hyp-stub.S +++ b/arch/arm/kernel/hyp-stub.S @@ -31,6 +31,7 @@ * zeroing of .bss would clobber it. */ .data + .align 2 ENTRY(__boot_cpu_mode) .long 0 .text diff --git a/arch/arm/kernel/iwmmxt.S b/arch/arm/kernel/iwmmxt.S index 49fadbda8c63..81cd4d43b3ec 100644 --- a/arch/arm/kernel/iwmmxt.S +++ b/arch/arm/kernel/iwmmxt.S @@ -367,6 +367,7 @@ ENTRY(iwmmxt_task_release) ENDPROC(iwmmxt_task_release) .data + .align 2 concan_owner: .word 0 diff --git a/arch/arm/kernel/sleep.S b/arch/arm/kernel/sleep.S index 0f6c1000582c..9f08d214d05a 100644 --- a/arch/arm/kernel/sleep.S +++ b/arch/arm/kernel/sleep.S @@ -171,6 +171,7 @@ mpidr_hash_ptr: .long mpidr_hash - . @ mpidr_hash struct offset .data + .align 2 .type sleep_save_sp, #object ENTRY(sleep_save_sp) .space SLEEP_SAVE_SP_SZ @ struct sleep_save_sp diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index 3a2fa203637a..65228bf4c6df 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -171,6 +171,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { __save_stack_trace(tsk, trace, 1); } +EXPORT_SYMBOL(save_stack_trace_tsk); void save_stack_trace(struct stack_trace *trace) { diff --git a/arch/arm/mach-exynos/sleep.S b/arch/arm/mach-exynos/sleep.S index cf950790fbdc..4292cae43f3c 100644 --- a/arch/arm/mach-exynos/sleep.S +++ b/arch/arm/mach-exynos/sleep.S @@ -124,6 +124,7 @@ _cp15_save_diag: #endif /* CONFIG_CACHE_L2X0 */ .data + .align 2 .globl cp15_save_diag cp15_save_diag: .long 0 @ cp15 diagnostic diff --git a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c index dd75a4756761..5169dfba9718 100644 --- a/arch/arm/mach-imx/mach-imx27_visstrim_m10.c +++ b/arch/arm/mach-imx/mach-imx27_visstrim_m10.c @@ -245,7 +245,6 @@ static phys_addr_t mx2_camera_base __initdata; static void __init visstrim_analog_camera_init(void) { struct platform_device *pdev; - int dma; gpio_set_value(TVP5150_PWDN, 1); ndelay(1); @@ -258,12 +257,9 @@ static void __init visstrim_analog_camera_init(void) if (IS_ERR(pdev)) return; - dma = dma_declare_coherent_memory(&pdev->dev, - mx2_camera_base, mx2_camera_base, - MX2_CAMERA_BUF_SIZE, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE); - if (!(dma & DMA_MEMORY_MAP)) - return; + dma_declare_coherent_memory(&pdev->dev, mx2_camera_base, + mx2_camera_base, MX2_CAMERA_BUF_SIZE, + DMA_MEMORY_EXCLUSIVE); } static void __init visstrim_reserve(void) @@ -444,16 +440,13 @@ static const struct imx_ssi_platform_data visstrim_m10_ssi_pdata __initconst = { static void __init visstrim_coda_init(void) { struct platform_device *pdev; - int dma; pdev = imx27_add_coda(); - dma = dma_declare_coherent_memory(&pdev->dev, - mx2_camera_base + MX2_CAMERA_BUF_SIZE, - mx2_camera_base + MX2_CAMERA_BUF_SIZE, - MX2_CAMERA_BUF_SIZE, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE); - if (!(dma & DMA_MEMORY_MAP)) - return; + dma_declare_coherent_memory(&pdev->dev, + mx2_camera_base + MX2_CAMERA_BUF_SIZE, + mx2_camera_base + MX2_CAMERA_BUF_SIZE, + MX2_CAMERA_BUF_SIZE, + DMA_MEMORY_EXCLUSIVE); } /* DMA deinterlace */ @@ -466,24 +459,21 @@ static void __init visstrim_deinterlace_init(void) { int ret = -ENOMEM; struct platform_device *pdev = &visstrim_deinterlace; - int dma; ret = platform_device_register(pdev); - dma = dma_declare_coherent_memory(&pdev->dev, - mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE, - mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE, - MX2_CAMERA_BUF_SIZE, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE); - if (!(dma & DMA_MEMORY_MAP)) - return; + dma_declare_coherent_memory(&pdev->dev, + mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE, + mx2_camera_base + 2 * MX2_CAMERA_BUF_SIZE, + MX2_CAMERA_BUF_SIZE, + DMA_MEMORY_EXCLUSIVE); } /* Emma-PrP for format conversion */ static void __init visstrim_emmaprp_init(void) { struct platform_device *pdev; - int dma; + int ret; pdev = imx27_add_mx2_emmaprp(); if (IS_ERR(pdev)) @@ -493,11 +483,11 @@ static void __init visstrim_emmaprp_init(void) * Use the same memory area as the analog camera since both * devices are, by nature, exclusive. */ - dma = dma_declare_coherent_memory(&pdev->dev, + ret = dma_declare_coherent_memory(&pdev->dev, mx2_camera_base, mx2_camera_base, MX2_CAMERA_BUF_SIZE, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE); - if (!(dma & DMA_MEMORY_MAP)) + DMA_MEMORY_EXCLUSIVE); + if (ret) pr_err("Failed to declare memory for emmaprp\n"); } diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c index bde9a9af6714..7716f83aecdd 100644 --- a/arch/arm/mach-imx/mach-mx31moboard.c +++ b/arch/arm/mach-imx/mach-mx31moboard.c @@ -475,7 +475,7 @@ static phys_addr_t mx3_camera_base __initdata; static int __init mx31moboard_init_cam(void) { - int dma, ret = -ENOMEM; + int ret; struct platform_device *pdev; imx31_add_ipu_core(); @@ -484,11 +484,11 @@ static int __init mx31moboard_init_cam(void) if (IS_ERR(pdev)) return PTR_ERR(pdev); - dma = dma_declare_coherent_memory(&pdev->dev, - mx3_camera_base, mx3_camera_base, - MX3_CAMERA_BUF_SIZE, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE); - if (!(dma & DMA_MEMORY_MAP)) + ret = dma_declare_coherent_memory(&pdev->dev, + mx3_camera_base, mx3_camera_base, + MX3_CAMERA_BUF_SIZE, + DMA_MEMORY_EXCLUSIVE); + if (ret) goto err; ret = platform_device_add(pdev); diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S index 1b9f0520dea9..fa5fd24f524c 100644 --- a/arch/arm/mach-omap2/sleep34xx.S +++ b/arch/arm/mach-omap2/sleep34xx.S @@ -530,10 +530,12 @@ l2dis_3630_offset: .long l2dis_3630 - . .data + .align 2 l2dis_3630: .word 0 .data + .align 2 l2_inv_api_params: .word 0x1, 0x00 diff --git a/arch/arm/mach-omap2/sleep44xx.S b/arch/arm/mach-omap2/sleep44xx.S index c7a3b4aab4b5..56dfa2d5d0a8 100644 --- a/arch/arm/mach-omap2/sleep44xx.S +++ b/arch/arm/mach-omap2/sleep44xx.S @@ -385,6 +385,7 @@ ppa_zero_params_offset: ENDPROC(omap_do_wfi) .data + .align 2 ppa_zero_params: .word 0 diff --git a/arch/arm/mach-pxa/mioa701_bootresume.S b/arch/arm/mach-pxa/mioa701_bootresume.S index 81591491ab94..42d93f40a59f 100644 --- a/arch/arm/mach-pxa/mioa701_bootresume.S +++ b/arch/arm/mach-pxa/mioa701_bootresume.S @@ -16,6 +16,7 @@ * insist on it to be truly read-only. */ .data + .align 2 ENTRY(mioa701_bootstrap) 0: b 1f @@ -34,4 +35,5 @@ ENTRY(mioa701_jumpaddr) ENTRY(mioa701_bootstrap_lg) .data + .align 2 .word 2b-0b diff --git a/arch/arm/mach-rockchip/sleep.S b/arch/arm/mach-rockchip/sleep.S index 2eec9a341f05..9927f06f52fe 100644 --- a/arch/arm/mach-rockchip/sleep.S +++ b/arch/arm/mach-rockchip/sleep.S @@ -23,7 +23,7 @@ * ddr to sram for system resumeing. * so it is ".data section". */ -.align + .align 2 ENTRY(rockchip_slp_cpu_resume) setmode PSR_I_BIT | PSR_F_BIT | SVC_MODE, r1 @ set svc, irqs off diff --git a/arch/arm/mm/cache-v4wb.S b/arch/arm/mm/cache-v4wb.S index 2522f8c8fbb1..a5084ec70c6e 100644 --- a/arch/arm/mm/cache-v4wb.S +++ b/arch/arm/mm/cache-v4wb.S @@ -47,6 +47,7 @@ #define CACHE_DLIMIT (CACHE_DSIZE * 4) .data + .align 2 flush_base: .long FLUSH_BASE .text diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index ff8b0aa2dfde..42f585379e19 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -315,8 +315,11 @@ retry: * signal first. We do not need to release the mmap_sem because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (!user_mode(regs)) + goto no_context; return 0; + } /* * Major/minor page fault accounting is only done on the diff --git a/arch/arm/mm/proc-v7-3level.S b/arch/arm/mm/proc-v7-3level.S index 5e5720e8bc5f..7d16bbc4102b 100644 --- a/arch/arm/mm/proc-v7-3level.S +++ b/arch/arm/mm/proc-v7-3level.S @@ -129,8 +129,7 @@ ENDPROC(cpu_v7_set_pte_ext) .macro v7_ttb_setup, zero, ttbr0l, ttbr0h, ttbr1, tmp ldr \tmp, =swapper_pg_dir @ swapper_pg_dir virtual address cmp \ttbr1, \tmp, lsr #12 @ PHYS_OFFSET > PAGE_OFFSET? - mrc p15, 0, \tmp, c2, c0, 2 @ TTB control egister - orr \tmp, \tmp, #TTB_EAE + mov \tmp, #TTB_EAE @ for TTB control egister ALT_SMP(orr \tmp, \tmp, #TTB_FLAGS_SMP) ALT_UP(orr \tmp, \tmp, #TTB_FLAGS_UP) ALT_SMP(orr \tmp, \tmp, #TTB_FLAGS_SMP << 16) diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S index b6bbfdb6dfdc..3d75b7972fd1 100644 --- a/arch/arm/mm/proc-xscale.S +++ b/arch/arm/mm/proc-xscale.S @@ -104,6 +104,7 @@ .endm .data + .align 2 clean_addr: .word CLEAN_ADDR .text diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 4e5a664be04b..e09bf5d15606 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -142,25 +142,25 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user(from->si_code, &to->si_code); if (from->si_code < 0) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else switch (from->si_code & __SI_MASK) { - case __SI_KILL: + else switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_TIMER: + case SIL_TIMER: err |= __put_user(from->si_tid, &to->si_tid); err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(from->si_int, &to->si_int); break; - case __SI_POLL: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_FAULT: + case SIL_FAULT: err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr, &to->si_addr); #ifdef BUS_MCEERR_AO @@ -173,29 +173,24 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif break; - case __SI_CHLD: + case SIL_CHLD: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_status, &to->si_status); err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ + case SIL_RT: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); break; - case __SI_SYS: + case SIL_SYS: err |= __put_user((compat_uptr_t)(unsigned long) from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; - default: /* this is just in case for now ... */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; } return err; } diff --git a/arch/blackfin/include/uapi/asm/siginfo.h b/arch/blackfin/include/uapi/asm/siginfo.h index c72f4e6e386f..79dfe3979123 100644 --- a/arch/blackfin/include/uapi/asm/siginfo.h +++ b/arch/blackfin/include/uapi/asm/siginfo.h @@ -14,28 +14,36 @@ #define si_uid16 _sifields._kill._uid -#define ILL_ILLPARAOP (__SI_FAULT|2) /* illegal opcode combine ********** */ -#define ILL_ILLEXCPT (__SI_FAULT|4) /* unrecoverable exception ********** */ -#define ILL_CPLB_VI (__SI_FAULT|9) /* D/I CPLB protect violation ******** */ -#define ILL_CPLB_MISS (__SI_FAULT|10) /* D/I CPLB miss ******** */ -#define ILL_CPLB_MULHIT (__SI_FAULT|11) /* D/I CPLB multiple hit ******** */ +#define ILL_ILLPARAOP 2 /* illegal opcode combine ********** */ +#define ILL_ILLEXCPT 4 /* unrecoverable exception ********** */ +#define ILL_CPLB_VI 9 /* D/I CPLB protect violation ******** */ +#define ILL_CPLB_MISS 10 /* D/I CPLB miss ******** */ +#define ILL_CPLB_MULHIT 11 /* D/I CPLB multiple hit ******** */ +#undef NSIGILL +#define NSIGILL 11 /* * SIGBUS si_codes */ -#define BUS_OPFETCH (__SI_FAULT|4) /* error from instruction fetch ******** */ +#define BUS_OPFETCH 4 /* error from instruction fetch ******** */ +#undef NSIGBUS +#define NSIGBUS 4 /* * SIGTRAP si_codes */ -#define TRAP_STEP (__SI_FAULT|1) /* single-step breakpoint************* */ -#define TRAP_TRACEFLOW (__SI_FAULT|2) /* trace buffer overflow ************* */ -#define TRAP_WATCHPT (__SI_FAULT|3) /* watchpoint match ************* */ -#define TRAP_ILLTRAP (__SI_FAULT|4) /* illegal trap ************* */ +#define TRAP_STEP 1 /* single-step breakpoint************* */ +#define TRAP_TRACEFLOW 2 /* trace buffer overflow ************* */ +#define TRAP_WATCHPT 3 /* watchpoint match ************* */ +#define TRAP_ILLTRAP 4 /* illegal trap ************* */ +#undef NSIGTRAP +#define NSIGTRAP 4 /* * SIGSEGV si_codes */ -#define SEGV_STACKFLOW (__SI_FAULT|3) /* stack overflow */ +#define SEGV_STACKFLOW 3 /* stack overflow */ +#undef NSIGSEGV +#define NSIGSEGV 3 #endif /* _UAPI_BFIN_SIGINFO_H */ diff --git a/arch/frv/include/uapi/asm/siginfo.h b/arch/frv/include/uapi/asm/siginfo.h index d3fd1ca45653..f55d9e0e9068 100644 --- a/arch/frv/include/uapi/asm/siginfo.h +++ b/arch/frv/include/uapi/asm/siginfo.h @@ -4,7 +4,7 @@ #include <linux/types.h> #include <asm-generic/siginfo.h> -#define FPE_MDAOVF (__SI_FAULT|9) /* media overflow */ +#define FPE_MDAOVF 9 /* media overflow */ #undef NSIGFPE #define NSIGFPE 9 diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h index 4694c64252d6..33389fc36f23 100644 --- a/arch/ia64/include/uapi/asm/siginfo.h +++ b/arch/ia64/include/uapi/asm/siginfo.h @@ -98,27 +98,30 @@ typedef struct siginfo { /* * SIGILL si_codes */ -#define ILL_BADIADDR (__SI_FAULT|9) /* unimplemented instruction address */ -#define __ILL_BREAK (__SI_FAULT|10) /* illegal break */ -#define __ILL_BNDMOD (__SI_FAULT|11) /* bundle-update (modification) in progress */ +#define ILL_BADIADDR 9 /* unimplemented instruction address */ +#define __ILL_BREAK 10 /* illegal break */ +#define __ILL_BNDMOD 11 /* bundle-update (modification) in progress */ #undef NSIGILL #define NSIGILL 11 /* * SIGFPE si_codes */ -#define __FPE_DECOVF (__SI_FAULT|9) /* decimal overflow */ -#define __FPE_DECDIV (__SI_FAULT|10) /* decimal division by zero */ -#define __FPE_DECERR (__SI_FAULT|11) /* packed decimal error */ -#define __FPE_INVASC (__SI_FAULT|12) /* invalid ASCII digit */ -#define __FPE_INVDEC (__SI_FAULT|13) /* invalid decimal digit */ +#ifdef __KERNEL__ +#define FPE_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ +#define __FPE_DECOVF 9 /* decimal overflow */ +#define __FPE_DECDIV 10 /* decimal division by zero */ +#define __FPE_DECERR 11 /* packed decimal error */ +#define __FPE_INVASC 12 /* invalid ASCII digit */ +#define __FPE_INVDEC 13 /* invalid decimal digit */ #undef NSIGFPE #define NSIGFPE 13 /* * SIGSEGV si_codes */ -#define __SEGV_PSTKOVF (__SI_FAULT|4) /* paragraph stack overflow */ +#define __SEGV_PSTKOVF 4 /* paragraph stack overflow */ #undef NSIGSEGV #define NSIGSEGV 4 diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 5db52c6813c4..6146d53b6ad7 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -124,31 +124,30 @@ copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from) */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: + err |= __put_user(from->si_code, &to->si_code); + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_FAULT: err |= __put_user(from->si_flags, &to->si_flags); err |= __put_user(from->si_isr, &to->si_isr); - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(from->si_addr, &to->si_addr); err |= __put_user(from->si_imm, &to->si_imm); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __put_user(from->si_tid, &to->si_tid); err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(from->si_ptr, &to->si_ptr); break; - case __SI_RT >> 16: /* Not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_ptr, &to->si_ptr); break; - case __SI_CHLD >> 16: + case SIL_CHLD: err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); - default: + case SIL_KILL: err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_pid, &to->si_pid); break; diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index 7b1fe9462158..3cb17cf9b362 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -349,7 +349,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) } siginfo.si_signo = SIGFPE; siginfo.si_errno = 0; - siginfo.si_code = __SI_FAULT; /* default code */ + siginfo.si_code = FPE_FIXME; /* default code */ siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri); if (isr & 0x11) { siginfo.si_code = FPE_FLTINV; @@ -373,7 +373,7 @@ handle_fpu_swa (int fp_fault, struct pt_regs *regs, unsigned long isr) /* raise exception */ siginfo.si_signo = SIGFPE; siginfo.si_errno = 0; - siginfo.si_code = __SI_FAULT; /* default code */ + siginfo.si_code = FPE_FIXME; /* default code */ siginfo.si_addr = (void __user *) (regs->cr_iip + ia64_psr(regs)->ri); if (isr & 0x880) { siginfo.si_code = FPE_FLTOVF; diff --git a/arch/metag/include/asm/dma-mapping.h b/arch/metag/include/asm/dma-mapping.h index fad3dc3cb210..ea573be2b6d0 100644 --- a/arch/metag/include/asm/dma-mapping.h +++ b/arch/metag/include/asm/dma-mapping.h @@ -9,7 +9,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) } /* - * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to + * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to * do any flushing here. */ static inline void diff --git a/arch/mips/include/uapi/asm/siginfo.h b/arch/mips/include/uapi/asm/siginfo.h index 8069cf766603..cf6113bbcb98 100644 --- a/arch/mips/include/uapi/asm/siginfo.h +++ b/arch/mips/include/uapi/asm/siginfo.h @@ -120,7 +120,7 @@ typedef struct siginfo { #undef SI_TIMER #undef SI_MESGQ #define SI_ASYNCIO -2 /* sent by AIO completion */ -#define SI_TIMER __SI_CODE(__SI_TIMER, -3) /* sent by timer expiration */ -#define SI_MESGQ __SI_CODE(__SI_MESGQ, -4) /* sent by real time mesq state change */ +#define SI_TIMER -3 /* sent by timer expiration */ +#define SI_MESGQ -4 /* sent by real time mesq state change */ #endif /* _UAPI_ASM_SIGINFO_H */ diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 84165f2b31ff..cf5c7c05e5a3 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -93,38 +93,37 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) at the same time. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user(from->si_code, &to->si_code); if (from->si_code < 0) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { - switch (from->si_code >> 16) { - case __SI_TIMER >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_TIMER: err |= __put_user(from->si_tid, &to->si_tid); err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(from->si_int, &to->si_int); break; - case __SI_CHLD >> 16: + case SIL_CHLD: err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); - default: + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __put_user((unsigned long)from->si_addr, &to->si_addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); break; - case __SI_SYS >> 16: + case SIL_SYS: err |= __copy_to_user(&to->si_call_addr, &from->si_call_addr, sizeof(compat_uptr_t)); err |= __put_user(from->si_syscall, &to->si_syscall); diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index b68b4d0726d3..2bf414993347 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -735,7 +735,7 @@ void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr, else if (fcr31 & FPU_CSR_INE_X) si.si_code = FPE_FLTRES; else - si.si_code = __SI_FAULT; + return; /* Broken hardware? */ force_sig_info(SIGFPE, &si, tsk); } diff --git a/arch/nios2/include/asm/dma-mapping.h b/arch/nios2/include/asm/dma-mapping.h index 7b3c6f280293..f8dc62222741 100644 --- a/arch/nios2/include/asm/dma-mapping.h +++ b/arch/nios2/include/asm/dma-mapping.h @@ -18,7 +18,7 @@ static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) } /* - * dma_alloc_noncoherent() returns non-cacheable memory, so there's no need to + * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to * do any flushing here. */ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c index 70aaabb8b3cb..9e0cb6a577d6 100644 --- a/arch/parisc/kernel/signal32.c +++ b/arch/parisc/kernel/signal32.c @@ -290,25 +290,25 @@ copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from) if (to->si_code < 0) err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { - switch (to->si_code >> 16) { - case __SI_CHLD >> 16: + switch (siginfo_layout(to->si_signo, to->si_code)) { + case SIL_CHLD: err |= __get_user(to->si_utime, &from->si_utime); err |= __get_user(to->si_stime, &from->si_stime); err |= __get_user(to->si_status, &from->si_status); default: + case SIL_KILL: err |= __get_user(to->si_pid, &from->si_pid); err |= __get_user(to->si_uid, &from->si_uid); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __get_user(addr, &from->si_addr); to->si_addr = compat_ptr(addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __get_user(to->si_band, &from->si_band); err |= __get_user(to->si_fd, &from->si_fd); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __get_user(to->si_pid, &from->si_pid); err |= __get_user(to->si_uid, &from->si_uid); err |= __get_user(to->si_int, &from->si_int); @@ -337,41 +337,40 @@ copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from) at the same time. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user(from->si_code, &to->si_code); if (from->si_code < 0) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { - switch (from->si_code >> 16) { - case __SI_CHLD >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_CHLD: err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); - default: + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_FAULT >> 16: + case SIL_FAULT: addr = ptr_to_compat(from->si_addr); err |= __put_user(addr, &to->si_addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __put_user(from->si_tid, &to->si_tid); err |= __put_user(from->si_overrun, &to->si_overrun); val = (compat_int_t)from->si_int; err |= __put_user(val, &to->si_int); break; - case __SI_RT >> 16: /* Not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_pid, &to->si_pid); val = (compat_int_t)from->si_int; err |= __put_user(val, &to->si_int); break; - case __SI_SYS >> 16: + case SIL_SYS: err |= __put_user(ptr_to_compat(from->si_call_addr), &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 97bb1385e771..92fb1c8dbbd8 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -913,42 +913,40 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) */ err = __put_user(s->si_signo, &d->si_signo); err |= __put_user(s->si_errno, &d->si_errno); - err |= __put_user((short)s->si_code, &d->si_code); + err |= __put_user(s->si_code, &d->si_code); if (s->si_code < 0) err |= __copy_to_user(&d->_sifields._pad, &s->_sifields._pad, SI_PAD_SIZE32); - else switch(s->si_code >> 16) { - case __SI_CHLD >> 16: + else switch(siginfo_layout(s->si_signo, s->si_code)) { + case SIL_CHLD: err |= __put_user(s->si_pid, &d->si_pid); err |= __put_user(s->si_uid, &d->si_uid); err |= __put_user(s->si_utime, &d->si_utime); err |= __put_user(s->si_stime, &d->si_stime); err |= __put_user(s->si_status, &d->si_status); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __put_user((unsigned int)(unsigned long)s->si_addr, &d->si_addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(s->si_band, &d->si_band); err |= __put_user(s->si_fd, &d->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __put_user(s->si_tid, &d->si_tid); err |= __put_user(s->si_overrun, &d->si_overrun); err |= __put_user(s->si_int, &d->si_int); break; - case __SI_SYS >> 16: + case SIL_SYS: err |= __put_user(ptr_to_compat(s->si_call_addr), &d->si_call_addr); err |= __put_user(s->si_syscall, &d->si_syscall); err |= __put_user(s->si_arch, &d->si_arch); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(s->si_int, &d->si_int); /* fallthrough */ - case __SI_KILL >> 16: - default: + case SIL_KILL: err |= __put_user(s->si_pid, &d->si_pid); err |= __put_user(s->si_uid, &d->si_uid); break; diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h new file mode 100644 index 000000000000..c02f4aba88a6 --- /dev/null +++ b/arch/s390/include/asm/ap.h @@ -0,0 +1,126 @@ +/* + * Adjunct processor (AP) interfaces + * + * Copyright IBM Corp. 2017 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + * + * Author(s): Tony Krowiak <akrowia@linux.vnet.ibm.com> + * Martin Schwidefsky <schwidefsky@de.ibm.com> + * Harald Freudenberger <freude@de.ibm.com> + */ + +#ifndef _ASM_S390_AP_H_ +#define _ASM_S390_AP_H_ + +/** + * The ap_qid_t identifier of an ap queue. + * If the AP facilities test (APFT) facility is available, + * card and queue index are 8 bit values, otherwise + * card index is 6 bit and queue index a 4 bit value. + */ +typedef unsigned int ap_qid_t; + +#define AP_MKQID(_card, _queue) (((_card) & 63) << 8 | ((_queue) & 255)) +#define AP_QID_CARD(_qid) (((_qid) >> 8) & 63) +#define AP_QID_QUEUE(_qid) ((_qid) & 255) + +/** + * struct ap_queue_status - Holds the AP queue status. + * @queue_empty: Shows if queue is empty + * @replies_waiting: Waiting replies + * @queue_full: Is 1 if the queue is full + * @irq_enabled: Shows if interrupts are enabled for the AP + * @response_code: Holds the 8 bit response code + * + * The ap queue status word is returned by all three AP functions + * (PQAP, NQAP and DQAP). There's a set of flags in the first + * byte, followed by a 1 byte response code. + */ +struct ap_queue_status { + unsigned int queue_empty : 1; + unsigned int replies_waiting : 1; + unsigned int queue_full : 1; + unsigned int _pad1 : 4; + unsigned int irq_enabled : 1; + unsigned int response_code : 8; + unsigned int _pad2 : 16; +}; + +/** + * ap_test_queue(): Test adjunct processor queue. + * @qid: The AP queue number + * @tbit: Test facilities bit + * @info: Pointer to queue descriptor + * + * Returns AP queue status structure. + */ +struct ap_queue_status ap_test_queue(ap_qid_t qid, + int tbit, + unsigned long *info); + +struct ap_config_info { + unsigned int apsc : 1; /* S bit */ + unsigned int apxa : 1; /* N bit */ + unsigned int qact : 1; /* C bit */ + unsigned int rc8a : 1; /* R bit */ + unsigned char _reserved1 : 4; + unsigned char _reserved2[3]; + unsigned char Na; /* max # of APs - 1 */ + unsigned char Nd; /* max # of Domains - 1 */ + unsigned char _reserved3[10]; + unsigned int apm[8]; /* AP ID mask */ + unsigned int aqm[8]; /* AP queue mask */ + unsigned int adm[8]; /* AP domain mask */ + unsigned char _reserved4[16]; +} __aligned(8); + +/* + * ap_query_configuration(): Fetch cryptographic config info + * + * Returns the ap configuration info fetched via PQAP(QCI). + * On success 0 is returned, on failure a negative errno + * is returned, e.g. if the PQAP(QCI) instruction is not + * available, the return value will be -EOPNOTSUPP. + */ +int ap_query_configuration(struct ap_config_info *info); + +/* + * struct ap_qirq_ctrl - convenient struct for easy invocation + * of the ap_queue_irq_ctrl() function. This struct is passed + * as GR1 parameter to the PQAP(AQIC) instruction. For details + * please see the AR documentation. + */ +struct ap_qirq_ctrl { + unsigned int _res1 : 8; + unsigned int zone : 8; /* zone info */ + unsigned int ir : 1; /* ir flag: enable (1) or disable (0) irq */ + unsigned int _res2 : 4; + unsigned int gisc : 3; /* guest isc field */ + unsigned int _res3 : 6; + unsigned int gf : 2; /* gisa format */ + unsigned int _res4 : 1; + unsigned int gisa : 27; /* gisa origin */ + unsigned int _res5 : 1; + unsigned int isc : 3; /* irq sub class */ +}; + +/** + * ap_queue_irq_ctrl(): Control interruption on a AP queue. + * @qid: The AP queue number + * @qirqctrl: struct ap_qirq_ctrl, see above + * @ind: The notification indicator byte + * + * Returns AP queue status. + * + * Control interruption on the given AP queue. + * Just a simple wrapper function for the low level PQAP(AQIC) + * instruction available for other kernel modules. + */ +struct ap_queue_status ap_queue_irq_ctrl(ap_qid_t qid, + struct ap_qirq_ctrl qirqctrl, + void *ind); + +#endif /* _ASM_S390_AP_H_ */ diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bd6f30304518..3f46a6577b8d 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -5,12 +5,11 @@ #include <linux/errno.h> typedef struct { + spinlock_t lock; cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; - spinlock_t pgtable_lock; struct list_head pgtable_list; - spinlock_t gmap_lock; struct list_head gmap_list; unsigned long gmap_asce; unsigned long asce; @@ -27,10 +26,8 @@ typedef struct { } mm_context_t; #define INIT_MM_CONTEXT(name) \ - .context.pgtable_lock = \ - __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ + .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ - .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \ .context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list), static inline int tprot(unsigned long addr) diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 72e9ca83a668..3c9abedc323c 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -17,9 +17,8 @@ static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - spin_lock_init(&mm->context.pgtable_lock); + spin_lock_init(&mm->context.lock); INIT_LIST_HEAD(&mm->context.pgtable_list); - spin_lock_init(&mm->context.gmap_lock); INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); @@ -103,7 +102,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (prev == next) return; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - cpumask_set_cpu(cpu, mm_cpumask(next)); /* Clear old ASCE by loading the kernel ASCE. */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); @@ -121,9 +119,8 @@ static inline void finish_arch_post_lock_switch(void) preempt_disable(); while (atomic_read(&mm->context.flush_count)) cpu_relax(); - - if (mm->context.flush_mm) - __tlb_flush_mm(mm); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); + __tlb_flush_mm_lazy(mm); preempt_enable(); } set_fs(current->thread.mm_segment); @@ -136,6 +133,7 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) { switch_mm(prev, next, current); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); set_user_asce(next); } diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h index 4d759f8f4bc7..b08d5bc2666e 100644 --- a/arch/s390/include/asm/tlbflush.h +++ b/arch/s390/include/asm/tlbflush.h @@ -48,23 +48,6 @@ static inline void __tlb_flush_global(void) * Flush TLB entries for a specific mm on all CPUs (in case gmap is used * this implicates multiple ASCEs!). */ -static inline void __tlb_flush_full(struct mm_struct *mm) -{ - preempt_disable(); - atomic_inc(&mm->context.flush_count); - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { - /* Local TLB flush */ - __tlb_flush_local(); - } else { - /* Global TLB flush */ - __tlb_flush_global(); - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); - } - atomic_dec(&mm->context.flush_count); - preempt_enable(); -} - static inline void __tlb_flush_mm(struct mm_struct *mm) { unsigned long gmap_asce; @@ -76,16 +59,18 @@ static inline void __tlb_flush_mm(struct mm_struct *mm) */ preempt_disable(); atomic_inc(&mm->context.flush_count); + /* Reset TLB flush mask */ + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); + barrier(); gmap_asce = READ_ONCE(mm->context.gmap_asce); if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { if (gmap_asce) __tlb_flush_idte(gmap_asce); __tlb_flush_idte(mm->context.asce); } else { - __tlb_flush_full(mm); + /* Global TLB flush */ + __tlb_flush_global(); } - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); atomic_dec(&mm->context.flush_count); preempt_enable(); } @@ -99,7 +84,6 @@ static inline void __tlb_flush_kernel(void) } #else #define __tlb_flush_global() __tlb_flush_local() -#define __tlb_flush_full(mm) __tlb_flush_local() /* * Flush TLB entries for a specific ASCE on all CPUs. @@ -117,10 +101,12 @@ static inline void __tlb_flush_kernel(void) static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { + spin_lock(&mm->context.lock); if (mm->context.flush_mm) { - __tlb_flush_mm(mm); mm->context.flush_mm = 0; + __tlb_flush_mm(mm); } + spin_unlock(&mm->context.lock); } /* diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index c620049c61f2..f549c4657376 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -75,35 +75,34 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) at the same time. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user(from->si_code, &to->si_code); if (from->si_code < 0) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { - switch (from->si_code >> 16) { - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_RT: err |= __put_user(from->si_int, &to->si_int); /* fallthrough */ - case __SI_KILL >> 16: + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_CHLD >> 16: + case SIL_CHLD: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __put_user((unsigned long) from->si_addr, &to->si_addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __put_user(from->si_tid, &to->si_tid); err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(from->si_int, &to->si_int); @@ -127,32 +126,31 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) if (to->si_code < 0) err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { - switch (to->si_code >> 16) { - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + switch (siginfo_layout(to->si_signo, to->si_code)) { + case SIL_RT: err |= __get_user(to->si_int, &from->si_int); /* fallthrough */ - case __SI_KILL >> 16: + case SIL_KILL: err |= __get_user(to->si_pid, &from->si_pid); err |= __get_user(to->si_uid, &from->si_uid); break; - case __SI_CHLD >> 16: + case SIL_CHLD: err |= __get_user(to->si_pid, &from->si_pid); err |= __get_user(to->si_uid, &from->si_uid); err |= __get_user(to->si_utime, &from->si_utime); err |= __get_user(to->si_stime, &from->si_stime); err |= __get_user(to->si_status, &from->si_status); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __get_user(tmp, &from->si_addr); to->si_addr = (void __force __user *) (u64) (tmp & PSW32_ADDR_INSN); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __get_user(to->si_band, &from->si_band); err |= __get_user(to->si_fd, &from->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __get_user(to->si_tid, &from->si_tid); err |= __get_user(to->si_overrun, &from->si_overrun); err |= __get_user(to->si_int, &from->si_int); diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 9e1494e3d849..2f66290c9b92 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -100,14 +100,14 @@ struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit) if (!gmap) return NULL; gmap->mm = mm; - spin_lock(&mm->context.gmap_lock); + spin_lock(&mm->context.lock); list_add_rcu(&gmap->list, &mm->context.gmap_list); if (list_is_singular(&mm->context.gmap_list)) gmap_asce = gmap->asce; else gmap_asce = -1UL; WRITE_ONCE(mm->context.gmap_asce, gmap_asce); - spin_unlock(&mm->context.gmap_lock); + spin_unlock(&mm->context.lock); return gmap; } EXPORT_SYMBOL_GPL(gmap_create); @@ -248,7 +248,7 @@ void gmap_remove(struct gmap *gmap) spin_unlock(&gmap->shadow_lock); } /* Remove gmap from the pre-mm list */ - spin_lock(&gmap->mm->context.gmap_lock); + spin_lock(&gmap->mm->context.lock); list_del_rcu(&gmap->list); if (list_empty(&gmap->mm->context.gmap_list)) gmap_asce = 0; @@ -258,7 +258,7 @@ void gmap_remove(struct gmap *gmap) else gmap_asce = -1UL; WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce); - spin_unlock(&gmap->mm->context.gmap_lock); + spin_unlock(&gmap->mm->context.lock); synchronize_rcu(); /* Put reference */ gmap_put(gmap); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index c5b74dd61197..05f1f27e6708 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -83,7 +83,7 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end) int rc, notify; /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */ - BUG_ON(mm->context.asce_limit < _REGION2_SIZE); + VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE); if (end >= TASK_SIZE_MAX) return -ENOMEM; rc = 0; @@ -124,7 +124,7 @@ void crst_table_downgrade(struct mm_struct *mm) pgd_t *pgd; /* downgrade should only happen from 3 to 2 levels (compat only) */ - BUG_ON(mm->context.asce_limit != _REGION2_SIZE); + VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE); if (current->active_mm == mm) { clear_user_asce(); @@ -188,7 +188,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Try to get a fragment of a 4K page as a 2K page table */ if (!mm_alloc_pgste(mm)) { table = NULL; - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); @@ -203,7 +203,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) list_del(&page->lru); } } - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); if (table) return table; } @@ -227,9 +227,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); clear_table(table, _PAGE_INVALID, PAGE_SIZE); - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); } return table; } @@ -243,13 +243,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_mapcount, 1U << bit); if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); if (mask != 0) return; } @@ -275,13 +275,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, return; } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); - spin_lock_bh(&mm->context.pgtable_lock); + spin_lock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); - spin_unlock_bh(&mm->context.pgtable_lock); + spin_unlock_bh(&mm->context.lock); table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c index 9d597f7ab8dd..48aaefd8f5d6 100644 --- a/arch/sh/drivers/pci/fixups-dreamcast.c +++ b/arch/sh/drivers/pci/fixups-dreamcast.c @@ -63,11 +63,10 @@ static void gapspci_fixup_resources(struct pci_dev *dev) res.end = GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1; res.flags = IORESOURCE_MEM; pcibios_resource_to_bus(dev->bus, ®ion, &res); - BUG_ON(!dma_declare_coherent_memory(&dev->dev, + BUG_ON(dma_declare_coherent_memory(&dev->dev, res.start, region.start, resource_size(&res), - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE)); break; default: diff --git a/arch/sparc/include/uapi/asm/siginfo.h b/arch/sparc/include/uapi/asm/siginfo.h index 2d9b79ccaa50..157f46fe374f 100644 --- a/arch/sparc/include/uapi/asm/siginfo.h +++ b/arch/sparc/include/uapi/asm/siginfo.h @@ -17,9 +17,16 @@ #define SI_NOINFO 32767 /* no information in siginfo_t */ /* + * SIGFPE si_codes + */ +#ifdef __KERNEL__ +#define FPE_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ + +/* * SIGEMT si_codes */ -#define EMT_TAGOVF (__SI_FAULT|1) /* tag overflow */ +#define EMT_TAGOVF 1 /* tag overflow */ #define NSIGEMT 1 #endif /* _UAPI__SPARC_SIGINFO_H */ diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index b4096bb665b2..0e4c08c45a37 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -85,34 +85,34 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) at the same time. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user(from->si_code, &to->si_code); if (from->si_code < 0) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { - switch (from->si_code >> 16) { - case __SI_TIMER >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_TIMER: err |= __put_user(from->si_tid, &to->si_tid); err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(from->si_int, &to->si_int); break; - case __SI_CHLD >> 16: + case SIL_CHLD: err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); default: + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_FAULT >> 16: + case SIL_FAULT: err |= __put_user(from->si_trapno, &to->si_trapno); err |= __put_user((unsigned long)from->si_addr, &to->si_addr); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index 466d4aed06c7..581cf35ee7e3 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -306,7 +306,7 @@ void do_fpe_trap(struct pt_regs *regs, unsigned long pc, unsigned long npc, info.si_errno = 0; info.si_addr = (void __user *)pc; info.si_trapno = 0; - info.si_code = __SI_FAULT; + info.si_code = FPE_FIXME; if ((fsr & 0x1c000) == (1 << 14)) { if (fsr & 0x10) info.si_code = FPE_FLTINV; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index c74f2dffcc13..0a56dc257cb9 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2303,7 +2303,7 @@ static void do_fpe_common(struct pt_regs *regs) info.si_errno = 0; info.si_addr = (void __user *)regs->tpc; info.si_trapno = 0; - info.si_code = __SI_FAULT; + info.si_code = FPE_FIXME; if ((fsr & 0x1c000) == (1 << 14)) { if (fsr & 0x10) info.si_code = FPE_FLTINV; diff --git a/arch/tile/include/asm/dma-mapping.h b/arch/tile/include/asm/dma-mapping.h index bbc71a29b2c6..7061dc8af43a 100644 --- a/arch/tile/include/asm/dma-mapping.h +++ b/arch/tile/include/asm/dma-mapping.h @@ -68,8 +68,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) int dma_set_mask(struct device *dev, u64 mask); /* - * dma_alloc_noncoherent() is #defined to return coherent memory, - * so there's no need to do any flushing here. + * dma_alloc_attrs() always returns non-cacheable memory, so there's no need to + * do any flushing here. */ static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, enum dma_data_direction direction) diff --git a/arch/tile/include/uapi/asm/siginfo.h b/arch/tile/include/uapi/asm/siginfo.h index 56d661bb010b..e83f931aa1f0 100644 --- a/arch/tile/include/uapi/asm/siginfo.h +++ b/arch/tile/include/uapi/asm/siginfo.h @@ -26,8 +26,8 @@ /* * Additional Tile-specific SIGILL si_codes */ -#define ILL_DBLFLT (__SI_FAULT|9) /* double fault */ -#define ILL_HARDWALL (__SI_FAULT|10) /* user networks hardwall violation */ +#define ILL_DBLFLT 9 /* double fault */ +#define ILL_HARDWALL 10 /* user networks hardwall violation */ #undef NSIGILL #define NSIGILL 10 diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index 0e863f1ee08c..971d87a1d8cf 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -64,7 +64,7 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr 3 ints plus the relevant union member. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user(from->si_code, &to->si_code); if (from->si_code < 0) { err |= __put_user(from->si_pid, &to->si_pid); @@ -77,28 +77,26 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr */ err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_FAULT: break; - case __SI_CHLD >> 16: + case SIL_CHLD: err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); err |= __put_user(from->si_status, &to->si_status); /* FALL THROUGH */ default: - case __SI_KILL >> 16: + case SIL_KILL: err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_POLL >> 16: + case SIL_POLL: err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(from->si_int, &to->si_int); break; - /* This is not generated by the kernel as of now. */ - case __SI_RT >> 16: - case __SI_MESGQ >> 16: + case SIL_RT: err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); break; diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index 54804866f238..9b08c6055f15 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -188,7 +188,7 @@ static int special_ill(tile_bundle_bits bundle, int *sigp, int *codep) /* Make it the requested signal. */ *sigp = sig; - *codep = code | __SI_FAULT; + *codep = code; return 1; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a3e6e6136a47..971feac13506 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -53,7 +53,6 @@ config X86 select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 - select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_PMEM_API if X86_64 # Causing hangs/crashes, see the commit that added this change for details. select ARCH_HAS_REFCOUNT if BROKEN diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 8b4140f6724f..cb9a1af109b4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -7,6 +7,4 @@ void clflush_cache_range(void *addr, unsigned int size); -#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 8e618fcf1f7c..6a77c63540f7 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -21,7 +21,7 @@ #ifdef CONFIG_AMD_MEM_ENCRYPT -extern unsigned long sme_me_mask; +extern u64 sme_me_mask; void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, unsigned long decrypted_kernel_vaddr, @@ -49,7 +49,7 @@ void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); #else /* !CONFIG_AMD_MEM_ENCRYPT */ -#define sme_me_mask 0UL +#define sme_me_mask 0ULL static inline void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) { } diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 831eb7895535..c471ca1f9412 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -86,7 +86,6 @@ static inline void e820__memblock_alloc_reserved_mpc_new(void) { } #endif int generic_processor_info(int apicid, int version); -int __generic_processor_info(int apicid, int version, bool enabled); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 8315e2f517a7..d705c769f77d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2130,7 +2130,7 @@ int generic_processor_info(int apicid, int version) * Since fixing handling of boot_cpu_physical_apicid requires * another discussion and tests on each platform, we leave it * for now and here we use read_apic_id() directly in this - * function, __generic_processor_info(). + * function, generic_processor_info(). */ if (disabled_cpu_apicid != BAD_APICID && disabled_cpu_apicid != read_apic_id() && diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 71beb28600d4..ab9feb5887b1 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -129,7 +129,7 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, 3 ints plus the relevant union member. */ put_user_ex(from->si_signo, &to->si_signo); put_user_ex(from->si_errno, &to->si_errno); - put_user_ex((short)from->si_code, &to->si_code); + put_user_ex(from->si_code, &to->si_code); if (from->si_code < 0) { put_user_ex(from->si_pid, &to->si_pid); @@ -142,8 +142,8 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, */ put_user_ex(from->_sifields._pad[0], &to->_sifields._pad[0]); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_FAULT: if (from->si_signo == SIGBUS && (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) @@ -160,11 +160,11 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, put_user_ex(from->si_pkey, &to->si_pkey); } break; - case __SI_SYS >> 16: + case SIL_SYS: put_user_ex(from->si_syscall, &to->si_syscall); put_user_ex(from->si_arch, &to->si_arch); break; - case __SI_CHLD >> 16: + case SIL_CHLD: if (!x32_ABI) { put_user_ex(from->si_utime, &to->si_utime); put_user_ex(from->si_stime, &to->si_stime); @@ -174,21 +174,18 @@ int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, } put_user_ex(from->si_status, &to->si_status); /* FALL THROUGH */ - default: - case __SI_KILL >> 16: + case SIL_KILL: put_user_ex(from->si_uid, &to->si_uid); break; - case __SI_POLL >> 16: + case SIL_POLL: put_user_ex(from->si_fd, &to->si_fd); break; - case __SI_TIMER >> 16: + case SIL_TIMER: put_user_ex(from->si_overrun, &to->si_overrun); put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); break; - /* This is not generated by the kernel as of now. */ - case __SI_RT >> 16: - case __SI_MESGQ >> 16: + case SIL_RT: put_user_ex(from->si_uid, &to->si_uid); put_user_ex(from->si_int, &to->si_int); break; diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 0fbd09269757..3fcc8e01683b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -37,7 +37,7 @@ static char sme_cmdline_off[] __initdata = "off"; * reside in the .data section so as not to be zeroed out when the .bss * section is later cleared. */ -unsigned long sme_me_mask __section(.data) = 0; +u64 sme_me_mask __section(.data) = 0; EXPORT_SYMBOL_GPL(sme_me_mask); /* Buffer used for early in-place encryption by BSP, no locking needed */ diff --git a/drivers/acpi/nfit/Kconfig b/drivers/acpi/nfit/Kconfig index 6d3351452ea2..929ba4da0b30 100644 --- a/drivers/acpi/nfit/Kconfig +++ b/drivers/acpi/nfit/Kconfig @@ -2,7 +2,7 @@ config ACPI_NFIT tristate "ACPI NVDIMM Firmware Interface Table (NFIT)" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV - depends on ARCH_HAS_MMIO_FLUSH + depends on ARCH_HAS_PMEM_API select LIBNVDIMM help Infrastructure to probe ACPI 6 compliant platforms for diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 1893e416e7c0..9c2c49b6a240 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -228,6 +228,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, if (cmd == ND_CMD_CALL) { call_pkg = buf; func = call_pkg->nd_command; + + for (i = 0; i < ARRAY_SIZE(call_pkg->nd_reserved2); i++) + if (call_pkg->nd_reserved2[i]) + return -EINVAL; } if (nvdimm) { @@ -1674,8 +1678,19 @@ static ssize_t range_index_show(struct device *dev, } static DEVICE_ATTR_RO(range_index); +static ssize_t ecc_unit_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region); + + return sprintf(buf, "%d\n", nfit_spa->clear_err_unit); +} +static DEVICE_ATTR_RO(ecc_unit_size); + static struct attribute *acpi_nfit_region_attributes[] = { &dev_attr_range_index.attr, + &dev_attr_ecc_unit_size.attr, NULL, }; @@ -1804,6 +1819,7 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); struct acpi_nfit_memory_map *memdev = memdev_from_spa(acpi_desc, spa->range_index, i); + struct acpi_nfit_control_region *dcr = nfit_mem->dcr; if (!memdev || !nfit_mem->dcr) { dev_err(dev, "%s: failed to find DCR\n", __func__); @@ -1811,13 +1827,13 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, } map->region_offset = memdev->region_offset; - map->serial_number = nfit_mem->dcr->serial_number; + map->serial_number = dcr->serial_number; map2->region_offset = memdev->region_offset; - map2->serial_number = nfit_mem->dcr->serial_number; - map2->vendor_id = nfit_mem->dcr->vendor_id; - map2->manufacturing_date = nfit_mem->dcr->manufacturing_date; - map2->manufacturing_location = nfit_mem->dcr->manufacturing_location; + map2->serial_number = dcr->serial_number; + map2->vendor_id = dcr->vendor_id; + map2->manufacturing_date = dcr->manufacturing_date; + map2->manufacturing_location = dcr->manufacturing_location; } /* v1.1 namespaces */ @@ -1835,6 +1851,28 @@ static int acpi_nfit_init_interleave_set(struct acpi_nfit_desc *acpi_desc, cmp_map_compat, NULL); nd_set->altcookie = nd_fletcher64(info, sizeof_nfit_set_info(nr), 0); + /* record the result of the sort for the mapping position */ + for (i = 0; i < nr; i++) { + struct nfit_set_info_map2 *map2 = &info2->mapping[i]; + int j; + + for (j = 0; j < nr; j++) { + struct nd_mapping_desc *mapping = &ndr_desc->mapping[j]; + struct nvdimm *nvdimm = mapping->nvdimm; + struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); + struct acpi_nfit_control_region *dcr = nfit_mem->dcr; + + if (map2->serial_number == dcr->serial_number && + map2->vendor_id == dcr->vendor_id && + map2->manufacturing_date == dcr->manufacturing_date && + map2->manufacturing_location + == dcr->manufacturing_location) { + mapping->position = i; + break; + } + } + } + ndr_desc->nd_set = nd_set; devm_kfree(dev, info); devm_kfree(dev, info2); @@ -1930,7 +1968,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk *nfit_blk, memcpy_flushcache(mmio->addr.aperture + offset, iobuf + copied, c); else { if (nfit_blk->dimm_flags & NFIT_BLK_READ_FLUSH) - mmio_flush_range((void __force *) + arch_invalidate_pmem((void __force *) mmio->addr.aperture + offset, c); memcpy(iobuf + copied, mmio->addr.aperture + offset, c); diff --git a/drivers/base/dma-coherent.c b/drivers/base/dma-coherent.c index 1c152aed6b82..a39b2166b145 100644 --- a/drivers/base/dma-coherent.c +++ b/drivers/base/dma-coherent.c @@ -37,7 +37,7 @@ static inline dma_addr_t dma_get_device_base(struct device *dev, return mem->device_base; } -static bool dma_init_coherent_memory( +static int dma_init_coherent_memory( phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags, struct dma_coherent_mem **mem) { @@ -45,25 +45,28 @@ static bool dma_init_coherent_memory( void __iomem *mem_base = NULL; int pages = size >> PAGE_SHIFT; int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + int ret; - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) + if (!size) { + ret = -EINVAL; goto out; + } - if (flags & DMA_MEMORY_MAP) - mem_base = memremap(phys_addr, size, MEMREMAP_WC); - else - mem_base = ioremap(phys_addr, size); - if (!mem_base) + mem_base = memremap(phys_addr, size, MEMREMAP_WC); + if (!mem_base) { + ret = -EINVAL; goto out; - + } dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dma_mem) + if (!dma_mem) { + ret = -ENOMEM; goto out; + } dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dma_mem->bitmap) + if (!dma_mem->bitmap) { + ret = -ENOMEM; goto out; + } dma_mem->virt_base = mem_base; dma_mem->device_base = device_addr; @@ -73,17 +76,13 @@ static bool dma_init_coherent_memory( spin_lock_init(&dma_mem->spinlock); *mem = dma_mem; - return true; + return 0; out: kfree(dma_mem); - if (mem_base) { - if (flags & DMA_MEMORY_MAP) - memunmap(mem_base); - else - iounmap(mem_base); - } - return false; + if (mem_base) + memunmap(mem_base); + return ret; } static void dma_release_coherent_memory(struct dma_coherent_mem *mem) @@ -91,10 +90,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem) if (!mem) return; - if (mem->flags & DMA_MEMORY_MAP) - memunmap(mem->virt_base); - else - iounmap(mem->virt_base); + memunmap(mem->virt_base); kfree(mem->bitmap); kfree(mem); } @@ -109,8 +105,6 @@ static int dma_assign_coherent_memory(struct device *dev, return -EBUSY; dev->dma_mem = mem; - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - return 0; } @@ -118,16 +112,16 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags) { struct dma_coherent_mem *mem; + int ret; - if (!dma_init_coherent_memory(phys_addr, device_addr, size, flags, - &mem)) - return 0; - - if (dma_assign_coherent_memory(dev, mem) == 0) - return flags & DMA_MEMORY_MAP ? DMA_MEMORY_MAP : DMA_MEMORY_IO; + ret = dma_init_coherent_memory(phys_addr, device_addr, size, flags, &mem); + if (ret) + return ret; - dma_release_coherent_memory(mem); - return 0; + ret = dma_assign_coherent_memory(dev, mem); + if (ret) + dma_release_coherent_memory(mem); + return ret; } EXPORT_SYMBOL(dma_declare_coherent_memory); @@ -171,7 +165,6 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, int order = get_order(size); unsigned long flags; int pageno; - int dma_memory_map; void *ret; spin_lock_irqsave(&mem->spinlock, flags); @@ -188,15 +181,9 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, */ *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); ret = mem->virt_base + (pageno << PAGE_SHIFT); - dma_memory_map = (mem->flags & DMA_MEMORY_MAP); spin_unlock_irqrestore(&mem->spinlock, flags); - if (dma_memory_map) - memset(ret, 0, size); - else - memset_io(ret, 0, size); - + memset(ret, 0, size); return ret; - err: spin_unlock_irqrestore(&mem->spinlock, flags); return NULL; @@ -359,14 +346,18 @@ static struct reserved_mem *dma_reserved_default_memory __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { struct dma_coherent_mem *mem = rmem->priv; + int ret; - if (!mem && - !dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE, - &mem)) { + if (!mem) + return -ENODEV; + + ret = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, + DMA_MEMORY_EXCLUSIVE, &mem); + + if (ret) { pr_err("Reserved memory: failed to init DMA memory pool at %pa, size %ld MiB\n", &rmem->base, (unsigned long)rmem->size / SZ_1M); - return -ENODEV; + return ret; } mem->use_dev_dma_pfn_offset = true; rmem->priv = mem; diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c index b555ff9dd8fc..e584eddef0a7 100644 --- a/drivers/base/dma-mapping.c +++ b/drivers/base/dma-mapping.c @@ -176,13 +176,10 @@ int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, rc = dma_declare_coherent_memory(dev, phys_addr, device_addr, size, flags); - if (rc) { + if (!rc) devres_add(dev, res); - rc = 0; - } else { + else devres_free(res); - rc = -ENOMEM; - } return rc; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index b008b6a98098..b640ad8a6d20 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3435,7 +3435,7 @@ static void rbd_acquire_lock(struct work_struct *work) struct rbd_device *rbd_dev = container_of(to_delayed_work(work), struct rbd_device, lock_dwork); enum rbd_lock_state lock_state; - int ret; + int ret = 0; dout("%s rbd_dev %p\n", __func__, rbd_dev); again: diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 23f33f95d4a6..d1aed2513bd9 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -451,9 +451,6 @@ static struct port_buffer *alloc_buf(struct virtqueue *vq, size_t buf_size, * device is created by remoteproc, the DMA memory is * associated with the grandparent device: * vdev => rproc => platform-dev. - * The code here would have been less quirky if - * DMA_MEMORY_INCLUDES_CHILDREN had been supported - * in dma-coherent.c */ if (!vq->vdev->dev.parent || !vq->vdev->dev.parent->parent) goto free_buf; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 938eb4868f7f..3600ff786646 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -46,6 +46,8 @@ void dax_read_unlock(int id) EXPORT_SYMBOL_GPL(dax_read_unlock); #ifdef CONFIG_BLOCK +#include <linux/blkdev.h> + int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, pgoff_t *pgoff) { @@ -59,6 +61,16 @@ int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size, } EXPORT_SYMBOL(bdev_dax_pgoff); +#if IS_ENABLED(CONFIG_FS_DAX) +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) +{ + if (!blk_queue_dax(bdev->bd_queue)) + return NULL; + return fs_dax_get_by_host(bdev->bd_disk->disk_name); +} +EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); +#endif + /** * __bdev_dax_supported() - Check if the device supports dax for filesystem * @sb: The superblock of the device diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index bf3672a81e49..d2fcafcea07e 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -534,7 +534,7 @@ static void cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata, int sec_no) { - uuid_le *sec_type = (uuid_le *)gdata->section_type; + guid_t *sec_type = (guid_t *)gdata->section_type; __u16 severity; char newpfx[64]; @@ -545,12 +545,12 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata printk("%s""Error %d, type: %s\n", pfx, sec_no, cper_severity_str(severity)); if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) - printk("%s""fru_id: %pUl\n", pfx, (uuid_le *)gdata->fru_id); + printk("%s""fru_id: %pUl\n", pfx, gdata->fru_id); if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) printk("%s""fru_text: %.20s\n", pfx, gdata->fru_text); snprintf(newpfx, sizeof(newpfx), "%s%s", pfx, INDENT_SP); - if (!uuid_le_cmp(*sec_type, CPER_SEC_PROC_GENERIC)) { + if (guid_equal(sec_type, &CPER_SEC_PROC_GENERIC)) { struct cper_sec_proc_generic *proc_err = acpi_hest_get_payload(gdata); printk("%s""section_type: general processor error\n", newpfx); @@ -558,7 +558,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata cper_print_proc_generic(newpfx, proc_err); else goto err_section_too_small; - } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PLATFORM_MEM)) { + } else if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); printk("%s""section_type: memory error\n", newpfx); @@ -568,7 +568,7 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata gdata->error_data_length); else goto err_section_too_small; - } else if (!uuid_le_cmp(*sec_type, CPER_SEC_PCIE)) { + } else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { struct cper_sec_pcie *pcie = acpi_hest_get_payload(gdata); printk("%s""section_type: PCIe error\n", newpfx); diff --git a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c index 96dc01750bc0..36762ec954e7 100644 --- a/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c +++ b/drivers/media/platform/soc_camera/sh_mobile_ceu_camera.c @@ -1708,11 +1708,10 @@ static int sh_mobile_ceu_probe(struct platform_device *pdev) err = dma_declare_coherent_memory(&pdev->dev, res->start, res->start, resource_size(res), - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE); - if (!err) { + if (err) { dev_err(&pdev->dev, "Unable to declare CEU memory.\n"); - return -ENXIO; + return err; } pcdev->video_limit = resource_size(res); diff --git a/drivers/net/ethernet/amd/au1000_eth.c b/drivers/net/ethernet/amd/au1000_eth.c index a3c90fe5de00..73ca8879ada7 100644 --- a/drivers/net/ethernet/amd/au1000_eth.c +++ b/drivers/net/ethernet/amd/au1000_eth.c @@ -1180,9 +1180,10 @@ static int au1000_probe(struct platform_device *pdev) /* Allocate the data buffers * Snooping works fine with eth on all au1xxx */ - aup->vaddr = (u32)dma_alloc_noncoherent(NULL, MAX_BUF_SIZE * - (NUM_TX_BUFFS + NUM_RX_BUFFS), - &aup->dma_addr, 0); + aup->vaddr = (u32)dma_alloc_attrs(NULL, MAX_BUF_SIZE * + (NUM_TX_BUFFS + NUM_RX_BUFFS), + &aup->dma_addr, 0, + DMA_ATTR_NON_CONSISTENT); if (!aup->vaddr) { dev_err(&pdev->dev, "failed to allocate data buffers\n"); err = -ENOMEM; @@ -1361,8 +1362,9 @@ err_remap3: err_remap2: iounmap(aup->mac); err_remap1: - dma_free_noncoherent(NULL, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS), - (void *)aup->vaddr, aup->dma_addr); + dma_free_attrs(NULL, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS), + (void *)aup->vaddr, aup->dma_addr, + DMA_ATTR_NON_CONSISTENT); err_vaddr: free_netdev(dev); err_alloc: @@ -1394,9 +1396,9 @@ static int au1000_remove(struct platform_device *pdev) if (aup->tx_db_inuse[i]) au1000_ReleaseDB(aup, aup->tx_db_inuse[i]); - dma_free_noncoherent(NULL, MAX_BUF_SIZE * - (NUM_TX_BUFFS + NUM_RX_BUFFS), - (void *)aup->vaddr, aup->dma_addr); + dma_free_attrs(NULL, MAX_BUF_SIZE * (NUM_TX_BUFFS + NUM_RX_BUFFS), + (void *)aup->vaddr, aup->dma_addr, + DMA_ATTR_NON_CONSISTENT); iounmap(aup->macdma); iounmap(aup->mac); diff --git a/drivers/net/ethernet/i825xx/lasi_82596.c b/drivers/net/ethernet/i825xx/lasi_82596.c index aa22e108f09b..b69c622ba8b2 100644 --- a/drivers/net/ethernet/i825xx/lasi_82596.c +++ b/drivers/net/ethernet/i825xx/lasi_82596.c @@ -96,8 +96,6 @@ #define OPT_SWAP_PORT 0x0001 /* Need to wordswp on the MPU port */ -#define DMA_ALLOC dma_alloc_noncoherent -#define DMA_FREE dma_free_noncoherent #define DMA_WBACK(ndev, addr, len) \ do { dma_cache_sync((ndev)->dev.parent, (void *)addr, len, DMA_TO_DEVICE); } while (0) @@ -200,8 +198,8 @@ static int __exit lan_remove_chip(struct parisc_device *pdev) struct i596_private *lp = netdev_priv(dev); unregister_netdev (dev); - DMA_FREE(&pdev->dev, sizeof(struct i596_private), - (void *)lp->dma, lp->dma_addr); + dma_free_attrs(&pdev->dev, sizeof(struct i596_private), lp->dma, + lp->dma_addr, DMA_ATTR_NON_CONSISTENT); free_netdev (dev); return 0; } diff --git a/drivers/net/ethernet/i825xx/lib82596.c b/drivers/net/ethernet/i825xx/lib82596.c index 8449c58f01fd..f00a1dc2128c 100644 --- a/drivers/net/ethernet/i825xx/lib82596.c +++ b/drivers/net/ethernet/i825xx/lib82596.c @@ -1063,8 +1063,9 @@ static int i82596_probe(struct net_device *dev) if (!dev->base_addr || !dev->irq) return -ENODEV; - dma = (struct i596_dma *) DMA_ALLOC(dev->dev.parent, - sizeof(struct i596_dma), &lp->dma_addr, GFP_KERNEL); + dma = dma_alloc_attrs(dev->dev.parent, sizeof(struct i596_dma), + &lp->dma_addr, GFP_KERNEL, + DMA_ATTR_NON_CONSISTENT); if (!dma) { printk(KERN_ERR "%s: Couldn't get shared memory\n", __FILE__); return -ENOMEM; @@ -1085,8 +1086,8 @@ static int i82596_probe(struct net_device *dev) i = register_netdev(dev); if (i) { - DMA_FREE(dev->dev.parent, sizeof(struct i596_dma), - (void *)dma, lp->dma_addr); + dma_free_attrs(dev->dev.parent, sizeof(struct i596_dma), + dma, lp->dma_addr, DMA_ATTR_NON_CONSISTENT); return i; } diff --git a/drivers/net/ethernet/i825xx/sni_82596.c b/drivers/net/ethernet/i825xx/sni_82596.c index 2af7f77345fb..b2c04a789744 100644 --- a/drivers/net/ethernet/i825xx/sni_82596.c +++ b/drivers/net/ethernet/i825xx/sni_82596.c @@ -23,8 +23,6 @@ static const char sni_82596_string[] = "snirm_82596"; -#define DMA_ALLOC dma_alloc_coherent -#define DMA_FREE dma_free_coherent #define DMA_WBACK(priv, addr, len) do { } while (0) #define DMA_INV(priv, addr, len) do { } while (0) #define DMA_WBACK_INV(priv, addr, len) do { } while (0) @@ -152,8 +150,8 @@ static int sni_82596_driver_remove(struct platform_device *pdev) struct i596_private *lp = netdev_priv(dev); unregister_netdev(dev); - DMA_FREE(dev->dev.parent, sizeof(struct i596_private), - lp->dma, lp->dma_addr); + dma_free_attrs(dev->dev.parent, sizeof(struct i596_private), lp->dma, + lp->dma_addr, DMA_ATTR_NON_CONSISTENT); iounmap(lp->ca); iounmap(lp->mpu_port); free_netdev (dev); diff --git a/drivers/net/ethernet/seeq/sgiseeq.c b/drivers/net/ethernet/seeq/sgiseeq.c index 70347720fdf9..573691bc3b71 100644 --- a/drivers/net/ethernet/seeq/sgiseeq.c +++ b/drivers/net/ethernet/seeq/sgiseeq.c @@ -737,8 +737,8 @@ static int sgiseeq_probe(struct platform_device *pdev) sp = netdev_priv(dev); /* Make private data page aligned */ - sr = dma_alloc_noncoherent(&pdev->dev, sizeof(*sp->srings), - &sp->srings_dma, GFP_KERNEL); + sr = dma_alloc_attrs(&pdev->dev, sizeof(*sp->srings), &sp->srings_dma, + GFP_KERNEL, DMA_ATTR_NON_CONSISTENT); if (!sr) { printk(KERN_ERR "Sgiseeq: Page alloc failed, aborting.\n"); err = -ENOMEM; @@ -813,8 +813,8 @@ static int sgiseeq_remove(struct platform_device *pdev) struct sgiseeq_private *sp = netdev_priv(dev); unregister_netdev(dev); - dma_free_noncoherent(&pdev->dev, sizeof(*sp->srings), sp->srings, - sp->srings_dma); + dma_free_attrs(&pdev->dev, sizeof(*sp->srings), sp->srings, + sp->srings_dma, DMA_ATTR_NON_CONSISTENT); free_netdev(dev); return 0; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 60491641a8d6..d5612bd1cc81 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -31,6 +31,16 @@ enum log_ent_request { LOG_OLD_ENT }; +static struct device *to_dev(struct arena_info *arena) +{ + return &arena->nd_btt->dev; +} + +static u64 adjust_initial_offset(struct nd_btt *nd_btt, u64 offset) +{ + return offset + nd_btt->initial_offset; +} + static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, void *buf, size_t n, unsigned long flags) { @@ -38,7 +48,7 @@ static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, struct nd_namespace_common *ndns = nd_btt->ndns; /* arena offsets may be shifted from the base of the device */ - offset += arena->nd_btt->initial_offset; + offset = adjust_initial_offset(nd_btt, offset); return nvdimm_read_bytes(ndns, offset, buf, n, flags); } @@ -49,7 +59,7 @@ static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, struct nd_namespace_common *ndns = nd_btt->ndns; /* arena offsets may be shifted from the base of the device */ - offset += arena->nd_btt->initial_offset; + offset = adjust_initial_offset(nd_btt, offset); return nvdimm_write_bytes(ndns, offset, buf, n, flags); } @@ -62,8 +72,10 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super) * We rely on that to make sure rw_bytes does error clearing * correctly, so make sure that is the case. */ - WARN_ON_ONCE(!IS_ALIGNED(arena->infooff, 512)); - WARN_ON_ONCE(!IS_ALIGNED(arena->info2off, 512)); + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->infooff, 512), + "arena->infooff: %#llx is unaligned\n", arena->infooff); + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->info2off, 512), + "arena->info2off: %#llx is unaligned\n", arena->info2off); ret = arena_write_bytes(arena, arena->info2off, super, sizeof(struct btt_sb), 0); @@ -76,7 +88,6 @@ static int btt_info_write(struct arena_info *arena, struct btt_sb *super) static int btt_info_read(struct arena_info *arena, struct btt_sb *super) { - WARN_ON(!super); return arena_read_bytes(arena, arena->infooff, super, sizeof(struct btt_sb), 0); } @@ -92,7 +103,10 @@ static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping, { u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); - WARN_ON(lba >= arena->external_nlba); + if (unlikely(lba >= arena->external_nlba)) + dev_err_ratelimited(to_dev(arena), + "%s: lba %#x out of range (max: %#x)\n", + __func__, lba, arena->external_nlba); return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE, flags); } @@ -106,7 +120,7 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, * This 'mapping' is supposed to be just the LBA mapping, without * any flags set, so strip the flag bits. */ - mapping &= MAP_LBA_MASK; + mapping = ent_lba(mapping); ze = (z_flag << 1) + e_flag; switch (ze) { @@ -131,7 +145,8 @@ static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, * construed as a valid 'normal' case, but we decide not to, * to avoid confusion */ - WARN_ONCE(1, "Invalid use of Z and E flags\n"); + dev_err_ratelimited(to_dev(arena), + "Invalid use of Z and E flags\n"); return -EIO; } @@ -147,7 +162,10 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, u32 raw_mapping, postmap, ze, z_flag, e_flag; u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); - WARN_ON(lba >= arena->external_nlba); + if (unlikely(lba >= arena->external_nlba)) + dev_err_ratelimited(to_dev(arena), + "%s: lba %#x out of range (max: %#x)\n", + __func__, lba, arena->external_nlba); ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE, rwb_flags); if (ret) @@ -155,10 +173,10 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, raw_mapping = le32_to_cpu(in); - z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT; - e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT; + z_flag = ent_z_flag(raw_mapping); + e_flag = ent_e_flag(raw_mapping); ze = (z_flag << 1) + e_flag; - postmap = raw_mapping & MAP_LBA_MASK; + postmap = ent_lba(raw_mapping); /* Reuse the {z,e}_flag variables for *trim and *error */ z_flag = 0; @@ -195,7 +213,6 @@ static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, static int btt_log_read_pair(struct arena_info *arena, u32 lane, struct log_entry *ent) { - WARN_ON(!ent); return arena_read_bytes(arena, arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, 2 * LOG_ENT_SIZE, 0); @@ -299,11 +316,6 @@ static int btt_log_get_old(struct log_entry *ent) return old; } -static struct device *to_dev(struct arena_info *arena) -{ - return &arena->nd_btt->dev; -} - /* * This function copies the desired (old/new) log entry into ent if * it is not NULL. It returns the sub-slot number (0 or 1) @@ -381,7 +393,9 @@ static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; if (++(arena->freelist[lane].seq) == 4) arena->freelist[lane].seq = 1; - arena->freelist[lane].block = le32_to_cpu(ent->old_map); + if (ent_e_flag(ent->old_map)) + arena->freelist[lane].has_err = 1; + arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map)); return ret; } @@ -407,12 +421,14 @@ static int btt_map_init(struct arena_info *arena) * make sure rw_bytes does error clearing correctly, so make sure that * is the case. */ - WARN_ON_ONCE(!IS_ALIGNED(arena->mapoff, 512)); + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->mapoff, 512), + "arena->mapoff: %#llx is unaligned\n", arena->mapoff); while (mapsize) { size_t size = min(mapsize, chunk_size); - WARN_ON_ONCE(size < 512); + dev_WARN_ONCE(to_dev(arena), size < 512, + "chunk size: %#zx is unaligned\n", size); ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, size, 0); if (ret) @@ -449,12 +465,14 @@ static int btt_log_init(struct arena_info *arena) * make sure rw_bytes does error clearing correctly, so make sure that * is the case. */ - WARN_ON_ONCE(!IS_ALIGNED(arena->logoff, 512)); + dev_WARN_ONCE(to_dev(arena), !IS_ALIGNED(arena->logoff, 512), + "arena->logoff: %#llx is unaligned\n", arena->logoff); while (logsize) { size_t size = min(logsize, chunk_size); - WARN_ON_ONCE(size < 512); + dev_WARN_ONCE(to_dev(arena), size < 512, + "chunk size: %#zx is unaligned\n", size); ret = arena_write_bytes(arena, arena->logoff + offset, zerobuf, size, 0); if (ret) @@ -480,6 +498,40 @@ static int btt_log_init(struct arena_info *arena) return ret; } +static u64 to_namespace_offset(struct arena_info *arena, u64 lba) +{ + return arena->dataoff + ((u64)lba * arena->internal_lbasize); +} + +static int arena_clear_freelist_error(struct arena_info *arena, u32 lane) +{ + int ret = 0; + + if (arena->freelist[lane].has_err) { + void *zero_page = page_address(ZERO_PAGE(0)); + u32 lba = arena->freelist[lane].block; + u64 nsoff = to_namespace_offset(arena, lba); + unsigned long len = arena->sector_size; + + mutex_lock(&arena->err_lock); + + while (len) { + unsigned long chunk = min(len, PAGE_SIZE); + + ret = arena_write_bytes(arena, nsoff, zero_page, + chunk, 0); + if (ret) + break; + len -= chunk; + nsoff += chunk; + if (len == 0) + arena->freelist[lane].has_err = 0; + } + mutex_unlock(&arena->err_lock); + } + return ret; +} + static int btt_freelist_init(struct arena_info *arena) { int old, new, ret; @@ -505,6 +557,17 @@ static int btt_freelist_init(struct arena_info *arena) arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); arena->freelist[i].block = le32_to_cpu(log_new.old_map); + /* + * FIXME: if error clearing fails during init, we want to make + * the BTT read-only + */ + if (ent_e_flag(log_new.old_map)) { + ret = arena_clear_freelist_error(arena, i); + if (ret) + dev_err_ratelimited(to_dev(arena), + "Unable to clear known errors\n"); + } + /* This implies a newly created or untouched flog entry */ if (log_new.old_map == log_new.new_map) continue; @@ -525,7 +588,6 @@ static int btt_freelist_init(struct arena_info *arena) if (ret) return ret; } - } return 0; @@ -566,6 +628,7 @@ static struct arena_info *alloc_arena(struct btt *btt, size_t size, if (!arena) return NULL; arena->nd_btt = btt->nd_btt; + arena->sector_size = btt->sector_size; if (!size) return arena; @@ -694,6 +757,7 @@ static int discover_arenas(struct btt *btt) arena->external_lba_start = cur_nlba; parse_arena_meta(arena, super, cur_off); + mutex_init(&arena->err_lock); ret = btt_freelist_init(arena); if (ret) goto out; @@ -904,11 +968,6 @@ static void unlock_map(struct arena_info *arena, u32 premap) spin_unlock(&arena->map_locks[idx].lock); } -static u64 to_namespace_offset(struct arena_info *arena, u64 lba) -{ - return arena->dataoff + ((u64)lba * arena->internal_lbasize); -} - static int btt_data_read(struct arena_info *arena, struct page *page, unsigned int off, u32 lba, u32 len) { @@ -1032,6 +1091,7 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, */ while (1) { u32 new_map; + int new_t, new_e; if (t_flag) { zero_fill_data(page, off, cur_len); @@ -1050,20 +1110,29 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, */ barrier(); - ret = btt_map_read(arena, premap, &new_map, &t_flag, - &e_flag, NVDIMM_IO_ATOMIC); + ret = btt_map_read(arena, premap, &new_map, &new_t, + &new_e, NVDIMM_IO_ATOMIC); if (ret) goto out_rtt; - if (postmap == new_map) + if ((postmap == new_map) && (t_flag == new_t) && + (e_flag == new_e)) break; postmap = new_map; + t_flag = new_t; + e_flag = new_e; } ret = btt_data_read(arena, page, off, postmap, cur_len); - if (ret) + if (ret) { + int rc; + + /* Media error - set the e_flag */ + rc = btt_map_write(arena, premap, postmap, 0, 1, + NVDIMM_IO_ATOMIC); goto out_rtt; + } if (bip) { ret = btt_rw_integrity(btt, bip, arena, postmap, READ); @@ -1088,6 +1157,21 @@ static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, return ret; } +/* + * Normally, arena_{read,write}_bytes will take care of the initial offset + * adjustment, but in the case of btt_is_badblock, where we query is_bad_pmem, + * we need the final, raw namespace offset here + */ +static bool btt_is_badblock(struct btt *btt, struct arena_info *arena, + u32 postmap) +{ + u64 nsoff = adjust_initial_offset(arena->nd_btt, + to_namespace_offset(arena, postmap)); + sector_t phys_sector = nsoff >> 9; + + return is_bad_pmem(btt->phys_bb, phys_sector, arena->internal_lbasize); +} + static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, sector_t sector, struct page *page, unsigned int off, unsigned int len) @@ -1100,7 +1184,9 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, while (len) { u32 cur_len; + int e_flag; + retry: lane = nd_region_acquire_lane(btt->nd_region); ret = lba_to_arena(btt, sector, &premap, &arena); @@ -1113,6 +1199,21 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, goto out_lane; } + if (btt_is_badblock(btt, arena, arena->freelist[lane].block)) + arena->freelist[lane].has_err = 1; + + if (mutex_is_locked(&arena->err_lock) + || arena->freelist[lane].has_err) { + nd_region_release_lane(btt->nd_region, lane); + + ret = arena_clear_freelist_error(arena, lane); + if (ret) + return ret; + + /* OK to acquire a different lane/free block */ + goto retry; + } + new_postmap = arena->freelist[lane].block; /* Wait if the new block is being read from */ @@ -1138,7 +1239,7 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, } lock_map(arena, premap); - ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL, + ret = btt_map_read(arena, premap, &old_postmap, NULL, &e_flag, NVDIMM_IO_ATOMIC); if (ret) goto out_map; @@ -1146,6 +1247,8 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, ret = -EIO; goto out_map; } + if (e_flag) + set_e_flag(old_postmap); log.lba = cpu_to_le32(premap); log.old_map = cpu_to_le32(old_postmap); @@ -1156,13 +1259,20 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, if (ret) goto out_map; - ret = btt_map_write(arena, premap, new_postmap, 0, 0, 0); + ret = btt_map_write(arena, premap, new_postmap, 0, 0, + NVDIMM_IO_ATOMIC); if (ret) goto out_map; unlock_map(arena, premap); nd_region_release_lane(btt->nd_region, lane); + if (e_flag) { + ret = arena_clear_freelist_error(arena, lane); + if (ret) + return ret; + } + len -= cur_len; off += cur_len; sector += btt->sector_size >> SECTOR_SHIFT; @@ -1211,11 +1321,13 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; - BUG_ON(len > PAGE_SIZE); - /* Make sure len is in multiples of sector size. */ - /* XXX is this right? */ - BUG_ON(len < btt->sector_size); - BUG_ON(len % btt->sector_size); + if (len > PAGE_SIZE || len < btt->sector_size || + len % btt->sector_size) { + dev_err_ratelimited(&btt->nd_btt->dev, + "unaligned bio segment (len: %d)\n", len); + bio->bi_status = BLK_STS_IOERR; + break; + } err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, op_is_write(bio_op(bio)), iter.bi_sector); @@ -1345,6 +1457,7 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, { int ret; struct btt *btt; + struct nd_namespace_io *nsio; struct device *dev = &nd_btt->dev; btt = devm_kzalloc(dev, sizeof(struct btt), GFP_KERNEL); @@ -1358,6 +1471,8 @@ static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, INIT_LIST_HEAD(&btt->arena_list); mutex_init(&btt->init_lock); btt->nd_region = nd_region; + nsio = to_nd_namespace_io(&nd_btt->ndns->dev); + btt->phys_bb = &nsio->bb; ret = discover_arenas(btt); if (ret) { @@ -1431,6 +1546,8 @@ int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) } btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL); + if (!btt_sb) + return -ENOMEM; /* * If this returns < 0, that is ok as it just means there wasn't diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index 888e862907a0..578c2057524d 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -15,6 +15,7 @@ #ifndef _LINUX_BTT_H #define _LINUX_BTT_H +#include <linux/badblocks.h> #include <linux/types.h> #define BTT_SIG_LEN 16 @@ -38,6 +39,11 @@ #define IB_FLAG_ERROR 0x00000001 #define IB_FLAG_ERROR_MASK 0x00000001 +#define ent_lba(ent) (ent & MAP_LBA_MASK) +#define ent_e_flag(ent) (!!(ent & MAP_ERR_MASK)) +#define ent_z_flag(ent) (!!(ent & MAP_TRIM_MASK)) +#define set_e_flag(ent) (ent |= MAP_ERR_MASK) + enum btt_init_state { INIT_UNCHECKED = 0, INIT_NOTFOUND, @@ -78,6 +84,7 @@ struct free_entry { u32 block; u8 sub; u8 seq; + u8 has_err; }; struct aligned_lock { @@ -104,6 +111,7 @@ struct aligned_lock { * handle incoming writes. * @version_major: Metadata layout version major. * @version_minor: Metadata layout version minor. + * @sector_size: The Linux sector size - 512 or 4096 * @nextoff: Offset in bytes to the start of the next arena. * @infooff: Offset in bytes to the info block of this arena. * @dataoff: Offset in bytes to the data area of this arena. @@ -131,6 +139,7 @@ struct arena_info { u32 nfree; u16 version_major; u16 version_minor; + u32 sector_size; /* Byte offsets to the different on-media structures */ u64 nextoff; u64 infooff; @@ -147,6 +156,7 @@ struct arena_info { struct dentry *debugfs_dir; /* Arena flags */ u32 flags; + struct mutex err_lock; }; /** @@ -181,6 +191,7 @@ struct btt { struct mutex init_lock; int init_state; int num_arenas; + struct badblocks *phys_bb; }; bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super); diff --git a/drivers/nvdimm/btt_devs.c b/drivers/nvdimm/btt_devs.c index 3e359d282f8e..d58925295aa7 100644 --- a/drivers/nvdimm/btt_devs.c +++ b/drivers/nvdimm/btt_devs.c @@ -61,7 +61,7 @@ static ssize_t sector_size_show(struct device *dev, { struct nd_btt *nd_btt = to_nd_btt(dev); - return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf); + return nd_size_select_show(nd_btt->lbasize, btt_lbasize_supported, buf); } static ssize_t sector_size_store(struct device *dev, @@ -72,7 +72,7 @@ static ssize_t sector_size_store(struct device *dev, device_lock(dev); nvdimm_bus_lock(dev); - rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize, + rc = nd_size_select_store(dev, buf, &nd_btt->lbasize, btt_lbasize_supported); dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, rc, buf, buf[len - 1] == '\n' ? "" : "\n"); diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 937fafa1886a..baf283986a7e 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -11,6 +11,7 @@ * General Public License for more details. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/sched/mm.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> #include <linux/module.h> @@ -234,6 +235,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, struct nd_cmd_clear_error clear_err; struct nd_cmd_ars_cap ars_cap; u32 clear_err_unit, mask; + unsigned int noio_flag; int cmd_rc, rc; if (!nvdimm_bus) @@ -250,8 +252,10 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, memset(&ars_cap, 0, sizeof(ars_cap)); ars_cap.address = phys; ars_cap.length = len; + noio_flag = memalloc_noio_save(); rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_ARS_CAP, &ars_cap, sizeof(ars_cap), &cmd_rc); + memalloc_noio_restore(noio_flag); if (rc < 0) return rc; if (cmd_rc < 0) @@ -266,8 +270,10 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys, memset(&clear_err, 0, sizeof(clear_err)); clear_err.address = phys; clear_err.length = len; + noio_flag = memalloc_noio_save(); rc = nd_desc->ndctl(nd_desc, NULL, ND_CMD_CLEAR_ERROR, &clear_err, sizeof(clear_err), &cmd_rc); + memalloc_noio_restore(noio_flag); if (rc < 0) return rc; if (cmd_rc < 0) @@ -905,19 +911,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, int read_only, unsigned int ioctl_cmd, unsigned long arg) { struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; - size_t buf_len = 0, in_len = 0, out_len = 0; static char out_env[ND_CMD_MAX_ENVELOPE]; static char in_env[ND_CMD_MAX_ENVELOPE]; const struct nd_cmd_desc *desc = NULL; unsigned int cmd = _IOC_NR(ioctl_cmd); - unsigned int func = cmd; - void __user *p = (void __user *) arg; struct device *dev = &nvdimm_bus->dev; - struct nd_cmd_pkg pkg; + void __user *p = (void __user *) arg; const char *cmd_name, *dimm_name; + u32 in_len = 0, out_len = 0; + unsigned int func = cmd; unsigned long cmd_mask; - void *buf; + struct nd_cmd_pkg pkg; int rc, i, cmd_rc; + u64 buf_len = 0; + void *buf; if (nvdimm) { desc = nd_cmd_dimm_desc(cmd); @@ -977,13 +984,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, if (cmd == ND_CMD_CALL) { func = pkg.nd_command; - dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n", + dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n", __func__, dimm_name, pkg.nd_command, in_len, out_len, buf_len); - - for (i = 0; i < ARRAY_SIZE(pkg.nd_reserved2); i++) - if (pkg.nd_reserved2[i]) - return -EINVAL; } /* process an output envelope */ @@ -1007,9 +1010,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, out_len += out_size; } - buf_len = out_len + in_len; + buf_len = (u64) out_len + (u64) in_len; if (buf_len > ND_IOCTL_MAX_BUFLEN) { - dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__, + dev_dbg(dev, "%s:%s cmd: %s buf_len: %llu > %d\n", __func__, dimm_name, cmd_name, buf_len, ND_IOCTL_MAX_BUFLEN); return -EINVAL; diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c index 47770460f3d3..b2fc29b8279b 100644 --- a/drivers/nvdimm/claim.c +++ b/drivers/nvdimm/claim.c @@ -280,18 +280,11 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns, } if (unlikely(is_bad_pmem(&nsio->bb, sector, sz_align))) { - /* - * FIXME: nsio_rw_bytes() may be called from atomic - * context in the btt case and the ACPI DSM path for - * clearing the error takes sleeping locks and allocates - * memory. An explicit error clearing path, and support - * for tracking badblocks in BTT metadata is needed to - * work around this collision. - */ if (IS_ALIGNED(offset, 512) && IS_ALIGNED(size, 512) && !(flags & NVDIMM_IO_ATOMIC)) { long cleared; + might_sleep(); cleared = nvdimm_clear_poison(&ndns->dev, nsio->res.start + offset, size); if (cleared < size) diff --git a/drivers/nvdimm/core.c b/drivers/nvdimm/core.c index 75bc08c6838c..bb71f0cf8f5d 100644 --- a/drivers/nvdimm/core.c +++ b/drivers/nvdimm/core.c @@ -277,14 +277,14 @@ int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, return 0; } -ssize_t nd_sector_size_show(unsigned long current_lbasize, +ssize_t nd_size_select_show(unsigned long current_size, const unsigned long *supported, char *buf) { ssize_t len = 0; int i; for (i = 0; supported[i]; i++) - if (current_lbasize == supported[i]) + if (current_size == supported[i]) len += sprintf(buf + len, "[%ld] ", supported[i]); else len += sprintf(buf + len, "%ld ", supported[i]); @@ -292,8 +292,8 @@ ssize_t nd_sector_size_show(unsigned long current_lbasize, return len; } -ssize_t nd_sector_size_store(struct device *dev, const char *buf, - unsigned long *current_lbasize, const unsigned long *supported) +ssize_t nd_size_select_store(struct device *dev, const char *buf, + unsigned long *current_size, const unsigned long *supported) { unsigned long lbasize; int rc, i; @@ -310,7 +310,7 @@ ssize_t nd_sector_size_store(struct device *dev, const char *buf, break; if (supported[i]) { - *current_lbasize = lbasize; + *current_size = lbasize; return 0; } else { return -EINVAL; diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 87796f840777..9c5f108910e3 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -45,12 +45,14 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd) return ndd->nslabel_size; } -size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) { - u32 index_span; + return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1); +} - if (ndd->nsindex_size) - return ndd->nsindex_size; +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) +{ + u32 nslot, space, size; /* * The minimum index space is 512 bytes, with that amount of @@ -60,16 +62,16 @@ size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) * starts to waste space at larger config_sizes, but it's * unlikely we'll ever see anything but 128K. */ - index_span = ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1); - index_span /= NSINDEX_ALIGN * 2; - ndd->nsindex_size = index_span * NSINDEX_ALIGN; - - return ndd->nsindex_size; -} - -int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) -{ - return ndd->nsarea.config_size / (sizeof_namespace_label(ndd) + 1); + nslot = nvdimm_num_label_slots(ndd); + space = ndd->nsarea.config_size - nslot * sizeof_namespace_label(ndd); + size = ALIGN(sizeof(struct nd_namespace_index) + DIV_ROUND_UP(nslot, 8), + NSINDEX_ALIGN) * 2; + if (size <= space) + return size / 2; + + dev_err(ndd->dev, "label area (%d) too small to host (%d byte) labels\n", + ndd->nsarea.config_size, sizeof_namespace_label(ndd)); + return 0; } static int __nd_label_validate(struct nvdimm_drvdata *ndd) diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c index 5f1c6756e57c..1427a386a033 100644 --- a/drivers/nvdimm/namespace_devs.c +++ b/drivers/nvdimm/namespace_devs.c @@ -1313,14 +1313,14 @@ static ssize_t sector_size_show(struct device *dev, if (is_namespace_blk(dev)) { struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); - return nd_sector_size_show(nsblk->lbasize, + return nd_size_select_show(nsblk->lbasize, blk_lbasize_supported, buf); } if (is_namespace_pmem(dev)) { struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); - return nd_sector_size_show(nspm->lbasize, + return nd_size_select_show(nspm->lbasize, pmem_lbasize_supported, buf); } return -ENXIO; @@ -1352,7 +1352,7 @@ static ssize_t sector_size_store(struct device *dev, if (to_ndns(dev)->claim) rc = -EBUSY; if (rc >= 0) - rc = nd_sector_size_store(dev, buf, lbasize, supported); + rc = nd_size_select_store(dev, buf, lbasize, supported); if (rc >= 0) rc = nd_namespace_label_update(nd_region, dev); dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__, diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index a87f793f2945..9c758a91372b 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -42,7 +42,7 @@ struct nd_poison { struct nvdimm_drvdata { struct device *dev; - int nsindex_size, nslabel_size; + int nslabel_size; struct nd_cmd_get_config_size nsarea; void *data; int ns_current, ns_next; @@ -134,6 +134,7 @@ struct nd_mapping { struct nvdimm *nvdimm; u64 start; u64 size; + int position; struct list_head labels; struct mutex lock; /* @@ -233,10 +234,10 @@ void nd_device_unregister(struct device *dev, enum nd_async_mode mode); void nd_device_notify(struct device *dev, enum nvdimm_event event); int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, size_t len); -ssize_t nd_sector_size_show(unsigned long current_lbasize, +ssize_t nd_size_select_show(unsigned long current_size, const unsigned long *supported, char *buf); -ssize_t nd_sector_size_store(struct device *dev, const char *buf, - unsigned long *current_lbasize, const unsigned long *supported); +ssize_t nd_size_select_store(struct device *dev, const char *buf, + unsigned long *current_size, const unsigned long *supported); int __init nvdimm_init(void); int __init nd_region_init(void); int __init nd_label_init(void); @@ -285,6 +286,13 @@ static inline struct device *nd_btt_create(struct nd_region *nd_region) struct nd_pfn *to_nd_pfn(struct device *dev); #if IS_ENABLED(CONFIG_NVDIMM_PFN) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define PFN_DEFAULT_ALIGNMENT HPAGE_PMD_SIZE +#else +#define PFN_DEFAULT_ALIGNMENT PAGE_SIZE +#endif + int nd_pfn_probe(struct device *dev, struct nd_namespace_common *ndns); bool is_nd_pfn(struct device *dev); struct device *nd_pfn_create(struct nd_region *nd_region); diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 5fcb6f5b22a2..9576c444f0ab 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -111,24 +111,27 @@ static ssize_t align_show(struct device *dev, return sprintf(buf, "%ld\n", nd_pfn->align); } -static ssize_t __align_store(struct nd_pfn *nd_pfn, const char *buf) +static const unsigned long *nd_pfn_supported_alignments(void) { - unsigned long val; - int rc; - - rc = kstrtoul(buf, 0, &val); - if (rc) - return rc; - - if (!is_power_of_2(val) || val < PAGE_SIZE || val > SZ_1G) - return -EINVAL; + /* + * This needs to be a non-static variable because the *_SIZE + * macros aren't always constants. + */ + const unsigned long supported_alignments[] = { + PAGE_SIZE, +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + HPAGE_PMD_SIZE, +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD + HPAGE_PUD_SIZE, +#endif +#endif + 0, + }; + static unsigned long data[ARRAY_SIZE(supported_alignments)]; - if (nd_pfn->dev.driver) - return -EBUSY; - else - nd_pfn->align = val; + memcpy(data, supported_alignments, sizeof(data)); - return 0; + return data; } static ssize_t align_store(struct device *dev, @@ -139,7 +142,8 @@ static ssize_t align_store(struct device *dev, device_lock(dev); nvdimm_bus_lock(dev); - rc = __align_store(nd_pfn, buf); + rc = nd_size_select_store(dev, buf, &nd_pfn->align, + nd_pfn_supported_alignments()); dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, rc, buf, buf[len - 1] == '\n' ? "" : "\n"); nvdimm_bus_unlock(dev); @@ -260,6 +264,13 @@ static ssize_t size_show(struct device *dev, } static DEVICE_ATTR_RO(size); +static ssize_t supported_alignments_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return nd_size_select_show(0, nd_pfn_supported_alignments(), buf); +} +static DEVICE_ATTR_RO(supported_alignments); + static struct attribute *nd_pfn_attributes[] = { &dev_attr_mode.attr, &dev_attr_namespace.attr, @@ -267,6 +278,7 @@ static struct attribute *nd_pfn_attributes[] = { &dev_attr_align.attr, &dev_attr_resource.attr, &dev_attr_size.attr, + &dev_attr_supported_alignments.attr, NULL, }; @@ -290,7 +302,7 @@ struct device *nd_pfn_devinit(struct nd_pfn *nd_pfn, return NULL; nd_pfn->mode = PFN_MODE_NONE; - nd_pfn->align = HPAGE_SIZE; + nd_pfn->align = PFN_DEFAULT_ALIGNMENT; dev = &nd_pfn->dev; device_initialize(&nd_pfn->dev); if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) { @@ -638,11 +650,12 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) / PAGE_SIZE); if (nd_pfn->mode == PFN_MODE_PMEM) { /* - * vmemmap_populate_hugepages() allocates the memmap array in - * HPAGE_SIZE chunks. + * The altmap should be padded out to the block size used + * when populating the vmemmap. This *should* be equal to + * PMD_SIZE for most architectures. */ offset = ALIGN(start + SZ_8K + 64 * npfns + dax_label_reserve, - max(nd_pfn->align, HPAGE_SIZE)) - start; + max(nd_pfn->align, PMD_SIZE)) - start; } else if (nd_pfn->mode == PFN_MODE_RAM) offset = ALIGN(start + SZ_8K + dax_label_reserve, nd_pfn->align) - start; diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 5434321cad67..c5917f040fa7 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -5,20 +5,6 @@ #include <linux/pfn_t.h> #include <linux/fs.h> -#ifdef CONFIG_ARCH_HAS_PMEM_API -#define ARCH_MEMREMAP_PMEM MEMREMAP_WB -void arch_wb_cache_pmem(void *addr, size_t size); -void arch_invalidate_pmem(void *addr, size_t size); -#else -#define ARCH_MEMREMAP_PMEM MEMREMAP_WT -static inline void arch_wb_cache_pmem(void *addr, size_t size) -{ -} -static inline void arch_invalidate_pmem(void *addr, size_t size) -{ -} -#endif - /* this definition is in it's own header for tools/testing/nvdimm to consume */ struct pmem_device { /* One contiguous memory region per device */ diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 5954cfbea3fc..829d760f651c 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -723,8 +723,9 @@ static ssize_t mappingN(struct device *dev, char *buf, int n) nd_mapping = &nd_region->mapping[n]; nvdimm = nd_mapping->nvdimm; - return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev), - nd_mapping->start, nd_mapping->size); + return sprintf(buf, "%s,%llu,%llu,%d\n", dev_name(&nvdimm->dev), + nd_mapping->start, nd_mapping->size, + nd_mapping->position); } #define REGION_MAPPING(idx) \ @@ -965,6 +966,7 @@ static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, nd_region->mapping[i].nvdimm = nvdimm; nd_region->mapping[i].start = mapping->start; nd_region->mapping[i].size = mapping->size; + nd_region->mapping[i].position = mapping->position; INIT_LIST_HEAD(&nd_region->mapping[i].labels); mutex_init(&nd_region->mapping[i].lock); diff --git a/drivers/of/device.c b/drivers/of/device.c index 17b66e9715d2..64b710265d39 100644 --- a/drivers/of/device.c +++ b/drivers/of/device.c @@ -9,6 +9,9 @@ #include <linux/module.h> #include <linux/mod_devicetable.h> #include <linux/slab.h> +#include <linux/pci.h> +#include <linux/platform_device.h> +#include <linux/amba/bus.h> #include <asm/errno.h> #include "of_private.h" @@ -84,31 +87,28 @@ int of_device_add(struct platform_device *ofdev) */ int of_dma_configure(struct device *dev, struct device_node *np) { - u64 dma_addr, paddr, size; + u64 dma_addr, paddr, size = 0; int ret; bool coherent; unsigned long offset; const struct iommu_ops *iommu; u64 mask; - /* - * Set default coherent_dma_mask to 32 bit. Drivers are expected to - * setup the correct supported mask. - */ - if (!dev->coherent_dma_mask) - dev->coherent_dma_mask = DMA_BIT_MASK(32); - - /* - * Set it to coherent_dma_mask by default if the architecture - * code has not set it. - */ - if (!dev->dma_mask) - dev->dma_mask = &dev->coherent_dma_mask; - ret = of_dma_get_range(np, &dma_addr, &paddr, &size); if (ret < 0) { + /* + * For legacy reasons, we have to assume some devices need + * DMA configuration regardless of whether "dma-ranges" is + * correctly specified or not. + */ + if (!dev_is_pci(dev) && +#ifdef CONFIG_ARM_AMBA + dev->bus != &amba_bustype && +#endif + dev->bus != &platform_bus_type) + return ret == -ENODEV ? 0 : ret; + dma_addr = offset = 0; - size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1); } else { offset = PFN_DOWN(paddr - dma_addr); @@ -129,6 +129,22 @@ int of_dma_configure(struct device *dev, struct device_node *np) dev_dbg(dev, "dma_pfn_offset(%#08lx)\n", offset); } + /* + * Set default coherent_dma_mask to 32 bit. Drivers are expected to + * setup the correct supported mask. + */ + if (!dev->coherent_dma_mask) + dev->coherent_dma_mask = DMA_BIT_MASK(32); + /* + * Set it to coherent_dma_mask by default if the architecture + * code has not set it. + */ + if (!dev->dma_mask) + dev->dma_mask = &dev->coherent_dma_mask; + + if (!size) + size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1); + dev->dma_pfn_offset = offset; /* diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 7cb982b54c8c..763ee50ea57d 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -300,7 +300,7 @@ config PWM_MEDIATEK Generic PWM framework driver for Mediatek ARM SoC. To compile this driver as a module, choose M here: the module - will be called pwm-mxs. + will be called pwm-mediatek. config PWM_MXS tristate "Freescale MXS PWM support" @@ -456,7 +456,7 @@ config PWM_TEGRA config PWM_TIECAP tristate "ECAP PWM support" - depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX + depends on ARCH_OMAP2PLUS || ARCH_DAVINCI_DA8XX || ARCH_KEYSTONE help PWM driver support for the ECAP APWM controller found on AM33XX TI SOC @@ -510,4 +510,13 @@ config PWM_VT8500 To compile this driver as a module, choose M here: the module will be called pwm-vt8500. +config PWM_ZX + tristate "ZTE ZX PWM support" + depends on ARCH_ZX + help + Generic PWM framework driver for ZTE ZX family SoCs. + + To compile this driver as a module, choose M here: the module + will be called pwm-zx. + endif diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index a3a4beef6daa..ebefba5f528b 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -50,3 +50,4 @@ obj-$(CONFIG_PWM_TIPWMSS) += pwm-tipwmss.o obj-$(CONFIG_PWM_TWL) += pwm-twl.o obj-$(CONFIG_PWM_TWL_LED) += pwm-twl-led.o obj-$(CONFIG_PWM_VT8500) += pwm-vt8500.o +obj-$(CONFIG_PWM_ZX) += pwm-zx.o diff --git a/drivers/pwm/pwm-bcm2835.c b/drivers/pwm/pwm-bcm2835.c index c5dbf16d810b..db001cba937f 100644 --- a/drivers/pwm/pwm-bcm2835.c +++ b/drivers/pwm/pwm-bcm2835.c @@ -167,6 +167,8 @@ static int bcm2835_pwm_probe(struct platform_device *pdev) pc->chip.dev = &pdev->dev; pc->chip.ops = &bcm2835_pwm_ops; pc->chip.npwm = 2; + pc->chip.of_xlate = of_pwm_xlate_with_flags; + pc->chip.of_pwm_n_cells = 3; platform_set_drvdata(pdev, pc); diff --git a/drivers/pwm/pwm-hibvt.c b/drivers/pwm/pwm-hibvt.c index 8dadc58d6cdf..27c107e78d59 100644 --- a/drivers/pwm/pwm-hibvt.c +++ b/drivers/pwm/pwm-hibvt.c @@ -208,7 +208,7 @@ static int hibvt_pwm_probe(struct platform_device *pdev) if (ret < 0) return ret; - pwm_chip->rstc = devm_reset_control_get(&pdev->dev, NULL); + pwm_chip->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL); if (IS_ERR(pwm_chip->rstc)) { clk_disable_unprepare(pwm_chip->clk); return PTR_ERR(pwm_chip->rstc); diff --git a/drivers/pwm/pwm-mediatek.c b/drivers/pwm/pwm-mediatek.c index 5c11bc708a3c..b52f3afb2ba1 100644 --- a/drivers/pwm/pwm-mediatek.c +++ b/drivers/pwm/pwm-mediatek.c @@ -2,6 +2,7 @@ * Mediatek Pulse Width Modulator driver * * Copyright (C) 2015 John Crispin <blogic@openwrt.org> + * Copyright (C) 2017 Zhi Mao <zhi.mao@mediatek.com> * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without any @@ -29,6 +30,8 @@ #define PWMDWIDTH 0x2c #define PWMTHRES 0x30 +#define PWM_CLK_DIV_MAX 7 + enum { MTK_CLK_MAIN = 0, MTK_CLK_TOP, @@ -61,6 +64,42 @@ static inline struct mtk_pwm_chip *to_mtk_pwm_chip(struct pwm_chip *chip) return container_of(chip, struct mtk_pwm_chip, chip); } +static int mtk_pwm_clk_enable(struct pwm_chip *chip, struct pwm_device *pwm) +{ + struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); + int ret; + + ret = clk_prepare_enable(pc->clks[MTK_CLK_TOP]); + if (ret < 0) + return ret; + + ret = clk_prepare_enable(pc->clks[MTK_CLK_MAIN]); + if (ret < 0) + goto disable_clk_top; + + ret = clk_prepare_enable(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); + if (ret < 0) + goto disable_clk_main; + + return 0; + +disable_clk_main: + clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]); +disable_clk_top: + clk_disable_unprepare(pc->clks[MTK_CLK_TOP]); + + return ret; +} + +static void mtk_pwm_clk_disable(struct pwm_chip *chip, struct pwm_device *pwm) +{ + struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); + + clk_disable_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); + clk_disable_unprepare(pc->clks[MTK_CLK_MAIN]); + clk_disable_unprepare(pc->clks[MTK_CLK_TOP]); +} + static inline u32 mtk_pwm_readl(struct mtk_pwm_chip *chip, unsigned int num, unsigned int offset) { @@ -80,6 +119,11 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, struct mtk_pwm_chip *pc = to_mtk_pwm_chip(chip); struct clk *clk = pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]; u32 resolution, clkdiv = 0; + int ret; + + ret = mtk_pwm_clk_enable(chip, pwm); + if (ret < 0) + return ret; resolution = NSEC_PER_SEC / clk_get_rate(clk); @@ -88,13 +132,18 @@ static int mtk_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, clkdiv++; } - if (clkdiv > 7) + if (clkdiv > PWM_CLK_DIV_MAX) { + mtk_pwm_clk_disable(chip, pwm); + dev_err(chip->dev, "period %d not supported\n", period_ns); return -EINVAL; + } - mtk_pwm_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | BIT(3) | clkdiv); + mtk_pwm_writel(pc, pwm->hwpwm, PWMCON, BIT(15) | clkdiv); mtk_pwm_writel(pc, pwm->hwpwm, PWMDWIDTH, period_ns / resolution); mtk_pwm_writel(pc, pwm->hwpwm, PWMTHRES, duty_ns / resolution); + mtk_pwm_clk_disable(chip, pwm); + return 0; } @@ -104,7 +153,7 @@ static int mtk_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) u32 value; int ret; - ret = clk_prepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); + ret = mtk_pwm_clk_enable(chip, pwm); if (ret < 0) return ret; @@ -124,7 +173,7 @@ static void mtk_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) value &= ~BIT(pwm->hwpwm); writel(value, pc->regs); - clk_unprepare(pc->clks[MTK_CLK_PWM1 + pwm->hwpwm]); + mtk_pwm_clk_disable(chip, pwm); } static const struct pwm_ops mtk_pwm_ops = { @@ -156,14 +205,6 @@ static int mtk_pwm_probe(struct platform_device *pdev) return PTR_ERR(pc->clks[i]); } - ret = clk_prepare(pc->clks[MTK_CLK_TOP]); - if (ret < 0) - return ret; - - ret = clk_prepare(pc->clks[MTK_CLK_MAIN]); - if (ret < 0) - goto disable_clk_top; - platform_set_drvdata(pdev, pc); pc->chip.dev = &pdev->dev; @@ -174,26 +215,15 @@ static int mtk_pwm_probe(struct platform_device *pdev) ret = pwmchip_add(&pc->chip); if (ret < 0) { dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret); - goto disable_clk_main; + return ret; } return 0; - -disable_clk_main: - clk_unprepare(pc->clks[MTK_CLK_MAIN]); -disable_clk_top: - clk_unprepare(pc->clks[MTK_CLK_TOP]); - - return ret; } static int mtk_pwm_remove(struct platform_device *pdev) { struct mtk_pwm_chip *pc = platform_get_drvdata(pdev); - unsigned int i; - - for (i = 0; i < pc->chip.npwm; i++) - pwm_disable(&pc->chip.pwms[i]); return pwmchip_remove(&pc->chip); } diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c index cb845edfe2b4..d589331d1884 100644 --- a/drivers/pwm/pwm-meson.c +++ b/drivers/pwm/pwm-meson.c @@ -441,7 +441,7 @@ static int meson_pwm_init_channels(struct meson_pwm *meson, for (i = 0; i < meson->chip.npwm; i++) { struct meson_pwm_channel *channel = &channels[i]; - snprintf(name, sizeof(name), "%s#mux%u", np->full_name, i); + snprintf(name, sizeof(name), "%pOF#mux%u", np, i); init.name = name; init.ops = &clk_mux_ops; diff --git a/drivers/pwm/pwm-pca9685.c b/drivers/pwm/pwm-pca9685.c index 5f55cfab9b1c..a7eaf962a95b 100644 --- a/drivers/pwm/pwm-pca9685.c +++ b/drivers/pwm/pwm-pca9685.c @@ -241,11 +241,11 @@ static inline int pca9685_pwm_gpio_probe(struct pca9685 *pca) } #endif -static void pca9685_set_sleep_mode(struct pca9685 *pca, int sleep) +static void pca9685_set_sleep_mode(struct pca9685 *pca, bool enable) { regmap_update_bits(pca->regmap, PCA9685_MODE1, - MODE1_SLEEP, sleep ? MODE1_SLEEP : 0); - if (!sleep) { + MODE1_SLEEP, enable ? MODE1_SLEEP : 0); + if (!enable) { /* Wait 500us for the oscillator to be back up */ udelay(500); } @@ -272,13 +272,13 @@ static int pca9685_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, * state is guaranteed active here. */ /* Put chip into sleep mode */ - pca9685_set_sleep_mode(pca, 1); + pca9685_set_sleep_mode(pca, true); /* Change the chip-wide output frequency */ regmap_write(pca->regmap, PCA9685_PRESCALE, prescale); /* Wake the chip up */ - pca9685_set_sleep_mode(pca, 0); + pca9685_set_sleep_mode(pca, false); pca->period_ns = period_ns; } else { @@ -534,7 +534,7 @@ static int pca9685_pwm_runtime_suspend(struct device *dev) struct i2c_client *client = to_i2c_client(dev); struct pca9685 *pca = i2c_get_clientdata(client); - pca9685_set_sleep_mode(pca, 1); + pca9685_set_sleep_mode(pca, true); return 0; } @@ -543,7 +543,7 @@ static int pca9685_pwm_runtime_resume(struct device *dev) struct i2c_client *client = to_i2c_client(dev); struct pca9685 *pca = i2c_get_clientdata(client); - pca9685_set_sleep_mode(pca, 0); + pca9685_set_sleep_mode(pca, false); return 0; } #endif diff --git a/drivers/pwm/pwm-renesas-tpu.c b/drivers/pwm/pwm-renesas-tpu.c index 075c1a764ba2..29267d12fb4c 100644 --- a/drivers/pwm/pwm-renesas-tpu.c +++ b/drivers/pwm/pwm-renesas-tpu.c @@ -455,7 +455,6 @@ static const struct of_device_id tpu_of_table[] = { { .compatible = "renesas,tpu-r8a73a4", }, { .compatible = "renesas,tpu-r8a7740", }, { .compatible = "renesas,tpu-r8a7790", }, - { .compatible = "renesas,tpu-sh7372", }, { .compatible = "renesas,tpu", }, { }, }; diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index 744d56197286..4d99d468df09 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -27,12 +27,15 @@ #define PWM_DUTY_NEGATIVE (0 << 3) #define PWM_INACTIVE_NEGATIVE (0 << 4) #define PWM_INACTIVE_POSITIVE (1 << 4) +#define PWM_POLARITY_MASK (PWM_DUTY_POSITIVE | PWM_INACTIVE_POSITIVE) #define PWM_OUTPUT_LEFT (0 << 5) +#define PWM_LOCK_EN (1 << 6) #define PWM_LP_DISABLE (0 << 8) struct rockchip_pwm_chip { struct pwm_chip chip; struct clk *clk; + struct clk *pclk; const struct rockchip_pwm_data *data; void __iomem *base; }; @@ -48,13 +51,8 @@ struct rockchip_pwm_data { struct rockchip_pwm_regs regs; unsigned int prescaler; bool supports_polarity; - const struct pwm_ops *ops; - - void (*set_enable)(struct pwm_chip *chip, - struct pwm_device *pwm, bool enable, - enum pwm_polarity polarity); - void (*get_state)(struct pwm_chip *chip, struct pwm_device *pwm, - struct pwm_state *state); + bool supports_lock; + u32 enable_conf; }; static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c) @@ -62,90 +60,18 @@ static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c) return container_of(c, struct rockchip_pwm_chip, chip); } -static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, - struct pwm_device *pwm, bool enable, - enum pwm_polarity polarity) -{ - struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); - u32 enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN; - u32 val; - - val = readl_relaxed(pc->base + pc->data->regs.ctrl); - - if (enable) - val |= enable_conf; - else - val &= ~enable_conf; - - writel_relaxed(val, pc->base + pc->data->regs.ctrl); -} - -static void rockchip_pwm_get_state_v1(struct pwm_chip *chip, - struct pwm_device *pwm, - struct pwm_state *state) -{ - struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); - u32 enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN; - u32 val; - - val = readl_relaxed(pc->base + pc->data->regs.ctrl); - if ((val & enable_conf) == enable_conf) - state->enabled = true; -} - -static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip, - struct pwm_device *pwm, bool enable, - enum pwm_polarity polarity) -{ - struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); - u32 enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | - PWM_CONTINUOUS; - u32 val; - - if (polarity == PWM_POLARITY_INVERSED) - enable_conf |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE; - else - enable_conf |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE; - - val = readl_relaxed(pc->base + pc->data->regs.ctrl); - - if (enable) - val |= enable_conf; - else - val &= ~enable_conf; - - writel_relaxed(val, pc->base + pc->data->regs.ctrl); -} - -static void rockchip_pwm_get_state_v2(struct pwm_chip *chip, - struct pwm_device *pwm, - struct pwm_state *state) -{ - struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); - u32 enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | - PWM_CONTINUOUS; - u32 val; - - val = readl_relaxed(pc->base + pc->data->regs.ctrl); - if ((val & enable_conf) != enable_conf) - return; - - state->enabled = true; - - if (!(val & PWM_DUTY_POSITIVE)) - state->polarity = PWM_POLARITY_INVERSED; -} - static void rockchip_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, struct pwm_state *state) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); + u32 enable_conf = pc->data->enable_conf; unsigned long clk_rate; u64 tmp; + u32 val; int ret; - ret = clk_enable(pc->clk); + ret = clk_enable(pc->pclk); if (ret) return; @@ -157,19 +83,31 @@ static void rockchip_pwm_get_state(struct pwm_chip *chip, tmp = readl_relaxed(pc->base + pc->data->regs.duty); tmp *= pc->data->prescaler * NSEC_PER_SEC; - state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, clk_rate); + state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, clk_rate); + + val = readl_relaxed(pc->base + pc->data->regs.ctrl); + if (pc->data->supports_polarity) + state->enabled = ((val & enable_conf) != enable_conf) ? + false : true; + else + state->enabled = ((val & enable_conf) == enable_conf) ? + true : false; - pc->data->get_state(chip, pwm, state); + if (pc->data->supports_polarity) { + if (!(val & PWM_DUTY_POSITIVE)) + state->polarity = PWM_POLARITY_INVERSED; + } - clk_disable(pc->clk); + clk_disable(pc->pclk); } -static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static void rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); unsigned long period, duty; u64 clk_rate, div; + u32 ctrl; clk_rate = clk_get_rate(pc->clk); @@ -178,26 +116,53 @@ static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, * bits, every possible input period can be obtained using the * default prescaler value for all practical clock rate values. */ - div = clk_rate * period_ns; + div = clk_rate * state->period; period = DIV_ROUND_CLOSEST_ULL(div, pc->data->prescaler * NSEC_PER_SEC); - div = clk_rate * duty_ns; + div = clk_rate * state->duty_cycle; duty = DIV_ROUND_CLOSEST_ULL(div, pc->data->prescaler * NSEC_PER_SEC); + /* + * Lock the period and duty of previous configuration, then + * change the duty and period, that would not be effective. + */ + ctrl = readl_relaxed(pc->base + pc->data->regs.ctrl); + if (pc->data->supports_lock) { + ctrl |= PWM_LOCK_EN; + writel_relaxed(ctrl, pc->base + pc->data->regs.ctrl); + } + writel(period, pc->base + pc->data->regs.period); writel(duty, pc->base + pc->data->regs.duty); - return 0; + if (pc->data->supports_polarity) { + ctrl &= ~PWM_POLARITY_MASK; + if (state->polarity == PWM_POLARITY_INVERSED) + ctrl |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE; + else + ctrl |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE; + } + + /* + * Unlock and set polarity at the same time, + * the configuration of duty, period and polarity + * would be effective together at next period. + */ + if (pc->data->supports_lock) + ctrl &= ~PWM_LOCK_EN; + + writel(ctrl, pc->base + pc->data->regs.ctrl); } static int rockchip_pwm_enable(struct pwm_chip *chip, - struct pwm_device *pwm, - bool enable, - enum pwm_polarity polarity) + struct pwm_device *pwm, + bool enable) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); + u32 enable_conf = pc->data->enable_conf; int ret; + u32 val; if (enable) { ret = clk_enable(pc->clk); @@ -205,7 +170,14 @@ static int rockchip_pwm_enable(struct pwm_chip *chip, return ret; } - pc->data->set_enable(chip, pwm, enable, polarity); + val = readl_relaxed(pc->base + pc->data->regs.ctrl); + + if (enable) + val |= enable_conf; + else + val &= ~enable_conf; + + writel_relaxed(val, pc->base + pc->data->regs.ctrl); if (!enable) clk_disable(pc->clk); @@ -219,33 +191,26 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); struct pwm_state curstate; bool enabled; - int ret; + int ret = 0; - pwm_get_state(pwm, &curstate); - enabled = curstate.enabled; - - ret = clk_enable(pc->clk); + ret = clk_enable(pc->pclk); if (ret) return ret; - if (state->polarity != curstate.polarity && enabled) { - ret = rockchip_pwm_enable(chip, pwm, false, state->polarity); + pwm_get_state(pwm, &curstate); + enabled = curstate.enabled; + + if (state->polarity != curstate.polarity && enabled && + !pc->data->supports_lock) { + ret = rockchip_pwm_enable(chip, pwm, false); if (ret) goto out; enabled = false; } - ret = rockchip_pwm_config(chip, pwm, state->duty_cycle, state->period); - if (ret) { - if (enabled != curstate.enabled) - rockchip_pwm_enable(chip, pwm, !enabled, - state->polarity); - goto out; - } - + rockchip_pwm_config(chip, pwm, state); if (state->enabled != enabled) { - ret = rockchip_pwm_enable(chip, pwm, state->enabled, - state->polarity); + ret = rockchip_pwm_enable(chip, pwm, state->enabled); if (ret) goto out; } @@ -257,18 +222,12 @@ static int rockchip_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, rockchip_pwm_get_state(chip, pwm, state); out: - clk_disable(pc->clk); + clk_disable(pc->pclk); return ret; } -static const struct pwm_ops rockchip_pwm_ops_v1 = { - .get_state = rockchip_pwm_get_state, - .apply = rockchip_pwm_apply, - .owner = THIS_MODULE, -}; - -static const struct pwm_ops rockchip_pwm_ops_v2 = { +static const struct pwm_ops rockchip_pwm_ops = { .get_state = rockchip_pwm_get_state, .apply = rockchip_pwm_apply, .owner = THIS_MODULE, @@ -282,9 +241,9 @@ static const struct rockchip_pwm_data pwm_data_v1 = { .ctrl = 0x0c, }, .prescaler = 2, - .ops = &rockchip_pwm_ops_v1, - .set_enable = rockchip_pwm_set_enable_v1, - .get_state = rockchip_pwm_get_state_v1, + .supports_polarity = false, + .supports_lock = false, + .enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN, }; static const struct rockchip_pwm_data pwm_data_v2 = { @@ -296,9 +255,9 @@ static const struct rockchip_pwm_data pwm_data_v2 = { }, .prescaler = 1, .supports_polarity = true, - .ops = &rockchip_pwm_ops_v2, - .set_enable = rockchip_pwm_set_enable_v2, - .get_state = rockchip_pwm_get_state_v2, + .supports_lock = false, + .enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | + PWM_CONTINUOUS, }; static const struct rockchip_pwm_data pwm_data_vop = { @@ -310,15 +269,30 @@ static const struct rockchip_pwm_data pwm_data_vop = { }, .prescaler = 1, .supports_polarity = true, - .ops = &rockchip_pwm_ops_v2, - .set_enable = rockchip_pwm_set_enable_v2, - .get_state = rockchip_pwm_get_state_v2, + .supports_lock = false, + .enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | + PWM_CONTINUOUS, +}; + +static const struct rockchip_pwm_data pwm_data_v3 = { + .regs = { + .duty = 0x08, + .period = 0x04, + .cntr = 0x00, + .ctrl = 0x0c, + }, + .prescaler = 1, + .supports_polarity = true, + .supports_lock = true, + .enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | + PWM_CONTINUOUS, }; static const struct of_device_id rockchip_pwm_dt_ids[] = { { .compatible = "rockchip,rk2928-pwm", .data = &pwm_data_v1}, { .compatible = "rockchip,rk3288-pwm", .data = &pwm_data_v2}, { .compatible = "rockchip,vop-pwm", .data = &pwm_data_vop}, + { .compatible = "rockchip,rk3328-pwm", .data = &pwm_data_v3}, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, rockchip_pwm_dt_ids); @@ -328,7 +302,7 @@ static int rockchip_pwm_probe(struct platform_device *pdev) const struct of_device_id *id; struct rockchip_pwm_chip *pc; struct resource *r; - int ret; + int ret, count; id = of_match_device(rockchip_pwm_dt_ids, &pdev->dev); if (!id) @@ -343,19 +317,49 @@ static int rockchip_pwm_probe(struct platform_device *pdev) if (IS_ERR(pc->base)) return PTR_ERR(pc->base); - pc->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(pc->clk)) - return PTR_ERR(pc->clk); + pc->clk = devm_clk_get(&pdev->dev, "pwm"); + if (IS_ERR(pc->clk)) { + pc->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(pc->clk)) { + ret = PTR_ERR(pc->clk); + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "Can't get bus clk: %d\n", + ret); + return ret; + } + } + + count = of_count_phandle_with_args(pdev->dev.of_node, + "clocks", "#clock-cells"); + if (count == 2) + pc->pclk = devm_clk_get(&pdev->dev, "pclk"); + else + pc->pclk = pc->clk; + + if (IS_ERR(pc->pclk)) { + ret = PTR_ERR(pc->pclk); + if (ret != -EPROBE_DEFER) + dev_err(&pdev->dev, "Can't get APB clk: %d\n", ret); + return ret; + } ret = clk_prepare_enable(pc->clk); - if (ret) + if (ret) { + dev_err(&pdev->dev, "Can't prepare enable bus clk: %d\n", ret); return ret; + } + + ret = clk_prepare(pc->pclk); + if (ret) { + dev_err(&pdev->dev, "Can't prepare APB clk: %d\n", ret); + goto err_clk; + } platform_set_drvdata(pdev, pc); pc->data = id->data; pc->chip.dev = &pdev->dev; - pc->chip.ops = pc->data->ops; + pc->chip.ops = &rockchip_pwm_ops; pc->chip.base = -1; pc->chip.npwm = 1; @@ -368,12 +372,20 @@ static int rockchip_pwm_probe(struct platform_device *pdev) if (ret < 0) { clk_unprepare(pc->clk); dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret); + goto err_pclk; } /* Keep the PWM clk enabled if the PWM appears to be up and running. */ if (!pwm_is_enabled(pc->chip.pwms)) clk_disable(pc->clk); + return 0; + +err_pclk: + clk_unprepare(pc->pclk); +err_clk: + clk_disable_unprepare(pc->clk); + return ret; } @@ -395,6 +407,7 @@ static int rockchip_pwm_remove(struct platform_device *pdev) if (pwm_is_enabled(pc->chip.pwms)) clk_disable(pc->clk); + clk_unprepare(pc->pclk); clk_unprepare(pc->clk); return pwmchip_remove(&pc->chip); diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c index f113cda47032..062f2cfc45ec 100644 --- a/drivers/pwm/pwm-samsung.c +++ b/drivers/pwm/pwm-samsung.c @@ -3,6 +3,7 @@ * Copyright (c) 2008 Simtec Electronics * Ben Dooks <ben@simtec.co.uk>, <ben-linux@fluff.org> * Copyright (c) 2013 Tomasz Figa <tomasz.figa@gmail.com> + * Copyright (c) 2017 Samsung Electronics Co., Ltd. * * PWM driver for Samsung SoCs * @@ -74,6 +75,7 @@ struct samsung_pwm_channel { * @chip: generic PWM chip * @variant: local copy of hardware variant data * @inverter_mask: inverter status for all channels - one bit per channel + * @disabled_mask: disabled status for all channels - one bit per channel * @base: base address of mapped PWM registers * @base_clk: base clock used to drive the timers * @tclk0: external clock 0 (can be ERR_PTR if not present) @@ -83,6 +85,7 @@ struct samsung_pwm_chip { struct pwm_chip chip; struct samsung_pwm_variant variant; u8 inverter_mask; + u8 disabled_mask; void __iomem *base; struct clk *base_clk; @@ -257,6 +260,8 @@ static int pwm_samsung_enable(struct pwm_chip *chip, struct pwm_device *pwm) tcon |= TCON_START(tcon_chan) | TCON_AUTORELOAD(tcon_chan); writel(tcon, our_chip->base + REG_TCON); + our_chip->disabled_mask &= ~BIT(pwm->hwpwm); + spin_unlock_irqrestore(&samsung_pwm_lock, flags); return 0; @@ -275,6 +280,8 @@ static void pwm_samsung_disable(struct pwm_chip *chip, struct pwm_device *pwm) tcon &= ~TCON_AUTORELOAD(tcon_chan); writel(tcon, our_chip->base + REG_TCON); + our_chip->disabled_mask |= BIT(pwm->hwpwm); + spin_unlock_irqrestore(&samsung_pwm_lock, flags); } @@ -297,8 +304,8 @@ static void pwm_samsung_manual_update(struct samsung_pwm_chip *chip, spin_unlock_irqrestore(&samsung_pwm_lock, flags); } -static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) +static int __pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, + int duty_ns, int period_ns, bool force_period) { struct samsung_pwm_chip *our_chip = to_samsung_pwm_chip(chip); struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm); @@ -312,9 +319,6 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, if (period_ns > NSEC_PER_SEC) return -ERANGE; - if (period_ns == chan->period_ns && duty_ns == chan->duty_ns) - return 0; - tcnt = readl(our_chip->base + REG_TCNTB(pwm->hwpwm)); oldtcmp = readl(our_chip->base + REG_TCMPB(pwm->hwpwm)); @@ -322,7 +326,7 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, ++tcnt; /* Check to see if we are changing the clock rate of the PWM. */ - if (chan->period_ns != period_ns) { + if (chan->period_ns != period_ns || force_period) { unsigned long tin_rate; u32 period; @@ -381,6 +385,12 @@ static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +static int pwm_samsung_config(struct pwm_chip *chip, struct pwm_device *pwm, + int duty_ns, int period_ns) +{ + return __pwm_samsung_config(chip, pwm, duty_ns, period_ns, false); +} + static void pwm_samsung_set_invert(struct samsung_pwm_chip *chip, unsigned int channel, bool invert) { @@ -592,51 +602,41 @@ static int pwm_samsung_remove(struct platform_device *pdev) } #ifdef CONFIG_PM_SLEEP -static int pwm_samsung_suspend(struct device *dev) +static int pwm_samsung_resume(struct device *dev) { - struct samsung_pwm_chip *chip = dev_get_drvdata(dev); + struct samsung_pwm_chip *our_chip = dev_get_drvdata(dev); + struct pwm_chip *chip = &our_chip->chip; unsigned int i; - /* - * No one preserves these values during suspend so reset them. - * Otherwise driver leaves PWM unconfigured if same values are - * passed to pwm_config() next time. - */ - for (i = 0; i < SAMSUNG_PWM_NUM; ++i) { - struct pwm_device *pwm = &chip->chip.pwms[i]; + for (i = 0; i < SAMSUNG_PWM_NUM; i++) { + struct pwm_device *pwm = &chip->pwms[i]; struct samsung_pwm_channel *chan = pwm_get_chip_data(pwm); if (!chan) continue; - chan->period_ns = 0; - chan->duty_ns = 0; - } - - return 0; -} + if (our_chip->variant.output_mask & BIT(i)) + pwm_samsung_set_invert(our_chip, i, + our_chip->inverter_mask & BIT(i)); -static int pwm_samsung_resume(struct device *dev) -{ - struct samsung_pwm_chip *chip = dev_get_drvdata(dev); - unsigned int chan; + if (chan->period_ns) { + __pwm_samsung_config(chip, pwm, chan->duty_ns, + chan->period_ns, true); + /* needed to make PWM disable work on Odroid-XU3 */ + pwm_samsung_manual_update(our_chip, pwm); + } - /* - * Inverter setting must be preserved across suspend/resume - * as nobody really seems to configure it more than once. - */ - for (chan = 0; chan < SAMSUNG_PWM_NUM; ++chan) { - if (chip->variant.output_mask & BIT(chan)) - pwm_samsung_set_invert(chip, chan, - chip->inverter_mask & BIT(chan)); + if (our_chip->disabled_mask & BIT(i)) + pwm_samsung_disable(chip, pwm); + else + pwm_samsung_enable(chip, pwm); } return 0; } #endif -static SIMPLE_DEV_PM_OPS(pwm_samsung_pm_ops, pwm_samsung_suspend, - pwm_samsung_resume); +static SIMPLE_DEV_PM_OPS(pwm_samsung_pm_ops, NULL, pwm_samsung_resume); static struct platform_driver pwm_samsung_driver = { .driver = { diff --git a/drivers/pwm/pwm-tegra.c b/drivers/pwm/pwm-tegra.c index e9b33f09ff09..f8ebbece57b7 100644 --- a/drivers/pwm/pwm-tegra.c +++ b/drivers/pwm/pwm-tegra.c @@ -218,7 +218,7 @@ static int tegra_pwm_probe(struct platform_device *pdev) */ pwm->clk_rate = clk_get_rate(pwm->clk); - pwm->rst = devm_reset_control_get(&pdev->dev, "pwm"); + pwm->rst = devm_reset_control_get_exclusive(&pdev->dev, "pwm"); if (IS_ERR(pwm->rst)) { ret = PTR_ERR(pwm->rst); dev_err(&pdev->dev, "Reset control is not found: %d\n", ret); diff --git a/drivers/pwm/pwm-tiecap.c b/drivers/pwm/pwm-tiecap.c index 6ec342dd3eea..34b228626bd5 100644 --- a/drivers/pwm/pwm-tiecap.c +++ b/drivers/pwm/pwm-tiecap.c @@ -39,15 +39,15 @@ #define ECCTL2_TSCTR_FREERUN BIT(4) struct ecap_context { - u32 cap3; - u32 cap4; - u16 ecctl2; + u32 cap3; + u32 cap4; + u16 ecctl2; }; struct ecap_pwm_chip { - struct pwm_chip chip; - unsigned int clk_rate; - void __iomem *mmio_base; + struct pwm_chip chip; + unsigned int clk_rate; + void __iomem *mmio_base; struct ecap_context ctx; }; @@ -64,9 +64,9 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip); + u32 period_cycles, duty_cycles; unsigned long long c; - unsigned long period_cycles, duty_cycles; - unsigned int reg_val; + u16 value; if (period_ns > NSEC_PER_SEC) return -ERANGE; @@ -74,7 +74,7 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, c = pc->clk_rate; c = c * period_ns; do_div(c, NSEC_PER_SEC); - period_cycles = (unsigned long)c; + period_cycles = (u32)c; if (period_cycles < 1) { period_cycles = 1; @@ -83,17 +83,17 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, c = pc->clk_rate; c = c * duty_ns; do_div(c, NSEC_PER_SEC); - duty_cycles = (unsigned long)c; + duty_cycles = (u32)c; } pm_runtime_get_sync(pc->chip.dev); - reg_val = readw(pc->mmio_base + ECCTL2); + value = readw(pc->mmio_base + ECCTL2); /* Configure APWM mode & disable sync option */ - reg_val |= ECCTL2_APWM_MODE | ECCTL2_SYNC_SEL_DISA; + value |= ECCTL2_APWM_MODE | ECCTL2_SYNC_SEL_DISA; - writew(reg_val, pc->mmio_base + ECCTL2); + writew(value, pc->mmio_base + ECCTL2); if (!pwm_is_enabled(pwm)) { /* Update active registers if not running */ @@ -110,40 +110,45 @@ static int ecap_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } if (!pwm_is_enabled(pwm)) { - reg_val = readw(pc->mmio_base + ECCTL2); + value = readw(pc->mmio_base + ECCTL2); /* Disable APWM mode to put APWM output Low */ - reg_val &= ~ECCTL2_APWM_MODE; - writew(reg_val, pc->mmio_base + ECCTL2); + value &= ~ECCTL2_APWM_MODE; + writew(value, pc->mmio_base + ECCTL2); } pm_runtime_put_sync(pc->chip.dev); + return 0; } static int ecap_pwm_set_polarity(struct pwm_chip *chip, struct pwm_device *pwm, - enum pwm_polarity polarity) + enum pwm_polarity polarity) { struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip); - unsigned short reg_val; + u16 value; pm_runtime_get_sync(pc->chip.dev); - reg_val = readw(pc->mmio_base + ECCTL2); + + value = readw(pc->mmio_base + ECCTL2); + if (polarity == PWM_POLARITY_INVERSED) /* Duty cycle defines LOW period of PWM */ - reg_val |= ECCTL2_APWM_POL_LOW; + value |= ECCTL2_APWM_POL_LOW; else /* Duty cycle defines HIGH period of PWM */ - reg_val &= ~ECCTL2_APWM_POL_LOW; + value &= ~ECCTL2_APWM_POL_LOW; + + writew(value, pc->mmio_base + ECCTL2); - writew(reg_val, pc->mmio_base + ECCTL2); pm_runtime_put_sync(pc->chip.dev); + return 0; } static int ecap_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip); - unsigned int reg_val; + u16 value; /* Leave clock enabled on enabling PWM */ pm_runtime_get_sync(pc->chip.dev); @@ -152,24 +157,25 @@ static int ecap_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) * Enable 'Free run Time stamp counter mode' to start counter * and 'APWM mode' to enable APWM output */ - reg_val = readw(pc->mmio_base + ECCTL2); - reg_val |= ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE; - writew(reg_val, pc->mmio_base + ECCTL2); + value = readw(pc->mmio_base + ECCTL2); + value |= ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE; + writew(value, pc->mmio_base + ECCTL2); + return 0; } static void ecap_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct ecap_pwm_chip *pc = to_ecap_pwm_chip(chip); - unsigned int reg_val; + u16 value; /* * Disable 'Free run Time stamp counter mode' to stop counter * and 'APWM mode' to put APWM output to low */ - reg_val = readw(pc->mmio_base + ECCTL2); - reg_val &= ~(ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE); - writew(reg_val, pc->mmio_base + ECCTL2); + value = readw(pc->mmio_base + ECCTL2); + value &= ~(ECCTL2_TSCTR_FREERUN | ECCTL2_APWM_MODE); + writew(value, pc->mmio_base + ECCTL2); /* Disable clock on PWM disable */ pm_runtime_put_sync(pc->chip.dev); @@ -184,12 +190,12 @@ static void ecap_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static const struct pwm_ops ecap_pwm_ops = { - .free = ecap_pwm_free, - .config = ecap_pwm_config, - .set_polarity = ecap_pwm_set_polarity, - .enable = ecap_pwm_enable, - .disable = ecap_pwm_disable, - .owner = THIS_MODULE, + .free = ecap_pwm_free, + .config = ecap_pwm_config, + .set_polarity = ecap_pwm_set_polarity, + .enable = ecap_pwm_enable, + .disable = ecap_pwm_disable, + .owner = THIS_MODULE, }; static const struct of_device_id ecap_of_match[] = { @@ -202,10 +208,10 @@ MODULE_DEVICE_TABLE(of, ecap_of_match); static int ecap_pwm_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; - int ret; + struct ecap_pwm_chip *pc; struct resource *r; struct clk *clk; - struct ecap_pwm_chip *pc; + int ret; pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL); if (!pc) @@ -248,9 +254,9 @@ static int ecap_pwm_probe(struct platform_device *pdev) return ret; } + platform_set_drvdata(pdev, pc); pm_runtime_enable(&pdev->dev); - platform_set_drvdata(pdev, pc); return 0; } @@ -259,6 +265,7 @@ static int ecap_pwm_remove(struct platform_device *pdev) struct ecap_pwm_chip *pc = platform_get_drvdata(pdev); pm_runtime_disable(&pdev->dev); + return pwmchip_remove(&pc->chip); } @@ -311,14 +318,13 @@ static SIMPLE_DEV_PM_OPS(ecap_pwm_pm_ops, ecap_pwm_suspend, ecap_pwm_resume); static struct platform_driver ecap_pwm_driver = { .driver = { - .name = "ecap", + .name = "ecap", .of_match_table = ecap_of_match, - .pm = &ecap_pwm_pm_ops, + .pm = &ecap_pwm_pm_ops, }, .probe = ecap_pwm_probe, .remove = ecap_pwm_remove, }; - module_platform_driver(ecap_pwm_driver); MODULE_DESCRIPTION("ECAP PWM driver"); diff --git a/drivers/pwm/pwm-tiehrpwm.c b/drivers/pwm/pwm-tiehrpwm.c index b5c6b0636893..4c22cb395040 100644 --- a/drivers/pwm/pwm-tiehrpwm.c +++ b/drivers/pwm/pwm-tiehrpwm.c @@ -122,12 +122,12 @@ struct ehrpwm_context { }; struct ehrpwm_pwm_chip { - struct pwm_chip chip; - unsigned int clk_rate; - void __iomem *mmio_base; + struct pwm_chip chip; + unsigned long clk_rate; + void __iomem *mmio_base; unsigned long period_cycles[NUM_PWM_CHANNEL]; enum pwm_polarity polarity[NUM_PWM_CHANNEL]; - struct clk *tbclk; + struct clk *tbclk; struct ehrpwm_context ctx; }; @@ -136,25 +136,26 @@ static inline struct ehrpwm_pwm_chip *to_ehrpwm_pwm_chip(struct pwm_chip *chip) return container_of(chip, struct ehrpwm_pwm_chip, chip); } -static inline u16 ehrpwm_read(void __iomem *base, int offset) +static inline u16 ehrpwm_read(void __iomem *base, unsigned int offset) { return readw(base + offset); } -static inline void ehrpwm_write(void __iomem *base, int offset, unsigned int val) +static inline void ehrpwm_write(void __iomem *base, unsigned int offset, + u16 value) { - writew(val & 0xFFFF, base + offset); + writew(value, base + offset); } -static void ehrpwm_modify(void __iomem *base, int offset, - unsigned short mask, unsigned short val) +static void ehrpwm_modify(void __iomem *base, unsigned int offset, u16 mask, + u16 value) { - unsigned short regval; + unsigned short val; - regval = readw(base + offset); - regval &= ~mask; - regval |= val & mask; - writew(regval, base + offset); + val = readw(base + offset); + val &= ~mask; + val |= value & mask; + writew(val, base + offset); } /** @@ -163,14 +164,13 @@ static void ehrpwm_modify(void __iomem *base, int offset, * @prescale_div: prescaler value set * @tb_clk_div: Time Base Control prescaler bits */ -static int set_prescale_div(unsigned long rqst_prescaler, - unsigned short *prescale_div, unsigned short *tb_clk_div) +static int set_prescale_div(unsigned long rqst_prescaler, u16 *prescale_div, + u16 *tb_clk_div) { unsigned int clkdiv, hspclkdiv; for (clkdiv = 0; clkdiv <= CLKDIV_MAX; clkdiv++) { for (hspclkdiv = 0; hspclkdiv <= HSPCLKDIV_MAX; hspclkdiv++) { - /* * calculations for prescaler value : * prescale_div = HSPCLKDIVIDER * CLKDIVIDER. @@ -191,13 +191,14 @@ static int set_prescale_div(unsigned long rqst_prescaler, } } } + return 1; } static void configure_polarity(struct ehrpwm_pwm_chip *pc, int chan) { - int aqctl_reg; - unsigned short aqctl_val, aqctl_mask; + u16 aqctl_val, aqctl_mask; + unsigned int aqctl_reg; /* * Configure PWM output to HIGH/LOW level on counter @@ -232,13 +233,13 @@ static void configure_polarity(struct ehrpwm_pwm_chip *pc, int chan) * duty_ns = 10^9 * (ps_divval * duty_cycles) / PWM_CLK_RATE */ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, - int duty_ns, int period_ns) + int duty_ns, int period_ns) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); + u32 period_cycles, duty_cycles; + u16 ps_divval, tb_divval; + unsigned int i, cmp_reg; unsigned long long c; - unsigned long period_cycles, duty_cycles; - unsigned short ps_divval, tb_divval; - int i, cmp_reg; if (period_ns > NSEC_PER_SEC) return -ERANGE; @@ -272,8 +273,9 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, if (i == pwm->hwpwm) continue; - dev_err(chip->dev, "Period value conflicts with channel %d\n", - i); + dev_err(chip->dev, + "period value conflicts with channel %u\n", + i); return -EINVAL; } } @@ -282,7 +284,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, /* Configure clock prescaler to support Low frequency PWM wave */ if (set_prescale_div(period_cycles/PERIOD_MAX, &ps_divval, - &tb_divval)) { + &tb_divval)) { dev_err(chip->dev, "Unsupported values\n"); return -EINVAL; } @@ -303,7 +305,7 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, /* Configure ehrpwm counter for up-count mode */ ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_CTRMODE_MASK, - TBCTL_CTRMODE_UP); + TBCTL_CTRMODE_UP); if (pwm->hwpwm == 1) /* Channel 1 configured with compare B register */ @@ -315,23 +317,26 @@ static int ehrpwm_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, ehrpwm_write(pc->mmio_base, cmp_reg, duty_cycles); pm_runtime_put_sync(chip->dev); + return 0; } static int ehrpwm_pwm_set_polarity(struct pwm_chip *chip, - struct pwm_device *pwm, enum pwm_polarity polarity) + struct pwm_device *pwm, + enum pwm_polarity polarity) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); /* Configuration of polarity in hardware delayed, do at enable */ pc->polarity[pwm->hwpwm] = polarity; + return 0; } static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); - unsigned short aqcsfrc_val, aqcsfrc_mask; + u16 aqcsfrc_val, aqcsfrc_mask; int ret; /* Leave clock enabled on enabling PWM */ @@ -348,7 +353,7 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) /* Changes to shadow mode */ ehrpwm_modify(pc->mmio_base, AQSFRC, AQSFRC_RLDCSF_MASK, - AQSFRC_RLDCSF_ZRO); + AQSFRC_RLDCSF_ZRO); ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val); @@ -358,20 +363,21 @@ static int ehrpwm_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) /* Enable TBCLK before enabling PWM device */ ret = clk_enable(pc->tbclk); if (ret) { - dev_err(chip->dev, "Failed to enable TBCLK for %s\n", - dev_name(pc->chip.dev)); + dev_err(chip->dev, "Failed to enable TBCLK for %s: %d\n", + dev_name(pc->chip.dev), ret); return ret; } /* Enable time counter for free_run */ ehrpwm_modify(pc->mmio_base, TBCTL, TBCTL_RUN_MASK, TBCTL_FREE_RUN); + return 0; } static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct ehrpwm_pwm_chip *pc = to_ehrpwm_pwm_chip(chip); - unsigned short aqcsfrc_val, aqcsfrc_mask; + u16 aqcsfrc_val, aqcsfrc_mask; /* Action Qualifier puts PWM output low forcefully */ if (pwm->hwpwm) { @@ -387,7 +393,7 @@ static void ehrpwm_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) * Action Qualifier control on PWM output from next TBCLK */ ehrpwm_modify(pc->mmio_base, AQSFRC, AQSFRC_RLDCSF_MASK, - AQSFRC_RLDCSF_IMDT); + AQSFRC_RLDCSF_IMDT); ehrpwm_modify(pc->mmio_base, AQCSFRC, aqcsfrc_mask, aqcsfrc_val); @@ -415,17 +421,17 @@ static void ehrpwm_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm) } static const struct pwm_ops ehrpwm_pwm_ops = { - .free = ehrpwm_pwm_free, - .config = ehrpwm_pwm_config, - .set_polarity = ehrpwm_pwm_set_polarity, - .enable = ehrpwm_pwm_enable, - .disable = ehrpwm_pwm_disable, - .owner = THIS_MODULE, + .free = ehrpwm_pwm_free, + .config = ehrpwm_pwm_config, + .set_polarity = ehrpwm_pwm_set_polarity, + .enable = ehrpwm_pwm_enable, + .disable = ehrpwm_pwm_disable, + .owner = THIS_MODULE, }; static const struct of_device_id ehrpwm_of_match[] = { - { .compatible = "ti,am3352-ehrpwm" }, - { .compatible = "ti,am33xx-ehrpwm" }, + { .compatible = "ti,am3352-ehrpwm" }, + { .compatible = "ti,am33xx-ehrpwm" }, {}, }; MODULE_DEVICE_TABLE(of, ehrpwm_of_match); @@ -433,10 +439,10 @@ MODULE_DEVICE_TABLE(of, ehrpwm_of_match); static int ehrpwm_pwm_probe(struct platform_device *pdev) { struct device_node *np = pdev->dev.of_node; - int ret; + struct ehrpwm_pwm_chip *pc; struct resource *r; struct clk *clk; - struct ehrpwm_pwm_chip *pc; + int ret; pc = devm_kzalloc(&pdev->dev, sizeof(*pc), GFP_KERNEL); if (!pc) @@ -489,13 +495,18 @@ static int ehrpwm_pwm_probe(struct platform_device *pdev) ret = pwmchip_add(&pc->chip); if (ret < 0) { dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret); - return ret; + goto err_clk_unprepare; } + platform_set_drvdata(pdev, pc); pm_runtime_enable(&pdev->dev); - platform_set_drvdata(pdev, pc); return 0; + +err_clk_unprepare: + clk_unprepare(pc->tbclk); + + return ret; } static int ehrpwm_pwm_remove(struct platform_device *pdev) @@ -504,8 +515,8 @@ static int ehrpwm_pwm_remove(struct platform_device *pdev) clk_unprepare(pc->tbclk); - pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); + return pwmchip_remove(&pc->chip); } @@ -513,6 +524,7 @@ static int ehrpwm_pwm_remove(struct platform_device *pdev) static void ehrpwm_pwm_save_context(struct ehrpwm_pwm_chip *pc) { pm_runtime_get_sync(pc->chip.dev); + pc->ctx.tbctl = ehrpwm_read(pc->mmio_base, TBCTL); pc->ctx.tbprd = ehrpwm_read(pc->mmio_base, TBPRD); pc->ctx.cmpa = ehrpwm_read(pc->mmio_base, CMPA); @@ -521,6 +533,7 @@ static void ehrpwm_pwm_save_context(struct ehrpwm_pwm_chip *pc) pc->ctx.aqctlb = ehrpwm_read(pc->mmio_base, AQCTLB); pc->ctx.aqsfrc = ehrpwm_read(pc->mmio_base, AQSFRC); pc->ctx.aqcsfrc = ehrpwm_read(pc->mmio_base, AQCSFRC); + pm_runtime_put_sync(pc->chip.dev); } @@ -539,9 +552,10 @@ static void ehrpwm_pwm_restore_context(struct ehrpwm_pwm_chip *pc) static int ehrpwm_pwm_suspend(struct device *dev) { struct ehrpwm_pwm_chip *pc = dev_get_drvdata(dev); - int i; + unsigned int i; ehrpwm_pwm_save_context(pc); + for (i = 0; i < pc->chip.npwm; i++) { struct pwm_device *pwm = &pc->chip.pwms[i]; @@ -551,13 +565,14 @@ static int ehrpwm_pwm_suspend(struct device *dev) /* Disable explicitly if PWM is running */ pm_runtime_put_sync(dev); } + return 0; } static int ehrpwm_pwm_resume(struct device *dev) { struct ehrpwm_pwm_chip *pc = dev_get_drvdata(dev); - int i; + unsigned int i; for (i = 0; i < pc->chip.npwm; i++) { struct pwm_device *pwm = &pc->chip.pwms[i]; @@ -568,24 +583,25 @@ static int ehrpwm_pwm_resume(struct device *dev) /* Enable explicitly if PWM was running */ pm_runtime_get_sync(dev); } + ehrpwm_pwm_restore_context(pc); + return 0; } #endif static SIMPLE_DEV_PM_OPS(ehrpwm_pwm_pm_ops, ehrpwm_pwm_suspend, - ehrpwm_pwm_resume); + ehrpwm_pwm_resume); static struct platform_driver ehrpwm_pwm_driver = { .driver = { - .name = "ehrpwm", + .name = "ehrpwm", .of_match_table = ehrpwm_of_match, - .pm = &ehrpwm_pwm_pm_ops, + .pm = &ehrpwm_pwm_pm_ops, }, .probe = ehrpwm_pwm_probe, .remove = ehrpwm_pwm_remove, }; - module_platform_driver(ehrpwm_pwm_driver); MODULE_DESCRIPTION("EHRPWM PWM driver"); diff --git a/drivers/pwm/pwm-vt8500.c b/drivers/pwm/pwm-vt8500.c index 8141a4984126..3a78dd09ac81 100644 --- a/drivers/pwm/pwm-vt8500.c +++ b/drivers/pwm/pwm-vt8500.c @@ -241,6 +241,7 @@ static int vt8500_pwm_probe(struct platform_device *pdev) ret = pwmchip_add(&chip->chip); if (ret < 0) { dev_err(&pdev->dev, "failed to add PWM chip\n"); + clk_unprepare(chip->clk); return ret; } diff --git a/drivers/pwm/pwm-zx.c b/drivers/pwm/pwm-zx.c new file mode 100644 index 000000000000..5d27c16edfb1 --- /dev/null +++ b/drivers/pwm/pwm-zx.c @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2017 Sanechips Technology Co., Ltd. + * Copyright 2017 Linaro Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/clk.h> +#include <linux/err.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/pwm.h> +#include <linux/slab.h> + +#define ZX_PWM_MODE 0x0 +#define ZX_PWM_CLKDIV_SHIFT 2 +#define ZX_PWM_CLKDIV_MASK GENMASK(11, 2) +#define ZX_PWM_CLKDIV(x) (((x) << ZX_PWM_CLKDIV_SHIFT) & \ + ZX_PWM_CLKDIV_MASK) +#define ZX_PWM_POLAR BIT(1) +#define ZX_PWM_EN BIT(0) +#define ZX_PWM_PERIOD 0x4 +#define ZX_PWM_DUTY 0x8 + +#define ZX_PWM_CLKDIV_MAX 1023 +#define ZX_PWM_PERIOD_MAX 65535 + +struct zx_pwm_chip { + struct pwm_chip chip; + struct clk *pclk; + struct clk *wclk; + void __iomem *base; +}; + +static inline struct zx_pwm_chip *to_zx_pwm_chip(struct pwm_chip *chip) +{ + return container_of(chip, struct zx_pwm_chip, chip); +} + +static inline u32 zx_pwm_readl(struct zx_pwm_chip *zpc, unsigned int hwpwm, + unsigned int offset) +{ + return readl(zpc->base + (hwpwm + 1) * 0x10 + offset); +} + +static inline void zx_pwm_writel(struct zx_pwm_chip *zpc, unsigned int hwpwm, + unsigned int offset, u32 value) +{ + writel(value, zpc->base + (hwpwm + 1) * 0x10 + offset); +} + +static void zx_pwm_set_mask(struct zx_pwm_chip *zpc, unsigned int hwpwm, + unsigned int offset, u32 mask, u32 value) +{ + u32 data; + + data = zx_pwm_readl(zpc, hwpwm, offset); + data &= ~mask; + data |= value & mask; + zx_pwm_writel(zpc, hwpwm, offset, data); +} + +static void zx_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); + unsigned long rate; + unsigned int div; + u32 value; + u64 tmp; + + value = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_MODE); + + if (value & ZX_PWM_POLAR) + state->polarity = PWM_POLARITY_NORMAL; + else + state->polarity = PWM_POLARITY_INVERSED; + + if (value & ZX_PWM_EN) + state->enabled = true; + else + state->enabled = false; + + div = (value & ZX_PWM_CLKDIV_MASK) >> ZX_PWM_CLKDIV_SHIFT; + rate = clk_get_rate(zpc->wclk); + + tmp = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_PERIOD); + tmp *= div * NSEC_PER_SEC; + state->period = DIV_ROUND_CLOSEST_ULL(tmp, rate); + + tmp = zx_pwm_readl(zpc, pwm->hwpwm, ZX_PWM_DUTY); + tmp *= div * NSEC_PER_SEC; + state->duty_cycle = DIV_ROUND_CLOSEST_ULL(tmp, rate); +} + +static int zx_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, + unsigned int duty_ns, unsigned int period_ns) +{ + struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); + unsigned int period_cycles, duty_cycles; + unsigned long long c; + unsigned int div = 1; + unsigned long rate; + + /* Find out the best divider */ + rate = clk_get_rate(zpc->wclk); + + while (1) { + c = rate / div; + c = c * period_ns; + do_div(c, NSEC_PER_SEC); + + if (c < ZX_PWM_PERIOD_MAX) + break; + + div++; + + if (div > ZX_PWM_CLKDIV_MAX) + return -ERANGE; + } + + /* Calculate duty cycles */ + period_cycles = c; + c *= duty_ns; + do_div(c, period_ns); + duty_cycles = c; + + /* + * If the PWM is being enabled, we have to temporarily disable it + * before configuring the registers. + */ + if (pwm_is_enabled(pwm)) + zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_EN, 0); + + /* Set up registers */ + zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_CLKDIV_MASK, + ZX_PWM_CLKDIV(div)); + zx_pwm_writel(zpc, pwm->hwpwm, ZX_PWM_PERIOD, period_cycles); + zx_pwm_writel(zpc, pwm->hwpwm, ZX_PWM_DUTY, duty_cycles); + + /* Re-enable the PWM if needed */ + if (pwm_is_enabled(pwm)) + zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, + ZX_PWM_EN, ZX_PWM_EN); + + return 0; +} + +static int zx_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm, + struct pwm_state *state) +{ + struct zx_pwm_chip *zpc = to_zx_pwm_chip(chip); + struct pwm_state cstate; + int ret; + + pwm_get_state(pwm, &cstate); + + if (state->polarity != cstate.polarity) + zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, ZX_PWM_POLAR, + (state->polarity == PWM_POLARITY_INVERSED) ? + 0 : ZX_PWM_POLAR); + + if (state->period != cstate.period || + state->duty_cycle != cstate.duty_cycle) { + ret = zx_pwm_config(chip, pwm, state->duty_cycle, + state->period); + if (ret) + return ret; + } + + if (state->enabled != cstate.enabled) { + if (state->enabled) { + ret = clk_prepare_enable(zpc->wclk); + if (ret) + return ret; + + zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, + ZX_PWM_EN, ZX_PWM_EN); + } else { + zx_pwm_set_mask(zpc, pwm->hwpwm, ZX_PWM_MODE, + ZX_PWM_EN, 0); + clk_disable_unprepare(zpc->wclk); + } + } + + return 0; +} + +static const struct pwm_ops zx_pwm_ops = { + .apply = zx_pwm_apply, + .get_state = zx_pwm_get_state, + .owner = THIS_MODULE, +}; + +static int zx_pwm_probe(struct platform_device *pdev) +{ + struct zx_pwm_chip *zpc; + struct resource *res; + unsigned int i; + int ret; + + zpc = devm_kzalloc(&pdev->dev, sizeof(*zpc), GFP_KERNEL); + if (!zpc) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + zpc->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(zpc->base)) + return PTR_ERR(zpc->base); + + zpc->pclk = devm_clk_get(&pdev->dev, "pclk"); + if (IS_ERR(zpc->pclk)) + return PTR_ERR(zpc->pclk); + + zpc->wclk = devm_clk_get(&pdev->dev, "wclk"); + if (IS_ERR(zpc->wclk)) + return PTR_ERR(zpc->wclk); + + ret = clk_prepare_enable(zpc->pclk); + if (ret) + return ret; + + zpc->chip.dev = &pdev->dev; + zpc->chip.ops = &zx_pwm_ops; + zpc->chip.base = -1; + zpc->chip.npwm = 4; + zpc->chip.of_xlate = of_pwm_xlate_with_flags; + zpc->chip.of_pwm_n_cells = 3; + + /* + * PWM devices may be enabled by firmware, and let's disable all of + * them initially to save power. + */ + for (i = 0; i < zpc->chip.npwm; i++) + zx_pwm_set_mask(zpc, i, ZX_PWM_MODE, ZX_PWM_EN, 0); + + ret = pwmchip_add(&zpc->chip); + if (ret < 0) { + dev_err(&pdev->dev, "failed to add PWM chip: %d\n", ret); + return ret; + } + + platform_set_drvdata(pdev, zpc); + + return 0; +} + +static int zx_pwm_remove(struct platform_device *pdev) +{ + struct zx_pwm_chip *zpc = platform_get_drvdata(pdev); + int ret; + + ret = pwmchip_remove(&zpc->chip); + clk_disable_unprepare(zpc->pclk); + + return ret; +} + +static const struct of_device_id zx_pwm_dt_ids[] = { + { .compatible = "zte,zx296718-pwm", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, zx_pwm_dt_ids); + +static struct platform_driver zx_pwm_driver = { + .driver = { + .name = "zx-pwm", + .of_match_table = zx_pwm_dt_ids, + }, + .probe = zx_pwm_probe, + .remove = zx_pwm_remove, +}; +module_platform_driver(zx_pwm_driver); + +MODULE_ALIAS("platform:zx-pwm"); +MODULE_AUTHOR("Shawn Guo <shawn.guo@linaro.org>"); +MODULE_DESCRIPTION("ZTE ZX PWM Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 9c97ad1ee121..ea19b4ff87a2 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -62,7 +62,6 @@ MODULE_LICENSE("GPL"); static int dasd_alloc_queue(struct dasd_block *); static void dasd_setup_queue(struct dasd_block *); static void dasd_free_queue(struct dasd_block *); -static void dasd_flush_request_queue(struct dasd_block *); static int dasd_flush_block_queue(struct dasd_block *); static void dasd_device_tasklet(struct dasd_device *); static void dasd_block_tasklet(struct dasd_block *); @@ -158,7 +157,6 @@ struct dasd_block *dasd_alloc_block(void) /* open_count = 0 means device online but not in use */ atomic_set(&block->open_count, -1); - spin_lock_init(&block->request_queue_lock); atomic_set(&block->tasklet_scheduled, 0); tasklet_init(&block->tasklet, (void (*)(unsigned long)) dasd_block_tasklet, @@ -391,7 +389,6 @@ static int dasd_state_ready_to_basic(struct dasd_device *device) device->state = DASD_STATE_READY; return rc; } - dasd_flush_request_queue(block); dasd_destroy_partitions(block); block->blocks = 0; block->bp_block = 0; @@ -1645,8 +1642,10 @@ void dasd_generic_handle_state_change(struct dasd_device *device) dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING); dasd_schedule_device_bh(device); - if (device->block) + if (device->block) { dasd_schedule_block_bh(device->block); + blk_mq_run_hw_queues(device->block->request_queue, true); + } } EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); @@ -2638,6 +2637,7 @@ static void dasd_block_timeout(unsigned long ptr) dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); dasd_schedule_block_bh(block); + blk_mq_run_hw_queues(block->request_queue, true); } /* @@ -2677,115 +2677,11 @@ static void __dasd_process_erp(struct dasd_device *device, erp_fn(cqr); } -/* - * Fetch requests from the block device queue. - */ -static void __dasd_process_request_queue(struct dasd_block *block) -{ - struct request_queue *queue; - struct request *req; - struct dasd_ccw_req *cqr; - struct dasd_device *basedev; - unsigned long flags; - queue = block->request_queue; - basedev = block->base; - /* No queue ? Then there is nothing to do. */ - if (queue == NULL) - return; - - /* - * We requeue request from the block device queue to the ccw - * queue only in two states. In state DASD_STATE_READY the - * partition detection is done and we need to requeue requests - * for that. State DASD_STATE_ONLINE is normal block device - * operation. - */ - if (basedev->state < DASD_STATE_READY) { - while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, BLK_STS_IOERR); - return; - } - - /* - * if device is stopped do not fetch new requests - * except failfast is active which will let requests fail - * immediately in __dasd_block_start_head() - */ - if (basedev->stopped && !(basedev->features & DASD_FEATURE_FAILFAST)) - return; - - /* Now we try to fetch requests from the request queue */ - while ((req = blk_peek_request(queue))) { - if (basedev->features & DASD_FEATURE_READONLY && - rq_data_dir(req) == WRITE) { - DBF_DEV_EVENT(DBF_ERR, basedev, - "Rejecting write request %p", - req); - blk_start_request(req); - __blk_end_request_all(req, BLK_STS_IOERR); - continue; - } - if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) && - (basedev->features & DASD_FEATURE_FAILFAST || - blk_noretry_request(req))) { - DBF_DEV_EVENT(DBF_ERR, basedev, - "Rejecting failfast request %p", - req); - blk_start_request(req); - __blk_end_request_all(req, BLK_STS_TIMEOUT); - continue; - } - cqr = basedev->discipline->build_cp(basedev, block, req); - if (IS_ERR(cqr)) { - if (PTR_ERR(cqr) == -EBUSY) - break; /* normal end condition */ - if (PTR_ERR(cqr) == -ENOMEM) - break; /* terminate request queue loop */ - if (PTR_ERR(cqr) == -EAGAIN) { - /* - * The current request cannot be build right - * now, we have to try later. If this request - * is the head-of-queue we stop the device - * for 1/2 second. - */ - if (!list_empty(&block->ccw_queue)) - break; - spin_lock_irqsave( - get_ccwdev_lock(basedev->cdev), flags); - dasd_device_set_stop_bits(basedev, - DASD_STOPPED_PENDING); - spin_unlock_irqrestore( - get_ccwdev_lock(basedev->cdev), flags); - dasd_block_set_timer(block, HZ/2); - break; - } - DBF_DEV_EVENT(DBF_ERR, basedev, - "CCW creation failed (rc=%ld) " - "on request %p", - PTR_ERR(cqr), req); - blk_start_request(req); - __blk_end_request_all(req, BLK_STS_IOERR); - continue; - } - /* - * Note: callback is set to dasd_return_cqr_cb in - * __dasd_block_start_head to cover erp requests as well - */ - cqr->callback_data = (void *) req; - cqr->status = DASD_CQR_FILLED; - req->completion_data = cqr; - blk_start_request(req); - list_add_tail(&cqr->blocklist, &block->ccw_queue); - INIT_LIST_HEAD(&cqr->devlist); - dasd_profile_start(block, cqr, req); - } -} - static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) { struct request *req; - int status; blk_status_t error = BLK_STS_OK; + int status; req = (struct request *) cqr->callback_data; dasd_profile_end(cqr->block, cqr, req); @@ -2809,7 +2705,19 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) break; } } - __blk_end_request_all(req, error); + + /* + * We need to take care for ETIMEDOUT errors here since the + * complete callback does not get called in this case. + * Take care of all errors here and avoid additional code to + * transfer the error value to the complete callback. + */ + if (error) { + blk_mq_end_request(req, error); + blk_mq_run_hw_queues(req->q, true); + } else { + blk_mq_complete_request(req); + } } /* @@ -2938,27 +2846,30 @@ static void dasd_block_tasklet(struct dasd_block *block) struct list_head final_queue; struct list_head *l, *n; struct dasd_ccw_req *cqr; + struct dasd_queue *dq; atomic_set(&block->tasklet_scheduled, 0); INIT_LIST_HEAD(&final_queue); - spin_lock(&block->queue_lock); + spin_lock_irq(&block->queue_lock); /* Finish off requests on ccw queue */ __dasd_process_block_ccw_queue(block, &final_queue); - spin_unlock(&block->queue_lock); + spin_unlock_irq(&block->queue_lock); + /* Now call the callback function of requests with final status */ - spin_lock_irq(&block->request_queue_lock); list_for_each_safe(l, n, &final_queue) { cqr = list_entry(l, struct dasd_ccw_req, blocklist); + dq = cqr->dq; + spin_lock_irq(&dq->lock); list_del_init(&cqr->blocklist); __dasd_cleanup_cqr(cqr); + spin_unlock_irq(&dq->lock); } - spin_lock(&block->queue_lock); - /* Get new request from the block device request queue */ - __dasd_process_request_queue(block); + + spin_lock_irq(&block->queue_lock); /* Now check if the head of the ccw queue needs to be started. */ __dasd_block_start_head(block); - spin_unlock(&block->queue_lock); - spin_unlock_irq(&block->request_queue_lock); + spin_unlock_irq(&block->queue_lock); + if (waitqueue_active(&shutdown_waitq)) wake_up(&shutdown_waitq); dasd_put_device(block->base); @@ -2977,14 +2888,13 @@ static int _dasd_requeue_request(struct dasd_ccw_req *cqr) { struct dasd_block *block = cqr->block; struct request *req; - unsigned long flags; if (!block) return -EINVAL; - spin_lock_irqsave(&block->request_queue_lock, flags); + spin_lock_irq(&cqr->dq->lock); req = (struct request *) cqr->callback_data; - blk_requeue_request(block->request_queue, req); - spin_unlock_irqrestore(&block->request_queue_lock, flags); + blk_mq_requeue_request(req, false); + spin_unlock_irq(&cqr->dq->lock); return 0; } @@ -2999,6 +2909,7 @@ static int dasd_flush_block_queue(struct dasd_block *block) struct dasd_ccw_req *cqr, *n; int rc, i; struct list_head flush_queue; + unsigned long flags; INIT_LIST_HEAD(&flush_queue); spin_lock_bh(&block->queue_lock); @@ -3037,11 +2948,11 @@ restart_cb: goto restart_cb; } /* call the callback function */ - spin_lock_irq(&block->request_queue_lock); + spin_lock_irqsave(&cqr->dq->lock, flags); cqr->endclk = get_tod_clock(); list_del_init(&cqr->blocklist); __dasd_cleanup_cqr(cqr); - spin_unlock_irq(&block->request_queue_lock); + spin_unlock_irqrestore(&cqr->dq->lock, flags); } return rc; } @@ -3069,42 +2980,114 @@ EXPORT_SYMBOL(dasd_schedule_block_bh); /* * Dasd request queue function. Called from ll_rw_blk.c */ -static void do_dasd_request(struct request_queue *queue) +static blk_status_t do_dasd_request(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *qd) { - struct dasd_block *block; + struct dasd_block *block = hctx->queue->queuedata; + struct dasd_queue *dq = hctx->driver_data; + struct request *req = qd->rq; + struct dasd_device *basedev; + struct dasd_ccw_req *cqr; + blk_status_t rc = BLK_STS_OK; + + basedev = block->base; + spin_lock_irq(&dq->lock); + if (basedev->state < DASD_STATE_READY) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "device not ready for request %p", req); + rc = BLK_STS_IOERR; + goto out; + } + + /* + * if device is stopped do not fetch new requests + * except failfast is active which will let requests fail + * immediately in __dasd_block_start_head() + */ + if (basedev->stopped && !(basedev->features & DASD_FEATURE_FAILFAST)) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "device stopped request %p", req); + rc = BLK_STS_RESOURCE; + goto out; + } + + if (basedev->features & DASD_FEATURE_READONLY && + rq_data_dir(req) == WRITE) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "Rejecting write request %p", req); + rc = BLK_STS_IOERR; + goto out; + } - block = queue->queuedata; + if (test_bit(DASD_FLAG_ABORTALL, &basedev->flags) && + (basedev->features & DASD_FEATURE_FAILFAST || + blk_noretry_request(req))) { + DBF_DEV_EVENT(DBF_ERR, basedev, + "Rejecting failfast request %p", req); + rc = BLK_STS_IOERR; + goto out; + } + + cqr = basedev->discipline->build_cp(basedev, block, req); + if (IS_ERR(cqr)) { + if (PTR_ERR(cqr) == -EBUSY || + PTR_ERR(cqr) == -ENOMEM || + PTR_ERR(cqr) == -EAGAIN) { + rc = BLK_STS_RESOURCE; + goto out; + } + DBF_DEV_EVENT(DBF_ERR, basedev, + "CCW creation failed (rc=%ld) on request %p", + PTR_ERR(cqr), req); + rc = BLK_STS_IOERR; + goto out; + } + /* + * Note: callback is set to dasd_return_cqr_cb in + * __dasd_block_start_head to cover erp requests as well + */ + cqr->callback_data = req; + cqr->status = DASD_CQR_FILLED; + cqr->dq = dq; + req->completion_data = cqr; + blk_mq_start_request(req); spin_lock(&block->queue_lock); - /* Get new request from the block device request queue */ - __dasd_process_request_queue(block); - /* Now check if the head of the ccw queue needs to be started. */ - __dasd_block_start_head(block); + list_add_tail(&cqr->blocklist, &block->ccw_queue); + INIT_LIST_HEAD(&cqr->devlist); + dasd_profile_start(block, cqr, req); + dasd_schedule_block_bh(block); spin_unlock(&block->queue_lock); + +out: + spin_unlock_irq(&dq->lock); + return rc; } /* * Block timeout callback, called from the block layer * - * request_queue lock is held on entry. - * * Return values: * BLK_EH_RESET_TIMER if the request should be left running * BLK_EH_NOT_HANDLED if the request is handled or terminated * by the driver. */ -enum blk_eh_timer_return dasd_times_out(struct request *req) +enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved) { struct dasd_ccw_req *cqr = req->completion_data; struct dasd_block *block = req->q->queuedata; struct dasd_device *device; + unsigned long flags; int rc = 0; if (!cqr) return BLK_EH_NOT_HANDLED; + spin_lock_irqsave(&cqr->dq->lock, flags); device = cqr->startdev ? cqr->startdev : block->base; - if (!device->blk_timeout) + if (!device->blk_timeout) { + spin_unlock_irqrestore(&cqr->dq->lock, flags); return BLK_EH_RESET_TIMER; + } DBF_DEV_EVENT(DBF_WARNING, device, " dasd_times_out cqr %p status %x", cqr, cqr->status); @@ -3154,19 +3137,64 @@ enum blk_eh_timer_return dasd_times_out(struct request *req) } dasd_schedule_block_bh(block); spin_unlock(&block->queue_lock); + spin_unlock_irqrestore(&cqr->dq->lock, flags); return rc ? BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; } +static int dasd_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int idx) +{ + struct dasd_queue *dq = kzalloc(sizeof(*dq), GFP_KERNEL); + + if (!dq) + return -ENOMEM; + + spin_lock_init(&dq->lock); + hctx->driver_data = dq; + + return 0; +} + +static void dasd_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int idx) +{ + kfree(hctx->driver_data); + hctx->driver_data = NULL; +} + +static void dasd_request_done(struct request *req) +{ + blk_mq_end_request(req, 0); + blk_mq_run_hw_queues(req->q, true); +} + +static struct blk_mq_ops dasd_mq_ops = { + .queue_rq = do_dasd_request, + .complete = dasd_request_done, + .timeout = dasd_times_out, + .init_hctx = dasd_init_hctx, + .exit_hctx = dasd_exit_hctx, +}; + /* * Allocate and initialize request queue and default I/O scheduler. */ static int dasd_alloc_queue(struct dasd_block *block) { - block->request_queue = blk_init_queue(do_dasd_request, - &block->request_queue_lock); - if (block->request_queue == NULL) - return -ENOMEM; + int rc; + + block->tag_set.ops = &dasd_mq_ops; + block->tag_set.nr_hw_queues = DASD_NR_HW_QUEUES; + block->tag_set.queue_depth = DASD_MAX_LCU_DEV * DASD_REQ_PER_DEV; + block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + + rc = blk_mq_alloc_tag_set(&block->tag_set); + if (rc) + return rc; + + block->request_queue = blk_mq_init_queue(&block->tag_set); + if (IS_ERR(block->request_queue)) + return PTR_ERR(block->request_queue); block->request_queue->queuedata = block; @@ -3229,26 +3257,11 @@ static void dasd_free_queue(struct dasd_block *block) { if (block->request_queue) { blk_cleanup_queue(block->request_queue); + blk_mq_free_tag_set(&block->tag_set); block->request_queue = NULL; } } -/* - * Flush request on the request queue. - */ -static void dasd_flush_request_queue(struct dasd_block *block) -{ - struct request *req; - - if (!block->request_queue) - return; - - spin_lock_irq(&block->request_queue_lock); - while ((req = blk_fetch_request(block->request_queue))) - __blk_end_request_all(req, BLK_STS_IOERR); - spin_unlock_irq(&block->request_queue_lock); -} - static int dasd_open(struct block_device *bdev, fmode_t mode) { struct dasd_device *base; @@ -3744,8 +3757,10 @@ int dasd_generic_path_operational(struct dasd_device *device) return 1; } dasd_schedule_device_bh(device); - if (device->block) + if (device->block) { dasd_schedule_block_bh(device->block); + blk_mq_run_hw_queues(device->block->request_queue, true); + } if (!device->stopped) wake_up(&generic_waitq); @@ -4008,8 +4023,10 @@ int dasd_generic_restore_device(struct ccw_device *cdev) */ device->stopped |= DASD_UNRESUMED_PM; - if (device->block) + if (device->block) { dasd_schedule_block_bh(device->block); + blk_mq_run_hw_queues(device->block->request_queue, true); + } clear_bit(DASD_FLAG_SUSPENDED, &device->flags); dasd_put_device(device); diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index e38042ce94e6..c95a4784c191 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -1326,7 +1326,7 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr, { struct dasd_device *device; struct request_queue *q; - unsigned long val, flags; + unsigned long val; device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device) || !device->block) @@ -1342,16 +1342,10 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr, dasd_put_device(device); return -ENODEV; } - spin_lock_irqsave(&device->block->request_queue_lock, flags); - if (!val) - blk_queue_rq_timed_out(q, NULL); - else - blk_queue_rq_timed_out(q, dasd_times_out); device->blk_timeout = val; blk_queue_rq_timeout(q, device->blk_timeout * HZ); - spin_unlock_irqrestore(&device->block->request_queue_lock, flags); dasd_put_device(device); return count; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index f9e25fc03d6b..db470bd10175 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -56,6 +56,7 @@ #include <asm/dasd.h> #include <asm/idals.h> #include <linux/bitops.h> +#include <linux/blk-mq.h> /* DASD discipline magic */ #define DASD_ECKD_MAGIC 0xC5C3D2C4 @@ -185,6 +186,7 @@ struct dasd_ccw_req { char status; /* status of this request */ short retries; /* A retry counter */ unsigned long flags; /* flags of this request */ + struct dasd_queue *dq; /* ... and how */ unsigned long starttime; /* jiffies time of request start */ @@ -248,6 +250,16 @@ struct dasd_ccw_req { #define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */ #define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */ +/* + * There is no reliable way to determine the number of available CPUs on + * LPAR but there is no big performance difference between 1 and the + * maximum CPU number. + * 64 is a good trade off performance wise. + */ +#define DASD_NR_HW_QUEUES 64 +#define DASD_MAX_LCU_DEV 256 +#define DASD_REQ_PER_DEV 4 + /* Signature for error recovery functions. */ typedef struct dasd_ccw_req *(*dasd_erp_fn_t) (struct dasd_ccw_req *); @@ -539,6 +551,7 @@ struct dasd_block { struct gendisk *gdp; struct request_queue *request_queue; spinlock_t request_queue_lock; + struct blk_mq_tag_set tag_set; struct block_device *bdev; atomic_t open_count; @@ -563,6 +576,10 @@ struct dasd_attention_data { __u8 lpum; }; +struct dasd_queue { + spinlock_t lock; +}; + /* reasons why device (ccw_device_start) was stopped */ #define DASD_STOPPED_NOT_ACC 1 /* not accessible */ #define DASD_STOPPED_QUIESCE 2 /* Quiesced */ @@ -731,7 +748,7 @@ void dasd_free_device(struct dasd_device *); struct dasd_block *dasd_alloc_block(void); void dasd_free_block(struct dasd_block *); -enum blk_eh_timer_return dasd_times_out(struct request *req); +enum blk_eh_timer_return dasd_times_out(struct request *req, bool reserved); void dasd_enable_device(struct dasd_device *); void dasd_set_target_state(struct dasd_device *, int); diff --git a/drivers/s390/crypto/ap_asm.h b/drivers/s390/crypto/ap_asm.h index 287b4ad0999e..cd350345b3d2 100644 --- a/drivers/s390/crypto/ap_asm.h +++ b/drivers/s390/crypto/ap_asm.h @@ -69,16 +69,19 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid) } /** - * ap_aqic(): Enable interruption for a specific AP. + * ap_aqic(): Control interruption for a specific AP. * @qid: The AP queue number + * @qirqctrl: struct ap_qirq_ctrl (64 bit value) * @ind: The notification indicator byte * * Returns AP queue status. */ -static inline struct ap_queue_status ap_aqic(ap_qid_t qid, void *ind) +static inline struct ap_queue_status ap_aqic(ap_qid_t qid, + struct ap_qirq_ctrl qirqctrl, + void *ind) { register unsigned long reg0 asm ("0") = qid | (3UL << 24); - register unsigned long reg1_in asm ("1") = (8UL << 44) | AP_ISC; + register struct ap_qirq_ctrl reg1_in asm ("1") = qirqctrl; register struct ap_queue_status reg1_out asm ("1"); register void *reg2 asm ("2") = ind; diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index 6dee598979e7..5f0be2040272 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -166,26 +166,51 @@ static int ap_configuration_available(void) } /** + * ap_apft_available(): Test if AP facilities test (APFT) + * facility is available. + * + * Returns 1 if APFT is is available. + */ +static int ap_apft_available(void) +{ + return test_facility(15); +} + +/** * ap_test_queue(): Test adjunct processor queue. * @qid: The AP queue number + * @tbit: Test facilities bit * @info: Pointer to queue descriptor * * Returns AP queue status structure. */ -static inline struct ap_queue_status -ap_test_queue(ap_qid_t qid, unsigned long *info) +struct ap_queue_status ap_test_queue(ap_qid_t qid, + int tbit, + unsigned long *info) { - if (test_facility(15)) - qid |= 1UL << 23; /* set APFT T bit*/ + if (tbit) + qid |= 1UL << 23; /* set T bit*/ return ap_tapq(qid, info); } +EXPORT_SYMBOL(ap_test_queue); -static inline int ap_query_configuration(void) +/* + * ap_query_configuration(): Fetch cryptographic config info + * + * Returns the ap configuration info fetched via PQAP(QCI). + * On success 0 is returned, on failure a negative errno + * is returned, e.g. if the PQAP(QCI) instruction is not + * available, the return value will be -EOPNOTSUPP. + */ +int ap_query_configuration(struct ap_config_info *info) { - if (!ap_configuration) + if (!ap_configuration_available()) return -EOPNOTSUPP; - return ap_qci(ap_configuration); + if (!info) + return -EINVAL; + return ap_qci(info); } +EXPORT_SYMBOL(ap_query_configuration); /** * ap_init_configuration(): Allocate and query configuration array. @@ -198,7 +223,7 @@ static void ap_init_configuration(void) ap_configuration = kzalloc(sizeof(*ap_configuration), GFP_KERNEL); if (!ap_configuration) return; - if (ap_query_configuration() != 0) { + if (ap_query_configuration(ap_configuration) != 0) { kfree(ap_configuration); ap_configuration = NULL; return; @@ -261,7 +286,7 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type, if (!ap_test_config_card_id(AP_QID_CARD(qid))) return -ENODEV; - status = ap_test_queue(qid, &info); + status = ap_test_queue(qid, ap_apft_available(), &info); switch (status.response_code) { case AP_RESPONSE_NORMAL: *queue_depth = (int)(info & 0xff); @@ -940,7 +965,9 @@ static int ap_select_domain(void) for (j = 0; j < AP_DEVICES; j++) { if (!ap_test_config_card_id(j)) continue; - status = ap_test_queue(AP_MKQID(j, i), NULL); + status = ap_test_queue(AP_MKQID(j, i), + ap_apft_available(), + NULL); if (status.response_code != AP_RESPONSE_NORMAL) continue; count++; @@ -993,7 +1020,7 @@ static void ap_scan_bus(struct work_struct *unused) AP_DBF(DBF_DEBUG, "ap_scan_bus running\n"); - ap_query_configuration(); + ap_query_configuration(ap_configuration); if (ap_select_domain() != 0) goto out; diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h index 4dc7c88fb054..754cf2223cfb 100644 --- a/drivers/s390/crypto/ap_bus.h +++ b/drivers/s390/crypto/ap_bus.h @@ -28,6 +28,7 @@ #include <linux/device.h> #include <linux/types.h> +#include <asm/ap.h> #define AP_DEVICES 64 /* Number of AP devices. */ #define AP_DOMAINS 256 /* Number of AP domains. */ @@ -40,41 +41,6 @@ extern int ap_domain_index; extern spinlock_t ap_list_lock; extern struct list_head ap_card_list; -/** - * The ap_qid_t identifier of an ap queue. It contains a - * 6 bit card index and a 4 bit queue index (domain). - */ -typedef unsigned int ap_qid_t; - -#define AP_MKQID(_card, _queue) (((_card) & 63) << 8 | ((_queue) & 255)) -#define AP_QID_CARD(_qid) (((_qid) >> 8) & 63) -#define AP_QID_QUEUE(_qid) ((_qid) & 255) - -/** - * structy ap_queue_status - Holds the AP queue status. - * @queue_empty: Shows if queue is empty - * @replies_waiting: Waiting replies - * @queue_full: Is 1 if the queue is full - * @pad: A 4 bit pad - * @int_enabled: Shows if interrupts are enabled for the AP - * @response_code: Holds the 8 bit response code - * @pad2: A 16 bit pad - * - * The ap queue status word is returned by all three AP functions - * (PQAP, NQAP and DQAP). There's a set of flags in the first - * byte, followed by a 1 byte response code. - */ -struct ap_queue_status { - unsigned int queue_empty : 1; - unsigned int replies_waiting : 1; - unsigned int queue_full : 1; - unsigned int pad1 : 4; - unsigned int int_enabled : 1; - unsigned int response_code : 8; - unsigned int pad2 : 16; -} __packed; - - static inline int ap_test_bit(unsigned int *ptr, unsigned int nr) { return (*ptr & (0x80000000u >> nr)) != 0; @@ -238,17 +204,6 @@ struct ap_message { struct ap_message *); }; -struct ap_config_info { - unsigned int special_command:1; - unsigned int ap_extended:1; - unsigned char reserved1:6; - unsigned char reserved2[15]; - unsigned int apm[8]; /* AP ID mask */ - unsigned int aqm[8]; /* AP queue mask */ - unsigned int adm[8]; /* AP domain mask */ - unsigned char reserved4[16]; -} __packed; - /** * ap_init_message() - Initialize ap_message. * Initialize a message before using. Otherwise this might result in diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c index 0f1a5d02acb0..56b96edffd5b 100644 --- a/drivers/s390/crypto/ap_queue.c +++ b/drivers/s390/crypto/ap_queue.c @@ -16,6 +16,25 @@ #include "ap_asm.h" /** + * ap_queue_irq_ctrl(): Control interruption on a AP queue. + * @qirqctrl: struct ap_qirq_ctrl (64 bit value) + * @ind: The notification indicator byte + * + * Returns AP queue status. + * + * Control interruption on the given AP queue. + * Just a simple wrapper function for the low level PQAP(AQIC) + * instruction available for other kernel modules. + */ +struct ap_queue_status ap_queue_irq_ctrl(ap_qid_t qid, + struct ap_qirq_ctrl qirqctrl, + void *ind) +{ + return ap_aqic(qid, qirqctrl, ind); +} +EXPORT_SYMBOL(ap_queue_irq_ctrl); + +/** * ap_queue_enable_interruption(): Enable interruption on an AP queue. * @qid: The AP queue number * @ind: the notification indicator byte @@ -27,8 +46,11 @@ static int ap_queue_enable_interruption(struct ap_queue *aq, void *ind) { struct ap_queue_status status; + struct ap_qirq_ctrl qirqctrl = { 0 }; - status = ap_aqic(aq->qid, ind); + qirqctrl.ir = 1; + qirqctrl.isc = AP_ISC; + status = ap_aqic(aq->qid, qirqctrl, ind); switch (status.response_code) { case AP_RESPONSE_NORMAL: case AP_RESPONSE_OTHERWISE_CHANGED: @@ -362,7 +384,7 @@ static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq) /* Get the status with TAPQ */ status = ap_tapq(aq->qid, NULL); - if (status.int_enabled == 1) { + if (status.irq_enabled == 1) { /* Irqs are now enabled */ aq->interrupt = AP_INTR_ENABLED; aq->state = (aq->queue_count > 0) ? diff --git a/drivers/scsi/NCR_Q720.c b/drivers/scsi/NCR_Q720.c index 05835bf1bf9c..54e7d26908ee 100644 --- a/drivers/scsi/NCR_Q720.c +++ b/drivers/scsi/NCR_Q720.c @@ -217,8 +217,7 @@ NCR_Q720_probe(struct device *dev) } if (dma_declare_coherent_memory(dev, base_addr, base_addr, - mem_size, DMA_MEMORY_MAP) - != DMA_MEMORY_MAP) { + mem_size, 0)) { printk(KERN_ERR "NCR_Q720: DMA declare memory failed\n"); goto out_release_region; } diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index b5b5facb8747..07002df4f83a 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -342,7 +342,7 @@ config X86_PKG_TEMP_THERMAL config INTEL_SOC_DTS_IOSF_CORE tristate - depends on X86 + depends on X86 && PCI select IOSF_MBI help This is becoming a common feature for Intel SoCs to expose the additional @@ -352,7 +352,7 @@ config INTEL_SOC_DTS_IOSF_CORE config INTEL_SOC_DTS_THERMAL tristate "Intel SoCs DTS thermal driver" - depends on X86 + depends on X86 && PCI select INTEL_SOC_DTS_IOSF_CORE select THERMAL_WRITABLE_TRIPS help @@ -473,4 +473,12 @@ config ZX2967_THERMAL the primitive temperature sensor embedded in zx2967 SoCs. This sensor generates the real time die temperature. +config UNIPHIER_THERMAL + tristate "Socionext UniPhier thermal driver" + depends on ARCH_UNIPHIER || COMPILE_TEST + depends on THERMAL_OF && MFD_SYSCON + help + Enable this to plug in UniPhier on-chip PVT thermal driver into the + thermal framework. The driver supports CPU thermal zone temperature + reporting and a couple of trip points. endif diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 094d7039981c..8b79bca23536 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -59,3 +59,4 @@ obj-$(CONFIG_HISI_THERMAL) += hisi_thermal.o obj-$(CONFIG_MTK_THERMAL) += mtk_thermal.o obj-$(CONFIG_GENERIC_ADC_THERMAL) += thermal-generic-adc.o obj-$(CONFIG_ZX2967_THERMAL) += zx2967_thermal.o +obj-$(CONFIG_UNIPHIER_THERMAL) += uniphier_thermal.o diff --git a/drivers/thermal/broadcom/bcm2835_thermal.c b/drivers/thermal/broadcom/bcm2835_thermal.c index e6863c841662..a4d6a0e2e993 100644 --- a/drivers/thermal/broadcom/bcm2835_thermal.c +++ b/drivers/thermal/broadcom/bcm2835_thermal.c @@ -145,7 +145,7 @@ static void bcm2835_thermal_debugfs(struct platform_device *pdev) debugfs_create_regset32("regset", 0444, data->debugfsdir, regset); } -static struct thermal_zone_of_device_ops bcm2835_thermal_ops = { +static const struct thermal_zone_of_device_ops bcm2835_thermal_ops = { .get_temp = bcm2835_thermal_get_temp, }; diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c index 9c3ce341eb97..bd3572c41585 100644 --- a/drivers/thermal/hisi_thermal.c +++ b/drivers/thermal/hisi_thermal.c @@ -206,7 +206,7 @@ static int hisi_thermal_get_temp(void *_sensor, int *temp) return 0; } -static struct thermal_zone_of_device_ops hisi_of_thermal_ops = { +static const struct thermal_zone_of_device_ops hisi_of_thermal_ops = { .get_temp = hisi_thermal_get_temp, }; diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c index 51ceb80212a7..c719167e9f28 100644 --- a/drivers/thermal/int340x_thermal/acpi_thermal_rel.c +++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c @@ -228,7 +228,7 @@ static void get_single_name(acpi_handle handle, char *name) struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER}; if (ACPI_FAILURE(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer))) - pr_warn("Failed get name from handle\n"); + pr_warn("Failed to get device name from acpi handle\n"); else { memcpy(name, buffer.pointer, ACPI_NAME_SIZE); kfree(buffer.pointer); diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.h b/drivers/thermal/int340x_thermal/acpi_thermal_rel.h index f00700bc9d79..65075b174329 100644 --- a/drivers/thermal/int340x_thermal/acpi_thermal_rel.h +++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.h @@ -34,10 +34,10 @@ struct trt { acpi_handle target; u64 influence; u64 sample_period; - u64 reverved1; - u64 reverved2; - u64 reverved3; - u64 reverved4; + u64 reserved1; + u64 reserved2; + u64 reserved3; + u64 reserved4; } __packed; #define ACPI_NR_ART_ELEMENTS 13 diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c index a9ec94ed7a42..8ee38f55c7f3 100644 --- a/drivers/thermal/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c @@ -16,6 +16,8 @@ #include <linux/thermal.h> #include "acpi_thermal_rel.h" +#define INT3400_THERMAL_TABLE_CHANGED 0x83 + enum int3400_thermal_uuid { INT3400_THERMAL_PASSIVE_1, INT3400_THERMAL_ACTIVE, @@ -104,7 +106,7 @@ static struct attribute *uuid_attrs[] = { NULL }; -static struct attribute_group uuid_attribute_group = { +static const struct attribute_group uuid_attribute_group = { .attrs = uuid_attrs, .name = "uuids" }; @@ -185,6 +187,35 @@ static int int3400_thermal_run_osc(acpi_handle handle, return result; } +static void int3400_notify(acpi_handle handle, + u32 event, + void *data) +{ + struct int3400_thermal_priv *priv = data; + char *thermal_prop[5]; + + if (!priv) + return; + + switch (event) { + case INT3400_THERMAL_TABLE_CHANGED: + thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s", + priv->thermal->type); + thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d", + priv->thermal->temperature); + thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP="); + thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d", + THERMAL_TABLE_CHANGED); + thermal_prop[4] = NULL; + kobject_uevent_env(&priv->thermal->device.kobj, KOBJ_CHANGE, + thermal_prop); + break; + default: + dev_err(&priv->adev->dev, "Unsupported event [0x%x]\n", event); + break; + } +} + static int int3400_thermal_get_temp(struct thermal_zone_device *thermal, int *temp) { @@ -290,6 +321,12 @@ static int int3400_thermal_probe(struct platform_device *pdev) if (result) goto free_zone; + result = acpi_install_notify_handler( + priv->adev->handle, ACPI_DEVICE_NOTIFY, int3400_notify, + (void *)priv); + if (result) + goto free_zone; + return 0; free_zone: @@ -306,6 +343,10 @@ static int int3400_thermal_remove(struct platform_device *pdev) { struct int3400_thermal_priv *priv = platform_get_drvdata(pdev); + acpi_remove_notify_handler( + priv->adev->handle, ACPI_DEVICE_NOTIFY, + int3400_notify); + if (!priv->rel_misc_dev_res) acpi_thermal_rel_misc_device_remove(priv->adev->handle); diff --git a/drivers/thermal/int340x_thermal/int3406_thermal.c b/drivers/thermal/int340x_thermal/int3406_thermal.c index 1891f34ab7fc..f69ab026ba24 100644 --- a/drivers/thermal/int340x_thermal/int3406_thermal.c +++ b/drivers/thermal/int340x_thermal/int3406_thermal.c @@ -21,39 +21,33 @@ struct int3406_thermal_data { int upper_limit; - int upper_limit_index; int lower_limit; - int lower_limit_index; acpi_handle handle; struct acpi_video_device_brightness *br; struct backlight_device *raw_bd; struct thermal_cooling_device *cooling_dev; }; -static int int3406_thermal_to_raw(int level, struct int3406_thermal_data *d) -{ - int max_level = d->br->levels[d->br->count - 1]; - int raw_max = d->raw_bd->props.max_brightness; - - return level * raw_max / max_level; -} - -static int int3406_thermal_to_acpi(int level, struct int3406_thermal_data *d) -{ - int raw_max = d->raw_bd->props.max_brightness; - int max_level = d->br->levels[d->br->count - 1]; - - return level * max_level / raw_max; -} +/* + * According to the ACPI spec, + * "Each brightness level is represented by a number between 0 and 100, + * and can be thought of as a percentage. For example, 50 can be 50% + * power consumption or 50% brightness, as defined by the OEM." + * + * As int3406 device uses this value to communicate with the native + * graphics driver, we make the assumption that it represents + * the percentage of brightness only + */ +#define ACPI_TO_RAW(v, d) (d->raw_bd->props.max_brightness * v / 100) +#define RAW_TO_ACPI(v, d) (v * 100 / d->raw_bd->props.max_brightness) static int int3406_thermal_get_max_state(struct thermal_cooling_device *cooling_dev, unsigned long *state) { struct int3406_thermal_data *d = cooling_dev->devdata; - int index = d->lower_limit_index ? d->lower_limit_index : 2; - *state = d->br->count - 1 - index; + *state = d->upper_limit - d->lower_limit; return 0; } @@ -62,19 +56,15 @@ int3406_thermal_set_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long state) { struct int3406_thermal_data *d = cooling_dev->devdata; - int level, raw_level; + int acpi_level, raw_level; - if (state > d->br->count - 3) + if (state > d->upper_limit - d->lower_limit) return -EINVAL; - state = d->br->count - 1 - state; - level = d->br->levels[state]; + acpi_level = d->br->levels[d->upper_limit - state]; - if ((d->upper_limit && level > d->upper_limit) || - (d->lower_limit && level < d->lower_limit)) - return -EINVAL; + raw_level = ACPI_TO_RAW(acpi_level, d); - raw_level = int3406_thermal_to_raw(level, d); return backlight_device_set_brightness(d->raw_bd, raw_level); } @@ -83,27 +73,22 @@ int3406_thermal_get_cur_state(struct thermal_cooling_device *cooling_dev, unsigned long *state) { struct int3406_thermal_data *d = cooling_dev->devdata; - int raw_level, level, i; - int *levels = d->br->levels; + int acpi_level; + int index; - raw_level = d->raw_bd->props.brightness; - level = int3406_thermal_to_acpi(raw_level, d); + acpi_level = RAW_TO_ACPI(d->raw_bd->props.brightness, d); /* - * There is no 1:1 mapping between the firmware interface level with the - * raw interface level, we will have to find one that is close enough. + * There is no 1:1 mapping between the firmware interface level + * with the raw interface level, we will have to find one that is + * right above it. */ - for (i = 2; i < d->br->count; i++) { - if (level < levels[i]) { - if (i == 2) - break; - if ((level - levels[i - 1]) < (levels[i] - level)) - i--; + for (index = d->lower_limit; index < d->upper_limit; index++) { + if (acpi_level <= d->br->levels[index]) break; - } } - *state = d->br->count - 1 - i; + *state = d->upper_limit - index; return 0; } @@ -117,7 +102,7 @@ static int int3406_thermal_get_index(int *array, int nr, int value) { int i; - for (i = 0; i < nr; i++) { + for (i = 2; i < nr; i++) { if (array[i] == value) break; } @@ -128,27 +113,20 @@ static void int3406_thermal_get_limit(struct int3406_thermal_data *d) { acpi_status status; unsigned long long lower_limit, upper_limit; - int index; status = acpi_evaluate_integer(d->handle, "DDDL", NULL, &lower_limit); - if (ACPI_SUCCESS(status)) { - index = int3406_thermal_get_index(d->br->levels, d->br->count, - lower_limit); - if (index > 0) { - d->lower_limit = (int)lower_limit; - d->lower_limit_index = index; - } - } + if (ACPI_SUCCESS(status)) + d->lower_limit = int3406_thermal_get_index(d->br->levels, + d->br->count, lower_limit); status = acpi_evaluate_integer(d->handle, "DDPC", NULL, &upper_limit); - if (ACPI_SUCCESS(status)) { - index = int3406_thermal_get_index(d->br->levels, d->br->count, - upper_limit); - if (index > 0) { - d->upper_limit = (int)upper_limit; - d->upper_limit_index = index; - } - } + if (ACPI_SUCCESS(status)) + d->upper_limit = int3406_thermal_get_index(d->br->levels, + d->br->count, upper_limit); + + /* lower_limit and upper_limit should be always set */ + d->lower_limit = d->lower_limit > 0 ? d->lower_limit : 2; + d->upper_limit = d->upper_limit > 0 ? d->upper_limit : d->br->count - 1; } static void int3406_notify(acpi_handle handle, u32 event, void *data) diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c index ff3b36f339e3..f02341f7134d 100644 --- a/drivers/thermal/int340x_thermal/processor_thermal_device.c +++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c @@ -127,7 +127,7 @@ static struct attribute *power_limit_attrs[] = { NULL }; -static struct attribute_group power_limit_attribute_group = { +static const struct attribute_group power_limit_attribute_group = { .attrs = power_limit_attrs, .name = "power_limits" }; diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c index 2b49e8d0fe9e..c60b1cfcc64e 100644 --- a/drivers/thermal/intel_pch_thermal.c +++ b/drivers/thermal/intel_pch_thermal.c @@ -49,7 +49,7 @@ #define WPT_TSGPEN 0x84 /* General Purpose Event Enables */ /* Wildcat Point-LP PCH Thermal Register bit definitions */ -#define WPT_TEMP_TSR 0x00ff /* Temp TS Reading */ +#define WPT_TEMP_TSR 0x01ff /* Temp TS Reading */ #define WPT_TSC_CPDE 0x01 /* Catastrophic Power-Down Enable */ #define WPT_TSS_TSDSS 0x10 /* Thermal Sensor Dynamic Shutdown Status */ #define WPT_TSS_GPES 0x08 /* GPE status */ @@ -125,7 +125,7 @@ static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips) *nr_trips = 0; /* Check if BIOS has already enabled thermal sensor */ - if (WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS)) { + if (WPT_TSEL_ETS & readb(ptd->hw_base + WPT_TSEL)) { ptd->bios_enabled = true; goto read_trips; } @@ -141,7 +141,7 @@ static int pch_wpt_init(struct pch_thermal_device *ptd, int *nr_trips) } writeb(tsel|WPT_TSEL_ETS, ptd->hw_base + WPT_TSEL); - if (!(WPT_TSS_TSDSS & readb(ptd->hw_base + WPT_TSS))) { + if (!(WPT_TSEL_ETS & readb(ptd->hw_base + WPT_TSEL))) { dev_err(&ptd->pdev->dev, "Sensor can't be enabled\n"); return -ENODEV; } @@ -174,9 +174,9 @@ read_trips: static int pch_wpt_get_temp(struct pch_thermal_device *ptd, int *temp) { - u8 wpt_temp; + u16 wpt_temp; - wpt_temp = WPT_TEMP_TSR & readl(ptd->hw_base + WPT_TEMP); + wpt_temp = WPT_TEMP_TSR & readw(ptd->hw_base + WPT_TEMP); /* Resolution of 1/2 degree C and an offset of -50C */ *temp = (wpt_temp * 1000 / 2 - 50000); @@ -387,7 +387,7 @@ static int intel_pch_thermal_resume(struct device *device) return ptd->ops->resume(ptd); } -static struct pci_device_id intel_pch_thermal_id[] = { +static const struct pci_device_id intel_pch_thermal_id[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_HSW_1), .driver_data = board_hsw, }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCH_THERMAL_DID_HSW_2), diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c index 7737f14846f9..1e61c09153c9 100644 --- a/drivers/thermal/mtk_thermal.c +++ b/drivers/thermal/mtk_thermal.c @@ -3,6 +3,7 @@ * Author: Hanyi Wu <hanyi.wu@mediatek.com> * Sascha Hauer <s.hauer@pengutronix.de> * Dawei Chien <dawei.chien@mediatek.com> + * Louis Yu <louis.yu@mediatek.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -111,9 +112,10 @@ /* * Layout of the fuses providing the calibration data - * These macros could be used for both MT8173 and MT2701. - * MT8173 has five sensors and need five VTS calibration data, - * and MT2701 has three sensors and need three VTS calibration data. + * These macros could be used for MT8173, MT2701, and MT2712. + * MT8173 has 5 sensors and needs 5 VTS calibration data. + * MT2701 has 3 sensors and needs 3 VTS calibration data. + * MT2712 has 4 sensors and needs 4 VTS calibration data. */ #define MT8173_CALIB_BUF0_VALID BIT(0) #define MT8173_CALIB_BUF1_ADC_GE(x) (((x) >> 22) & 0x3ff) @@ -124,6 +126,8 @@ #define MT8173_CALIB_BUF2_VTS_TSABB(x) (((x) >> 14) & 0x1ff) #define MT8173_CALIB_BUF0_DEGC_CALI(x) (((x) >> 1) & 0x3f) #define MT8173_CALIB_BUF0_O_SLOPE(x) (((x) >> 26) & 0x3f) +#define MT8173_CALIB_BUF0_O_SLOPE_SIGN(x) (((x) >> 7) & 0x1) +#define MT8173_CALIB_BUF1_ID(x) (((x) >> 9) & 0x1) /* MT2701 thermal sensors */ #define MT2701_TS1 0 @@ -136,11 +140,26 @@ /* The total number of temperature sensors in the MT2701 */ #define MT2701_NUM_SENSORS 3 -#define THERMAL_NAME "mtk-thermal" - /* The number of sensing points per bank */ #define MT2701_NUM_SENSORS_PER_ZONE 3 +/* MT2712 thermal sensors */ +#define MT2712_TS1 0 +#define MT2712_TS2 1 +#define MT2712_TS3 2 +#define MT2712_TS4 3 + +/* AUXADC channel 11 is used for the temperature sensors */ +#define MT2712_TEMP_AUXADC_CHANNEL 11 + +/* The total number of temperature sensors in the MT2712 */ +#define MT2712_NUM_SENSORS 4 + +/* The number of sensing points per bank */ +#define MT2712_NUM_SENSORS_PER_ZONE 4 + +#define THERMAL_NAME "mtk-thermal" + struct mtk_thermal; struct thermal_bank_cfg { @@ -215,6 +234,21 @@ static const int mt2701_adcpnp[MT2701_NUM_SENSORS_PER_ZONE] = { static const int mt2701_mux_values[MT2701_NUM_SENSORS] = { 0, 1, 16 }; +/* MT2712 thermal sensor data */ +static const int mt2712_bank_data[MT2712_NUM_SENSORS] = { + MT2712_TS1, MT2712_TS2, MT2712_TS3, MT2712_TS4 +}; + +static const int mt2712_msr[MT2712_NUM_SENSORS_PER_ZONE] = { + TEMP_MSR0, TEMP_MSR1, TEMP_MSR2, TEMP_MSR3 +}; + +static const int mt2712_adcpnp[MT2712_NUM_SENSORS_PER_ZONE] = { + TEMP_ADCPNP0, TEMP_ADCPNP1, TEMP_ADCPNP2, TEMP_ADCPNP3 +}; + +static const int mt2712_mux_values[MT2712_NUM_SENSORS] = { 0, 1, 2, 3 }; + /** * The MT8173 thermal controller has four banks. Each bank can read up to * four temperature sensors simultaneously. The MT8173 has a total of 5 @@ -278,6 +312,31 @@ static const struct mtk_thermal_data mt2701_thermal_data = { }; /** + * The MT2712 thermal controller has one bank, which can read up to + * four temperature sensors simultaneously. The MT2712 has a total of 4 + * temperature sensors. + * + * The thermal core only gets the maximum temperature of this one bank, + * so the bank concept wouldn't be necessary here. However, the SVS (Smart + * Voltage Scaling) unit makes its decisions based on the same bank + * data. + */ +static const struct mtk_thermal_data mt2712_thermal_data = { + .auxadc_channel = MT2712_TEMP_AUXADC_CHANNEL, + .num_banks = 1, + .num_sensors = MT2712_NUM_SENSORS, + .bank_data = { + { + .num_sensors = 4, + .sensors = mt2712_bank_data, + }, + }, + .msr = mt2712_msr, + .adcpnp = mt2712_adcpnp, + .sensor_mux_values = mt2712_mux_values, +}; + +/** * raw_to_mcelsius - convert a raw ADC value to mcelsius * @mt: The thermal controller * @raw: raw ADC value @@ -552,7 +611,11 @@ static int mtk_thermal_get_calibration_data(struct device *dev, mt->vts[MT8173_TS4] = MT8173_CALIB_BUF2_VTS_TS4(buf[2]); mt->vts[MT8173_TSABB] = MT8173_CALIB_BUF2_VTS_TSABB(buf[2]); mt->degc_cali = MT8173_CALIB_BUF0_DEGC_CALI(buf[0]); - mt->o_slope = MT8173_CALIB_BUF0_O_SLOPE(buf[0]); + if (MT8173_CALIB_BUF1_ID(buf[1]) & + MT8173_CALIB_BUF0_O_SLOPE_SIGN(buf[0])) + mt->o_slope = -MT8173_CALIB_BUF0_O_SLOPE(buf[0]); + else + mt->o_slope = MT8173_CALIB_BUF0_O_SLOPE(buf[0]); } else { dev_info(dev, "Device not calibrated, using default calibration values\n"); } @@ -571,6 +634,10 @@ static const struct of_device_id mtk_thermal_of_match[] = { { .compatible = "mediatek,mt2701-thermal", .data = (void *)&mt2701_thermal_data, + }, + { + .compatible = "mediatek,mt2712-thermal", + .data = (void *)&mt2712_thermal_data, }, { }, }; @@ -645,16 +712,16 @@ static int mtk_thermal_probe(struct platform_device *pdev) return -EINVAL; } + ret = device_reset(&pdev->dev); + if (ret) + return ret; + ret = clk_prepare_enable(mt->clk_auxadc); if (ret) { dev_err(&pdev->dev, "Can't enable auxadc clk: %d\n", ret); return ret; } - ret = device_reset(&pdev->dev); - if (ret) - goto err_disable_clk_auxadc; - ret = clk_prepare_enable(mt->clk_peri_therm); if (ret) { dev_err(&pdev->dev, "Can't enable peri clk: %d\n", ret); @@ -705,6 +772,7 @@ static struct platform_driver mtk_thermal_driver = { module_platform_driver(mtk_thermal_driver); +MODULE_AUTHOR("Louis Yu <louis.yu@mediatek.com>"); MODULE_AUTHOR("Dawei Chien <dawei.chien@mediatek.com>"); MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>"); MODULE_AUTHOR("Hanyi Wu <hanyi.wu@mediatek.com>"); diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index 4362a69ac88d..c866cc165960 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -188,7 +188,7 @@ static void qoriq_tmu_init_device(struct qoriq_tmu_data *data) tmu_write(data, TMR_DISABLE, &data->regs->tmr); } -static struct thermal_zone_of_device_ops tmu_tz_ops = { +static const struct thermal_zone_of_device_ops tmu_tz_ops = { .get_temp = tmu_get_temp, }; diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c index 37fcefd06d9f..203aca44a2bb 100644 --- a/drivers/thermal/rcar_gen3_thermal.c +++ b/drivers/thermal/rcar_gen3_thermal.c @@ -225,7 +225,7 @@ static int rcar_gen3_thermal_set_trips(void *devdata, int low, int high) return 0; } -static struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = { +static const struct thermal_zone_of_device_ops rcar_gen3_tz_of_ops = { .get_temp = rcar_gen3_thermal_get_temp, .set_trips = rcar_gen3_thermal_set_trips, }; diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 4c7796512453..206035139110 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -320,6 +320,44 @@ static const struct tsadc_table rk3288_code_table[] = { {0, 125000}, }; +static const struct tsadc_table rk3328_code_table[] = { + {0, -40000}, + {296, -40000}, + {304, -35000}, + {313, -30000}, + {331, -20000}, + {340, -15000}, + {349, -10000}, + {359, -5000}, + {368, 0}, + {378, 5000}, + {388, 10000}, + {398, 15000}, + {408, 20000}, + {418, 25000}, + {429, 30000}, + {440, 35000}, + {451, 40000}, + {462, 45000}, + {473, 50000}, + {485, 55000}, + {496, 60000}, + {508, 65000}, + {521, 70000}, + {533, 75000}, + {546, 80000}, + {559, 85000}, + {572, 90000}, + {586, 95000}, + {600, 100000}, + {614, 105000}, + {629, 110000}, + {644, 115000}, + {659, 120000}, + {675, 125000}, + {TSADCV2_DATA_MASK, 125000}, +}; + static const struct tsadc_table rk3368_code_table[] = { {0, -40000}, {106, -40000}, @@ -790,6 +828,29 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = { }, }; +static const struct rockchip_tsadc_chip rk3328_tsadc_data = { + .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */ + .chn_num = 1, /* one channels for tsadc */ + + .tshut_mode = TSHUT_MODE_CRU, /* default TSHUT via CRU */ + .tshut_temp = 95000, + + .initialize = rk_tsadcv2_initialize, + .irq_ack = rk_tsadcv3_irq_ack, + .control = rk_tsadcv3_control, + .get_temp = rk_tsadcv2_get_temp, + .set_alarm_temp = rk_tsadcv2_alarm_temp, + .set_tshut_temp = rk_tsadcv2_tshut_temp, + .set_tshut_mode = rk_tsadcv2_tshut_mode, + + .table = { + .id = rk3328_code_table, + .length = ARRAY_SIZE(rk3328_code_table), + .data_mask = TSADCV2_DATA_MASK, + .mode = ADC_INCREMENT, + }, +}; + static const struct rockchip_tsadc_chip rk3366_tsadc_data = { .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */ .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */ @@ -875,6 +936,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = { .data = (void *)&rk3288_tsadc_data, }, { + .compatible = "rockchip,rk3328-tsadc", + .data = (void *)&rk3328_tsadc_data, + }, + { .compatible = "rockchip,rk3366-tsadc", .data = (void *)&rk3366_tsadc_data, }, diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c index 7b8ef09d2b3c..ed805c7c5ace 100644 --- a/drivers/thermal/samsung/exynos_tmu.c +++ b/drivers/thermal/samsung/exynos_tmu.c @@ -1286,7 +1286,7 @@ static int exynos_map_dt_data(struct platform_device *pdev) return 0; } -static struct thermal_zone_of_device_ops exynos_sensor_ops = { +static const struct thermal_zone_of_device_ops exynos_sensor_ops = { .get_temp = exynos_get_temp, .set_emul_temp = exynos_tmu_set_emulation, }; diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 5a51c740e372..2b1b0ba393a4 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -390,7 +390,7 @@ static void handle_critical_trips(struct thermal_zone_device *tz, if (trip_type == THERMAL_TRIP_CRITICAL) { dev_emerg(&tz->device, - "critical temperature reached(%d C),shutting down\n", + "critical temperature reached (%d C), shutting down\n", tz->temperature / 1000); mutex_lock(&poweroff_lock); if (!power_off_triggered) { @@ -836,11 +836,7 @@ static void thermal_release(struct device *dev) if (!strncmp(dev_name(dev), "thermal_zone", sizeof("thermal_zone") - 1)) { tz = to_thermal_zone(dev); - kfree(tz->trip_type_attrs); - kfree(tz->trip_temp_attrs); - kfree(tz->trip_hyst_attrs); - kfree(tz->trips_attribute_group.attrs); - kfree(tz->device.groups); + thermal_zone_destroy_device_groups(tz); kfree(tz); } else if (!strncmp(dev_name(dev), "cooling_device", sizeof("cooling_device") - 1)) { @@ -1213,10 +1209,8 @@ thermal_zone_device_register(const char *type, int trips, int mask, ida_init(&tz->ida); mutex_init(&tz->lock); result = ida_simple_get(&thermal_tz_ida, 0, 0, GFP_KERNEL); - if (result < 0) { - kfree(tz); - return ERR_PTR(result); - } + if (result < 0) + goto free_tz; tz->id = result; strlcpy(tz->type, type, sizeof(tz->type)); @@ -1232,18 +1226,15 @@ thermal_zone_device_register(const char *type, int trips, int mask, /* Add nodes that are always present via .groups */ result = thermal_zone_create_device_groups(tz, mask); if (result) - goto unregister; + goto remove_id; /* A new thermal zone needs to be updated anyway. */ atomic_set(&tz->need_update, 1); dev_set_name(&tz->device, "thermal_zone%d", tz->id); result = device_register(&tz->device); - if (result) { - ida_simple_remove(&thermal_tz_ida, tz->id); - kfree(tz); - return ERR_PTR(result); - } + if (result) + goto remove_device_groups; for (count = 0; count < trips; count++) { if (tz->ops->get_trip_type(tz, count, &trip_type)) @@ -1297,6 +1288,14 @@ unregister: ida_simple_remove(&thermal_tz_ida, tz->id); device_unregister(&tz->device); return ERR_PTR(result); + +remove_device_groups: + thermal_zone_destroy_device_groups(tz); +remove_id: + ida_simple_remove(&thermal_tz_ida, tz->id); +free_tz: + kfree(tz); + return ERR_PTR(result); } EXPORT_SYMBOL_GPL(thermal_zone_device_register); diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 2412b3759e16..27e3b1df7360 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -71,6 +71,7 @@ int thermal_build_list_of_policies(char *buf); /* sysfs I/F */ int thermal_zone_create_device_groups(struct thermal_zone_device *, int); +void thermal_zone_destroy_device_groups(struct thermal_zone_device *); void thermal_cooling_device_setup_sysfs(struct thermal_cooling_device *); /* used only at binding time */ ssize_t diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index a694de907a26..fb80c96d8f73 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -605,6 +605,24 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) return 0; } +/** + * destroy_trip_attrs() - destroy attributes for trip points + * @tz: the thermal zone device + * + * helper function to free resources allocated by create_trip_attrs() + */ +static void destroy_trip_attrs(struct thermal_zone_device *tz) +{ + if (!tz) + return; + + kfree(tz->trip_type_attrs); + kfree(tz->trip_temp_attrs); + if (tz->ops->get_trip_hyst) + kfree(tz->trip_hyst_attrs); + kfree(tz->trips_attribute_group.attrs); +} + int thermal_zone_create_device_groups(struct thermal_zone_device *tz, int mask) { @@ -637,6 +655,17 @@ int thermal_zone_create_device_groups(struct thermal_zone_device *tz, return 0; } +void thermal_zone_destroy_device_groups(struct thermal_zone_device *tz) +{ + if (!tz) + return; + + if (tz->trips) + destroy_trip_attrs(tz); + + kfree(tz->device.groups); +} + /* sys I/F for cooling device */ static ssize_t thermal_cooling_device_type_show(struct device *dev, diff --git a/drivers/thermal/uniphier_thermal.c b/drivers/thermal/uniphier_thermal.c new file mode 100644 index 000000000000..95704732f760 --- /dev/null +++ b/drivers/thermal/uniphier_thermal.c @@ -0,0 +1,384 @@ +/** + * uniphier_thermal.c - Socionext UniPhier thermal driver + * + * Copyright 2014 Panasonic Corporation + * Copyright 2016-2017 Socionext Inc. + * All rights reserved. + * + * Author: + * Kunihiko Hayashi <hayashi.kunihiko@socionext.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 of + * the License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/bitops.h> +#include <linux/interrupt.h> +#include <linux/mfd/syscon.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_device.h> +#include <linux/platform_device.h> +#include <linux/regmap.h> +#include <linux/thermal.h> + +#include "thermal_core.h" + +/* + * block registers + * addresses are the offset from .block_base + */ +#define PVTCTLEN 0x0000 +#define PVTCTLEN_EN BIT(0) + +#define PVTCTLMODE 0x0004 +#define PVTCTLMODE_MASK 0xf +#define PVTCTLMODE_TEMPMON 0x5 + +#define EMONREPEAT 0x0040 +#define EMONREPEAT_ENDLESS BIT(24) +#define EMONREPEAT_PERIOD GENMASK(3, 0) +#define EMONREPEAT_PERIOD_1000000 0x9 + +/* + * common registers + * addresses are the offset from .map_base + */ +#define PVTCTLSEL 0x0900 +#define PVTCTLSEL_MASK GENMASK(2, 0) +#define PVTCTLSEL_MONITOR 0 + +#define SETALERT0 0x0910 +#define SETALERT1 0x0914 +#define SETALERT2 0x0918 +#define SETALERT_TEMP_OVF (GENMASK(7, 0) << 16) +#define SETALERT_TEMP_OVF_VALUE(val) (((val) & GENMASK(7, 0)) << 16) +#define SETALERT_EN BIT(0) + +#define PMALERTINTCTL 0x0920 +#define PMALERTINTCTL_CLR(ch) BIT(4 * (ch) + 2) +#define PMALERTINTCTL_SET(ch) BIT(4 * (ch) + 1) +#define PMALERTINTCTL_EN(ch) BIT(4 * (ch) + 0) +#define PMALERTINTCTL_MASK (GENMASK(10, 8) | GENMASK(6, 4) | \ + GENMASK(2, 0)) + +#define TMOD 0x0928 +#define TMOD_WIDTH 9 + +#define TMODCOEF 0x0e5c + +#define TMODSETUP0_EN BIT(30) +#define TMODSETUP0_VAL(val) (((val) & GENMASK(13, 0)) << 16) +#define TMODSETUP1_EN BIT(15) +#define TMODSETUP1_VAL(val) ((val) & GENMASK(14, 0)) + +/* SoC critical temperature */ +#define CRITICAL_TEMP_LIMIT (120 * 1000) + +/* Max # of alert channels */ +#define ALERT_CH_NUM 3 + +/* SoC specific thermal sensor data */ +struct uniphier_tm_soc_data { + u32 map_base; + u32 block_base; + u32 tmod_setup_addr; +}; + +struct uniphier_tm_dev { + struct regmap *regmap; + struct device *dev; + bool alert_en[ALERT_CH_NUM]; + struct thermal_zone_device *tz_dev; + const struct uniphier_tm_soc_data *data; +}; + +static int uniphier_tm_initialize_sensor(struct uniphier_tm_dev *tdev) +{ + struct regmap *map = tdev->regmap; + u32 val; + u32 tmod_calib[2]; + int ret; + + /* stop PVT */ + regmap_write_bits(map, tdev->data->block_base + PVTCTLEN, + PVTCTLEN_EN, 0); + + /* + * Since SoC has a calibrated value that was set in advance, + * TMODCOEF shows non-zero and PVT refers the value internally. + * + * If TMODCOEF shows zero, the boards don't have the calibrated + * value, and the driver has to set default value from DT. + */ + ret = regmap_read(map, tdev->data->map_base + TMODCOEF, &val); + if (ret) + return ret; + if (!val) { + /* look for the default values in DT */ + ret = of_property_read_u32_array(tdev->dev->of_node, + "socionext,tmod-calibration", + tmod_calib, + ARRAY_SIZE(tmod_calib)); + if (ret) + return ret; + + regmap_write(map, tdev->data->tmod_setup_addr, + TMODSETUP0_EN | TMODSETUP0_VAL(tmod_calib[0]) | + TMODSETUP1_EN | TMODSETUP1_VAL(tmod_calib[1])); + } + + /* select temperature mode */ + regmap_write_bits(map, tdev->data->block_base + PVTCTLMODE, + PVTCTLMODE_MASK, PVTCTLMODE_TEMPMON); + + /* set monitoring period */ + regmap_write_bits(map, tdev->data->block_base + EMONREPEAT, + EMONREPEAT_ENDLESS | EMONREPEAT_PERIOD, + EMONREPEAT_ENDLESS | EMONREPEAT_PERIOD_1000000); + + /* set monitor mode */ + regmap_write_bits(map, tdev->data->map_base + PVTCTLSEL, + PVTCTLSEL_MASK, PVTCTLSEL_MONITOR); + + return 0; +} + +static void uniphier_tm_set_alert(struct uniphier_tm_dev *tdev, u32 ch, + u32 temp) +{ + struct regmap *map = tdev->regmap; + + /* set alert temperature */ + regmap_write_bits(map, tdev->data->map_base + SETALERT0 + (ch << 2), + SETALERT_EN | SETALERT_TEMP_OVF, + SETALERT_EN | + SETALERT_TEMP_OVF_VALUE(temp / 1000)); +} + +static void uniphier_tm_enable_sensor(struct uniphier_tm_dev *tdev) +{ + struct regmap *map = tdev->regmap; + int i; + u32 bits = 0; + + for (i = 0; i < ALERT_CH_NUM; i++) + if (tdev->alert_en[i]) + bits |= PMALERTINTCTL_EN(i); + + /* enable alert interrupt */ + regmap_write_bits(map, tdev->data->map_base + PMALERTINTCTL, + PMALERTINTCTL_MASK, bits); + + /* start PVT */ + regmap_write_bits(map, tdev->data->block_base + PVTCTLEN, + PVTCTLEN_EN, PVTCTLEN_EN); + + usleep_range(700, 1500); /* The spec note says at least 700us */ +} + +static void uniphier_tm_disable_sensor(struct uniphier_tm_dev *tdev) +{ + struct regmap *map = tdev->regmap; + + /* disable alert interrupt */ + regmap_write_bits(map, tdev->data->map_base + PMALERTINTCTL, + PMALERTINTCTL_MASK, 0); + + /* stop PVT */ + regmap_write_bits(map, tdev->data->block_base + PVTCTLEN, + PVTCTLEN_EN, 0); + + usleep_range(1000, 2000); /* The spec note says at least 1ms */ +} + +static int uniphier_tm_get_temp(void *data, int *out_temp) +{ + struct uniphier_tm_dev *tdev = data; + struct regmap *map = tdev->regmap; + int ret; + u32 temp; + + ret = regmap_read(map, tdev->data->map_base + TMOD, &temp); + if (ret) + return ret; + + /* MSB of the TMOD field is a sign bit */ + *out_temp = sign_extend32(temp, TMOD_WIDTH - 1) * 1000; + + return 0; +} + +static const struct thermal_zone_of_device_ops uniphier_of_thermal_ops = { + .get_temp = uniphier_tm_get_temp, +}; + +static void uniphier_tm_irq_clear(struct uniphier_tm_dev *tdev) +{ + u32 mask = 0, bits = 0; + int i; + + for (i = 0; i < ALERT_CH_NUM; i++) { + mask |= (PMALERTINTCTL_CLR(i) | PMALERTINTCTL_SET(i)); + bits |= PMALERTINTCTL_CLR(i); + } + + /* clear alert interrupt */ + regmap_write_bits(tdev->regmap, + tdev->data->map_base + PMALERTINTCTL, mask, bits); +} + +static irqreturn_t uniphier_tm_alarm_irq(int irq, void *_tdev) +{ + struct uniphier_tm_dev *tdev = _tdev; + + disable_irq_nosync(irq); + uniphier_tm_irq_clear(tdev); + + return IRQ_WAKE_THREAD; +} + +static irqreturn_t uniphier_tm_alarm_irq_thread(int irq, void *_tdev) +{ + struct uniphier_tm_dev *tdev = _tdev; + + thermal_zone_device_update(tdev->tz_dev, THERMAL_EVENT_UNSPECIFIED); + + return IRQ_HANDLED; +} + +static int uniphier_tm_probe(struct platform_device *pdev) +{ + struct device *dev = &pdev->dev; + struct regmap *regmap; + struct device_node *parent; + struct uniphier_tm_dev *tdev; + const struct thermal_trip *trips; + int i, ret, irq, ntrips, crit_temp = INT_MAX; + + tdev = devm_kzalloc(dev, sizeof(*tdev), GFP_KERNEL); + if (!tdev) + return -ENOMEM; + tdev->dev = dev; + + tdev->data = of_device_get_match_data(dev); + if (WARN_ON(!tdev->data)) + return -EINVAL; + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + /* get regmap from syscon node */ + parent = of_get_parent(dev->of_node); /* parent should be syscon node */ + regmap = syscon_node_to_regmap(parent); + of_node_put(parent); + if (IS_ERR(regmap)) { + dev_err(dev, "failed to get regmap (error %ld)\n", + PTR_ERR(regmap)); + return PTR_ERR(regmap); + } + tdev->regmap = regmap; + + ret = uniphier_tm_initialize_sensor(tdev); + if (ret) { + dev_err(dev, "failed to initialize sensor\n"); + return ret; + } + + ret = devm_request_threaded_irq(dev, irq, uniphier_tm_alarm_irq, + uniphier_tm_alarm_irq_thread, + 0, "thermal", tdev); + if (ret) + return ret; + + platform_set_drvdata(pdev, tdev); + + tdev->tz_dev = devm_thermal_zone_of_sensor_register(dev, 0, tdev, + &uniphier_of_thermal_ops); + if (IS_ERR(tdev->tz_dev)) { + dev_err(dev, "failed to register sensor device\n"); + return PTR_ERR(tdev->tz_dev); + } + + /* get trip points */ + trips = of_thermal_get_trip_points(tdev->tz_dev); + ntrips = of_thermal_get_ntrips(tdev->tz_dev); + if (ntrips > ALERT_CH_NUM) { + dev_err(dev, "thermal zone has too many trips\n"); + return -E2BIG; + } + + /* set alert temperatures */ + for (i = 0; i < ntrips; i++) { + if (trips[i].type == THERMAL_TRIP_CRITICAL && + trips[i].temperature < crit_temp) + crit_temp = trips[i].temperature; + uniphier_tm_set_alert(tdev, i, trips[i].temperature); + tdev->alert_en[i] = true; + } + if (crit_temp > CRITICAL_TEMP_LIMIT) { + dev_err(dev, "critical trip is over limit(>%d), or not set\n", + CRITICAL_TEMP_LIMIT); + return -EINVAL; + } + + uniphier_tm_enable_sensor(tdev); + + return 0; +} + +static int uniphier_tm_remove(struct platform_device *pdev) +{ + struct uniphier_tm_dev *tdev = platform_get_drvdata(pdev); + + /* disable sensor */ + uniphier_tm_disable_sensor(tdev); + + return 0; +} + +static const struct uniphier_tm_soc_data uniphier_pxs2_tm_data = { + .map_base = 0xe000, + .block_base = 0xe000, + .tmod_setup_addr = 0xe904, +}; + +static const struct uniphier_tm_soc_data uniphier_ld20_tm_data = { + .map_base = 0xe000, + .block_base = 0xe800, + .tmod_setup_addr = 0xe938, +}; + +static const struct of_device_id uniphier_tm_dt_ids[] = { + { + .compatible = "socionext,uniphier-pxs2-thermal", + .data = &uniphier_pxs2_tm_data, + }, + { + .compatible = "socionext,uniphier-ld20-thermal", + .data = &uniphier_ld20_tm_data, + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, uniphier_tm_dt_ids); + +static struct platform_driver uniphier_tm_driver = { + .probe = uniphier_tm_probe, + .remove = uniphier_tm_remove, + .driver = { + .name = "uniphier-thermal", + .of_match_table = uniphier_tm_dt_ids, + }, +}; +module_platform_driver(uniphier_tm_driver); + +MODULE_AUTHOR("Kunihiko Hayashi <hayashi.kunihiko@socionext.com>"); +MODULE_DESCRIPTION("UniPhier thermal driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/thermal/zx2967_thermal.c b/drivers/thermal/zx2967_thermal.c index a5670ad2cfc8..6acce0bce7c0 100644 --- a/drivers/thermal/zx2967_thermal.c +++ b/drivers/thermal/zx2967_thermal.c @@ -111,7 +111,7 @@ unlock: return ret; } -static struct thermal_zone_of_device_ops zx2967_of_thermal_ops = { +static const struct thermal_zone_of_device_ops zx2967_of_thermal_ops = { .get_temp = zx2967_thermal_get_temp, }; diff --git a/drivers/usb/host/ohci-sm501.c b/drivers/usb/host/ohci-sm501.c index a8b8d8b8d9f3..d4e0f7cd96fa 100644 --- a/drivers/usb/host/ohci-sm501.c +++ b/drivers/usb/host/ohci-sm501.c @@ -123,13 +123,12 @@ static int ohci_hcd_sm501_drv_probe(struct platform_device *pdev) * regular memory. The HCD_LOCAL_MEM flag does just that. */ - if (!dma_declare_coherent_memory(dev, mem->start, + retval = dma_declare_coherent_memory(dev, mem->start, mem->start - mem->parent->start, resource_size(mem), - DMA_MEMORY_MAP | - DMA_MEMORY_EXCLUSIVE)) { + DMA_MEMORY_EXCLUSIVE); + if (retval) { dev_err(dev, "cannot declare coherent memory\n"); - retval = -ENXIO; goto err1; } diff --git a/drivers/usb/host/ohci-tmio.c b/drivers/usb/host/ohci-tmio.c index cfcfadfc94fc..16d081a093bb 100644 --- a/drivers/usb/host/ohci-tmio.c +++ b/drivers/usb/host/ohci-tmio.c @@ -227,13 +227,10 @@ static int ohci_hcd_tmio_drv_probe(struct platform_device *dev) goto err_ioremap_regs; } - if (!dma_declare_coherent_memory(&dev->dev, sram->start, - sram->start, - resource_size(sram), - DMA_MEMORY_MAP | DMA_MEMORY_EXCLUSIVE)) { - ret = -EBUSY; + ret = dma_declare_coherent_memory(&dev->dev, sram->start, sram->start, + resource_size(sram), DMA_MEMORY_EXCLUSIVE); + if (ret) goto err_dma_declare; - } if (cell->enable) { ret = cell->enable(dev); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1bc709fe330a..b3e3edc09d80 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -152,17 +152,10 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, ceph_invalidate_fscache_page(inode, page); + WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; - /* - * We can get non-dirty pages here due to races between - * set_page_dirty and truncate_complete_page; just spit out a - * warning, in case we end up with accounting problems later. - */ - if (!PageDirty(page)) - pr_err("%p invalidatepage %p page not dirty\n", inode, page); - ClearPageChecked(page); dout("%p invalidatepage %p idx %lu full dirty page\n", @@ -455,13 +448,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; - if (fsc->mount_options->rsize >= PAGE_SIZE) - max = (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - - dout("readpages %p file %p nr_pages %d max %d\n", inode, - file, nr_pages, - max); + max = fsc->mount_options->rsize >> PAGE_SHIFT; + dout("readpages %p file %p nr_pages %d max %d\n", + inode, file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); if (rc < 0) @@ -474,14 +463,22 @@ out: return rc; } +struct ceph_writeback_ctl +{ + loff_t i_size; + u64 truncate_size; + u32 truncate_seq; + bool size_stable; + bool head_snapc; +}; + /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - loff_t *snap_size, - u64 *truncate_size, - u32 *truncate_seq) +static struct ceph_snap_context * +get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, + struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -491,30 +488,78 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); - if (capsnap->dirty_pages) { - snapc = ceph_get_snap_context(capsnap->context); - if (snap_size) - *snap_size = capsnap->size; - if (truncate_size) - *truncate_size = capsnap->truncate_size; - if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; - break; + if (!capsnap->dirty_pages) + continue; + + /* get i_size, truncate_{seq,size} for page_snapc? */ + if (snapc && capsnap->context != page_snapc) + continue; + + if (ctl) { + if (capsnap->writing) { + ctl->i_size = i_size_read(inode); + ctl->size_stable = false; + } else { + ctl->i_size = capsnap->size; + ctl->size_stable = true; + } + ctl->truncate_size = capsnap->truncate_size; + ctl->truncate_seq = capsnap->truncate_seq; + ctl->head_snapc = false; } + + if (snapc) + break; + + snapc = ceph_get_snap_context(capsnap->context); + if (!page_snapc || + page_snapc == snapc || + page_snapc->seq > snapc->seq) + break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); - if (truncate_size) - *truncate_size = ci->i_truncate_size; - if (truncate_seq) - *truncate_seq = ci->i_truncate_seq; + if (ctl) { + ctl->i_size = i_size_read(inode); + ctl->truncate_size = ci->i_truncate_size; + ctl->truncate_seq = ci->i_truncate_seq; + ctl->size_stable = false; + ctl->head_snapc = true; + } } spin_unlock(&ci->i_ceph_lock); return snapc; } +static u64 get_writepages_data_length(struct inode *inode, + struct page *page, u64 start) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_cap_snap *capsnap = NULL; + u64 end = i_size_read(inode); + + if (snapc != ci->i_head_snapc) { + bool found = false; + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + if (!capsnap->writing) + end = capsnap->size; + found = true; + break; + } + } + spin_unlock(&ci->i_ceph_lock); + WARN_ON(!found); + } + if (end > page_offset(page) + PAGE_SIZE) + end = page_offset(page) + PAGE_SIZE; + return end > start ? end - start : 0; +} + /* * Write a single page, but leave the page locked. * @@ -526,30 +571,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; - struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - loff_t snap_size = -1; long writeback_stat; - u64 truncate_size; - u32 truncate_seq; int err, len = PAGE_SIZE; + struct ceph_writeback_ctl ceph_wbc; dout("writepage %p idx %lu\n", page, page->index); inode = page->mapping->host; ci = ceph_inode(inode); fsc = ceph_inode_to_client(inode); - osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = page_snap_context(page); - if (snapc == NULL) { + if (!snapc) { dout("writepage %p page %p not dirty?\n", inode, page); return 0; } - oldest = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, snapc); @@ -561,20 +601,18 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); - if (snap_size == -1) - snap_size = i_size_read(inode); - /* is this a partial page at end of file? */ - if (page_off >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_off >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", page, ceph_wbc.i_size); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); return 0; } - if (snap_size < page_off + len) - len = snap_size - page_off; + if (ceph_wbc.i_size < page_off + len) + len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", - inode, page, page->index, page_off, len, snapc); + dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", + inode, page, page->index, page_off, len, snapc, snapc->seq); writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > @@ -582,10 +620,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_osdc_writepages(osdc, ceph_vino(inode), - &ci->i_layout, snapc, - page_off, len, - truncate_seq, truncate_size, + err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, snapc, page_off, len, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; @@ -746,31 +784,17 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start, end; - int range_whole = 0; - int should_loop = 1; - pgoff_t max_pages = 0, max_pages_ever = 0; + pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; - int done = 0; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; - int do_sync = 0; - loff_t snap_size, i_size; - u64 truncate_size; - u32 truncate_seq; + struct ceph_writeback_ctl ceph_wbc; + bool should_loop, range_whole = false; + bool stop, done = false; - /* - * Include a 'sync' in the OSD request if this is a data - * integrity write (e.g., O_SYNC write or fsync()), or if our - * cap is being revoked. - */ - if ((wbc->sync_mode == WB_SYNC_ALL) || - ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) - do_sync = 1; - dout("writepages_start %p dosync=%d (mode=%s)\n", - inode, do_sync, + dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); @@ -783,35 +807,17 @@ static int ceph_writepages_start(struct address_space *mapping, mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } - if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - if (wsize < PAGE_SIZE) - wsize = PAGE_SIZE; - max_pages_ever = wsize >> PAGE_SHIFT; pagevec_init(&pvec, 0); - /* where to start/end? */ - if (wbc->range_cyclic) { - start = mapping->writeback_index; /* Start from prev offset */ - end = -1; - dout(" cyclic, start at %lu\n", start); - } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - should_loop = 0; - dout(" not cyclic, %lu to %lu\n", start, end); - } - index = start; + start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + index = start_index; retry: /* find oldest snap context with dirty data */ - ceph_put_snap_context(snapc); - snap_size = -1; - snapc = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + snapc = get_oldest_context(inode, &ceph_wbc, NULL); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ @@ -821,40 +827,56 @@ retry: dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); - i_size = i_size_read(inode); - - if (last_snapc && snapc != last_snapc) { - /* if we switched to a newer snapc, restart our scan at the - * start of the original file range. */ - dout(" snapc differs from last pass, restarting at %lu\n", - index); - index = start; + should_loop = false; + if (ceph_wbc.head_snapc && snapc != last_snapc) { + /* where to start/end? */ + if (wbc->range_cyclic) { + index = start_index; + end = -1; + if (index > 0) + should_loop = true; + dout(" cyclic, start at %lu\n", index); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + dout(" not cyclic, %lu to %lu\n", index, end); + } + } else if (!ceph_wbc.head_snapc) { + /* Do not respect wbc->range_{start,end}. Dirty pages + * in that range can be associated with newer snapc. + * They are not writeable until we write all dirty pages + * associated with 'snapc' get written */ + if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) + should_loop = true; + dout(" non-head snapc, range whole\n"); } + + ceph_put_snap_context(last_snapc); last_snapc = snapc; - while (!done && index <= end) { - unsigned i; - int first; - pgoff_t strip_unit_end = 0; + stop = false; + while (!stop && index <= end) { int num_ops = 0, op_idx; - int pvec_pages, locked_pages = 0; + unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; - int want; + pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; - max_pages = max_pages_ever; + max_pages = wsize >> PAGE_SHIFT; get_more_pages: - first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; + pvec_pages = min_t(unsigned, PAGEVEC_SIZE, + max_pages - locked_pages); + if (end - index < (u64)(pvec_pages - 1)) + pvec_pages = (unsigned)(end - index) + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - want); + pvec_pages); dout("pagevec_lookup_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; @@ -871,11 +893,15 @@ get_more_pages: unlikely(page->mapping != mapping)) { dout("!dirty or !mapping %p\n", page); unlock_page(page); - break; + continue; } - if (!wbc->range_cyclic && page->index > end) { + if (page->index > end) { dout("end of range %p\n", page); - done = 1; + /* can't be range_cyclic (1st pass) because + * end == -1 in that case. */ + stop = true; + if (ceph_wbc.head_snapc) + done = true; unlock_page(page); break; } @@ -884,39 +910,37 @@ get_more_pages: unlock_page(page); break; } - if (wbc->sync_mode != WB_SYNC_NONE) { - dout("waiting on writeback %p\n", page); - wait_on_page_writeback(page); - } - if (page_offset(page) >= - (snap_size == -1 ? i_size : snap_size)) { - dout("%p page eof %llu\n", page, - (snap_size == -1 ? i_size : snap_size)); - done = 1; + if (page_offset(page) >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", + page, ceph_wbc.i_size); + /* not done if range_cyclic */ + stop = true; unlock_page(page); break; } if (PageWriteback(page)) { - dout("%p under writeback\n", page); - unlock_page(page); - break; + if (wbc->sync_mode == WB_SYNC_NONE) { + dout("%p under writeback\n", page); + unlock_page(page); + continue; + } + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); } /* only if matching snap context */ pgsnapc = page_snap_context(page); - if (pgsnapc->seq > snapc->seq) { - dout("page snapc %p %lld > oldest %p %lld\n", + if (pgsnapc != snapc) { + dout("page snapc %p %lld != oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); - if (!locked_pages) - continue; /* keep looking for snap */ - break; + continue; } if (!clear_page_dirty_for_io(page)) { dout("%p !clear_page_dirty_for_io\n", page); unlock_page(page); - break; + continue; } /* @@ -942,7 +966,7 @@ get_more_pages: break; } - num_ops = 1 + do_sync; + num_ops = 1; strip_unit_end = page->index + ((len - 1) >> PAGE_SHIFT); @@ -972,8 +996,6 @@ get_more_pages: } /* note position of first page in pvec */ - if (first < 0) - first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -984,8 +1006,10 @@ get_more_pages: BLK_RW_ASYNC); } - pages[locked_pages] = page; - locked_pages++; + + pages[locked_pages++] = page; + pvec.pages[i] = NULL; + len += PAGE_SIZE; } @@ -993,23 +1017,23 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; - BUG_ON(!locked_pages || first < 0); + unsigned j, n = 0; + /* shift unused page to beginning of pvec */ + for (j = 0; j < pvec_pages; j++) { + if (!pvec.pages[j]) + continue; + if (n < j) + pvec.pages[n] = pvec.pages[j]; + n++; + } + pvec.nr = n; if (pvec_pages && i == pvec_pages && locked_pages < max_pages) { dout("reached end pvec, trying for more\n"); - pagevec_reinit(&pvec); + pagevec_release(&pvec); goto get_more_pages; } - - /* shift unused pages over in the pvec... we - * will need to release them below. */ - for (j = i; j < pvec_pages; j++) { - dout(" pvec leftover page %p\n", pvec.pages[j]); - pvec.pages[j-i+first] = pvec.pages[j]; - } - pvec.nr -= i-first; } new_request: @@ -1019,10 +1043,9 @@ new_request: req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, false); + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, false); if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -1031,8 +1054,8 @@ new_request: CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, true); + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + @@ -1048,7 +1071,7 @@ new_request: for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); if (offset + len != cur_offset) { - if (op_idx + do_sync + 1 == req->r_num_ops) + if (op_idx + 1 == req->r_num_ops) break; osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); @@ -1069,14 +1092,15 @@ new_request: len += PAGE_SIZE; } - if (snap_size != -1) { - len = min(len, snap_size - offset); + if (ceph_wbc.size_stable) { + len = min(len, ceph_wbc.i_size - offset); } else if (i == locked_pages) { /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ u64 min_len = len + 1 - PAGE_SIZE; - len = min(len, (u64)i_size_read(inode) - offset); + len = get_writepages_data_length(inode, pages[i - 1], + offset); len = max(len, min_len); } dout("writepages got pages at %llu~%llu\n", offset, len); @@ -1085,17 +1109,12 @@ new_request: 0, !!pool, false); osd_req_op_extent_update(req, op_idx, len); - if (do_sync) { - op_idx++; - osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); - } BUG_ON(op_idx + 1 != req->r_num_ops); pool = NULL; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; - num_ops += do_sync; locked_pages -= i; /* allocate new pages array for next request */ @@ -1127,22 +1146,50 @@ new_request: if (pages) goto new_request; - if (wbc->nr_to_write <= 0) - done = 1; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) + done = stop = true; release_pvec_pages: dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, pvec.nr ? pvec.pages[0] : NULL); pagevec_release(&pvec); - - if (locked_pages && !done) - goto retry; } if (should_loop && !done) { /* more to do; loop back to beginning of file */ dout("writepages looping back to beginning of file\n"); - should_loop = 0; + end = start_index - 1; /* OK even when start_index == 0 */ + + /* to write dirty pages associated with next snapc, + * we need to wait until current writes complete */ + if (wbc->sync_mode != WB_SYNC_NONE && + start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc.head_snapc) { + struct page *page; + unsigned i, nr; + index = 0; + while ((index <= end) && + (nr = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + PAGEVEC_SIZE))) { + for (i = 0; i < nr; i++) { + page = pvec.pages[i]; + if (page_snap_context(page) != snapc) + continue; + wait_on_page_writeback(page); + } + pagevec_release(&pvec); + cond_resched(); + } + } + + start_index = 0; index = 0; goto retry; } @@ -1152,8 +1199,8 @@ release_pvec_pages: out: ceph_osdc_put_request(req); - ceph_put_snap_context(snapc); - dout("writepages done, rc = %d\n", rc); + ceph_put_snap_context(last_snapc); + dout("writepages dend - startone, rc = %d\n", rc); return rc; } @@ -1165,8 +1212,7 @@ out: static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, - NULL, NULL); + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); @@ -1211,8 +1257,7 @@ retry_locked: * this page is already dirty in another (older) snap * context! is it writeable now? */ - oldest = get_oldest_context(inode, NULL, NULL, NULL); - + oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 174d6e6569a8..a3ab265d3215 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -209,7 +209,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ - if (fsc->fscache == NULL) + if (!fsc->fscache) return; /* Only cache for regular files that are read only */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7007ae2a5ad2..157fe59fbabe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,13 +490,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, } /* - * if we are newly issued FILE_SHARED, mark dir not complete; we - * don't know what happened to this directory while we didn't - * have the cap. + * If FILE_SHARED is newly issued, mark dir not complete. We don't + * know what happened to this directory while we didn't have the cap. + * If FILE_SHARED is being revoked, also mark dir not complete. It + * stops on-going cached readdir. */ - if ((issued & CEPH_CAP_FILE_SHARED) && - (had & CEPH_CAP_FILE_SHARED) == 0) { - ci->i_shared_gen++; + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + ci->i_shared_gen++; if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); __ceph_dir_clear_complete(ci); @@ -611,7 +612,7 @@ void ceph_add_cap(struct inode *inode, } if (flags & CEPH_CAP_FLAG_AUTH) { - if (ci->i_auth_cap == NULL || + if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; cap->mds_wanted = wanted; @@ -728,7 +729,7 @@ static void __touch_cap(struct ceph_cap *cap) struct ceph_mds_session *s = cap->session; spin_lock(&s->s_cap_lock); - if (s->s_cap_iterator == NULL) { + if (!s->s_cap_iterator) { dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); @@ -1248,7 +1249,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.mode = inode->i_mode; arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - arg.flags = 0; + if (list_empty(&ci->i_cap_snaps)) + arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; + else + arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; if (sync) arg.flags |= CEPH_CLIENT_CAPS_SYNC; @@ -1454,13 +1458,19 @@ retry: goto retry; } + // make sure flushsnap messages are sent in proper order. + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + __kick_flushing_caps(mdsc, session, ci, 0); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + } + __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) { *psession = session; - } else { + } else if (session) { mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); } @@ -1901,11 +1911,7 @@ ack: (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { - spin_lock(&mdsc->cap_dirty_lock); - oldest_flush_tid = __get_oldest_flush_tid(mdsc); - spin_unlock(&mdsc->cap_dirty_lock); - __kick_flushing_caps(mdsc, session, ci, - oldest_flush_tid); + __kick_flushing_caps(mdsc, session, ci, 0); ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) @@ -2110,7 +2116,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret < 0) goto out; @@ -3422,7 +3428,7 @@ retry: tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ - if (tcap->cap_id != t_cap_id || + if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { dout(" updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 4e2d112c982f..d635496ea189 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -24,7 +24,7 @@ static int mdsmap_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mdsmap *mdsmap; - if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + if (!fsc->mdsc || !fsc->mdsc->mdsmap) return 0; mdsmap = fsc->mdsc->mdsmap; seq_printf(s, "epoch %d\n", mdsmap->m_epoch); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ef7240ace576..019c2036d36f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -377,8 +377,10 @@ more: } /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; - req->r_direct_hash = ceph_frag_value(frag); - __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + if (op == CEPH_MDS_OP_READDIR) { + req->r_direct_hash = ceph_frag_value(frag); + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + } if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3d48c415f3cb..65a6fa12c857 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -175,7 +175,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (cf == NULL) { + if (!cf) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } @@ -562,8 +562,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; size_t len = iov_iter_count(to); - dout("sync_read on file %p %llu~%u %s\n", file, off, - (unsigned)len, + dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (!len) @@ -788,7 +787,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -800,7 +799,6 @@ static void ceph_aio_retry_work(struct work_struct *work) } req->r_ops[0] = orig_req->r_ops[0]; - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = aio_req->mtime; req->r_data_offset = req->r_ops[0].extent.offset; @@ -847,8 +845,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_read_write (%s) on file %p %lld~%u\n", - (write ? "write" : "read"), file, pos, (unsigned)count); + dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -861,7 +860,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -874,8 +873,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, - /*include a 'startsync' command*/ - write ? 2 : 1, + 1, write ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ, flags, snapc, @@ -887,6 +885,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + len = size; pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { @@ -922,7 +925,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_SIZE - 1)); - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = mtime; } @@ -1048,7 +1050,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", + file, pos, (unsigned)count, snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -1060,7 +1063,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; @@ -1307,6 +1310,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; +retry_snap: inode_lock(inode); /* We can write back this queue in page reclaim */ @@ -1338,7 +1342,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } -retry_snap: /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; @@ -1387,14 +1390,6 @@ retry_snap: &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); - if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u" - "got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), - pos, (unsigned)count); - inode_lock(inode); - goto retry_snap; - } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1428,10 +1423,15 @@ retry_snap: ceph_cap_string(got)); ceph_put_cap_refs(ci, got); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)count); + goto retry_snap; + } + if (written >= 0) { if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; - written = generic_write_sync(iocb, written); } @@ -1481,13 +1481,13 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= i_size) { + if (offset < 0 || offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= i_size) { + if (offset < 0 || offset >= i_size) { ret = -ENXIO; goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 220dfd87cbfa..373dab5173ca 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -52,7 +52,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) ino_t t = ceph_vino_to_ino(vino); inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); - if (inode == NULL) + if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { dout("get_inode created new inode %p %llx.%llx ino %llx\n", @@ -133,12 +133,9 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, } frag = kmalloc(sizeof(*frag), GFP_NOFS); - if (!frag) { - pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " - "frag %x\n", &ci->vfs_inode, - ceph_vinop(&ci->vfs_inode), f); + if (!frag) return ERR_PTR(-ENOMEM); - } + frag->frag = f; frag->split_by = 0; frag->mds = -1; @@ -1070,7 +1067,6 @@ out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) ceph_put_mds_session(old_lease_session); - return; } /* @@ -1177,7 +1173,7 @@ retry_lookup: dn = d_alloc(parent, &dname); dout("d_alloc %p '%.*s' = %p\n", parent, dname.len, dname.name, dn); - if (dn == NULL) { + if (!dn) { dput(parent); err = -ENOMEM; goto done; @@ -1477,7 +1473,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct dentry *dn; struct inode *in; int err = 0, skipped = 0, ret, i; - struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 last_hash = 0; @@ -1510,8 +1505,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(d_inode(parent)); - parent = d_find_alias(snapdir); dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", rinfo->dir_nr, parent); } else { @@ -1519,15 +1512,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, rinfo->dir_nr, parent); if (rinfo->dir_dir) ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); - } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && - !(rinfo->hash_order && last_hash)) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); - req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); - req->r_readdir_cache_idx = 0; + if (ceph_frag_is_leftmost(frag) && + req->r_readdir_offset == 2 && + !(rinfo->hash_order && last_hash)) { + /* note dir version at start of readdir so we can + * tell if any dentries get dropped */ + req->r_dir_release_cnt = + atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = + atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } } cache_ctl.index = req->r_readdir_cache_idx; @@ -1566,7 +1562,7 @@ retry_lookup: dn = d_alloc(parent, &dname); dout("d_alloc %p '%.*s' = %p\n", parent, dname.len, dname.name, dn); - if (dn == NULL) { + if (!dn) { dout("d_alloc badness\n"); err = -ENOMEM; goto out; @@ -1650,10 +1646,6 @@ out: req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); - if (snapdir) { - iput(snapdir); - dput(parent); - } dout("readdir_prepopulate done\n"); return err; } @@ -1841,9 +1833,20 @@ retry: * possibly truncate them.. so write AND block! */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + struct ceph_cap_snap *capsnap; + to = ci->i_truncate_size; + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + // MDS should have revoked Frw caps + WARN_ON_ONCE(capsnap->writing); + if (capsnap->dirty_pages && capsnap->size > to) + to = capsnap->size; + } + spin_unlock(&ci->i_ceph_lock); dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - spin_unlock(&ci->i_ceph_lock); + + truncate_pagecache(inode, to); + filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 666a9f274832..9dd6b836ac9e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -408,7 +408,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *session; - if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; session = mdsc->sessions[mds]; dout("lookup_mds_session %p %d\n", session, @@ -483,7 +483,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, dout("register_session realloc to %d\n", newmax); sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); - if (sa == NULL) + if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, @@ -731,9 +731,16 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { - inode = req->r_inode; - ihold(inode); - } else if (req->r_dentry) { + if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { + inode = req->r_inode; + ihold(inode); + } else { + /* req->r_dentry is non-null for LSSNAP request. + * fall-thru */ + WARN_ON_ONCE(!req->r_dentry); + } + } + if (!inode && req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; @@ -886,7 +893,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 /* Calculate serialized length of metadata */ metadata_bytes = 4; /* map length */ - for (i = 0; metadata[i][0] != NULL; ++i) { + for (i = 0; metadata[i][0]; ++i) { metadata_bytes += 8 + strlen(metadata[i][0]) + strlen(metadata[i][1]); metadata_key_count++; @@ -919,7 +926,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ceph_encode_32(&p, metadata_key_count); /* Two length-prefixed strings for each entry in the map */ - for (i = 0; metadata[i][0] != NULL; ++i) { + for (i = 0; metadata[i][0]; ++i) { size_t const key_len = strlen(metadata[i][0]); size_t const val_len = strlen(metadata[i][1]); @@ -1122,7 +1129,7 @@ static int iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = p->next; - if (cap->ci == NULL) { + if (!cap->ci) { dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); @@ -1748,7 +1755,7 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int len, pos; unsigned seq; - if (dentry == NULL) + if (!dentry) return ERR_PTR(-EINVAL); retry: @@ -1771,7 +1778,7 @@ retry: len--; /* no leading '/' */ path = kmalloc(len+1, GFP_NOFS); - if (path == NULL) + if (!path) return ERR_PTR(-ENOMEM); pos = len; path[pos] = 0; /* trailing null */ @@ -2875,7 +2882,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } if (list_empty(&ci->i_cap_snaps)) { - snap_follows = 0; + snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; } else { struct ceph_cap_snap *capsnap = list_first_entry(&ci->i_cap_snaps, @@ -3133,7 +3140,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, newmap->m_epoch, oldmap->m_epoch); for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { - if (mdsc->sessions[i] == NULL) + if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; oldstate = ceph_mdsmap_get_state(oldmap, i); @@ -3280,7 +3287,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, mutex_lock(&session->s_mutex); session->s_seq++; - if (inode == NULL) { + if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; } @@ -3438,7 +3445,7 @@ static void delayed_work(struct work_struct *work) for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (s == NULL) + if (!s) continue; if (s->s_state == CEPH_MDS_SESSION_CLOSING) { dout("resending session close request for mds%d\n", @@ -3490,7 +3497,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) { + if (!mdsc->mdsmap) { kfree(mdsc); return -ENOMEM; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 1a748cf88535..33ced4c22732 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -112,7 +112,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u16 mdsmap_ev; m = kzalloc(sizeof(*m), GFP_NOFS); - if (m == NULL) + if (!m) return ERR_PTR(-ENOMEM); ceph_decode_need(p, end, 1 + 1, bad); @@ -138,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_num_mds = m->m_max_mds; m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); - if (m->m_info == NULL) + if (!m->m_info) goto nomem; /* pick out active nodes from mds_info (state > 0) */ @@ -232,7 +232,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); - if (info->export_targets == NULL) + if (!info->export_targets) goto nomem; for (j = 0; j < num_export_targets; j++) info->export_targets[j] = diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index dab5d6732345..1ffc8b426c1c 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -299,7 +299,8 @@ static int cmpu64_rev(const void *a, const void *b) /* * build the snap context for a given realm. */ -static int build_snap_context(struct ceph_snap_realm *realm) +static int build_snap_context(struct ceph_snap_realm *realm, + struct list_head* dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -313,7 +314,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent); + err = build_snap_context(parent, dirty_realms); if (err) goto fail; } @@ -332,7 +333,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - (unsigned int) realm->cached_context->num_snaps); + (unsigned int)realm->cached_context->num_snaps); return 0; } @@ -373,7 +374,11 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - ceph_put_snap_context(realm->cached_context); + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); + } realm->cached_context = snapc; return 0; @@ -394,15 +399,16 @@ fail: /* * rebuild snap context for the given realm and all of its children. */ -static void rebuild_snap_realms(struct ceph_snap_realm *realm) +static void rebuild_snap_realms(struct ceph_snap_realm *realm, + struct list_head *dirty_realms) { struct ceph_snap_realm *child; dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm); + build_snap_context(realm, dirty_realms); list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child); + rebuild_snap_realms(child, dirty_realms); } @@ -624,13 +630,11 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; - struct ceph_snap_realm *child; dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { + list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { struct inode *inode = igrab(&ci->vfs_inode); if (!inode) continue; @@ -643,14 +647,6 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); - list_for_each_entry(child, &realm->children, child_item) { - dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", - realm, realm->ino, child, child->ino); - list_del_init(&child->dirty_item); - list_add(&child->dirty_item, &realm->dirty_item); - } - - list_del_init(&realm->dirty_item); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -721,8 +717,6 @@ more: if (err < 0) goto fail; - /* queue realm for cap_snap creation */ - list_add(&realm->dirty_item, &dirty_realms); if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; @@ -741,7 +735,7 @@ more: /* invalidate when we reach the _end_ (root) of the trace */ if (invalidate && p >= e) - rebuild_snap_realms(realm); + rebuild_snap_realms(realm, &dirty_realms); if (!first_realm) first_realm = realm; @@ -758,6 +752,7 @@ more: while (!list_empty(&dirty_realms)) { realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, dirty_item); + list_del_init(&realm->dirty_item); queue_realm_cap_snaps(realm); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index aa06a8c24792..e4082afedcb1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -49,9 +49,16 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) struct ceph_statfs st; u64 fsid; int err; + u64 data_pool; + + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { + data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; + } else { + data_pool = CEPH_NOPOOL; + } dout("statfs\n"); - err = ceph_monc_do_statfs(&fsc->client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st); if (err < 0) return err; @@ -113,7 +120,6 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, - Opt_cap_release_safety, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, @@ -152,7 +158,6 @@ static match_table_t fsopt_tokens = { {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, @@ -235,27 +240,43 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ case Opt_wsize: - fsopt->wsize = intval; + if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) + return -EINVAL; + fsopt->wsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rsize: - fsopt->rsize = intval; + if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) + return -EINVAL; + fsopt->rsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rasize: - fsopt->rasize = intval; + if (intval < 0) + return -EINVAL; + fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: + if (intval < 1) + return -EINVAL; fsopt->caps_wanted_delay_min = intval; break; case Opt_caps_wanted_delay_max: + if (intval < 1) + return -EINVAL; fsopt->caps_wanted_delay_max = intval; break; case Opt_readdir_max_entries: + if (intval < 1) + return -EINVAL; fsopt->max_readdir = intval; break; case Opt_readdir_max_bytes: + if (intval < PAGE_SIZE && intval != 0) + return -EINVAL; fsopt->max_readdir_bytes = intval; break; case Opt_congestion_kb: + if (intval < 1024) /* at least 1M */ + return -EINVAL; fsopt->congestion_kb = intval; break; case Opt_dirstat: @@ -392,7 +413,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->sb_flags = flags; fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); if (!fsopt->snapdir_name) { @@ -402,7 +424,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); @@ -508,7 +529,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); - if (fsopt->rsize != CEPH_RSIZE_DEFAULT) + if (fsopt->rsize != CEPH_MAX_READ_SIZE) seq_printf(m, ",rsize=%d", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) seq_printf(m, ",rasize=%d", fsopt->rasize); @@ -520,9 +541,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) seq_printf(m, ",caps_wanted_delay_max=%d", fsopt->caps_wanted_delay_max); - if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - fsopt->cap_release_safety); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) @@ -576,7 +594,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - if (fsopt->mds_namespace == NULL) { + if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); } else { @@ -597,13 +615,13 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, * to be processed in parallel, limit concurrency. */ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); - if (fsc->wb_wq == NULL) + if (!fsc->wb_wq) goto fail_client; fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); - if (fsc->pg_inv_wq == NULL) + if (!fsc->pg_inv_wq) goto fail_wb_wq; fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); - if (fsc->trunc_wq == NULL) + if (!fsc->trunc_wq) goto fail_pg_inv_wq; /* set up mempools */ @@ -674,26 +692,26 @@ static int __init init_caches(void) __alignof__(struct ceph_inode_info), SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT, ceph_inode_init_once); - if (ceph_inode_cachep == NULL) + if (!ceph_inode_cachep) return -ENOMEM; ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) + if (!ceph_cap_cachep) goto bad_cap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_flush_cachep == NULL) + if (!ceph_cap_flush_cachep) goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) + if (!ceph_dentry_cachep) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) + if (!ceph_file_cachep) goto bad_file; if ((error = ceph_fscache_register())) @@ -947,20 +965,10 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) return err; /* set ra_pages based on rasize mount option? */ - if (fsc->mount_options->rasize >= PAGE_SIZE) - sb->s_bdi->ra_pages = - (fsc->mount_options->rasize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - else - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - - if (fsc->mount_options->rsize > fsc->mount_options->rasize && - fsc->mount_options->rsize >= PAGE_SIZE) - sb->s_bdi->io_pages = - (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - else if (fsc->mount_options->rsize == 0) - sb->s_bdi->io_pages = ULONG_MAX; + sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; + + /* set io_pages based on max osd read size */ + sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f02a2225fe42..279a2f401cf5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,12 +46,25 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ +/* max size of osd read request, limited by libceph */ +#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN +/* osd has a configurable limitaion of max write size. + * CEPH_MSG_MAX_DATA_LEN should be small enough. */ +#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + struct ceph_mount_options { int flags; int sb_flags; @@ -61,7 +74,6 @@ struct ceph_mount_options { int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; - int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 11263f102e4c..3542b2c364cf 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -777,7 +777,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, spin_unlock(&ci->i_ceph_lock); /* security module gets xattr while filling trace */ - if (current->journal_info != NULL) { + if (current->journal_info) { pr_warn_ratelimited("sync getxattr %p " "during filling trace\n", inode); return -EBUSY; @@ -809,7 +809,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); - if (current->journal_info != NULL && + if (current->journal_info && !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: @@ -1058,7 +1058,7 @@ do_sync_unlocked: up_read(&mdsc->snap_rwsem); /* security module set xattr while filling trace */ - if (current->journal_info != NULL) { + if (current->journal_info) { pr_warn_ratelimited("sync setxattr %p " "during filling trace\n", inode); err = -EBUSY; @@ -1108,7 +1108,7 @@ bool ceph_security_xattr_deadlock(struct inode *in) { struct ceph_inode_info *ci; bool ret; - if (in->i_security == NULL) + if (!in->i_security) return false; ci = ceph_inode(in); spin_lock(&ci->i_ceph_lock); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 23ebb92484c6..28de3edd4f4d 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -114,6 +114,7 @@ struct ext2_sb_info { */ spinlock_t s_lock; struct mb_cache *s_ea_block_cache; + struct dax_device *s_daxdev; }; static inline spinlock_t * diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 30163d007b2f..4dca6f348714 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -800,10 +800,10 @@ int ext2_get_block(struct inode *inode, sector_t iblock, static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { - struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; + struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); bool new = false, boundary = false; u32 bno; int ret; @@ -814,13 +814,9 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - bdev = inode->i_sb->s_bdev; - iomap->bdev = bdev; + iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; + iomap->dax_dev = sbi->s_daxdev; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -842,7 +838,6 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - fs_put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7b1bc9059863..fc18edd81815 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,6 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); kfree(sbi); } @@ -813,6 +814,7 @@ static unsigned long descriptor_loc(struct super_block *sb, static int ext2_fill_super(struct super_block *sb, void *data, int silent) { + struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); struct buffer_head * bh; struct ext2_sb_info * sbi; struct ext2_super_block * es; @@ -842,6 +844,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; + sbi->s_daxdev = dax_dev; spin_lock_init(&sbi->s_lock); @@ -1200,6 +1203,7 @@ failed_sbi: kfree(sbi->s_blockgroup_lock); kfree(sbi); failed: + fs_put_dax(dax_dev); return ret; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 84b9da192238..e2abe01c8c6b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1526,6 +1526,7 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; + struct dax_device *s_daxdev; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e963508ea35f..31db875bc7a1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3397,7 +3397,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { - struct block_device *bdev; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; @@ -3466,12 +3466,8 @@ retry: } iomap->flags = 0; - bdev = inode->i_sb->s_bdev; - iomap->bdev = bdev; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; if (ret == 0) { @@ -3504,7 +3500,6 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - fs_put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 93aece6891f2..71b9a667e1bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -951,6 +951,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); kfree(sbi); } @@ -3398,6 +3399,7 @@ static void ext4_set_resv_clusters(struct super_block *sb) static int ext4_fill_super(struct super_block *sb, void *data, int silent) { + struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; @@ -3423,6 +3425,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if ((data && !orig_data) || !sbi) goto out_free_base; + sbi->s_daxdev = dax_dev; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) @@ -4399,6 +4402,7 @@ out_fail: out_free_base: kfree(sbi); kfree(orig_data); + fs_put_dax(dax_dev); return err ? err : ret; } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index b4b8438c42ef..436b3a1464d9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -207,15 +207,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5b876f6d3f6b..04fe1df052b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -230,8 +230,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -244,7 +245,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -263,6 +264,12 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -283,7 +290,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -295,7 +302,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -346,7 +353,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -581,11 +588,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -601,14 +621,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -904,7 +931,14 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) { @@ -1017,7 +1051,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1115,7 +1149,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1194,7 +1228,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1249,7 +1283,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fb96bb71da00..36b535207c88 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -457,14 +457,65 @@ out_fail: return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -508,8 +559,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -570,16 +621,6 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct page *page; struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -620,9 +661,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -756,7 +795,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; @@ -782,7 +822,7 @@ alloc: static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -814,7 +854,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -903,7 +943,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -1040,7 +1080,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1146,35 +1186,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1240,7 +1251,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1271,12 +1282,11 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1341,11 +1351,11 @@ static int encrypt_one_page(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; gfp_t gfp_flags = GFP_NOFS; - if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + if (!f2fs_encrypted_file(inode)) return 0; /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -1471,7 +1481,8 @@ out: } static int __write_data_page(struct page *page, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1492,6 +1503,7 @@ static int __write_data_page(struct page *page, bool *submitted, .encrypted_page = NULL, .submitted = false, .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); @@ -1598,7 +1610,7 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, wbc); + return __write_data_page(page, NULL, wbc, FS_DATA_IO); } /* @@ -1607,7 +1619,8 @@ static int f2fs_write_data_page(struct page *page, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1697,7 +1710,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1752,8 +1765,9 @@ continue_unlock: return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1790,7 +1804,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) @@ -1809,6 +1823,16 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -1858,7 +1882,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1956,8 +1980,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -1971,21 +1995,9 @@ repeat: zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); - goto fail; - } - bio->bi_opf = REQ_OP_READ; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -2075,10 +2087,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 37f9c7f55605..c0c933ad43c8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -705,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -735,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94a88b233e98..9a7c90386947 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -91,6 +91,8 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -110,8 +112,12 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -142,6 +148,8 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -190,11 +198,18 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + enum { D_PREP, D_SUBMIT, @@ -230,11 +245,14 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ @@ -308,6 +326,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_flush_device) #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -332,6 +351,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR + struct f2fs_gc_range { u32 sync; u64 start; @@ -355,16 +377,36 @@ struct f2fs_flush_device { u32 segments; /* # of segments to flush */ }; +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, @@ -372,19 +414,26 @@ static inline void make_dentry_ptr_block(struct inode *inode, { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } static inline void make_dentry_ptr_inline(struct inode *inode, - struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) + struct f2fs_dentry_ptr *d, void *t) { + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + d->inode = inode; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -473,12 +522,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -521,6 +571,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ @@ -533,10 +584,15 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -823,6 +879,23 @@ enum need_lock_type { LOCK_RETRY, }; +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -837,6 +910,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; #define is_read_io(rw) ((rw) == READ) @@ -1028,6 +1102,11 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -1046,10 +1125,19 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1137,6 +1225,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1760,20 +1869,38 @@ static inline bool IS_INODE(struct page *page) return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1836,6 +1963,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1864,6 +2005,8 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1983,6 +2126,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1999,6 +2144,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -2009,8 +2161,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) @@ -2069,11 +2221,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + int extra_size = get_extra_isize(inode); - return (void *)&(ri->i_addr[1]); + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2164,10 +2317,50 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} + /* * file.c */ @@ -2187,6 +2380,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * inode.c */ void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); @@ -2255,6 +2450,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) @@ -2285,15 +2482,15 @@ int truncate_xattr_node(struct inode *inode, struct page *page); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *get_node_page_ra(struct page *parent, int start); void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); @@ -2314,6 +2511,7 @@ void destroy_node_manager_caches(void); /* * segment.c */ +bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); @@ -2336,7 +2534,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); int rewrite_data_page(struct f2fs_io_info *fio); @@ -2353,8 +2552,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, @@ -2377,7 +2575,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write); + long nr_to_write, enum iostat_type io_type); void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); @@ -2430,6 +2628,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -2726,10 +2927,10 @@ void destroy_extent_cache(void); /* * sysfs.c */ -int __init f2fs_register_sysfs(void); -void f2fs_unregister_sysfs(void); -int f2fs_init_sysfs(struct f2fs_sb_info *sbi); -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support @@ -2739,6 +2940,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2761,6 +2967,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 843a0d99f7ea..517e112c8a9a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -98,14 +98,16 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); @@ -274,9 +276,19 @@ sync_nodes: goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -382,7 +394,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -467,9 +480,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -647,7 +664,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; - flags = fi->i_flags & FS_FL_USER_VISIBLE; + flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); if (flags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & FS_COMPR_FL) @@ -927,7 +944,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -1003,8 +1021,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1173,7 +1191,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1184,8 +1203,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1495,33 +1514,67 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags = fi->i_flags & + (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); return put_user(flags, (int __user *)arg); } +static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int oldflags; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; + + flags = f2fs_mask_flags(inode->i_mode, flags); + + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + flags = flags & (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + flags |= oldflags & ~(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + fi->i_flags = flags; + + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + else + clear_inode_flag(inode, FI_PROJ_INHERIT); + + inode->i_ctime = current_time(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return 0; +} + static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; - unsigned int oldflags; int ret; if (!inode_owner_or_capable(inode)) @@ -1536,31 +1589,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode_lock(inode); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - ret = -EPERM; - goto unlock_out; - } - - flags = f2fs_mask_flags(inode->i_mode, flags); - - oldflags = fi->i_flags; - - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; - goto unlock_out; - } - } - - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; - fi->i_flags = flags; + ret = __f2fs_ioc_setflags(inode, flags); - inode->i_ctime = current_time(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, false); -unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1610,10 +1640,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); goto out; } inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: @@ -1647,10 +1679,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); @@ -1786,7 +1819,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -2043,7 +2076,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2085,7 +2118,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2384,6 +2417,210 @@ out: return ret; } +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + +#ifdef CONFIG_QUOTA +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct page *ipage; + kprojid_t kprojid; + int err; + + if (!f2fs_sb_has_project_quota(sb)) { + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (!f2fs_has_extra_attr(inode)) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out_unlock; + } + + if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, + i_projid)) { + err = -EOVERFLOW; + f2fs_put_page(ipage, 1); + goto out_unlock; + } + f2fs_put_page(ipage, 1); + + dquot_initialize(inode); + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + + F2FS_I(inode)->i_projid = kprojid; + inode->i_ctime = current_time(inode); +out_dirty: + f2fs_mark_inode_dirty_sync(inode, true); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & FS_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & FS_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & FS_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & FS_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & FS_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & FS_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +#define F2FS_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_PROJINHERIT_FL) + +/* Transfer xflags flags to internal */ +static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= FS_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= FS_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= FS_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= FS_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= FS_PROJINHERIT_FL; + + return iflags; +} + +static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags & + (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)); + + if (f2fs_sb_has_project_quota(inode->i_sb)) + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + fi->i_projid); + + if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + unsigned int flags; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (fa.fsx_xflags & ~F2FS_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = f2fs_xflags_to_iflags(fa.fsx_xflags); + if (f2fs_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + inode_lock(inode); + flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) | + (flags & F2FS_FL_XFLAG_VISIBLE); + err = __f2fs_ioc_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = f2fs_ioc_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; +} long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -2426,6 +2663,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_move_range(filp, arg); case F2FS_IOC_FLUSH_DEVICE: return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_FSGETXATTR: + return f2fs_ioc_fsgetxattr(filp, arg); + case F2FS_IOC_FSSETXATTR: + return f2fs_ioc_fssetxattr(filp, arg); default: return -ENOTTY; } @@ -2455,6 +2698,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2491,6 +2737,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_FSGETXATTR: + case F2FS_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; @@ -2506,6 +2755,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fa3d2e2df8e7..bfe6a8ccc3a0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,16 +28,21 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current), + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; if (kthread_should_stop()) @@ -55,6 +60,9 @@ static int gc_thread_func(void *data) } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -69,19 +77,24 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ @@ -93,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -110,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -259,20 +277,11 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } -static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - struct seg_entry *se = get_seg_entry(sbi, segno); - - return se->ckpt_valid_blocks > se->valid_blocks ? - se->ckpt_valid_blocks : se->valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_ssr_cost(sbi, segno); + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) @@ -582,7 +591,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -590,8 +599,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -684,6 +697,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -731,6 +746,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .page = page, .encrypted_page = NULL, .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -819,8 +835,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -854,14 +869,18 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx, segno, off); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -898,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -944,6 +963,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } @@ -954,21 +977,17 @@ next: blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, true) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -976,6 +995,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { @@ -1002,17 +1030,20 @@ gc_more: gc_type = FG_GC; } - ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - if (!__get_victim(sbi, &segno, gc_type)) + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; - ret = 0; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; @@ -1029,6 +1060,16 @@ gc_more: stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e0fd4376e6fb..8322e4e7bb3f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -22,10 +22,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) @@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) + if (from >= MAX_INLINE_DATA(inode)) return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); if (from == 0) @@ -116,6 +117,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -200,6 +202,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -216,11 +220,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -255,9 +264,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,11 +294,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -300,9 +309,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -316,19 +325,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -337,11 +346,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -356,25 +366,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -395,14 +404,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -444,19 +452,19 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -473,9 +481,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -484,7 +492,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -500,7 +508,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *inline_dentry = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,10 +518,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); - bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; @@ -534,7 +543,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -557,7 +565,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -565,11 +574,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -586,20 +596,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *inline_dentry; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - inline_dentry = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -609,25 +620,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; @@ -652,7 +665,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -661,7 +674,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6cd312a17c69..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode) static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -292,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -416,6 +519,9 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 760d85223c81..a4dab98c4b7b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto fail_drop; @@ -72,6 +79,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -85,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -204,6 +225,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -261,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -724,6 +754,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; @@ -912,6 +947,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d53fe620939e..fca87835a1da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -554,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -578,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -613,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -654,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -876,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -1022,11 +1028,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; @@ -1170,6 +1175,11 @@ repeat: err = -EIO; goto out_err; } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " @@ -1177,9 +1187,9 @@ page_hit: nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - ClearPageUptodate(page); err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); return ERR_PTR(err); } @@ -1326,7 +1336,8 @@ continue_unlock: } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1339,6 +1350,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .io_type = io_type, }; trace_f2fs_writepage(page, NODE); @@ -1395,6 +1407,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (submitted) *submitted = fio.submitted; + if (do_balance) + f2fs_balance_fs(sbi, false); return 0; redirty_out: @@ -1405,7 +1419,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc); + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1493,7 +1507,8 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc); + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1530,7 +1545,8 @@ out: return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1608,7 +1624,8 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - ret = __write_node_page(page, false, &submitted, wbc); + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); if (ret) unlock_page(page); else if (submitted) @@ -1697,7 +1714,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -2191,7 +2208,8 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; struct page *xpage; @@ -2207,22 +2225,22 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: update xattr nid in inode */ - remove_free_nid(sbi, new_xnid); - f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(inc_valid_node_count(sbi, inode, false))) - f2fs_bug_on(sbi, 1); + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + alloc_nid_done(sbi, new_xnid); update_inode_page(inode); /* 3: update and set xattr node page dirty */ - xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); - if (!xpage) - return -ENOMEM; - - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -2262,7 +2280,14 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 907d6b7dde6a..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -226,18 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { + bool quota_inode = false; + if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -291,7 +317,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; @@ -328,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -361,7 +395,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -414,8 +449,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -557,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -573,11 +623,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -586,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -599,8 +649,6 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ @@ -614,5 +662,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6f8fc4a6e701..621b9b3d320b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -17,10 +17,12 @@ #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/sched/signal.h> #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -167,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -213,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -248,6 +271,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } @@ -292,6 +316,7 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -309,17 +334,21 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; last_idx = page->index; @@ -481,6 +510,8 @@ repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -499,6 +530,8 @@ repeat: fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -519,8 +552,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return ret; } - if (!atomic_read(&fcc->issing_flush)) { - atomic_inc(&fcc->issing_flush); + if (atomic_inc_return(&fcc->issing_flush) == 1) { ret = submit_flush_wait(sbi); atomic_dec(&fcc->issing_flush); @@ -530,18 +562,39 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) init_completion(&cmd.wait); - atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->issing_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; @@ -778,11 +831,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); - size = min((unsigned long)(end - blk), max_blocks); + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); - blk += size; + blk = START_BLOCK(sbi, segno + 1); } #endif } @@ -815,6 +871,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, submit_bio(bio); list_move_tail(&dc->list, &dcc->wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); } } else { __remove_discard_cmd(sbi, dc); @@ -996,32 +1054,81 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (!issue_cond || is_idle(sbi)) + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { __submit_discard_cmd(sbi, dc); - if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) goto out; } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1102,34 +1209,63 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/* This comes from f2fs_put_super and f2fs_trim_fs */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); __wait_discard_cmd(sbi, false); } +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; set_freezable(); do { - wait_event_interruptible(*q, kthread_should_stop() || - freezing(current) || - atomic_read(&dcc->discard_cmd_cnt)); + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; if (kthread_should_stop()) return 0; - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); - congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; } @@ -1320,7 +1456,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1402,11 +1539,11 @@ skip: goto find_next; list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= total_len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + wake_up_discard_thread(sbi, false); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1424,9 +1561,13 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) + for (i = 0; i < MAX_PLIST_NUM; i++) { INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); @@ -1491,6 +1632,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1507,17 +1652,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; @@ -1528,17 +1681,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se->ckpt_valid_blocks++; } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1900,7 +2061,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1921,12 +2082,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) @@ -1990,7 +2149,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2083,6 +2242,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2202,9 +2364,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + if (add_list) { struct f2fs_bio_info *io; @@ -2236,7 +2401,8 @@ reallocate: } } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -2255,6 +2421,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) set_page_writeback(page); f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2263,6 +2431,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -2276,13 +2446,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - return f2fs_submit_page_bio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -2324,7 +2503,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -2343,7 +2522,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -2382,8 +2561,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 6b871b492fd5..e0a6cc23ace3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -492,29 +492,11 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); -} - static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -577,6 +559,10 @@ static inline bool need_inplace_update_policy(struct inode *inode, if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -799,3 +785,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 32e4c025e97e..89f61eb3d167 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,6 +25,7 @@ #include <linux/quotaops.h> #include <linux/f2fs_fs.h> #include <linux/sysfs.h> +#include <linux/quota.h> #include "f2fs.h" #include "node.h" @@ -107,8 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, Opt_usrquota, Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -144,8 +157,20 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; @@ -157,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -168,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -175,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -386,15 +512,76 @@ static int parse_options(struct super_block *sb, char *options) sb->s_flags &= ~MS_LAZYTIME; break; #ifdef CONFIG_QUOTA + case Opt_quota: case Opt_usrquota: set_opt(sbi, USRQUOTA); break; case Opt_grpquota: set_opt(sbi, GRPQUOTA); break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; #else + case Opt_quota: case Opt_usrquota: case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -406,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -439,6 +630,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); #ifdef CONFIG_QUOTA memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); @@ -446,6 +638,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) #endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -584,7 +777,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -642,7 +834,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->ckpt); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -651,6 +843,10 @@ static void f2fs_put_super(struct super_block *sb) destroy_device_list(sbi); mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); @@ -664,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -698,6 +897,48 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -733,9 +974,49 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -809,11 +1090,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) sbi->fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); if (test_opt(sbi, USRQUOTA)) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); #endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } @@ -862,6 +1148,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we @@ -871,6 +1162,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -952,6 +1260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -966,6 +1279,13 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; @@ -1065,7 +1385,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return err; + return 0; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -1082,6 +1402,27 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) return &F2FS_I(inode)->i_reserved_quota; } +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + static int f2fs_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); @@ -1119,7 +1460,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, struct inode *inode; int err; - err = f2fs_quota_sync(sb, -1); + err = f2fs_quota_sync(sb, type); if (err) return err; @@ -1147,7 +1488,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, -1); + f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); if (err) @@ -1163,7 +1504,7 @@ out_put: return err; } -static void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { int type; @@ -1171,6 +1512,12 @@ static void f2fs_quota_off_umount(struct super_block *sb) f2fs_quota_off(sb, type); } +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, .write_dquot = dquot_commit, @@ -1180,6 +1527,7 @@ static const struct dquot_operations f2fs_quota_operations = { .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, .get_next_id = dquot_get_next_id, }; @@ -1194,12 +1542,12 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else -static inline void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { } #endif -static struct super_operations f2fs_sops = { +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, @@ -1303,9 +1651,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1922,6 +2277,11 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware @@ -1956,7 +2316,7 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &f2fs_sops; @@ -1980,6 +2340,10 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1: NR_TEMP_TYPE; int j; @@ -2098,11 +2462,6 @@ try_onemore: if (err) goto free_nm; - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { @@ -2122,10 +2481,15 @@ try_onemore: goto free_root_inode; } - err = f2fs_init_sysfs(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); + if (err) + goto free_sysfs; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -2135,7 +2499,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_sysfs; + goto free_meta; } if (need_fsck) @@ -2149,7 +2513,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_sysfs; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2173,7 +2537,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_sysfs; + goto free_meta; } kfree(options); @@ -2191,9 +2555,17 @@ skip_recovery: f2fs_update_time(sbi, REQ_TIME); return 0; -free_sysfs: +free_meta: f2fs_sync_inode_meta(sbi); - f2fs_exit_sysfs(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2202,13 +2574,6 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); f2fs_destroy_stats(sbi); @@ -2228,6 +2593,10 @@ free_options: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -2311,7 +2680,7 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_register_sysfs(); + err = f2fs_init_sysfs(); if (err) goto free_extent_cache; err = register_shrinker(&f2fs_shrinker_info); @@ -2330,7 +2699,7 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2350,7 +2719,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 71191d89917d..e2c258f717cd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -18,7 +18,6 @@ #include "gc.h" static struct proc_dir_entry *f2fs_proc_root; -static struct kset *f2fs_kset; /* Sysfs support for f2fs */ enum { @@ -41,6 +40,7 @@ struct f2fs_attr { const char *, size_t); int struct_type; int offset; + int id; }; static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) @@ -76,6 +76,34 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, BD_PART_WRITTEN(sbi))); } +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -124,7 +152,39 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, spin_unlock(&sbi->stat_lock); return count; } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + return count; } @@ -155,6 +215,30 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -172,12 +256,23 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); @@ -191,20 +286,36 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), ATTR_LIST(gc_min_sleep_time), ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -217,26 +328,59 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), ATTR_LIST(reserved_blocks), NULL, }; +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, }; -static struct kobj_type f2fs_ktype = { +static struct kobj_type f2fs_sb_ktype = { .default_attrs = f2fs_attrs, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -288,6 +432,48 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -303,28 +489,47 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); -int __init f2fs_register_sysfs(void) +int __init f2fs_init_sysfs(void) { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + int ret; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; } -void f2fs_unregister_sysfs(void) +void f2fs_exit_sysfs(void) { - kset_unregister(f2fs_kset); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; } -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) { struct super_block *sb = sbi->sb; int err; + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -333,33 +538,19 @@ int f2fs_init_sysfs(struct f2fs_sb_info *sbi) &f2fs_seq_segment_info_fops, sb); proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + kobject_del(&sbi->s_kobj); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 832c5110abab..7c65540148f8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -442,7 +442,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -473,8 +473,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; + down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -503,7 +505,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -686,7 +690,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/fcntl.c b/fs/fcntl.c index 3b01b646e528..0491da3b28c3 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -741,10 +741,21 @@ static void send_sigio_to_task(struct task_struct *p, si.si_signo = signum; si.si_errno = 0; si.si_code = reason; + /* + * Posix definies POLL_IN and friends to be signal + * specific si_codes for SIG_POLL. Linux extended + * these si_codes to other signals in a way that is + * ambiguous if other signals also have signal + * specific si_codes. In that case use SI_SIGIO instead + * to remove the ambiguity. + */ + if (sig_specific_sicodes(signum)) + si.si_code = SI_SIGIO; + /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ - BUG_ON((reason & __SI_MASK) != __SI_POLL); + BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c5b6b7165489..e9e97803442a 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt) static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); loff_t pos = 0; return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); @@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); loff_t pos = 0; /* * No locking or generic_write_checks(), the server is diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16d00e53264..13c65dd2d37d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_in *in; unsigned reqsize; - if (task_active_pid_ns(current) != fc->pid_ns) - return -EIO; - restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; @@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, in = &req->in; reqsize = in->h.len; + + if (task_active_pid_ns(current) != fc->pid_ns) { + rcu_read_lock(); + in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); + rcu_read_unlock(); + } + /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; @@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; - if (task_active_pid_ns(current) != fc->pid_ns) - return -EIO; - if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 00800c07ba1c..622081b97426 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -923,33 +923,29 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, return err; } -int fuse_update_attributes(struct inode *inode, struct kstat *stat, - struct file *file, bool *refreshed) +static int fuse_update_get_attr(struct inode *inode, struct file *file, + struct kstat *stat) { struct fuse_inode *fi = get_fuse_inode(inode); - int err; - bool r; + int err = 0; if (time_before64(fi->i_time, get_jiffies_64())) { - r = true; forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); - } else { - r = false; - err = 0; - if (stat) { - generic_fillattr(inode, stat); - stat->mode = fi->orig_i_mode; - stat->ino = fi->orig_ino; - } + } else if (stat) { + generic_fillattr(inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; } - if (refreshed != NULL) - *refreshed = r; - return err; } +int fuse_update_attributes(struct inode *inode, struct file *file) +{ + return fuse_update_get_attr(inode, file, NULL); +} + int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { @@ -1786,7 +1782,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_attributes(inode, stat, NULL, NULL); + return fuse_update_get_attr(inode, NULL, stat); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d66789804287..cb7dff5c45d7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -645,7 +645,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { - struct file *file = io->file; + struct file *file = io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -707,7 +707,8 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, static int fuse_do_readpage(struct file *file, struct page *page) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct kiocb iocb; + struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -735,6 +736,8 @@ static int fuse_do_readpage(struct file *file, struct page *page) req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = count; + init_sync_kiocb(&iocb, file); + io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; @@ -923,7 +926,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); + err = fuse_update_attributes(inode, iocb->ki_filp); if (err) return err; } @@ -957,13 +960,18 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { - struct file *file = io->file; + struct kiocb *iocb = io->iocb; + struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; struct fuse_write_in *inarg = &req->misc.write.in; fuse_write_fill(req, ff, pos, count); inarg->flags = file->f_flags; + if (iocb->ki_flags & IOCB_DSYNC) + inarg->flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + inarg->flags |= O_SYNC; if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); @@ -993,14 +1001,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, +static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, struct inode *inode, loff_t pos, size_t count) { size_t res; unsigned offset; unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); @@ -1100,7 +1108,7 @@ static inline unsigned fuse_wr_pages(loff_t pos, size_t len) FUSE_MAX_PAGES_PER_REQ); } -static ssize_t fuse_perform_write(struct file *file, +static ssize_t fuse_perform_write(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *ii, loff_t pos) { @@ -1133,7 +1141,7 @@ static ssize_t fuse_perform_write(struct file *file, } else { size_t num_written; - num_written = fuse_send_write_pages(req, file, inode, + num_written = fuse_send_write_pages(req, iocb, inode, pos, count); err = req->out.h.error; if (!err) { @@ -1169,7 +1177,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (get_fuse_conn(inode)->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, NULL, file, NULL); + err = fuse_update_attributes(mapping->host, file); if (err) return err; @@ -1201,7 +1209,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) pos += written; - written_buffered = fuse_perform_write(file, mapping, from, pos); + written_buffered = fuse_perform_write(iocb, mapping, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1220,13 +1228,15 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written += written_buffered; iocb->ki_pos = pos + written_buffered; } else { - written = fuse_perform_write(file, mapping, from, iocb->ki_pos); + written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); if (written >= 0) iocb->ki_pos += written; } out: current->backing_dev_info = NULL; inode_unlock(inode); + if (written > 0) + written = generic_write_sync(iocb, written); return written ? written : err; } @@ -1317,7 +1327,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, { int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; - struct file *file = io->file; + struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1399,8 +1409,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, loff_t *ppos) { ssize_t res; - struct file *file = io->file; - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(io->iocb->ki_filp); if (is_bad_inode(inode)) return -EIO; @@ -1414,15 +1423,14 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); return __fuse_direct_read(&io, to, &iocb->ki_pos); } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; if (is_bad_inode(inode)) @@ -2181,9 +2189,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; - if (pid && pid_nr == 0) - return -EOVERFLOW; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fc, &args); @@ -2303,7 +2308,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, NULL, file, NULL); + err = fuse_update_attributes(inode, file); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2323,7 +2328,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, NULL, file, NULL); + retval = fuse_update_attributes(inode, file); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); @@ -2874,7 +2879,6 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) io->offset = offset; io->write = (iov_iter_rw(iter) == WRITE); io->err = 0; - io->file = file; /* * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bd4d2a3e1ec1..d5773ca67ad2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -252,16 +252,15 @@ struct fuse_io_priv { bool should_dirty; int err; struct kiocb *iocb; - struct file *file; struct completion *done; bool blocking; }; -#define FUSE_IO_PRIV_SYNC(f) \ +#define FUSE_IO_PRIV_SYNC(i) \ { \ .refcnt = KREF_INIT(1), \ .async = 0, \ - .file = f, \ + .iocb = i, \ } /** @@ -905,8 +904,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct kstat *stat, - struct file *file, bool *refreshed); +int fuse_update_attributes(struct inode *inode, struct file *file); void fuse_flush_writepages(struct inode *inode); diff --git a/fs/inode.c b/fs/inode.c index 210054157a49..d1e35b53bb23 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1570,11 +1570,24 @@ EXPORT_SYMBOL(bmap); static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, bool rcu) { - if (!rcu) { - struct inode *realinode = d_real_inode(dentry); + struct dentry *upperdentry; - if (unlikely(inode != realinode) && - (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + /* + * Nothing to do if in rcu or if non-overlayfs + */ + if (rcu || likely(!(dentry->d_flags & DCACHE_OP_REAL))) + return; + + upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); + + /* + * If file is on lower then we can't update atime, so no worries about + * stale mtime/ctime. + */ + if (upperdentry) { + struct inode *realinode = d_inode(upperdentry); + + if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { inode->i_mtime = realinode->i_mtime; inode->i_ctime = realinode->i_ctime; diff --git a/fs/internal.h b/fs/internal.h index fedfe94d84ba..48cee21b4f14 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -71,8 +71,10 @@ extern void __init mnt_init(void); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); +extern int mnt_want_write_file_path(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); +extern void mnt_drop_write_file_path(struct file *); /* * fs_struct.c diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 27d577dbe51a..96c1d14c18f1 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -235,12 +235,8 @@ reclaimer(void *ptr) struct net *net = host->net; req = kmalloc(sizeof(*req), GFP_KERNEL); - if (!req) { - printk(KERN_ERR "lockd: reclaimer unable to alloc memory." - " Locks for %s won't be reclaimed!\n", - host->h_name); + if (!req) return 0; - } allow_signal(SIGKILL); diff --git a/fs/namespace.c b/fs/namespace.c index f8893dc6a989..df0f7521979a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -431,13 +431,18 @@ int __mnt_want_write_file(struct file *file) } /** - * mnt_want_write_file - get write access to a file's mount + * mnt_want_write_file_path - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already + * + * Called by the vfs for cases when we have an open file at hand, but will do an + * inode operation on it (important distinction for files opened on overlayfs, + * since the file operations will come from the real underlying file, while + * inode operations come from the overlay). */ -int mnt_want_write_file(struct file *file) +int mnt_want_write_file_path(struct file *file) { int ret; @@ -447,6 +452,53 @@ int mnt_want_write_file(struct file *file) sb_end_write(file->f_path.mnt->mnt_sb); return ret; } + +static inline int may_write_real(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct dentry *upperdentry; + + /* Writable file? */ + if (file->f_mode & FMODE_WRITER) + return 0; + + /* Not overlayfs? */ + if (likely(!(dentry->d_flags & DCACHE_OP_REAL))) + return 0; + + /* File refers to upper, writable layer? */ + upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); + if (upperdentry && file_inode(file) == d_inode(upperdentry)) + return 0; + + /* Lower layer: can't write to real file, sorry... */ + return -EPERM; +} + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + * + * Mostly called by filesystems from their ioctl operation before performing + * modification. On overlayfs this needs to check if the file is on a read-only + * lower layer and deny access in that case. + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + ret = may_write_real(file); + if (!ret) { + sb_start_write(file_inode(file)->i_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file_inode(file)->i_sb); + } + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** @@ -484,10 +536,16 @@ void __mnt_drop_write_file(struct file *file) __mnt_drop_write(file->f_path.mnt); } -void mnt_drop_write_file(struct file *file) +void mnt_drop_write_file_path(struct file *file) { mnt_drop_write(file->f_path.mnt); } + +void mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); + sb_end_write(file_inode(file)->i_sb); +} EXPORT_SYMBOL(mnt_drop_write_file); static int mnt_make_readonly(struct mount *mnt) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 5427cdf04c5a..14358de173fb 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -51,7 +51,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; - if (nfsi->nrequests != 0) + if (nfs_have_writebacks(inode)) res->change_attr++; res->ctime = inode->i_ctime; res->mtime = inode->i_mtime; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d7df5e67b0c1..606dd3871f66 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1089,7 +1089,7 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode) delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || !(delegation->type & FMODE_WRITE)) goto out; - if (nfsi->nrequests < delegation->pagemod_limit) + if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit) ret = false; out: rcu_read_unlock(); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3522b1249019..5ceaeb1f6fb6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2260,7 +2260,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str spin_lock(&inode->i_lock); retry = false; } - res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); @@ -2296,7 +2295,6 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; - res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; err = 0; @@ -2344,7 +2342,6 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->jiffies = set->jiffies; cache->cred = get_rpccred(set->cred); cache->mask = set->mask; @@ -2432,7 +2429,6 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE | NFS_MAY_WRITE | NFS_MAY_READ; cache.cred = cred; - cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { if (status == -ESTALE) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6fb9fad2d1e6..d2972d537469 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -616,13 +616,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index af330c31f627..a385d1c3f146 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -631,11 +631,11 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - result = generic_write_sync(iocb, result); - if (result < 0) - goto out; written = result; iocb->ki_pos += written; + result = generic_write_sync(iocb, written); + if (result < 0) + goto out; /* Return error values */ if (nfs_need_check_write(file, inode)) { @@ -744,15 +744,18 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) goto out; /* - * Revalidate the cache if the server has time stamps granular - * enough to detect subsecond changes. Otherwise, clear the - * cache to prevent missing any changes. + * Invalidate cache to prevent missing any changes. If + * the file is mapped, clear the page cache as well so + * those mappings will be loaded. * * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { nfs_zap_caches(inode); + if (mapping_mapped(filp->f_mapping)) + nfs_revalidate_mapping(inode, filp->f_mapping); + } out: return status; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 109279d6d91b..134d9f560240 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1285,7 +1285,6 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); unsigned long ret = 0; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) @@ -1315,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->nrequests == 0) { + && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); ret |= NFS_INO_INVALID_ATTR; } @@ -1823,7 +1822,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->nrequests == 0 || new_isize > cur_isize) { + if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; @@ -2012,10 +2011,11 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_LIST_HEAD(&nfsi->commit_info.list); - nfsi->nrequests = 0; - nfsi->commit_info.ncommit = 0; + atomic_long_set(&nfsi->nrequests, 0); + atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); init_rwsem(&nfsi->rmdir_sem); + mutex_init(&nfsi->commit_mutex); nfs4_init_once(nfsi); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index dc456416d2be..68cc22083639 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -251,7 +251,6 @@ int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 40bd05f05e74..ac4f10b7f6c1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -303,6 +303,17 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, struct rpc_cred *newcred = NULL; rpc_authflavor_t flavor; + if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP || + sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) { + /* Using machine creds for cleanup operations + * is only relevent if the client credentials + * might expire. So don't bother for + * RPC_AUTH_UNIX. If file was only exported to + * sec=sys, the PUTFH would fail anyway. + */ + if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX) + return false; + } if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { spin_lock(&clp->cl_lock); if (clp->cl_machine_cred != NULL) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d90132642340..6c61e2b99635 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1659,12 +1659,52 @@ update: return state; } +static struct inode * +nfs4_opendata_get_inode(struct nfs4_opendata *data) +{ + struct inode *inode; + + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) + return ERR_PTR(-EAGAIN); + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, + &data->f_attr, data->f_label); + break; + default: + inode = d_inode(data->dentry); + ihold(inode); + nfs_refresh_inode(inode, &data->f_attr); + } + return inode; +} + static struct nfs4_state * -_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *state; struct inode *inode; - struct nfs4_state *state = NULL; - int ret; + + inode = nfs4_opendata_get_inode(data); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (data->state != NULL && data->state->inode == inode) { + state = data->state; + atomic_inc(&state->count); + } else + state = nfs4_get_open_state(inode, data->owner); + iput(inode); + if (state == NULL) + state = ERR_PTR(-ENOMEM); + return state; +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + struct nfs4_state *state; if (!data->rpc_done) { state = nfs4_try_open_cached(data); @@ -1672,29 +1712,17 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) goto out; } - ret = -EAGAIN; - if (!(data->f_attr.valid & NFS_ATTR_FATTR)) - goto err; - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); - ret = PTR_ERR(inode); - if (IS_ERR(inode)) - goto err; - ret = -ENOMEM; - state = nfs4_get_open_state(inode, data->owner); - if (state == NULL) - goto err_put_inode; + state = nfs4_opendata_find_nfs4_state(data); + if (IS_ERR(state)) + goto out; + if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); - iput(inode); out: nfs_release_seqid(data->o_arg.seqid); return state; -err_put_inode: - iput(inode); -err: - return ERR_PTR(ret); } static struct nfs4_state * @@ -2071,7 +2099,6 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; case NFS4_OPEN_CLAIM_FH: task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; - nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; if (nfs4_setup_sequence(data->o_arg.server->nfs_client, @@ -2258,7 +2285,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred, mask = NFS4_ACCESS_READ; cache.cred = cred; - cache.jiffies = jiffies; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache); @@ -7318,7 +7344,9 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, 1 << (OP_DESTROY_SESSION - 32) | 1 << (OP_DESTROY_CLIENTID - 32) }; + unsigned long flags = 0; unsigned int i; + int ret = 0; if (sp->how == SP4_MACH_CRED) { /* Print state protect result */ @@ -7334,7 +7362,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { if (sp->enforce.u.words[i] & ~supported_enforce[i]) { dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } @@ -7353,10 +7382,11 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { dfprintk(MOUNT, "sp4_mach_cred:\n"); dfprintk(MOUNT, " minimal mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags); } else { dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (test_bit(OP_CLOSE, sp->allow.u.longs) && @@ -7364,110 +7394,46 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags); } if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, - &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags); } if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags); } if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { dfprintk(MOUNT, " stateid mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags); } if (test_bit(OP_WRITE, sp->allow.u.longs)) { dfprintk(MOUNT, " write mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags); } if (test_bit(OP_COMMIT, sp->allow.u.longs)) { dfprintk(MOUNT, " commit mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags); } } - +out: + clp->cl_sp4_flags = flags; return 0; } struct nfs41_exchange_id_data { struct nfs41_exchange_id_res res; struct nfs41_exchange_id_args args; - struct rpc_xprt *xprt; - int rpc_status; }; -static void nfs4_exchange_id_done(struct rpc_task *task, void *data) -{ - struct nfs41_exchange_id_data *cdata = - (struct nfs41_exchange_id_data *)data; - struct nfs_client *clp = cdata->args.client; - int status = task->tk_status; - - trace_nfs4_exchange_id(clp, status); - - if (status == 0) - status = nfs4_check_cl_exchange_flags(cdata->res.flags); - - if (cdata->xprt && status == 0) { - status = nfs4_detect_session_trunking(clp, &cdata->res, - cdata->xprt); - goto out; - } - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); - - if (status == 0) { - clp->cl_clientid = cdata->res.clientid; - clp->cl_exchange_flags = cdata->res.flags; - clp->cl_seqid = cdata->res.seqid; - /* Client ID is not confirmed */ - if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - - kfree(clp->cl_serverowner); - clp->cl_serverowner = cdata->res.server_owner; - cdata->res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = cdata->res.impl_id; - cdata->res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - cdata->res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = cdata->res.server_scope; - cdata->res.server_scope = NULL; - } - /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier.data, - sizeof(clp->cl_confirm.data)); - } -out: - cdata->rpc_status = status; - return; -} - static void nfs4_exchange_id_release(void *data) { struct nfs41_exchange_id_data *cdata = @@ -7481,7 +7447,6 @@ static void nfs4_exchange_id_release(void *data) } static const struct rpc_call_ops nfs4_exchange_id_call_ops = { - .rpc_call_done = nfs4_exchange_id_done, .rpc_release = nfs4_exchange_id_release, }; @@ -7490,7 +7455,8 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { * * Wrapper for EXCHANGE_ID operation. */ -static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, +static struct rpc_task * +nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { struct rpc_message msg = { @@ -7504,17 +7470,15 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, .flags = RPC_TASK_TIMEOUT, }; struct nfs41_exchange_id_data *calldata; - struct rpc_task *task; int status; if (!atomic_inc_not_zero(&clp->cl_count)) - return -EIO; + return ERR_PTR(-EIO); + status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) { - nfs_put_client(clp); - return -ENOMEM; - } + if (!calldata) + goto out; nfs4_init_boot_verifier(clp, &calldata->args.verifier); @@ -7553,34 +7517,22 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, goto out_impl_id; } if (xprt) { - calldata->xprt = xprt; task_setup_data.rpc_xprt = xprt; task_setup_data.flags |= RPC_TASK_SOFTCONN; memcpy(calldata->args.verifier.data, clp->cl_confirm.data, sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; -#ifdef CONFIG_NFS_V4_1_MIGRATION calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else - calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, + EXCHGID4_FLAG_BIND_PRINC_STATEID; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - - status = calldata->rpc_status; - - rpc_put_task(task); -out: - return status; + return rpc_run_task(&task_setup_data); out_impl_id: kfree(calldata->res.impl_id); @@ -7590,8 +7542,69 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); +out: nfs_put_client(clp); - goto out; + return ERR_PTR(status); +} + +/* + * _nfs4_proc_exchange_id() + * + * Wrapper for EXCHANGE_ID operation. + */ +static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, + u32 sp4_how) +{ + struct rpc_task *task; + struct nfs41_exchange_id_args *argp; + struct nfs41_exchange_id_res *resp; + int status; + + task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + + argp = task->tk_msg.rpc_argp; + resp = task->tk_msg.rpc_resp; + status = task->tk_status; + if (status != 0) + goto out; + + status = nfs4_check_cl_exchange_flags(resp->flags); + if (status != 0) + goto out; + + status = nfs4_sp4_select_mode(clp, &resp->state_protect); + if (status != 0) + goto out; + + clp->cl_clientid = resp->clientid; + clp->cl_exchange_flags = resp->flags; + clp->cl_seqid = resp->seqid; + /* Client ID is not confirmed */ + if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R)) + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + resp->server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + } + + swap(clp->cl_serverowner, resp->server_owner); + swap(clp->cl_serverscope, resp->server_scope); + swap(clp->cl_implid, resp->impl_id); + + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, argp->verifier.data, + sizeof(clp->cl_confirm.data)); +out: + trace_nfs4_exchange_id(clp, status); + rpc_put_task(task); + return status; } /* @@ -7614,13 +7627,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); } /** @@ -7642,6 +7655,9 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct rpc_task *task; + int status; + u32 sp4_how; dprintk("--> %s try %s\n", __func__, @@ -7650,7 +7666,17 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); /* Test connection for session trunking. Async exchange_id call */ - return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); + task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt); + if (IS_ERR(task)) + return PTR_ERR(task); + + status = task->tk_status; + if (status == 0) + status = nfs4_detect_session_trunking(adata->clp, + task->tk_msg.rpc_resp, xprt); + + rpc_put_task(task); + return status; } EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index de9066a92c0d..bec120ec1967 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked - * @nonblock - if true don't block waiting for lock * - * this lock must be held if modifying the page group list + * this lock must be held when traversing or modifying the page + * group list * - * return 0 on success, < 0 on error: -EDELAY if nonblocking or the - * result from wait_on_bit_lock - * - * NOTE: calling with nonblock=false should always have set the - * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock - * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. + * return 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req, bool nonblock) +nfs_page_group_lock(struct nfs_page *req) { struct nfs_page *head = req->wb_head; @@ -155,35 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock) if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) return 0; - if (!nonblock) { - set_bit(PG_CONTENDED1, &head->wb_flags); - smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - TASK_UNINTERRUPTIBLE); - } - - return -EAGAIN; -} - -/* - * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it - * @req - a request in the group - * - * This is a blocking call to wait for the group lock to be cleared. - */ -void -nfs_page_group_lock_wait(struct nfs_page *req) -{ - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - - if (!test_bit(PG_HEADLOCK, &head->wb_flags)) - return; set_bit(PG_CONTENDED1, &head->wb_flags); smp_mb__after_atomic(); - wait_on_bit(&head->wb_flags, PG_HEADLOCK, - TASK_UNINTERRUPTIBLE); + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); } /* @@ -246,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) { bool ret; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); ret = nfs_page_group_sync_on_bit_locked(req, bit); nfs_page_group_unlock(req); @@ -288,9 +258,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) inode = page_file_mapping(req->wb_page)->host; set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests++; - spin_unlock(&inode->i_lock); + atomic_long_inc(&NFS_I(inode)->nrequests); } } } @@ -306,14 +274,11 @@ static void nfs_page_group_destroy(struct kref *kref) { struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *head = req->wb_head; struct nfs_page *tmp, *next; - /* subrequests must release the ref on the head request */ - if (req->wb_head != req) - nfs_release_request(req->wb_head); - if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) - return; + goto out; tmp = req; do { @@ -324,6 +289,10 @@ nfs_page_group_destroy(struct kref *kref) nfs_free_request(tmp); tmp = next; } while (tmp != req); +out: + /* subrequests must release the ref on the head request */ + if (head != req) + nfs_release_request(head); } /** @@ -465,6 +434,7 @@ void nfs_release_request(struct nfs_page *req) { kref_put(&req->wb_kref, nfs_page_group_destroy); } +EXPORT_SYMBOL_GPL(nfs_release_request); /** * nfs_wait_on_request - Wait for a request to complete. @@ -483,6 +453,7 @@ nfs_wait_on_request(struct nfs_page *req) return wait_on_bit_io(&req->wb_flags, PG_BUSY, TASK_UNINTERRUPTIBLE); } +EXPORT_SYMBOL_GPL(nfs_wait_on_request); /* * nfs_generic_pg_test - determine if requests can be coalesced @@ -530,16 +501,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) } EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); -/* - * nfs_pgio_header_free - Free a read or write header - * @hdr: The header to free - */ -void nfs_pgio_header_free(struct nfs_pgio_header *hdr) -{ - hdr->rw_ops->rw_free_header(hdr); -} -EXPORT_SYMBOL_GPL(nfs_pgio_header_free); - /** * nfs_pgio_data_destroy - make @hdr suitable for reuse * @@ -548,14 +509,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { if (hdr->args.context) put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + nfs_pgio_data_destroy(hdr); + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call @@ -669,7 +640,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); static void nfs_pgio_error(struct nfs_pgio_header *hdr) { set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -680,7 +650,6 @@ static void nfs_pgio_error(struct nfs_pgio_header *hdr) static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,12 +680,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags, - gfp_t gfp_flags) + int io_flags) { - struct nfs_pgio_mirror *new; - int i; - desc->pg_moreio = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; @@ -732,23 +697,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_mirror_count = 1; desc->pg_mirror_idx = 0; - if (pg_ops->pg_get_mirror_count) { - /* until we have a request, we don't have an lseg and no - * idea how many mirrors there will be */ - new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), gfp_flags); - desc->pg_mirrors_dynamic = new; - desc->pg_mirrors = new; - - for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++) - nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize); - } else { - desc->pg_mirrors_dynamic = NULL; - desc->pg_mirrors = desc->pg_mirrors_static; - nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); - } + desc->pg_mirrors_dynamic = NULL; + desc->pg_mirrors = desc->pg_mirrors_static; + nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); } -EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_pgio_result - Basic pageio error handling @@ -865,32 +817,52 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) return ret; } +static struct nfs_pgio_mirror * +nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, + unsigned int mirror_count) +{ + struct nfs_pgio_mirror *ret; + unsigned int i; + + kfree(desc->pg_mirrors_dynamic); + desc->pg_mirrors_dynamic = NULL; + if (mirror_count == 1) + return desc->pg_mirrors_static; + ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS); + if (ret != NULL) { + for (i = 0; i < mirror_count; i++) + nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); + desc->pg_mirrors_dynamic = ret; + } + return ret; +} + /* * nfs_pageio_setup_mirroring - determine if mirroring is to be used * by calling the pg_get_mirror_count op */ -static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, +static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - int mirror_count = 1; - - if (!pgio->pg_ops->pg_get_mirror_count) - return 0; + unsigned int mirror_count = 1; - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - if (pgio->pg_error < 0) - return pgio->pg_error; - - if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) - return -EINVAL; + if (pgio->pg_ops->pg_get_mirror_count) + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0) + return; - if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic)) - return -EINVAL; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) { + pgio->pg_error = -EINVAL; + return; + } + pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count); + if (pgio->pg_mirrors == NULL) { + pgio->pg_error = -ENOMEM; + pgio->pg_mirrors = pgio->pg_mirrors_static; + mirror_count = 1; + } pgio->pg_mirror_count = mirror_count; - - return 0; } /* @@ -1036,7 +1008,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, unsigned int bytes_left = 0; unsigned int offset, pgbase; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); subreq = req; bytes_left = subreq->wb_bytes; @@ -1058,7 +1030,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (mirror->pg_recoalesce) return 0; /* retry add_request for this subreq */ - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); continue; } @@ -1155,7 +1127,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); /* find the last request */ for (lastreq = req->wb_head; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c383d0913b54..7879ed8ceb76 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -529,47 +529,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static void pnfs_free_lseg_async_work(struct work_struct *work) -{ - struct pnfs_layout_segment *lseg; - struct pnfs_layout_hdr *lo; - - lseg = container_of(work, struct pnfs_layout_segment, pls_work); - lo = lseg->pls_layout; - - pnfs_free_lseg(lseg); - pnfs_put_layout_hdr(lo); -} - -static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) -{ - INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); - schedule_work(&lseg->pls_work); -} - -void -pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) -{ - if (!lseg) - return; - - assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); - - dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount), - test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - struct pnfs_layout_hdr *lo = lseg->pls_layout; - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) - return; - pnfs_layout_remove_lseg(lo, lseg); - if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) { - pnfs_get_layout_hdr(lo); - pnfs_free_lseg_async(lseg); - } - } -} - /* * is l2 fully contained in l1? * start1 end1 @@ -2274,7 +2233,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } @@ -2398,7 +2356,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 99731e3e332f..87f144f14d1e 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -67,7 +67,6 @@ struct pnfs_layout_segment { u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; - struct work_struct pls_work; }; enum pnfs_try_status { @@ -230,7 +229,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 25f28fa64c57..60da59be83b6 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -83,34 +83,11 @@ pnfs_generic_clear_request_commit(struct nfs_page *req, } out: nfs_request_remove_commit_list(req, cinfo); - pnfs_put_lseg_locked(freeme); + pnfs_put_lseg(freeme); } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); static int -pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, - struct nfs_commit_info *cinfo, int max) -{ - struct nfs_page *req, *tmp; - int ret = 0; - - list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; - kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req, cinfo); - clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); - nfs_list_add_request(req, dst); - ret++; - if ((ret == max) && !cinfo->dreq) - break; - } - return ret; -} - -static int pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, int max) @@ -119,15 +96,15 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(&cinfo->inode->i_lock); - ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); + ret = nfs_scan_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; if (bucket->clseg == NULL) bucket->clseg = pnfs_get_lseg(bucket->wlseg); if (list_empty(src)) { - pnfs_put_lseg_locked(bucket->wlseg); + pnfs_put_lseg(bucket->wlseg); bucket->wlseg = NULL; } } @@ -142,7 +119,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -162,11 +139,10 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, int nwritten; int i; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - nwritten = pnfs_generic_transfer_commit_list(&b->written, - dst, cinfo, 0); + nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); if (!nwritten) continue; cinfo->ds->nwritten -= nwritten; @@ -953,12 +929,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -975,7 +951,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a8421d9dab6a..0d42573d423d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -68,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0, GFP_KERNEL); + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d828ef88e7db..6b179af59b92 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1691,8 +1691,8 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, rpc_authflavor_t *server_authlist, unsigned int count) { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + bool found_auth_null = false; unsigned int i; - int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1701,6 +1701,10 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor * can be used but still use the sec= that was specified. + * + * Note also that the MNT procedure in MNTv1 does not return a list + * of supported security flavors. In this case, nfs_mount() fabricates + * a security flavor list containing just AUTH_NULL. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; @@ -1709,11 +1713,11 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, goto out; if (flavor == RPC_AUTH_NULL) - use_auth_null = true; + found_auth_null = true; } - if (use_auth_null) { - flavor = RPC_AUTH_NULL; + if (found_auth_null) { + flavor = args->auth_info.flavors[0]; goto out; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b1af5dee5e0a..f68083db63c8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -102,10 +102,8 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) { - memset(p, 0, sizeof(*p)); - p->rw_mode = FMODE_WRITE; - } + memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; return p; } @@ -154,6 +152,14 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +static struct nfs_page * +nfs_page_private_request(struct page *page) +{ + if (!PagePrivate(page)) + return NULL; + return (struct nfs_page *)page_private(page); +} + /* * nfs_page_find_head_request_locked - find head request associated with @page * @@ -162,21 +168,41 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) * returns matching head request with reference held, or NULL if not found. */ static struct nfs_page * -nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_private_request(struct page *page) { - struct nfs_page *req = NULL; - - if (PagePrivate(page)) - req = (struct nfs_page *)page_private(page); - else if (unlikely(PageSwapCache(page))) - req = nfs_page_search_commits_for_head_request_locked(nfsi, - page); + struct address_space *mapping = page_file_mapping(page); + struct nfs_page *req; + if (!PagePrivate(page)) + return NULL; + spin_lock(&mapping->private_lock); + req = nfs_page_private_request(page); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } + spin_unlock(&mapping->private_lock); + return req; +} +static struct nfs_page * +nfs_page_find_swap_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *req = NULL; + if (!PageSwapCache(page)) + return NULL; + mutex_lock(&nfsi->commit_mutex); + if (PageSwapCache(page)) { + req = nfs_page_search_commits_for_head_request_locked(nfsi, + page); + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } + } + mutex_unlock(&nfsi->commit_mutex); return req; } @@ -187,12 +213,11 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) */ static struct nfs_page *nfs_page_find_head_request(struct page *page) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req = NULL; + struct nfs_page *req; - spin_lock(&inode->i_lock); - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - spin_unlock(&inode->i_lock); + req = nfs_page_find_private_request(page); + if (!req) + req = nfs_page_find_swap_request(page); return req; } @@ -241,9 +266,6 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) { struct nfs_page *req; - WARN_ON_ONCE(head != head->wb_head); - WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); - req = head; do { if (page_offset >= req->wb_pgbase && @@ -269,20 +291,17 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) unsigned int pos = 0; unsigned int len = nfs_page_length(req->wb_page); - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); - do { + for (;;) { tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (tmp) { - /* no way this should happen */ - WARN_ON_ONCE(tmp->wb_pgbase != pos); - pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); - } - } while (tmp && pos < len); + if (!tmp) + break; + pos = tmp->wb_pgbase + tmp->wb_bytes; + } nfs_page_group_unlock(req); - WARN_ON_ONCE(pos > len); - return pos == len; + return pos >= len; } /* We can set the PG_uptodate flag if we see that a write request @@ -333,8 +352,11 @@ static void nfs_end_page_writeback(struct nfs_page *req) { struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); + bool is_done; - if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); + nfs_unlock_request(req); + if (!is_done) return; end_page_writeback(req->wb_page); @@ -342,22 +364,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } - -/* nfs_page_group_clear_bits - * @req - an nfs request - * clears all page group related bits from @req - */ -static void -nfs_page_group_clear_bits(struct nfs_page *req) -{ - clear_bit(PG_TEARDOWN, &req->wb_flags); - clear_bit(PG_UNLOCKPAGE, &req->wb_flags); - clear_bit(PG_UPTODATE, &req->wb_flags); - clear_bit(PG_WB_END, &req->wb_flags); - clear_bit(PG_REMOVE, &req->wb_flags); -} - - /* * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req * @@ -366,43 +372,24 @@ nfs_page_group_clear_bits(struct nfs_page *req) * @inode - inode associated with request page group, must be holding inode lock * @head - head request of page group, must be holding head lock * @req - request that couldn't lock and needs to wait on the req bit lock - * @nonblock - if true, don't actually wait * - * NOTE: this must be called holding page_group bit lock and inode spin lock - * and BOTH will be released before returning. + * NOTE: this must be called holding page_group bit lock + * which will be released before returning. * * returns 0 on success, < 0 on error. */ -static int -nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, - struct nfs_page *req, bool nonblock) - __releases(&inode->i_lock) +static void +nfs_unroll_locks(struct inode *inode, struct nfs_page *head, + struct nfs_page *req) { struct nfs_page *tmp; - int ret; /* relinquish all the locks successfully grabbed this run */ - for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) - nfs_unlock_request(tmp); - - WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); - - /* grab a ref on the request that will be waited on */ - kref_get(&req->wb_kref); - - nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); - - /* release ref from nfs_page_find_head_request_locked */ - nfs_release_request(head); - - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - - return ret; + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } } /* @@ -417,7 +404,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, */ static void nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, - struct nfs_page *old_head) + struct nfs_page *old_head, + struct inode *inode) { while (destroy_list) { struct nfs_page *subreq = destroy_list; @@ -428,33 +416,28 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ - subreq->wb_head = subreq; subreq->wb_this_page = subreq; - /* subreq is now totally disconnected from page group or any - * write / commit lists. last chance to wake any waiters */ - nfs_unlock_request(subreq); + clear_bit(PG_REMOVE, &subreq->wb_flags); - if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { - /* release ref on old head request */ - nfs_release_request(old_head); + /* Note: races with nfs_page_group_destroy() */ + if (!kref_read(&subreq->wb_kref)) { + /* Check if we raced with nfs_page_group_destroy() */ + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) + nfs_free_request(subreq); + continue; + } - nfs_page_group_clear_bits(subreq); + subreq->wb_head = subreq; - /* release the PG_INODE_REF reference */ - if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) - nfs_release_request(subreq); - else - WARN_ON_ONCE(1); - } else { - WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); - /* zombie requests have already released the last - * reference and were waiting on the rest of the - * group to complete. Since it's no longer part of a - * group, simply free the request */ - nfs_page_group_clear_bits(subreq); - nfs_free_request(subreq); + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { + nfs_release_request(subreq); + atomic_long_dec(&NFS_I(inode)->nrequests); } + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_and_release_request(subreq); } } @@ -464,7 +447,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * operations for this page. * * @page - the page used to lookup the "page group" of nfs_page structures - * @nonblock - if true, don't block waiting for request locks * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations @@ -478,7 +460,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * error was encountered. */ static struct nfs_page * -nfs_lock_and_join_requests(struct page *page, bool nonblock) +nfs_lock_and_join_requests(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *head, *subreq; @@ -487,43 +469,63 @@ nfs_lock_and_join_requests(struct page *page, bool nonblock) int ret; try_again: - total_bytes = 0; - - WARN_ON_ONCE(destroy_list); - - spin_lock(&inode->i_lock); - /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_page_find_head_request_locked(NFS_I(inode), page); - - if (!head) { - spin_unlock(&inode->i_lock); + head = nfs_page_find_head_request(page); + if (!head) return NULL; - } - /* holding inode lock, so always make a non-blocking call to try the - * page group lock */ - ret = nfs_page_group_lock(head, true); - if (ret < 0) { - spin_unlock(&inode->i_lock); + /* lock the page head first in order to avoid an ABBA inefficiency */ + if (!nfs_lock_request(head)) { + ret = nfs_wait_on_request(head); + nfs_release_request(head); + if (ret < 0) + return ERR_PTR(ret); + goto try_again; + } - if (!nonblock && ret == -EAGAIN) { - nfs_page_group_lock_wait(head); - nfs_release_request(head); - goto try_again; - } + /* Ensure that nobody removed the request before we locked it */ + if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { + nfs_unlock_and_release_request(head); + goto try_again; + } - nfs_release_request(head); + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unlock_and_release_request(head); return ERR_PTR(ret); } /* lock each request in the page group */ - subreq = head; - do { + total_bytes = head->wb_bytes; + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { + + if (!kref_get_unless_zero(&subreq->wb_kref)) { + if (subreq->wb_offset == head->wb_offset + total_bytes) + total_bytes += subreq->wb_bytes; + continue; + } + + while (!nfs_lock_request(subreq)) { + /* + * Unlock page to allow nfs_page_group_sync_on_bit() + * to succeed + */ + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(inode, head, subreq); + nfs_release_request(subreq); + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); + } + } /* * Subrequests are always contiguous, non overlapping * and in order - but may be repeated (mirrored writes). @@ -535,24 +537,12 @@ try_again: ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); + nfs_unroll_locks(inode, head, subreq); + nfs_unlock_and_release_request(subreq); + nfs_unlock_and_release_request(head); return ERR_PTR(-EIO); } - - if (!nfs_lock_request(subreq)) { - /* releases page group bit lock and - * inode spin lock and all references */ - ret = nfs_unroll_locks_and_wait(inode, head, - subreq, nonblock); - - if (ret == 0) - goto try_again; - - return ERR_PTR(ret); - } - - subreq = subreq->wb_this_page; - } while (subreq != head); + } /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ @@ -573,34 +563,30 @@ try_again: head->wb_bytes = total_bytes; } - /* - * prepare head request to be added to new pgio descriptor - */ - nfs_page_group_clear_bits(head); - - /* - * some part of the group was still on the inode list - otherwise - * the group wouldn't be involved in async write. - * grab a reference for the head request, iff it needs one. - */ - if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + /* Postpone destruction of this request */ + if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { + set_bit(PG_INODE_REF, &head->wb_flags); kref_get(&head->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } nfs_page_group_unlock(head); - /* drop lock to clean uprequests on destroy list */ - spin_unlock(&inode->i_lock); + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); - nfs_destroy_unlinked_subrequests(destroy_list, head); + /* Did we lose a race with nfs_inode_remove_request()? */ + if (!(PagePrivate(page) || PageSwapCache(page))) { + nfs_unlock_and_release_request(head); + return NULL; + } - /* still holds ref on head from nfs_page_find_head_request_locked + /* still holds ref on head from nfs_page_find_head_request * and still has lock on head from lock loop */ return head; } static void nfs_write_error_remove_page(struct nfs_page *req) { - nfs_unlock_request(req); nfs_end_page_writeback(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); @@ -624,12 +610,12 @@ nfs_error_is_fatal_on_server(int err) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page) { struct nfs_page *req; int ret = 0; - req = nfs_lock_and_join_requests(page, nonblock); + req = nfs_lock_and_join_requests(page); if (!req) goto out; ret = PTR_ERR(req); @@ -672,7 +658,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -759,6 +745,7 @@ out_err: */ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { + struct address_space *mapping = page_file_mapping(req->wb_page); struct nfs_inode *nfsi = NFS_I(inode); WARN_ON_ONCE(req->wb_this_page != req); @@ -766,27 +753,30 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) /* Lock the request! */ nfs_lock_request(req); - spin_lock(&inode->i_lock); - if (!nfsi->nrequests && - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - inode->i_version++; /* * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ + spin_lock(&mapping->private_lock); + if (!nfs_have_writebacks(inode) && + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) { + spin_lock(&inode->i_lock); + inode->i_version++; + spin_unlock(&inode->i_lock); + } if (likely(!PageSwapCache(req->wb_page))) { set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); } - nfsi->nrequests++; + spin_unlock(&mapping->private_lock); + atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. * This flag also informs pgio layer when to bump nrequests when * adding subrequests. */ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); } /* @@ -794,25 +784,22 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct address_space *mapping = page_file_mapping(req->wb_page); + struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; + atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; - spin_lock(&inode->i_lock); + spin_lock(&mapping->private_lock); if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); clear_bit(PG_MAPPED, &head->wb_flags); } - nfsi->nrequests--; - spin_unlock(&inode->i_lock); - } else { - spin_lock(&inode->i_lock); - nfsi->nrequests--; - spin_unlock(&inode->i_lock); + spin_unlock(&mapping->private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) @@ -868,7 +855,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. + * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the + * nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -876,7 +864,7 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, { set_bit(PG_CLEAN, &req->wb_flags); nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + atomic_long_inc(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); @@ -896,9 +884,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); if (req->wb_page) nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -922,7 +910,7 @@ nfs_request_remove_commit_list(struct nfs_page *req, if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) return; nfs_list_remove_request(req); - cinfo->mds->ncommit--; + atomic_long_dec(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); @@ -967,7 +955,7 @@ nfs_clear_page_commit(struct page *page) WB_RECLAIMABLE); } -/* Called holding inode (/cinfo) lock */ +/* Called holding the request lock on @req */ static void nfs_clear_request_commit(struct nfs_page *req) { @@ -976,9 +964,11 @@ nfs_clear_request_commit(struct nfs_page *req) struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); + mutex_lock(&NFS_I(inode)->commit_mutex); if (!pnfs_clear_request_commit(req, &cinfo)) { nfs_request_remove_commit_list(req, &cinfo); } + mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_clear_page_commit(req->wb_page); } } @@ -1023,7 +1013,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) remove_req: nfs_inode_remove_request(req); next: - nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1035,10 +1024,10 @@ out: unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { - return cinfo->mds->ncommit; + return atomic_long_read(&cinfo->mds->ncommit); } -/* cinfo->inode->i_lock held by caller */ +/* NFS_I(cinfo->inode)->commit_mutex held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -1046,20 +1035,37 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_page *req, *tmp; int ret = 0; +restart: list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); + if (!nfs_lock_request(req)) { + int status; + + /* Prevent deadlock with nfs_lock_and_join_requests */ + if (!list_empty(dst)) { + nfs_release_request(req); + continue; + } + /* Ensure we make progress to prevent livelock */ + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + status = nfs_wait_on_request(req); + nfs_release_request(req); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (status < 0) + break; + goto restart; + } nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; if ((ret == max) && !cinfo->dreq) break; + cond_resched(); } return ret; } +EXPORT_SYMBOL_GPL(nfs_scan_commit_list); /* * nfs_scan_commit - Scan an inode for commit requests @@ -1076,15 +1082,17 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(&cinfo->inode->i_lock); - if (cinfo->mds->ncommit > 0) { + if (!atomic_long_read(&cinfo->mds->ncommit)) + return 0; + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (atomic_long_read(&cinfo->mds->ncommit) > 0) { const int max = INT_MAX; ret = nfs_scan_commit_list(&cinfo->mds->list, dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); return ret; } @@ -1105,43 +1113,21 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, unsigned int end; int error; - if (!PagePrivate(page)) - return NULL; - end = offset + bytes; - spin_lock(&inode->i_lock); - - for (;;) { - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - if (req == NULL) - goto out_unlock; - /* should be handled by nfs_flush_incompatible */ - WARN_ON_ONCE(req->wb_head != req); - WARN_ON_ONCE(req->wb_this_page != req); - - rqend = req->wb_offset + req->wb_bytes; - /* - * Tell the caller to flush out the request if - * the offsets are non-contiguous. - * Note: nfs_flush_incompatible() will already - * have flushed out requests having wrong owners. - */ - if (offset > rqend - || end < req->wb_offset) - goto out_flushme; - - if (nfs_lock_request(req)) - break; + req = nfs_lock_and_join_requests(page); + if (IS_ERR_OR_NULL(req)) + return req; - /* The request is locked, so wait and then retry */ - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error != 0) - goto out_err; - spin_lock(&inode->i_lock); - } + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (offset > rqend || end < req->wb_offset) + goto out_flushme; /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -1152,17 +1138,17 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = end - req->wb_offset; else req->wb_bytes = rqend - req->wb_offset; -out_unlock: - if (req) - nfs_clear_request_commit(req); - spin_unlock(&inode->i_lock); return req; out_flushme: - spin_unlock(&inode->i_lock); - nfs_release_request(req); + /* + * Note: we mark the request dirty here because + * nfs_lock_and_join_requests() cannot preserve + * commit flags, so we have to replay the write. + */ + nfs_mark_request_dirty(req); + nfs_unlock_and_release_request(req); error = nfs_wb_page(inode, page); -out_err: - return ERR_PTR(error); + return (error < 0) ? ERR_PTR(error) : NULL; } /* @@ -1227,8 +1213,6 @@ int nfs_flush_incompatible(struct file *file, struct page *page) l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || !nfs_match_open_context(req->wb_context, ctx); - /* for now, flush if more than 1 request in page_group */ - do_flush |= req->wb_this_page != req; if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { @@ -1412,7 +1396,6 @@ static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); - nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1452,7 +1435,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags, GFP_NOIO); + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1934,7 +1917,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; /* no commits means nothing needs to be done */ - if (!nfsi->commit_info.ncommit) + if (!atomic_long_read(&nfsi->commit_info.ncommit)) return ret; if (wbc->sync_mode == WB_SYNC_NONE) { @@ -2015,7 +1998,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(page, false); + req = nfs_lock_and_join_requests(page); if (IS_ERR(req)) { ret = PTR_ERR(req); diff --git a/fs/open.c b/fs/open.c index 35bb784763a4..7ea118471dce 100644 --- a/fs/open.c +++ b/fs/open.c @@ -96,7 +96,7 @@ long vfs_truncate(const struct path *path, loff_t length) * write access on the upper inode, not on the overlay inode. For * non-overlay filesystems d_real() is an identity function. */ - upperdentry = d_real(path->dentry, NULL, O_WRONLY); + upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0); error = PTR_ERR(upperdentry); if (IS_ERR(upperdentry)) goto mnt_drop_write_and_out; @@ -670,12 +670,12 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) if (!f.file) goto out; - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (error) goto out_fput; audit_file(f.file); error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); out_fput: fdput(f); out: @@ -857,7 +857,7 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags); + struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 48b70e6490f3..9cb0c80e5967 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -155,7 +155,7 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) { - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); if (!hardlink) { ovl_inode_update(inode, newdentry); @@ -692,7 +692,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) if (flags) ovl_cleanup(wdir, upper); - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, true); out_d_drop: d_drop(dentry); dput(whiteout); @@ -742,7 +742,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) err = vfs_rmdir(dir, upper); else err = vfs_unlink(dir, upper, NULL); - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, ovl_type_origin(dentry)); /* * Keeping this dentry hashed would mean having to release @@ -1089,8 +1089,9 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, drop_nlink(d_inode(new)); } - ovl_dentry_version_inc(old->d_parent); - ovl_dentry_version_inc(new->d_parent); + ovl_dentry_version_inc(old->d_parent, + !overwrite && ovl_type_origin(new)); + ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old)); out_dput: dput(newdentry); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5bc71642b226..a619addecafc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -498,6 +498,9 @@ static int ovl_set_nlink_common(struct dentry *dentry, len = snprintf(buf, sizeof(buf), format, (int) (inode->i_nlink - realinode->i_nlink)); + if (WARN_ON(len >= sizeof(buf))) + return -EIO; + return ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_NLINK, buf, len, 0); } @@ -576,10 +579,13 @@ static int ovl_inode_set(struct inode *inode, void *data) static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, struct dentry *upperdentry) { - struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL; - - /* Lower (origin) inode must match, even if NULL */ - if (ovl_inode_lower(inode) != lowerinode) + /* + * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. + * This happens when finding a copied up overlay inode for a renamed + * or hardlinked overlay dentry and lower dentry cannot be followed + * by origin because lower fs does not support file handles. + */ + if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) return false; /* diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e927a62c97ae..d4e8c1a08fb0 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -204,8 +204,8 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); @@ -217,7 +217,7 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, struct dentry *lowerdentry); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); -void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); @@ -229,6 +229,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); void ovl_set_flag(unsigned long flag, struct inode *inode); +void ovl_clear_flag(unsigned long flag, struct inode *inode); bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); @@ -256,6 +257,7 @@ extern const struct file_operations ovl_dir_operations; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); +void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index f0fd3adb1693..62e9b22a2077 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -15,11 +15,13 @@ #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/ratelimit.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; + u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; @@ -32,18 +34,20 @@ struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; + struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; - struct rb_root root; + struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; int count; int err; + bool is_upper; bool d_type_supported; }; @@ -58,7 +62,33 @@ struct ovl_dir_file { static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { - return container_of(n, struct ovl_cache_entry, node); + return rb_entry(n, struct ovl_cache_entry, node); +} + +static bool ovl_cache_entry_find_link(const char *name, int len, + struct rb_node ***link, + struct rb_node **parent) +{ + bool found = false; + struct rb_node **newp = *link; + + while (!found && *newp) { + int cmp; + struct ovl_cache_entry *tmp; + + *parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + found = true; + } + *link = newp; + + return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, @@ -82,6 +112,32 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } +static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, + struct ovl_cache_entry *p) +{ + /* Don't care if not doing ovl_iter() */ + if (!rdd->dentry) + return false; + + /* Always recalc d_ino for parent */ + if (strcmp(p->name, "..") == 0) + return true; + + /* If this is lower, then native d_ino will do */ + if (!rdd->is_upper) + return false; + + /* + * Recalc d_ino for '.' and for all entries if dir is impure (contains + * copied up entries) + */ + if ((p->name[0] == '.' && p->len == 1) || + ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) + return true; + + return false; +} + static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) @@ -97,7 +153,11 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, p->name[len] = '\0'; p->len = len; p->type = d_type; + p->real_ino = ino; p->ino = ino; + /* Defer setting d_ino for upper entry to ovl_iterate() */ + if (ovl_calc_d_ino(rdd, p)) + p->ino = 0; p->is_whiteout = false; if (d_type == DT_CHR) { @@ -111,32 +171,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { - struct rb_node **newp = &rdd->root.rb_node; + struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; - while (*newp) { - int cmp; - struct ovl_cache_entry *tmp; - - parent = *newp; - tmp = ovl_cache_entry_from_node(*newp); - cmp = strncmp(name, tmp->name, len); - if (cmp > 0) - newp = &tmp->node.rb_right; - else if (cmp < 0 || len < tmp->len) - newp = &tmp->node.rb_left; - else - return 0; - } + if (ovl_cache_entry_find_link(name, len, &newp, &parent)) + return 0; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); - if (p == NULL) + if (p == NULL) { + rdd->err = -ENOMEM; return -ENOMEM; + } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); - rb_insert_color(&p->node, &rdd->root); + rb_insert_color(&p->node, rdd->root); return 0; } @@ -147,7 +197,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(&rdd->root, name, namelen); + p = ovl_cache_entry_find(rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { @@ -172,6 +222,16 @@ void ovl_cache_free(struct list_head *list) INIT_LIST_HEAD(list); } +void ovl_dir_cache_free(struct inode *inode) +{ + struct ovl_dir_cache *cache = ovl_dir_cache(inode); + + if (cache) { + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; @@ -179,8 +239,8 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { - if (ovl_dir_cache(dentry) == cache) - ovl_set_dir_cache(dentry, NULL); + if (ovl_dir_cache(d_inode(dentry)) == cache) + ovl_set_dir_cache(d_inode(dentry), NULL); ovl_cache_free(&cache->entries); kfree(cache); @@ -273,7 +333,8 @@ static void ovl_dir_reset(struct file *file) od->is_real = false; } -static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, + struct rb_root *root) { int err; struct path realpath; @@ -281,13 +342,14 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) .ctx.actor = ovl_fill_merge, .dentry = dentry, .list = list, - .root = RB_ROOT, + .root = root, .is_lowest = false, }; int idx, next; for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; if (next != -1) { err = ovl_dir_read(&realpath, &rdd); @@ -326,12 +388,13 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) int res; struct ovl_dir_cache *cache; - cache = ovl_dir_cache(dentry); + cache = ovl_dir_cache(d_inode(dentry)); if (cache && ovl_dentry_version_get(dentry) == cache->version) { + WARN_ON(!cache->refcount); cache->refcount++; return cache; } - ovl_set_dir_cache(dentry, NULL); + ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) @@ -339,8 +402,9 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); + cache->root = RB_ROOT; - res = ovl_dir_read_merged(dentry, &cache->entries); + res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); @@ -348,22 +412,266 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) } cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(dentry, cache); + ovl_set_dir_cache(d_inode(dentry), cache); return cache; } +/* + * Set d_ino for upper entries. Non-upper entries should always report + * the uppermost real inode ino and should not call this function. + * + * When not all layer are on same fs, report real ino also for upper. + * + * When all layers are on the same fs, and upper has a reference to + * copy up origin, call vfs_getattr() on the overlay entry to make + * sure that d_ino will be consistent with st_ino from stat(2). + */ +static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) + +{ + struct dentry *dir = path->dentry; + struct dentry *this = NULL; + enum ovl_path_type type; + u64 ino = p->real_ino; + int err = 0; + + if (!ovl_same_sb(dir->d_sb)) + goto out; + + if (p->name[0] == '.') { + if (p->len == 1) { + this = dget(dir); + goto get; + } + if (p->len == 2 && p->name[1] == '.') { + /* we shall not be moved */ + this = dget(dir->d_parent); + goto get; + } + } + this = lookup_one_len(p->name, dir, p->len); + if (IS_ERR_OR_NULL(this) || !this->d_inode) { + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + goto fail; + } + goto out; + } + +get: + type = ovl_path_type(this); + if (OVL_TYPE_ORIGIN(type)) { + struct kstat stat; + struct path statpath = *path; + + statpath.dentry = this; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + goto fail; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + ino = stat.ino; + } + +out: + p->ino = ino; + dput(this); + return err; + +fail: + pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + p->name, err); + goto out; +} + +static int ovl_fill_plain(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_cache_entry *p; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + list_add_tail(&p->l_node, rdd->list); + + return 0; +} + +static int ovl_dir_read_impure(struct path *path, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_cache_entry *p, *n; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_plain, + .list = list, + .root = root, + }; + + INIT_LIST_HEAD(list); + *root = RB_ROOT; + ovl_path_upper(path->dentry, &realpath); + + err = ovl_dir_read(&realpath, &rdd); + if (err) + return err; + + list_for_each_entry_safe(p, n, list, l_node) { + if (strcmp(p->name, ".") != 0 && + strcmp(p->name, "..") != 0) { + err = ovl_cache_update_ino(path, p); + if (err) + return err; + } + if (p->ino == p->real_ino) { + list_del(&p->l_node); + kfree(p); + } else { + struct rb_node **newp = &root->rb_node; + struct rb_node *parent = NULL; + + if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, + &newp, &parent))) + return -EIO; + + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, root); + } + } + return 0; +} + +static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) +{ + int res; + struct dentry *dentry = path->dentry; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) + return cache; + + /* Impure cache is not refcounted, free it here */ + ovl_dir_cache_free(d_inode(dentry)); + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + res = ovl_dir_read_impure(path, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + if (list_empty(&cache->entries)) { + /* Good oportunity to get rid of an unnecessary "impure" flag */ + ovl_do_removexattr(ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); + ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + kfree(cache); + return NULL; + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +struct ovl_readdir_translate { + struct dir_context *orig_ctx; + struct ovl_dir_cache *cache; + struct dir_context ctx; + u64 parent_ino; +}; + +static int ovl_fill_real(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_translate *rdt = + container_of(ctx, struct ovl_readdir_translate, ctx); + struct dir_context *orig_ctx = rdt->orig_ctx; + + if (rdt->parent_ino && strcmp(name, "..") == 0) + ino = rdt->parent_ino; + else if (rdt->cache) { + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); + if (p) + ino = p->ino; + } + + return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); +} + +static int ovl_iterate_real(struct file *file, struct dir_context *ctx) +{ + int err; + struct ovl_dir_file *od = file->private_data; + struct dentry *dir = file->f_path.dentry; + struct ovl_readdir_translate rdt = { + .ctx.actor = ovl_fill_real, + .orig_ctx = ctx, + }; + + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { + struct kstat stat; + struct path statpath = file->f_path; + + statpath.dentry = dir->d_parent; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + return err; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + rdt.parent_ino = stat.ino; + } + + if (ovl_test_flag(OVL_IMPURE, d_inode(dir))) { + rdt.cache = ovl_cache_get_impure(&file->f_path); + if (IS_ERR(rdt.cache)) + return PTR_ERR(rdt.cache); + } + + return iterate_dir(od->realfile, &rdt.ctx); +} + + static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_cache_entry *p; + int err; if (!ctx->pos) ovl_dir_reset(file); - if (od->is_real) + if (od->is_real) { + /* + * If parent is merge, then need to adjust d_ino for '..', if + * dir is impure then need to adjust d_ino for copied up + * entries. + */ + if (ovl_same_sb(dentry->d_sb) && + (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) { + return ovl_iterate_real(file, ctx); + } return iterate_dir(od->realfile, ctx); + } if (!od->cache) { struct ovl_dir_cache *cache; @@ -378,9 +686,15 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); - if (!p->is_whiteout) + if (!p->is_whiteout) { + if (!p->ino) { + err = ovl_cache_update_ino(&file->f_path, p); + if (err) + return err; + } if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; + } od->cursor = p->l_node.next; ctx->pos++; } @@ -522,8 +836,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p; + struct rb_root root = RB_ROOT; - err = ovl_dir_read_merged(dentry, list); + err = ovl_dir_read_merged(dentry, list, &root); if (err) return err; @@ -612,12 +927,13 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level) int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); + struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = NULL, .list = &list, - .root = RB_ROOT, + .root = &root, .is_lowest = false, }; @@ -675,12 +991,13 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, struct inode *dir = dentry->d_inode; struct path path = { .mnt = mnt, .dentry = dentry }; LIST_HEAD(list); + struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = NULL, .list = &list, - .root = RB_ROOT, + .root = &root, .is_lowest = false, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d86e89f97201..cd49c0298ddf 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -70,20 +70,20 @@ static int ovl_check_append_only(struct inode *inode, int flag) static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, - unsigned int open_flags) + unsigned int open_flags, unsigned int flags) { struct dentry *real; int err; + if (flags & D_REAL_UPPER) + return ovl_dentry_upper(dentry); + if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) return dentry; goto bug; } - if (d_is_negative(dentry)) - return dentry; - if (open_flags) { err = ovl_open_maybe_copy_up(dentry, open_flags); if (err) @@ -105,7 +105,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, goto bug; /* Handle recursion */ - real = d_real(real, inode, open_flags); + real = d_real(real, inode, open_flags, 0); if (!inode || inode == d_inode(real)) return real; @@ -198,6 +198,7 @@ static void ovl_destroy_inode(struct inode *inode) dput(oi->__upperdentry); kfree(oi->redirect); + ovl_dir_cache_free(inode); mutex_destroy(&oi->lock); call_rcu(&inode->i_rcu, ovl_i_callback); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f46ad75dc96a..117794582f9f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -180,14 +180,14 @@ struct inode *ovl_inode_real(struct inode *inode) } -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) { - return OVL_I(d_inode(dentry))->cache; + return OVL_I(inode)->cache; } -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) { - OVL_I(d_inode(dentry))->cache = cache; + OVL_I(inode)->cache = cache; } bool ovl_dentry_is_opaque(struct dentry *dentry) @@ -275,12 +275,19 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) } } -void ovl_dentry_version_inc(struct dentry *dentry) +void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) { struct inode *inode = d_inode(dentry); WARN_ON(!inode_is_locked(inode)); - OVL_I(inode)->version++; + /* + * Version is used by readdir code to keep cache consistent. For merge + * dirs all changes need to be noted. For non-merge dirs, cache only + * contains impure (ones which have been copied up and have origins) + * entries, so only need to note changes to impure entries. + */ + if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity) + OVL_I(inode)->version++; } u64 ovl_dentry_version_get(struct dentry *dentry) @@ -382,6 +389,11 @@ void ovl_set_flag(unsigned long flag, struct inode *inode) set_bit(flag, &OVL_I(inode)->flags); } +void ovl_clear_flag(unsigned long flag, struct inode *inode) +{ + clear_bit(flag, &OVL_I(inode)->flags); +} + bool ovl_test_flag(unsigned long flag, struct inode *inode) { return test_bit(flag, &OVL_I(inode)->flags); diff --git a/fs/signalfd.c b/fs/signalfd.c index 593b022ac11b..d2c434112f42 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -95,23 +95,23 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, */ err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo); err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno); - err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code); - switch (kinfo->si_code & __SI_MASK) { - case __SI_KILL: + err |= __put_user(kinfo->si_code, &uinfo->ssi_code); + switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) { + case SIL_KILL: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); break; - case __SI_TIMER: + case SIL_TIMER: err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; - case __SI_POLL: + case SIL_POLL: err |= __put_user(kinfo->si_band, &uinfo->ssi_band); err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd); break; - case __SI_FAULT: + case SIL_FAULT: err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); #ifdef __ARCH_SI_TRAPNO err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); @@ -128,20 +128,14 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, &uinfo->ssi_addr_lsb); #endif break; - case __SI_CHLD: + case SIL_CHLD: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); err |= __put_user(kinfo->si_status, &uinfo->ssi_status); err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime); err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime); break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); - err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); - err |= __put_user(kinfo->si_int, &uinfo->ssi_int); - break; + case SIL_RT: default: /* * This case catches also the signals queued by sigqueue(). diff --git a/fs/xattr.c b/fs/xattr.c index 464c94bf65f9..4424f7fecf14 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -23,6 +23,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> +#include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) @@ -441,6 +442,12 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_from_user(kvalue, size); + else if (strcmp(kname, XATTR_NAME_CAPS) == 0) { + error = cap_convert_nscap(d, &kvalue, size); + if (error < 0) + goto out; + size = error; + } } error = vfs_setxattr(d, kname, kvalue, size, flags); @@ -496,10 +503,10 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (!error) { error = setxattr(f.file->f_path.dentry, name, value, size, flags); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); } fdput(f); return error; @@ -728,10 +735,10 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (!error) { error = removexattr(f.file->f_path.dentry, name); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); } fdput(f); return error; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index fffae1390d7f..29172609f2a3 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -80,6 +80,19 @@ xfs_find_bdev_for_inode( return mp->m_ddev_targp->bt_bdev; } +struct dax_device * +xfs_find_daxdev_for_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_daxdev; + else + return mp->m_ddev_targp->bt_daxdev; +} + /* * We're now finished for good with this page. Update the page state via the * associated buffer_heads, paying attention to the start and end offsets that diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index cc174ec6c2fd..88c85ea63da0 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -59,5 +59,6 @@ int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); +extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b1c9711e79a4..da14658da310 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1802,7 +1802,8 @@ xfs_setsize_buftarg_early( xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev) + struct block_device *bdev, + struct dax_device *dax_dev) { xfs_buftarg_t *btp; @@ -1811,6 +1812,7 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; + btp->bt_daxdev = dax_dev; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 20721261dae5..bf71507ddb16 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -108,6 +108,7 @@ typedef unsigned int xfs_buf_flags_t; typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; + struct dax_device *bt_daxdev; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; @@ -385,7 +386,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *); + struct block_device *, struct dax_device *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 79cb5b3d140c..a1909bc064e9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -69,6 +69,7 @@ xfs_bmbt_to_iomap( iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); + iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); } xfs_extlen_t @@ -975,7 +976,6 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; - struct block_device *bdev; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1085,13 +1085,6 @@ xfs_file_iomap_begin( xfs_bmbt_to_iomap(ip, iomap, &imap); - /* optionally associate a dax device with the iomap bdev */ - bdev = iomap->bdev; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; - if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; @@ -1169,7 +1162,6 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { - fs_put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9301c5a6060b..dcd1292664b3 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -270,7 +270,14 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #endif /* DEBUG */ #ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* + * make sure we ignore the inode flag if the filesystem doesn't have a + * configured realtime device. + */ +#define XFS_IS_REALTIME_INODE(ip) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (ip)->i_mount->m_rtdev_targp) #else #define XFS_IS_REALTIME_INODE(ip) (0) #endif diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c1c4c2ea1014..3008f31753df 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -714,17 +714,26 @@ STATIC void xfs_close_devices( struct xfs_mount *mp) { + struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); + fs_put_dax(dax_logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); + fs_put_dax(dax_rtdev); } xfs_free_buftarg(mp, mp->m_ddev_targp); + fs_put_dax(dax_ddev); } /* @@ -742,6 +751,8 @@ xfs_open_devices( struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; + struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); + struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; struct block_device *logdev = NULL, *rtdev = NULL; int error; @@ -752,6 +763,7 @@ xfs_open_devices( error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) goto out; + dax_logdev = fs_dax_get_by_bdev(logdev); } if (mp->m_rtname) { @@ -765,24 +777,25 @@ xfs_open_devices( error = -EINVAL; goto out_close_rtdev; } + dax_rtdev = fs_dax_get_by_bdev(rtdev); } /* * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -798,10 +811,14 @@ xfs_open_devices( xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); + fs_put_dax(dax_rtdev); out_close_logdev: - if (logdev && logdev != ddev) + if (logdev && logdev != ddev) { xfs_blkdev_put(logdev); + fs_put_dax(dax_logdev); + } out: + fs_put_dax(dax_ddev); return error; } diff --git a/include/linux/capability.h b/include/linux/capability.h index 6ffb67e10c06..b52e278e4744 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -248,4 +248,6 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); + #endif /* !_LINUX_CAPABILITY_H */ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index edf5b04b918a..b422170b791a 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -167,6 +167,8 @@ struct ceph_mon_request_header { struct ceph_mon_statfs { struct ceph_mon_request_header monhdr; struct ceph_fsid fsid; + __u8 contains_data_pool; + __le64 data_pool; } __attribute__ ((packed)); struct ceph_statfs { @@ -669,7 +671,9 @@ enum { extern const char *ceph_cap_op_name(int op); /* flags field in client cap messages (version >= 10) */ -#define CEPH_CLIENT_CAPS_SYNC (0x1) +#define CEPH_CLIENT_CAPS_SYNC (1<<0) +#define CEPH_CLIENT_CAPS_NO_CAPSNAP (1<<1) +#define CEPH_CLIENT_CAPS_PENDING_CAPSNAP (1<<2); /* * caps message, used for capability callbacks, acks, requests, etc. diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 8a79587e1317..4c846aabd9f6 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -84,17 +84,6 @@ struct ceph_options { #define CEPH_AUTH_NAME_DEFAULT "guest" -/* - * Delay telling the MDS we no longer want caps, in case we reopen - * the file. Delay a minimum amount of time, even if we send a cap - * message for some other reason. Otherwise, take the oppotunity to - * update the mds to avoid sending another message later. - */ -#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ -#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ - -#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4) - /* mount state */ enum { CEPH_MOUNT_MOUNTING, diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index d5a3ecea578d..0fa990bf867a 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -133,8 +133,8 @@ void ceph_monc_renew_subs(struct ceph_mon_client *monc); extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, unsigned long timeout); -extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, - struct ceph_statfs *buf); +int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, + struct ceph_statfs *buf); int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, u64 *newest); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index b8281feda9c7..01408841c9c4 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -230,7 +230,6 @@ extern const char *ceph_osd_state_name(int s); \ /* fancy write */ \ f(APPEND, __CEPH_OSD_OP(WR, DATA, 6), "append") \ - f(STARTSYNC, __CEPH_OSD_OP(WR, DATA, 7), "startsync") \ f(SETTRUNC, __CEPH_OSD_OP(WR, DATA, 8), "settrunc") \ f(TRIMTRUNC, __CEPH_OSD_OP(WR, DATA, 9), "trimtrunc") \ \ diff --git a/include/linux/cper.h b/include/linux/cper.h index 4c671fc2081e..723e952fde0d 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -74,36 +74,36 @@ enum { * Corrected Machine Check */ #define CPER_NOTIFY_CMC \ - UUID_LE(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ - 0xEB, 0xD4, 0xF8, 0x90) + GUID_INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ + 0xEB, 0xD4, 0xF8, 0x90) /* Corrected Platform Error */ #define CPER_NOTIFY_CPE \ - UUID_LE(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81, \ - 0xF2, 0x7E, 0xBE, 0xEE) + GUID_INIT(0x4E292F96, 0xD843, 0x4a55, 0xA8, 0xC2, 0xD4, 0x81, \ + 0xF2, 0x7E, 0xBE, 0xEE) /* Machine Check Exception */ #define CPER_NOTIFY_MCE \ - UUID_LE(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ - 0xE1, 0x49, 0x13, 0xBB) + GUID_INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ + 0xE1, 0x49, 0x13, 0xBB) /* PCI Express Error */ #define CPER_NOTIFY_PCIE \ - UUID_LE(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D, \ - 0xAF, 0x67, 0xC1, 0x04) + GUID_INIT(0xCF93C01F, 0x1A16, 0x4dfc, 0xB8, 0xBC, 0x9C, 0x4D, \ + 0xAF, 0x67, 0xC1, 0x04) /* INIT Record (for IPF) */ #define CPER_NOTIFY_INIT \ - UUID_LE(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B, \ - 0xD3, 0x9B, 0xC9, 0x8E) + GUID_INIT(0xCC5263E8, 0x9308, 0x454a, 0x89, 0xD0, 0x34, 0x0B, \ + 0xD3, 0x9B, 0xC9, 0x8E) /* Non-Maskable Interrupt */ #define CPER_NOTIFY_NMI \ - UUID_LE(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24, \ - 0x85, 0xD6, 0xE9, 0x8A) + GUID_INIT(0x5BAD89FF, 0xB7E6, 0x42c9, 0x81, 0x4A, 0xCF, 0x24, \ + 0x85, 0xD6, 0xE9, 0x8A) /* BOOT Error Record */ #define CPER_NOTIFY_BOOT \ - UUID_LE(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \ - 0xD4, 0x64, 0xB3, 0x8F) + GUID_INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \ + 0xD4, 0x64, 0xB3, 0x8F) /* DMA Remapping Error */ #define CPER_NOTIFY_DMAR \ - UUID_LE(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ - 0x72, 0x2D, 0xEB, 0x41) + GUID_INIT(0x667DD791, 0xC6B3, 0x4c27, 0x8A, 0x6B, 0x0F, 0x8E, \ + 0x72, 0x2D, 0xEB, 0x41) /* * Flags bits definitions for flags in struct cper_record_header @@ -170,50 +170,50 @@ enum { * Processor Generic */ #define CPER_SEC_PROC_GENERIC \ - UUID_LE(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1, \ - 0x93, 0xC4, 0xF3, 0xDB) + GUID_INIT(0x9876CCAD, 0x47B4, 0x4bdb, 0xB6, 0x5E, 0x16, 0xF1, \ + 0x93, 0xC4, 0xF3, 0xDB) /* Processor Specific: X86/X86_64 */ #define CPER_SEC_PROC_IA \ - UUID_LE(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \ - 0x24, 0x2B, 0x6E, 0x1D) + GUID_INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \ + 0x24, 0x2B, 0x6E, 0x1D) /* Processor Specific: IA64 */ #define CPER_SEC_PROC_IPF \ - UUID_LE(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00, \ - 0x80, 0xC7, 0x3C, 0x88, 0x81) + GUID_INIT(0xE429FAF1, 0x3CB7, 0x11D4, 0x0B, 0xCA, 0x07, 0x00, \ + 0x80, 0xC7, 0x3C, 0x88, 0x81) /* Processor Specific: ARM */ #define CPER_SEC_PROC_ARM \ - UUID_LE(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05, \ - 0x1D, 0x5D, 0x46, 0xB0) + GUID_INIT(0xE19E3D16, 0xBC11, 0x11E4, 0x9C, 0xAA, 0xC2, 0x05, \ + 0x1D, 0x5D, 0x46, 0xB0) /* Platform Memory */ #define CPER_SEC_PLATFORM_MEM \ - UUID_LE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \ - 0xED, 0x7C, 0x83, 0xB1) + GUID_INIT(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \ + 0xED, 0x7C, 0x83, 0xB1) #define CPER_SEC_PCIE \ - UUID_LE(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D, \ - 0xCB, 0x3C, 0x6F, 0x35) + GUID_INIT(0xD995E954, 0xBBC1, 0x430F, 0xAD, 0x91, 0xB4, 0x4D, \ + 0xCB, 0x3C, 0x6F, 0x35) /* Firmware Error Record Reference */ #define CPER_SEC_FW_ERR_REC_REF \ - UUID_LE(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72, \ - 0x9C, 0x8E, 0x69, 0xED) + GUID_INIT(0x81212A96, 0x09ED, 0x4996, 0x94, 0x71, 0x8D, 0x72, \ + 0x9C, 0x8E, 0x69, 0xED) /* PCI/PCI-X Bus */ #define CPER_SEC_PCI_X_BUS \ - UUID_LE(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA, \ - 0xD3, 0xF9, 0xC9, 0xDD) + GUID_INIT(0xC5753963, 0x3B84, 0x4095, 0xBF, 0x78, 0xED, 0xDA, \ + 0xD3, 0xF9, 0xC9, 0xDD) /* PCI Component/Device */ #define CPER_SEC_PCI_DEV \ - UUID_LE(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06, \ - 0x8B, 0x00, 0x13, 0x26) + GUID_INIT(0xEB5E4685, 0xCA66, 0x4769, 0xB6, 0xA2, 0x26, 0x06, \ + 0x8B, 0x00, 0x13, 0x26) #define CPER_SEC_DMAR_GENERIC \ - UUID_LE(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62, \ - 0xDE, 0x3E, 0x2C, 0x64) + GUID_INIT(0x5B51FEF7, 0xC79D, 0x4434, 0x8F, 0x1B, 0xAA, 0x62, \ + 0xDE, 0x3E, 0x2C, 0x64) /* Intel VT for Directed I/O specific DMAr */ #define CPER_SEC_DMAR_VT \ - UUID_LE(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE, \ - 0xDD, 0x93, 0xE8, 0xCF) + GUID_INIT(0x71761D37, 0x32B2, 0x45cd, 0xA7, 0xD0, 0xB0, 0xFE, \ + 0xDD, 0x93, 0xE8, 0xCF) /* IOMMU specific DMAr */ #define CPER_SEC_DMAR_IOMMU \ - UUID_LE(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \ - 0xDF, 0xAA, 0x84, 0xEC) + GUID_INIT(0x036F84E1, 0x7F37, 0x428c, 0xA7, 0x9E, 0x57, 0x5F, \ + 0xDF, 0xAA, 0x84, 0xEC) #define CPER_PROC_VALID_TYPE 0x0001 #define CPER_PROC_VALID_ISA 0x0002 @@ -290,10 +290,10 @@ struct cper_record_header { __u32 validation_bits; __u32 record_length; __u64 timestamp; - uuid_le platform_id; - uuid_le partition_id; - uuid_le creator_id; - uuid_le notification_type; + guid_t platform_id; + guid_t partition_id; + guid_t creator_id; + guid_t notification_type; __u64 record_id; __u32 flags; __u64 persistence_information; @@ -309,8 +309,8 @@ struct cper_section_descriptor { __u8 validation_bits; __u8 reserved; /* must be zero */ __u32 flags; - uuid_le section_type; - uuid_le fru_id; + guid_t section_type; + guid_t fru_id; __u32 section_severity; __u8 fru_text[20]; }; @@ -343,7 +343,7 @@ struct cper_sec_proc_ia { /* IA32/X64 Processor Error Information Structure */ struct cper_ia_err_info { - uuid_le err_type; + guid_t err_type; __u64 validation_bits; __u64 check_info; __u64 target_id; diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e74655d941b7..a1e6a33a4b03 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -51,7 +51,9 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); +extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -164,11 +166,15 @@ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_force_rebuild(void) { } + static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_wait_for_hotplug(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/include/linux/dax.h b/include/linux/dax.h index eb0bff6f1eab..46cad1d0f129 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -57,6 +57,7 @@ static inline void fs_put_dax(struct dax_device *dax_dev) put_dax(dax_dev); } +struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); #else static inline int bdev_dax_supported(struct super_block *sb, int blocksize) { @@ -71,6 +72,11 @@ static inline struct dax_device *fs_dax_get_by_host(const char *host) static inline void fs_put_dax(struct dax_device *dax_dev) { } + +static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) +{ + return NULL; +} #endif int dax_read_lock(void); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index aae1cdb76851..ed1a7cf6923a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -147,7 +147,7 @@ struct dentry_operations { struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, - unsigned int); + unsigned int, unsigned int); } ____cacheline_aligned; /* @@ -562,11 +562,15 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) return upper; } +/* d_real() flags */ +#define D_REAL_UPPER 0x2 /* return upper dentry or NULL if non-upper */ + /** * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) - * @flags: open flags to control copy-up behavior + * @open_flags: open flags to control copy-up behavior + * @flags: flags to control what is returned by this function * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. @@ -575,10 +579,10 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper) */ static inline struct dentry *d_real(struct dentry *dentry, const struct inode *inode, - unsigned int flags) + unsigned int open_flags, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) - return dentry->d_op->d_real(dentry, inode, flags); + return dentry->d_op->d_real(dentry, inode, open_flags, flags); else return dentry; } @@ -593,7 +597,7 @@ static inline struct dentry *d_real(struct dentry *dentry, static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ - return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0)); + return d_backing_inode(d_real((struct dentry *) dentry, NULL, 0, 0)); } struct name_snapshot { diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2189c79cde5d..29ce9815da87 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -550,26 +550,13 @@ static inline void dma_free_coherent(struct device *dev, size_t size, return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); } -static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp) -{ - return dma_alloc_attrs(dev, size, dma_handle, gfp, - DMA_ATTR_NON_CONSISTENT); -} - -static inline void dma_free_noncoherent(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_handle) -{ - dma_free_attrs(dev, size, cpu_addr, dma_handle, - DMA_ATTR_NON_CONSISTENT); -} - static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { - debug_dma_mapping_error(dev, dma_addr); + const struct dma_map_ops *ops = get_dma_ops(dev); - if (get_dma_ops(dev)->mapping_error) - return get_dma_ops(dev)->mapping_error(dev, dma_addr); + debug_dma_mapping_error(dev, dma_addr); + if (ops->mapping_error) + return ops->mapping_error(dev, dma_addr); return 0; } @@ -720,10 +707,7 @@ static inline int dma_get_cache_alignment(void) #endif /* flags for the coherent memory api */ -#define DMA_MEMORY_MAP 0x01 -#define DMA_MEMORY_IO 0x02 -#define DMA_MEMORY_INCLUDES_CHILDREN 0x04 -#define DMA_MEMORY_EXCLUSIVE 0x08 +#define DMA_MEMORY_EXCLUSIVE 0x01 #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, @@ -736,7 +720,7 @@ static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags) { - return 0; + return -ENOSYS; } static inline void diff --git a/include/linux/efi.h b/include/linux/efi.h index 65905c3cb655..66f4a4e79f4b 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -47,10 +47,10 @@ typedef u16 efi_char16_t; /* UNICODE character */ typedef u64 efi_physical_addr_t; typedef void *efi_handle_t; -typedef uuid_le efi_guid_t; +typedef guid_t efi_guid_t; #define EFI_GUID(a,b,c,d0,d1,d2,d3,d4,d5,d6,d7) \ - UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) + GUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) /* * Generic EFI table header diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index b6feed6547ce..2a0c453d7235 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -186,6 +186,8 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ + get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ @@ -205,9 +207,7 @@ struct f2fs_extent { #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ - -#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ - F2FS_INLINE_XATTR_ADDRS - 1)) +#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -235,8 +235,16 @@ struct f2fs_inode { struct f2fs_extent i_ext; /* caching a largest extent */ - __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - + union { + struct { + __le16 i_extra_isize; /* extra inode attribute size */ + __le16 i_padding; /* padding */ + __le32 i_projid; /* project id */ + __le32 i_inode_checksum;/* inode meta checksum */ + __le32 i_extra_end[0]; /* for attribute size calculation */ + }; + __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ + }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; @@ -465,7 +473,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) /* - * space utilization of regular dentry and inline dentry + * space utilization of regular dentry and inline dentry (w/o extra reservation) * regular dentry inline dentry * bitmap 1 * 27 = 27 1 * 23 = 23 * reserved 1 * 3 = 3 1 * 7 = 7 @@ -501,24 +509,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* for inline dir */ -#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - BITS_PER_BYTE + 1)) -#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \ - BITS_PER_BYTE - 1) / BITS_PER_BYTE) -#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE)) - -/* inline directory entry structure */ -struct f2fs_inline_dentry { - __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE]; - __u8 reserved[INLINE_RESERVED_SIZE]; - struct f2fs_dir_entry dentry[NR_INLINE_DENTRY]; - __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN]; -} __packed; - /* file types used in inode_info->flags */ enum { F2FS_FT_UNKNOWN, @@ -534,4 +524,6 @@ enum { #define S_SHIFT 12 +#define F2FS_DEF_PROJID 0 /* default project ID */ + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2d0e6748e46e..33d8e45cd874 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1235,7 +1235,7 @@ static inline struct inode *file_inode(const struct file *f) static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file), 0); + return d_real(file->f_path.dentry, file_inode(file), 0, 0); } static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index f3d3e6af8838..3eaad2fbf284 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -87,6 +87,7 @@ struct nd_mapping_desc { struct nvdimm *nvdimm; u64 start; u64 size; + int position; }; struct nd_region_desc { @@ -173,4 +174,19 @@ u64 nd_fletcher64(void *addr, size_t len, bool le); void nvdimm_flush(struct nd_region *nd_region); int nvdimm_has_flush(struct nd_region *nd_region); int nvdimm_has_cache(struct nd_region *nd_region); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +#define ARCH_MEMREMAP_PMEM MEMREMAP_WB +void arch_wb_cache_pmem(void *addr, size_t size); +void arch_invalidate_pmem(void *addr, size_t size); +#else +#define ARCH_MEMREMAP_PMEM MEMREMAP_WT +static inline void arch_wb_cache_pmem(void *addr, size_t size) +{ +} +static inline void arch_invalidate_pmem(void *addr, size_t size) +{ +} +#endif + #endif /* __LIBNVDIMM_H__ */ diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h index 22b5d4e687ce..d1c2901f1542 100644 --- a/include/linux/lsm_audit.h +++ b/include/linux/lsm_audit.h @@ -4,7 +4,7 @@ * * Author : Etienne BASSET <etienne.basset@ensta.org> * - * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil> + * All credits to : Stephen Smalley, <sds@tycho.nsa.gov> * All BUGS to : Etienne BASSET <etienne.basset@ensta.org> */ #ifndef _LSM_COMMON_LOGGING_ diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d1c7bef25691..c9258124e417 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -528,11 +528,6 @@ * * Security hooks for task operations. * - * @task_create: - * Check permission before creating a child process. See the clone(2) - * manual page for definitions of the @clone_flags. - * @clone_flags contains the flags indicating what should be shared. - * Return 0 if permission is granted. * @task_alloc: * @task task being allocated. * @clone_flags contains the flags indicating what should be shared. @@ -1505,7 +1500,6 @@ union security_list_options { int (*file_receive)(struct file *file); int (*file_open)(struct file *file, const struct cred *cred); - int (*task_create)(unsigned long clone_flags); int (*task_alloc)(struct task_struct *task, unsigned long clone_flags); void (*task_free)(struct task_struct *task); int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp); @@ -1779,7 +1773,6 @@ struct security_hook_heads { struct list_head file_send_sigiotask; struct list_head file_receive; struct list_head file_open; - struct list_head task_create; struct list_head task_alloc; struct list_head task_free; struct list_head cred_alloc_blank; diff --git a/include/linux/mem_encrypt.h b/include/linux/mem_encrypt.h index 1255f09f5e42..265a9cd21cb4 100644 --- a/include/linux/mem_encrypt.h +++ b/include/linux/mem_encrypt.h @@ -21,7 +21,7 @@ #else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ -#define sme_me_mask 0UL +#define sme_me_mask 0ULL #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ @@ -30,18 +30,23 @@ static inline bool sme_active(void) return !!sme_me_mask; } -static inline unsigned long sme_get_me_mask(void) +static inline u64 sme_get_me_mask(void) { return sme_me_mask; } +#ifdef CONFIG_AMD_MEM_ENCRYPT /* * The __sme_set() and __sme_clr() macros are useful for adding or removing * the encryption mask from a value (e.g. when dealing with pagetable * entries). */ -#define __sme_set(x) ((unsigned long)(x) | sme_me_mask) -#define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask) +#define __sme_set(x) ((x) | sme_me_mask) +#define __sme_clr(x) ((x) & ~sme_me_mask) +#else +#define __sme_set(x) (x) +#define __sme_clr(x) (x) +#endif #endif /* __ASSEMBLY__ */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 5cc91d6381a3..a0282ceaa48b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -49,7 +49,6 @@ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; - unsigned long jiffies; struct rpc_cred * cred; __u32 mask; struct rcu_head rcu_head; @@ -154,7 +153,7 @@ struct nfs_inode { */ __be32 cookieverf[2]; - unsigned long nrequests; + atomic_long_t nrequests; struct nfs_mds_commit_info commit_info; /* Open contexts for shared mmap writes */ @@ -163,6 +162,7 @@ struct nfs_inode { /* Readers: in-flight sillydelete RPC calls */ /* Writers: rmdir */ struct rw_semaphore rmdir_sem; + struct mutex commit_mutex; #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; @@ -510,7 +510,7 @@ extern void nfs_commit_free(struct nfs_commit_data *data); static inline int nfs_have_writebacks(struct inode *inode) { - return NFS_I(inode)->nrequests != 0; + return atomic_long_read(&NFS_I(inode)->nrequests) != 0; } /* diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index d67b67ae6c8b..d117120c9b6e 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -125,8 +125,7 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int how, - gfp_t gfp_flags); + int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern int nfs_pageio_resend(struct nfs_pageio_descriptor *, @@ -139,8 +138,7 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, extern int nfs_wait_on_request(struct nfs_page *); extern void nfs_unlock_request(struct nfs_page *req); extern void nfs_unlock_and_release_request(struct nfs_page *); -extern int nfs_page_group_lock(struct nfs_page *, bool); -extern void nfs_page_group_lock_wait(struct nfs_page *); +extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 62cbcb842f99..164d5359d4ab 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1476,7 +1476,7 @@ struct nfs_pgio_header { struct nfs_mds_commit_info { atomic_t rpcs_out; - unsigned long ncommit; + atomic_long_t ncommit; struct list_head list; }; diff --git a/include/linux/security.h b/include/linux/security.h index 974bb9b0996c..ce6265960d6c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -90,6 +90,8 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name, extern int cap_inode_removexattr(struct dentry *dentry, const char *name); extern int cap_inode_need_killpriv(struct dentry *dentry); extern int cap_inode_killpriv(struct dentry *dentry); +extern int cap_inode_getsecurity(struct inode *inode, const char *name, + void **buffer, bool alloc); extern int cap_mmap_addr(unsigned long addr); extern int cap_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags); @@ -316,7 +318,6 @@ int security_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int sig); int security_file_receive(struct file *file); int security_file_open(struct file *file, const struct cred *cred); -int security_task_create(unsigned long clone_flags); int security_task_alloc(struct task_struct *task, unsigned long clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); @@ -878,11 +879,6 @@ static inline int security_file_open(struct file *file, return 0; } -static inline int security_task_create(unsigned long clone_flags) -{ - return 0; -} - static inline int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { diff --git a/include/linux/signal.h b/include/linux/signal.h index e2678b5dbb21..38564e3e54c7 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -21,6 +21,20 @@ static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from); +enum siginfo_layout { + SIL_KILL, + SIL_TIMER, + SIL_POLL, + SIL_FAULT, + SIL_CHLD, + SIL_RT, +#ifdef __ARCH_SIGSYS + SIL_SYS, +#endif +}; + +enum siginfo_layout siginfo_layout(int sig, int si_code); + /* * Define some primitives to manipulate sigset_t. */ @@ -380,10 +394,18 @@ int unhandled_signal(struct task_struct *tsk, int sig); rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) ) +#define SIG_SPECIFIC_SICODES_MASK (\ + rt_sigmask(SIGILL) | rt_sigmask(SIGFPE) | \ + rt_sigmask(SIGSEGV) | rt_sigmask(SIGBUS) | \ + rt_sigmask(SIGTRAP) | rt_sigmask(SIGCHLD) | \ + rt_sigmask(SIGPOLL) | rt_sigmask(SIGSYS) | \ + SIGEMT_MASK ) + #define sig_kernel_only(sig) siginmask(sig, SIG_KERNEL_ONLY_MASK) #define sig_kernel_coredump(sig) siginmask(sig, SIG_KERNEL_COREDUMP_MASK) #define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK) #define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK) +#define sig_specific_sicodes(sig) siginmask(sig, SIG_SPECIFIC_SICODES_MASK) #define sig_fatal(t, signr) \ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 50a99a117da7..c1768f9d993b 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -139,6 +139,8 @@ struct rpc_task_setup { #define RPC_TASK_RUNNING 0 #define RPC_TASK_QUEUED 1 #define RPC_TASK_ACTIVE 2 +#define RPC_TASK_MSG_RECV 3 +#define RPC_TASK_MSG_RECV_WAIT 4 #define RPC_IS_RUNNING(t) test_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) #define rpc_set_running(t) set_bit(RPC_TASK_RUNNING, &(t)->tk_runstate) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 261b48a2701d..86b59e3525a5 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -239,6 +239,19 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +/** + * xdr_stream_remaining - Return the number of bytes remaining in the stream + * @xdr: pointer to struct xdr_stream + * + * Return value: + * Number of bytes remaining in @xdr before xdr->end + */ +static inline size_t +xdr_stream_remaining(const struct xdr_stream *xdr) +{ + return xdr->nwords << 2; +} + ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags); /** diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index eab1c749e192..5a7bff41f6b7 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -174,7 +174,7 @@ enum xprt_transports { struct rpc_xprt { struct kref kref; /* Reference count */ - struct rpc_xprt_ops * ops; /* transport methods */ + const struct rpc_xprt_ops *ops; /* transport methods */ const struct rpc_timeout *timeout; /* timeout parms */ struct sockaddr_storage addr; /* server address */ @@ -232,6 +232,7 @@ struct rpc_xprt { */ spinlock_t transport_lock; /* lock transport info */ spinlock_t reserve_lock; /* lock slot table */ + spinlock_t recv_lock; /* lock receive list */ u32 xid; /* Next XID value to use */ struct rpc_task * snd_task; /* Task blocked in send */ struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ @@ -372,6 +373,8 @@ void xprt_write_space(struct rpc_xprt *xprt); void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); void xprt_complete_rqst(struct rpc_task *task, int copied); +void xprt_pin_rqst(struct rpc_rqst *req); +void xprt_unpin_rqst(struct rpc_rqst *req); void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index dab11f97e1c6..fd5b959c753c 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -102,6 +102,7 @@ enum thermal_notify_event { THERMAL_DEVICE_DOWN, /* Thermal device is down */ THERMAL_DEVICE_UP, /* Thermal device is up after a down event */ THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */ + THERMAL_TABLE_CHANGED, /* Thermal table(s) changed */ }; struct thermal_zone_device_ops { diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index b3575ce29148..c18e01252346 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -112,8 +112,9 @@ extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); +extern bool in_userns(const struct user_namespace *ancestor, + const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); - struct ns_common *ns_get_owner(struct ns_common *ns); #else @@ -144,6 +145,12 @@ static inline bool userns_may_setgroups(const struct user_namespace *ns) return true; } +static inline bool in_userns(const struct user_namespace *ancestor, + const struct user_namespace *child) +{ + return true; +} + static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index bc4dd7837e4c..5d216f7fb05a 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -543,14 +543,14 @@ TRACE_EVENT(f2fs_map_blocks, TRACE_EVENT(f2fs_background_gc, - TP_PROTO(struct super_block *sb, long wait_ms, + TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) - __field(long, wait_ms) + __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), @@ -562,13 +562,120 @@ TRACE_EVENT(f2fs_background_gc, __entry->free = free; ), - TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", + TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) ); +TRACE_EVENT(f2fs_gc_begin, + + TP_PROTO(struct super_block *sb, bool sync, bool background, + long long dirty_nodes, long long dirty_dents, + long long dirty_imeta, unsigned int free_sec, + unsigned int free_seg, int reserved_seg, + unsigned int prefree_seg), + + TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, sync) + __field(bool, background) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->sync = sync; + __entry->background = background; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + "rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->sync, + __entry->background, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + +TRACE_EVENT(f2fs_gc_end, + + TP_PROTO(struct super_block *sb, int ret, int seg_freed, + int sec_freed, long long dirty_nodes, + long long dirty_dents, long long dirty_imeta, + unsigned int free_sec, unsigned int free_seg, + int reserved_seg, unsigned int prefree_seg), + + TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, + dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ret) + __field(int, seg_freed) + __field(int, sec_freed) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->ret = ret; + __entry->seg_freed = seg_freed; + __entry->sec_freed = sec_freed; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " + "free_seg:%u, rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->ret, + __entry->seg_freed, + __entry->sec_freed, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 9c4eca6b374a..e5aa6794cea4 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -151,29 +151,6 @@ typedef struct siginfo { #define si_arch _sifields._sigsys._arch #endif -#ifdef __KERNEL__ -#define __SI_MASK 0xffff0000u -#define __SI_KILL (0 << 16) -#define __SI_TIMER (1 << 16) -#define __SI_POLL (2 << 16) -#define __SI_FAULT (3 << 16) -#define __SI_CHLD (4 << 16) -#define __SI_RT (5 << 16) -#define __SI_MESGQ (6 << 16) -#define __SI_SYS (7 << 16) -#define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) -#else /* __KERNEL__ */ -#define __SI_KILL 0 -#define __SI_TIMER 0 -#define __SI_POLL 0 -#define __SI_FAULT 0 -#define __SI_CHLD 0 -#define __SI_RT 0 -#define __SI_MESGQ 0 -#define __SI_SYS 0 -#define __SI_CODE(T,N) (N) -#endif /* __KERNEL__ */ - /* * si_code values * Digital reserves positive values for kernel-generated signals. @@ -181,8 +158,8 @@ typedef struct siginfo { #define SI_USER 0 /* sent by kill, sigsend, raise */ #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */ #define SI_QUEUE -1 /* sent by sigqueue */ -#define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */ -#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */ +#define SI_TIMER -2 /* sent by timer expiration */ +#define SI_MESGQ -3 /* sent by real time mesq state change */ #define SI_ASYNCIO -4 /* sent by AIO completion */ #define SI_SIGIO -5 /* sent by queued SIGIO */ #define SI_TKILL -6 /* sent by tkill system call */ @@ -194,86 +171,86 @@ typedef struct siginfo { /* * SIGILL si_codes */ -#define ILL_ILLOPC (__SI_FAULT|1) /* illegal opcode */ -#define ILL_ILLOPN (__SI_FAULT|2) /* illegal operand */ -#define ILL_ILLADR (__SI_FAULT|3) /* illegal addressing mode */ -#define ILL_ILLTRP (__SI_FAULT|4) /* illegal trap */ -#define ILL_PRVOPC (__SI_FAULT|5) /* privileged opcode */ -#define ILL_PRVREG (__SI_FAULT|6) /* privileged register */ -#define ILL_COPROC (__SI_FAULT|7) /* coprocessor error */ -#define ILL_BADSTK (__SI_FAULT|8) /* internal stack error */ +#define ILL_ILLOPC 1 /* illegal opcode */ +#define ILL_ILLOPN 2 /* illegal operand */ +#define ILL_ILLADR 3 /* illegal addressing mode */ +#define ILL_ILLTRP 4 /* illegal trap */ +#define ILL_PRVOPC 5 /* privileged opcode */ +#define ILL_PRVREG 6 /* privileged register */ +#define ILL_COPROC 7 /* coprocessor error */ +#define ILL_BADSTK 8 /* internal stack error */ #define NSIGILL 8 /* * SIGFPE si_codes */ -#define FPE_INTDIV (__SI_FAULT|1) /* integer divide by zero */ -#define FPE_INTOVF (__SI_FAULT|2) /* integer overflow */ -#define FPE_FLTDIV (__SI_FAULT|3) /* floating point divide by zero */ -#define FPE_FLTOVF (__SI_FAULT|4) /* floating point overflow */ -#define FPE_FLTUND (__SI_FAULT|5) /* floating point underflow */ -#define FPE_FLTRES (__SI_FAULT|6) /* floating point inexact result */ -#define FPE_FLTINV (__SI_FAULT|7) /* floating point invalid operation */ -#define FPE_FLTSUB (__SI_FAULT|8) /* subscript out of range */ +#define FPE_INTDIV 1 /* integer divide by zero */ +#define FPE_INTOVF 2 /* integer overflow */ +#define FPE_FLTDIV 3 /* floating point divide by zero */ +#define FPE_FLTOVF 4 /* floating point overflow */ +#define FPE_FLTUND 5 /* floating point underflow */ +#define FPE_FLTRES 6 /* floating point inexact result */ +#define FPE_FLTINV 7 /* floating point invalid operation */ +#define FPE_FLTSUB 8 /* subscript out of range */ #define NSIGFPE 8 /* * SIGSEGV si_codes */ -#define SEGV_MAPERR (__SI_FAULT|1) /* address not mapped to object */ -#define SEGV_ACCERR (__SI_FAULT|2) /* invalid permissions for mapped object */ -#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ -#define SEGV_PKUERR (__SI_FAULT|4) /* failed protection key checks */ +#define SEGV_MAPERR 1 /* address not mapped to object */ +#define SEGV_ACCERR 2 /* invalid permissions for mapped object */ +#define SEGV_BNDERR 3 /* failed address bound checks */ +#define SEGV_PKUERR 4 /* failed protection key checks */ #define NSIGSEGV 4 /* * SIGBUS si_codes */ -#define BUS_ADRALN (__SI_FAULT|1) /* invalid address alignment */ -#define BUS_ADRERR (__SI_FAULT|2) /* non-existent physical address */ -#define BUS_OBJERR (__SI_FAULT|3) /* object specific hardware error */ +#define BUS_ADRALN 1 /* invalid address alignment */ +#define BUS_ADRERR 2 /* non-existent physical address */ +#define BUS_OBJERR 3 /* object specific hardware error */ /* hardware memory error consumed on a machine check: action required */ -#define BUS_MCEERR_AR (__SI_FAULT|4) +#define BUS_MCEERR_AR 4 /* hardware memory error detected in process but not consumed: action optional*/ -#define BUS_MCEERR_AO (__SI_FAULT|5) +#define BUS_MCEERR_AO 5 #define NSIGBUS 5 /* * SIGTRAP si_codes */ -#define TRAP_BRKPT (__SI_FAULT|1) /* process breakpoint */ -#define TRAP_TRACE (__SI_FAULT|2) /* process trace trap */ -#define TRAP_BRANCH (__SI_FAULT|3) /* process taken branch trap */ -#define TRAP_HWBKPT (__SI_FAULT|4) /* hardware breakpoint/watchpoint */ +#define TRAP_BRKPT 1 /* process breakpoint */ +#define TRAP_TRACE 2 /* process trace trap */ +#define TRAP_BRANCH 3 /* process taken branch trap */ +#define TRAP_HWBKPT 4 /* hardware breakpoint/watchpoint */ #define NSIGTRAP 4 /* * SIGCHLD si_codes */ -#define CLD_EXITED (__SI_CHLD|1) /* child has exited */ -#define CLD_KILLED (__SI_CHLD|2) /* child was killed */ -#define CLD_DUMPED (__SI_CHLD|3) /* child terminated abnormally */ -#define CLD_TRAPPED (__SI_CHLD|4) /* traced child has trapped */ -#define CLD_STOPPED (__SI_CHLD|5) /* child has stopped */ -#define CLD_CONTINUED (__SI_CHLD|6) /* stopped child has continued */ +#define CLD_EXITED 1 /* child has exited */ +#define CLD_KILLED 2 /* child was killed */ +#define CLD_DUMPED 3 /* child terminated abnormally */ +#define CLD_TRAPPED 4 /* traced child has trapped */ +#define CLD_STOPPED 5 /* child has stopped */ +#define CLD_CONTINUED 6 /* stopped child has continued */ #define NSIGCHLD 6 /* - * SIGPOLL si_codes + * SIGPOLL (or any other signal without signal specific si_codes) si_codes */ -#define POLL_IN (__SI_POLL|1) /* data input available */ -#define POLL_OUT (__SI_POLL|2) /* output buffers available */ -#define POLL_MSG (__SI_POLL|3) /* input message available */ -#define POLL_ERR (__SI_POLL|4) /* i/o error */ -#define POLL_PRI (__SI_POLL|5) /* high priority input available */ -#define POLL_HUP (__SI_POLL|6) /* device disconnected */ +#define POLL_IN 1 /* data input available */ +#define POLL_OUT 2 /* output buffers available */ +#define POLL_MSG 3 /* input message available */ +#define POLL_ERR 4 /* i/o error */ +#define POLL_PRI 5 /* high priority input available */ +#define POLL_HUP 6 /* device disconnected */ #define NSIGPOLL 6 /* * SIGSYS si_codes */ -#define SYS_SECCOMP (__SI_SYS|1) /* seccomp triggered */ -#define NSIGSYS 1 +#define SYS_SECCOMP 1 /* seccomp triggered */ +#define NSIGSYS 1 /* * sigevent definitions diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 6fe14d001f68..230e05d35191 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -60,9 +60,13 @@ typedef struct __user_cap_data_struct { #define VFS_CAP_U32_2 2 #define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2)) -#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2 -#define VFS_CAP_U32 VFS_CAP_U32_2 -#define VFS_CAP_REVISION VFS_CAP_REVISION_2 +#define VFS_CAP_REVISION_3 0x03000000 +#define VFS_CAP_U32_3 2 +#define XATTR_CAPS_SZ_3 (sizeof(__le32)*(2 + 2*VFS_CAP_U32_3)) + +#define XATTR_CAPS_SZ XATTR_CAPS_SZ_3 +#define VFS_CAP_U32 VFS_CAP_U32_3 +#define VFS_CAP_REVISION VFS_CAP_REVISION_3 struct vfs_cap_data { __le32 magic_etc; /* Little endian */ @@ -72,6 +76,18 @@ struct vfs_cap_data { } data[VFS_CAP_U32]; }; +/* + * same as vfs_cap_data but with a rootid at the end + */ +struct vfs_ns_cap_data { + __le32 magic_etc; + struct { + __le32 permitted; /* Little endian */ + __le32 inheritable; /* Little endian */ + } data[VFS_CAP_U32]; + __le32 rootid; +}; + #ifndef __KERNEL__ /* diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 67230ecf2ce1..4657e2924ecb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2275,6 +2275,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2349,8 +2356,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(void) @@ -2363,6 +2372,11 @@ void cpuset_update_active_cpus(void) schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. diff --git a/kernel/exit.c b/kernel/exit.c index a35d8a17e01f..3481ababd06a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1615,7 +1615,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); - unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); @@ -1741,7 +1741,7 @@ COMPAT_SYSCALL_DEFINE5(waitid, user_access_begin(); unsafe_put_user(signo, &infop->si_signo, Efault); unsafe_put_user(0, &infop->si_errno, Efault); - unsafe_put_user((short)info.cause, &infop->si_code, Efault); + unsafe_put_user(info.cause, &infop->si_code, Efault); unsafe_put_user(info.pid, &infop->si_pid, Efault); unsafe_put_user(info.uid, &infop->si_uid, Efault); unsafe_put_user(info.status, &infop->si_status, Efault); diff --git a/kernel/fork.c b/kernel/fork.c index 6f1b0af00bda..10646182440f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1569,10 +1569,6 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } - retval = security_task_create(clone_flags); - if (retval) - goto fork_out; - retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 73be2b3909bd..82afb7ed369f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -421,10 +421,8 @@ static void free_desc(unsigned int irq) * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ - mutex_lock(&sparse_irq_lock); kobject_del(&desc->kobj); delete_irq_desc(irq); - mutex_unlock(&sparse_irq_lock); /* * We free the descriptor, masks and stat fields via RCU. That @@ -462,20 +460,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node, desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; - mutex_lock(&sparse_irq_lock); irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); - mutex_unlock(&sparse_irq_lock); } + bitmap_set(allocated_irqs, start, cnt); return start; err: for (i--; i >= 0; i--) free_desc(start + i); - - mutex_lock(&sparse_irq_lock); - bitmap_clear(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); return -ENOMEM; } @@ -575,6 +568,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, desc->owner = owner; } + bitmap_set(allocated_irqs, start, cnt); return start; } @@ -670,10 +664,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt) if (from >= nr_irqs || (from + cnt) > nr_irqs) return; + mutex_lock(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); - mutex_lock(&sparse_irq_lock); bitmap_clear(allocated_irqs, from, cnt); mutex_unlock(&sparse_irq_lock); } @@ -720,19 +714,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, from, cnt, 0); ret = -EEXIST; if (irq >=0 && start != irq) - goto err; + goto unlock; if (start + cnt > nr_irqs) { ret = irq_expand_nr_irqs(start + cnt); if (ret) - goto err; + goto unlock; } - - bitmap_set(allocated_irqs, start, cnt); - mutex_unlock(&sparse_irq_lock); - return alloc_descs(start, cnt, node, affinity, owner); - -err: + ret = alloc_descs(start, cnt, node, affinity, owner); +unlock: mutex_unlock(&sparse_irq_lock); return ret; } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 48eadf416c24..3fa4bd59f569 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -315,11 +315,12 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, ops->set_desc(arg, desc); /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1, + arg); if (ret) break; - irq_set_msi_desc_off(virq, 0, desc); + irq_set_msi_desc_off(desc->irq, 0, desc); } if (ret) { diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 74a5a7255b4d..4918314893bc 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -101,6 +101,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns int i; int err; + err = -EINVAL; + if (!in_userns(parent_pid_ns->user_ns, user_ns)) + goto out; + err = -ENOSPC; if (level > MAX_PID_NS_LEVEL) goto out; diff --git a/kernel/power/process.c b/kernel/power/process.c index 78672d324a6e..50f25cb370c6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -20,8 +20,9 @@ #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> +#include <linux/cpuset.h> -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; @@ -202,6 +203,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 60f356d91060..84b1367935e4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -728,8 +728,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - if (copy_siginfo_to_user32(uinfo, &info) || - __put_user(info.si_code, &uinfo->si_code)) { + if (copy_siginfo_to_user32(uinfo, &info)) { ret = -EFAULT; break; } @@ -739,8 +738,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, { siginfo_t __user *uinfo = (siginfo_t __user *) data; - if (copy_siginfo_to_user(uinfo, &info) || - __put_user(info.si_code, &uinfo->si_code)) { + if (copy_siginfo_to_user(uinfo, &info)) { ret = -EFAULT; break; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6d2c7ff9ba98..136a76d80dbf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5556,16 +5556,15 @@ static void cpuset_cpu_active(void) * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); + partition_sched_domains(1, NULL, NULL); + if (--num_cpus_frozen) return; - } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); } cpuset_update_active_cpus(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a5d83ed8dd82..0a85641e62ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5424,7 +5424,7 @@ wake_affine_llc(struct sched_domain *sd, struct task_struct *p, return false; /* if this cache has capacity, come here */ - if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1) + if (this_stats.has_capacity && this_stats.nr_running+1 < prev_stats.nr_running) return true; /* @@ -7708,7 +7708,7 @@ next_group: * number. * * Return: 1 when packing is required and a task should be moved to - * this CPU. The amount of the imbalance is returned in *imbalance. + * this CPU. The amount of the imbalance is returned in env->imbalance. * * @env: The load balancing environment. * @sds: Statistics of the sched_domain which is to be packed diff --git a/kernel/signal.c b/kernel/signal.c index ed804a470dcd..800a18f77732 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2686,6 +2686,51 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset, } #endif +enum siginfo_layout siginfo_layout(int sig, int si_code) +{ + enum siginfo_layout layout = SIL_KILL; + if ((si_code > SI_USER) && (si_code < SI_KERNEL)) { + static const struct { + unsigned char limit, layout; + } filter[] = { + [SIGILL] = { NSIGILL, SIL_FAULT }, + [SIGFPE] = { NSIGFPE, SIL_FAULT }, + [SIGSEGV] = { NSIGSEGV, SIL_FAULT }, + [SIGBUS] = { NSIGBUS, SIL_FAULT }, + [SIGTRAP] = { NSIGTRAP, SIL_FAULT }, +#if defined(SIGMET) && defined(NSIGEMT) + [SIGEMT] = { NSIGEMT, SIL_FAULT }, +#endif + [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, + [SIGPOLL] = { NSIGPOLL, SIL_POLL }, +#ifdef __ARCH_SIGSYS + [SIGSYS] = { NSIGSYS, SIL_SYS }, +#endif + }; + if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) + layout = filter[sig].layout; + else if (si_code <= NSIGPOLL) + layout = SIL_POLL; + } else { + if (si_code == SI_TIMER) + layout = SIL_TIMER; + else if (si_code == SI_SIGIO) + layout = SIL_POLL; + else if (si_code < 0) + layout = SIL_RT; + /* Tests to support buggy kernel ABIs */ +#ifdef TRAP_FIXME + if ((sig == SIGTRAP) && (si_code == TRAP_FIXME)) + layout = SIL_FAULT; +#endif +#ifdef FPE_FIXME + if ((sig == SIGFPE) && (si_code == FPE_FIXME)) + layout = SIL_FAULT; +#endif + } + return layout; +} + #ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) @@ -2708,22 +2753,20 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); - switch (from->si_code & __SI_MASK) { - case __SI_KILL: + err |= __put_user(from->si_code, &to->si_code); + switch (siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); break; - case __SI_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_ptr, &to->si_ptr); + case SIL_TIMER: + /* Unreached SI_TIMER is negative */ break; - case __SI_POLL: + case SIL_POLL: err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - case __SI_FAULT: + case SIL_FAULT: err |= __put_user(from->si_addr, &to->si_addr); #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); @@ -2748,30 +2791,25 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_pkey, &to->si_pkey); #endif break; - case __SI_CHLD: + case SIL_CHLD: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_status, &to->si_status); err |= __put_user(from->si_utime, &to->si_utime); err |= __put_user(from->si_stime, &to->si_stime); break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ + case SIL_RT: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; #ifdef __ARCH_SIGSYS - case __SI_SYS: + case SIL_SYS: err |= __put_user(from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; #endif - default: /* this is just in case for now ... */ - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; } return err; } diff --git a/kernel/sys.c b/kernel/sys.c index 2855ee73acd0..9aebc2935013 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1896,15 +1896,11 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) /* * Finally, make sure the caller has the rights to - * change /proc/pid/exe link: only local root should + * change /proc/pid/exe link: only local sys admin should * be allowed to. */ if (prctl_map->exe_fd != (u32)-1) { - struct user_namespace *ns = current_user_ns(); - const struct cred *cred = current_cred(); - - if (!uid_eq(cred->uid, make_kuid(ns, 0)) || - !gid_eq(cred->gid, make_kgid(ns, 0))) + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) goto out; } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 2f735cbe05e8..c490f1e4313b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -986,17 +986,21 @@ bool userns_may_setgroups(const struct user_namespace *ns) } /* - * Returns true if @ns is the same namespace as or a descendant of - * @target_ns. + * Returns true if @child is the same namespace or a descendant of + * @ancestor. */ +bool in_userns(const struct user_namespace *ancestor, + const struct user_namespace *child) +{ + const struct user_namespace *ns; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return (ns == ancestor); +} + bool current_in_userns(const struct user_namespace *target_ns) { - struct user_namespace *ns; - for (ns = current_user_ns(); ns; ns = ns->parent) { - if (ns == target_ns) - return true; - } - return false; + return in_userns(target_ns, current_user_ns()); } static inline struct user_namespace *to_user_ns(struct ns_common *ns) diff --git a/lib/Kconfig b/lib/Kconfig index 40b114a11d7c..a85e6f76add5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -559,9 +559,6 @@ config ARCH_HAS_PMEM_API config ARCH_HAS_UACCESS_FLUSHCACHE bool -config ARCH_HAS_MMIO_FLUSH - bool - config STACKDEPOT bool select STACKTRACE diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 303c779bfe38..43ba91c440bc 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -58,7 +58,7 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data); -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data); +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size); static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err); static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control, @@ -1473,7 +1473,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -2987,12 +2987,15 @@ static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, return len; } -static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) +static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size) { struct l2cap_conf_opt *opt = *ptr; BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val); + if (size < L2CAP_CONF_OPT_SIZE + len) + return; + opt->type = type; opt->len = len; @@ -3017,7 +3020,7 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) *ptr += L2CAP_CONF_OPT_SIZE + len; } -static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) +static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size) { struct l2cap_conf_efs efs; @@ -3045,7 +3048,7 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) } l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, size); } static void l2cap_ack_timeout(struct work_struct *work) @@ -3191,11 +3194,12 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan) chan->ack_win = chan->tx_win; } -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_req *req = data; struct l2cap_conf_rfc rfc = { .mode = chan->mode }; void *ptr = req->data; + void *endptr = data + data_size; u16 size; BT_DBG("chan %p", chan); @@ -3220,7 +3224,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) done: if (chan->imtu != L2CAP_DEFAULT_MTU) - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); switch (chan->mode) { case L2CAP_MODE_BASIC: @@ -3239,7 +3243,7 @@ done: rfc.max_pdu_size = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; case L2CAP_MODE_ERTM: @@ -3259,21 +3263,21 @@ done: L2CAP_DEFAULT_TX_WINDOW); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (test_bit(FLAG_EXT_CTRL, &chan->flags)) l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; @@ -3291,17 +3295,17 @@ done: rfc.max_pdu_size = cpu_to_le16(size); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; } @@ -3312,10 +3316,11 @@ done: return ptr - data; } -static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_rsp *rsp = data; void *ptr = rsp->data; + void *endptr = data + data_size; void *req = chan->conf_req; int len = chan->conf_len; int type, hint, olen; @@ -3417,7 +3422,7 @@ done: return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); } if (result == L2CAP_CONF_SUCCESS) { @@ -3430,7 +3435,7 @@ done: chan->omtu = mtu; set_bit(CONF_MTU_DONE, &chan->conf_state); } - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr); if (remote_efs) { if (chan->local_stype != L2CAP_SERV_NOTRAFIC && @@ -3444,7 +3449,7 @@ done: l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } else { /* Send PENDING Conf Rsp */ result = L2CAP_CONF_PENDING; @@ -3477,7 +3482,7 @@ done: set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) { chan->remote_id = efs.id; @@ -3491,7 +3496,7 @@ done: le32_to_cpu(efs.sdu_itime); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } break; @@ -3505,7 +3510,7 @@ done: set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; @@ -3527,10 +3532,11 @@ done: } static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, - void *data, u16 *result) + void *data, size_t size, u16 *result) { struct l2cap_conf_req *req = data; void *ptr = req->data; + void *endptr = data + size; int type, olen; unsigned long val; struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; @@ -3548,13 +3554,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->imtu = L2CAP_DEFAULT_MIN_MTU; } else chan->imtu = val; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); break; case L2CAP_CONF_FLUSH_TO: chan->flush_to = val; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, - 2, chan->flush_to); + 2, chan->flush_to, endptr - ptr); break; case L2CAP_CONF_RFC: @@ -3568,13 +3574,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->fcs = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); break; case L2CAP_CONF_EWS: chan->ack_win = min_t(u16, val, chan->ack_win); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); break; case L2CAP_CONF_EFS: @@ -3587,7 +3593,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); break; case L2CAP_CONF_FCS: @@ -3692,7 +3698,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) return; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3900,7 +3906,7 @@ sendresp: u8 buf[128]; set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3978,7 +3984,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, break; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, req), req); + l2cap_build_conf_req(chan, req, sizeof(req)), req); chan->num_conf_req++; break; @@ -4090,7 +4096,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, } /* Complete config. */ - len = l2cap_parse_conf_req(chan, rsp); + len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp)); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto unlock; @@ -4124,7 +4130,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) { u8 buf[64]; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -4184,7 +4190,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, char buf[64]; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - buf, &result); + buf, sizeof(buf), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4214,7 +4220,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, /* throw out any old stored conf requests */ result = L2CAP_CONF_SUCCESS; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - req, &result); + req, sizeof(req), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4791,7 +4797,7 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result, set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } } @@ -7465,7 +7471,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 875675765531..63edc6e5f026 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -676,7 +676,8 @@ bad: /* * Do a synchronous statfs(). */ -int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) +int ceph_monc_do_statfs(struct ceph_mon_client *monc, u64 data_pool, + struct ceph_statfs *buf) { struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; @@ -696,6 +697,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) goto out; req->u.st = buf; + req->request->hdr.version = cpu_to_le16(2); mutex_lock(&monc->mutex); register_generic_request(req); @@ -705,6 +707,8 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; + h->contains_data_pool = (data_pool != CEPH_NOPOOL); + h->data_pool = cpu_to_le64(data_pool); send_generic_request(monc, req); mutex_unlock(&monc->mutex); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index dcfbdd74dfd1..e02f01f534e2 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -863,8 +863,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, dst->cls.method_len = src->cls.method_len; dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; - case CEPH_OSD_OP_STARTSYNC: - break; case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); dst->watch.ver = cpu_to_le64(0); @@ -916,9 +914,6 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, * if the file was recently truncated, we include information about its * old and new size so that the object can be updated appropriately. (we * avoid synchronously deleting truncated objects because it's slow.) - * - * if @do_sync, include a 'startsync' command so that the osd will flush - * data quickly. */ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index ac701c28f44f..c2c68a15b59d 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -171,10 +171,10 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) /* * Add the temporary list to the backchannel preallocation list */ - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_splice(&tmp_list, &xprt->bc_pa_list); xprt_inc_alloc_count(xprt, min_reqs); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); dprintk("RPC: setup backchannel transport done\n"); return 0; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2e49d1f892b7..2ad827db2704 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1903,6 +1903,14 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { case -ECONNREFUSED: + /* A positive refusal suggests a rebind is needed. */ + if (RPC_IS_SOFTCONN(task)) + break; + if (clnt->cl_autobind) { + rpc_force_rebind(clnt); + task->tk_action = call_bind; + return; + } case -ECONNRESET: case -ECONNABORTED: case -ENETUNREACH: @@ -2139,10 +2147,6 @@ call_status(struct rpc_task *task) rpc_delay(task, 3*HZ); case -ETIMEDOUT: task->tk_action = call_timeout; - if (!(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && task->tk_client->cl_discrtry) - xprt_conditional_disconnect(req->rq_xprt, - req->rq_connect_cookie); break; case -ECONNREFUSED: case -ECONNRESET: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 399fab5d1936..ff8e06cd067e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1013,7 +1013,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock_bh(&bc_xprt->transport_lock); + spin_lock(&bc_xprt->recv_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1031,7 +1031,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock_bh(&bc_xprt->transport_lock); + spin_unlock(&bc_xprt->recv_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1040,7 +1040,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock_bh(&bc_xprt->transport_lock); + spin_unlock(&bc_xprt->recv_lock); return -EAGAIN; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4654a9934269..e741ec2b4d8e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -844,6 +844,50 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +/** + * xprt_pin_rqst - Pin a request on the transport receive list + * @req: Request to pin + * + * Caller must ensure this is atomic with the call to xprt_lookup_rqst() + * so should be holding the xprt transport lock. + */ +void xprt_pin_rqst(struct rpc_rqst *req) +{ + set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); +} +EXPORT_SYMBOL_GPL(xprt_pin_rqst); + +/** + * xprt_unpin_rqst - Unpin a request on the transport receive list + * @req: Request to pin + * + * Caller should be holding the xprt transport lock. + */ +void xprt_unpin_rqst(struct rpc_rqst *req) +{ + struct rpc_task *task = req->rq_task; + + clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); + if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) + wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); +} +EXPORT_SYMBOL_GPL(xprt_unpin_rqst); + +static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) +__must_hold(&req->rq_xprt->recv_lock) +{ + struct rpc_task *task = req->rq_task; + + if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { + spin_unlock(&req->rq_xprt->recv_lock); + set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); + wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, + TASK_UNINTERRUPTIBLE); + clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); + spin_lock(&req->rq_xprt->recv_lock); + } +} + static void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; @@ -966,13 +1010,13 @@ void xprt_transmit(struct rpc_task *task) /* * Add to the list only if we're expecting a reply */ - spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ + spin_lock(&xprt->recv_lock); list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -1287,12 +1331,16 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); + spin_lock(&xprt->recv_lock); + if (!list_empty(&req->rq_list)) { + list_del(&req->rq_list); + xprt_wait_on_pinned_rqst(req); + } + spin_unlock(&xprt->recv_lock); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) xprt->ops->release_request(task); - if (!list_empty(&req->rq_list)) - list_del(&req->rq_list); xprt->last_used = jiffies; xprt_schedule_autodisconnect(xprt); spin_unlock_bh(&xprt->transport_lock); @@ -1318,6 +1366,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); + spin_lock_init(&xprt->recv_lock); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 03f6b5840764..d31d0ac5ada9 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -49,6 +49,7 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, if (IS_ERR(rb)) goto out_fail; req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -202,20 +203,24 @@ size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) */ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp; - - headerp = rdmab_to_msg(req->rl_rdmabuf); - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = - cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); - headerp->rm_type = rdma_msg; - headerp->rm_body.rm_chunks[0] = xdr_zero; - headerp->rm_body.rm_chunks[1] = xdr_zero; - headerp->rm_body.rm_chunks[2] = xdr_zero; + __be32 *p; + + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(&req->rl_stream, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + p = xdr_reserve_space(&req->rl_stream, 28); + if (unlikely(!p)) + return -EIO; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); + *p++ = rdma_msg; + *p++ = xdr_zero; + *p++ = xdr_zero; + *p = xdr_zero; if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN, &rqst->rq_snd_buf, rpcrdma_noch)) @@ -271,9 +276,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) * @xprt: transport receiving the call * @rep: receive buffer containing the call * - * Called in the RPC reply handler, which runs in a tasklet. - * Be quick about it. - * * Operational assumptions: * o Backchannel credits are ignored, just as the NFS server * forechannel currently does @@ -284,7 +286,6 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_msg *headerp; struct svc_serv *bc_serv; struct rpcrdma_req *req; struct rpc_rqst *rqst; @@ -292,24 +293,15 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, size_t size; __be32 *p; - headerp = rdmab_to_msg(rep->rr_rdmabuf); + p = xdr_inline_decode(&rep->rr_stream, 0); + size = xdr_stream_remaining(&rep->rr_stream); + #ifdef RPCRDMA_BACKCHANNEL_DEBUG pr_info("RPC: %s: callback XID %08x, length=%u\n", - __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); - pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); + __func__, be32_to_cpup(p), size); + pr_info("RPC: %s: %*ph\n", __func__, size, p); #endif - /* Sanity check: - * Need at least enough bytes for RPC/RDMA header, as code - * here references the header fields by array offset. Also, - * backward calls are always inline, so ensure there - * are some bytes beyond the RPC/RDMA header. - */ - if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) - goto out_short; - p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); - size = rep->rr_len - RPCRDMA_HDRLEN_MIN; - /* Grab a free bc rqst */ spin_lock(&xprt->bc_pa_lock); if (list_empty(&xprt->bc_pa_list)) { @@ -325,7 +317,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, /* Prepare rqst */ rqst->rq_reply_bytes_recvd = 0; rqst->rq_bytes_sent = 0; - rqst->rq_xid = headerp->rm_xid; + rqst->rq_xid = *p; rqst->rq_private_buf.len = size; set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); @@ -337,9 +329,9 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, buf->len = size; /* The receive buffer has to be hooked to the rpcrdma_req - * so that it can be reposted after the server is done - * parsing it but just before sending the backward - * direction reply. + * so that it is not released while the req is pointing + * to its buffer, and so that it can be reposted after + * the Upper Layer is done decoding it. */ req = rpcr_to_rdmar(rqst); dprintk("RPC: %s: attaching rep %p to req %p\n", @@ -367,13 +359,4 @@ out_overflow: * when the connection is re-established. */ return; - -out_short: - pr_warn("RPC/RDMA short backward direction call\n"); - - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) - xprt_disconnect_done(xprt); - else - pr_warn("RPC: %s: reposting rep %p\n", - __func__, rep); } diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index d3f84bb1d443..6c7151341194 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -177,7 +177,7 @@ fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ -static int +static struct rpcrdma_mr_seg * fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { @@ -188,7 +188,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ @@ -232,13 +232,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw->mw_offset = dma_pages[0] + pageoff; *out = mw; - return mw->mw_nents; + return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", @@ -247,7 +247,7 @@ out_maperr: ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); } /* Invalidate all memory regions that were registered for "req". diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 6aea36a38bfd..5a936a6a31a3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -344,7 +344,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* Post a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ -static int +static struct rpcrdma_mr_seg * frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, struct rpcrdma_mw **out) { @@ -364,7 +364,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_defer_mr_recovery(mw); mw = rpcrdma_get_mw(r_xprt); if (!mw) - return -ENOBUFS; + return ERR_PTR(-ENOBUFS); } while (mw->frmr.fr_state != FRMR_IS_INVALID); frmr = &mw->frmr; frmr->fr_state = FRMR_IS_VALID; @@ -429,25 +429,25 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mw->mw_offset = mr->iova; *out = mw; - return mw->mw_nents; + return seg; out_dmamap_err: pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", mw->mw_sg, i); frmr->fr_state = FRMR_IS_INVALID; rpcrdma_put_mw(r_xprt, mw); - return -EIO; + return ERR_PTR(-EIO); out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frmr->fr_mr, n, mw->mw_nents); rpcrdma_defer_mr_recovery(mw); - return -EIO; + return ERR_PTR(-EIO); out_senderr: pr_err("rpcrdma: FRMR registration ib_post_send returned %i\n", rc); rpcrdma_defer_mr_recovery(mw); - return -ENOTCONN; + return ERR_PTR(-ENOTCONN); } /* Invalidate all memory regions that were registered for "req". diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index ca4d6e4528f3..f1889f4d4803 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -169,40 +169,41 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } -/* Split "vec" on page boundaries into segments. FMR registers pages, - * not a byte range. Other modes coalesce these segments into a single - * MR when they can. +/* Split @vec on page boundaries into SGEs. FMR registers pages, not + * a byte range. Other modes coalesce these SGEs into a single MR + * when they can. + * + * Returns pointer to next available SGE, and bumps the total number + * of SGEs consumed. */ -static int -rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n) +static struct rpcrdma_mr_seg * +rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, + unsigned int *n) { - size_t page_offset; - u32 remaining; + u32 remaining, page_offset; char *base; base = vec->iov_base; page_offset = offset_in_page(base); remaining = vec->iov_len; - while (remaining && n < RPCRDMA_MAX_SEGS) { - seg[n].mr_page = NULL; - seg[n].mr_offset = base; - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); - remaining -= seg[n].mr_len; - base += seg[n].mr_len; - ++n; + while (remaining) { + seg->mr_page = NULL; + seg->mr_offset = base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining); + remaining -= seg->mr_len; + base += seg->mr_len; + ++seg; + ++(*n); page_offset = 0; } - return n; + return seg; } -/* - * Chunk assembly from upper layer xdr_buf. - * - * Prepare the passed-in xdr_buf into representation as RPC/RDMA chunk - * elements. Segments are then coalesced when registered, if possible - * within the selected memreg mode. +/* Convert @xdrbuf into SGEs no larger than a page each. As they + * are registered, these SGEs are then coalesced into RDMA segments + * when the selected memreg mode supports it. * - * Returns positive number of segments converted, or a negative errno. + * Returns positive number of SGEs consumed, or a negative errno. */ static int @@ -210,47 +211,41 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, unsigned int pos, enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg) { - int len, n, p, page_base; + unsigned long page_base; + unsigned int len, n; struct page **ppages; n = 0; - if (pos == 0) { - n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n); - if (n == RPCRDMA_MAX_SEGS) - goto out_overflow; - } + if (pos == 0) + seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); - p = 0; - while (len && n < RPCRDMA_MAX_SEGS) { - if (!ppages[p]) { - /* alloc the pagelist for receiving buffer */ - ppages[p] = alloc_page(GFP_ATOMIC); - if (!ppages[p]) + while (len) { + if (unlikely(!*ppages)) { + /* XXX: Certain upper layer operations do + * not provide receive buffer pages. + */ + *ppages = alloc_page(GFP_ATOMIC); + if (!*ppages) return -EAGAIN; } - seg[n].mr_page = ppages[p]; - seg[n].mr_offset = (void *)(unsigned long) page_base; - seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); - if (seg[n].mr_len > PAGE_SIZE) - goto out_overflow; - len -= seg[n].mr_len; + seg->mr_page = *ppages; + seg->mr_offset = (char *)page_base; + seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); + len -= seg->mr_len; + ++ppages; + ++seg; ++n; - ++p; - page_base = 0; /* page offset only applies to first page */ + page_base = 0; } - /* Message overflows the seg array */ - if (len && n == RPCRDMA_MAX_SEGS) - goto out_overflow; - /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) - return n; + goto out; /* When encoding a Write chunk, some servers need to see an * extra segment for non-XDR-aligned Write chunks. The upper @@ -258,30 +253,81 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * for this purpose. */ if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) - return n; + goto out; - if (xdrbuf->tail[0].iov_len) { - n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n); - if (n == RPCRDMA_MAX_SEGS) - goto out_overflow; - } + if (xdrbuf->tail[0].iov_len) + seg = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); +out: + if (unlikely(n > RPCRDMA_MAX_SEGS)) + return -EIO; return n; +} -out_overflow: - pr_err("rpcrdma: segment array overflow\n"); - return -EIO; +static inline int +encode_item_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_one; + return 0; } -static inline __be32 * +static inline int +encode_item_not_present(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p = xdr_zero; + return 0; +} + +static void xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) { *iptr++ = cpu_to_be32(mw->mw_handle); *iptr++ = cpu_to_be32(mw->mw_length); - return xdr_encode_hyper(iptr, mw->mw_offset); + xdr_encode_hyper(iptr, mw->mw_offset); } -/* XDR-encode the Read list. Supports encoding a list of read +static int +encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + xdr_encode_rdma_segment(p, mw); + return 0; +} + +static int +encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mw *mw, + u32 position) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 6 * sizeof(*p)); + if (unlikely(!p)) + return -EMSGSIZE; + + *p++ = xdr_one; /* Item present */ + *p++ = cpu_to_be32(position); + xdr_encode_rdma_segment(p, mw); + return 0; +} + +/* Register and XDR encode the Read list. Supports encoding a list of read * segments that belong to a single read chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): @@ -290,23 +336,20 @@ xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mw *mw) * N elements, position P (same P for all chunks of same arg!): * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Read list, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single @pos value is currently supported. */ -static __be32 * -rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, struct rpc_rqst *rqst, - __be32 *iptr, enum rpcrdma_chunktype rtype) +static noinline int +rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype rtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; unsigned int pos; - int n, nsegs; - - if (rtype == rpcrdma_noch) { - *iptr++ = xdr_zero; /* item not present */ - return iptr; - } + int nsegs; pos = rqst->rq_snd_buf.head[0].iov_len; if (rtype == rpcrdma_areadch) @@ -315,40 +358,33 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, rtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + false, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - *iptr++ = xdr_one; /* item present */ - - /* All read segments in this chunk - * have the same "position". - */ - *iptr++ = cpu_to_be32(pos); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_read_segment(xdr, mw, pos) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: pos %u %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, pos, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.read_chunk_count++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); - /* Finish Read list */ - *iptr++ = xdr_zero; /* Next item not present */ - return iptr; + return 0; } -/* XDR-encode the Write list. Supports encoding a list containing - * one array of plain segments that belong to a single write chunk. +/* Register and XDR encode the Write list. Supports encoding a list + * containing one array of plain segments that belong to a single + * write chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -356,66 +392,65 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, * N elements: * 1 - N - HLOO - HLOO - ... - HLOO - 0 * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Write list, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. + * + * Only a single Write chunk is currently supported. */ -static __be32 * +static noinline int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - struct rpc_rqst *rqst, __be32 *iptr, - enum rpcrdma_chunktype wtype) + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - int n, nsegs, nchunks; + int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_writech) { - *iptr++ = xdr_zero; /* no Write list present */ - return iptr; - } - seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, rqst->rq_rcv_buf.head[0].iov_len, wtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; - *iptr++ = xdr_one; /* Write list present */ - segcount = iptr++; /* save location of segment count */ + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_rdma_segment(xdr, mw) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x016%llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); /* Update count of segments in this Write chunk */ *segcount = cpu_to_be32(nchunks); - /* Finish Write list */ - *iptr++ = xdr_zero; /* Next item not present */ - return iptr; + return 0; } -/* XDR-encode the Reply chunk. Supports encoding an array of plain - * segments that belong to a single write (reply) chunk. +/* Register and XDR encode the Reply chunk. Supports encoding an array + * of plain segments that belong to a single write (reply) chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -423,58 +458,57 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * N elements: * 1 - N - HLOO - HLOO - ... - HLOO * - * Returns a pointer to the XDR word in the RDMA header following - * the end of the Reply chunk, or an error pointer. + * Returns zero on success, or a negative errno if a failure occurred. + * @xdr is advanced to the next position in the stream. */ -static __be32 * -rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, - struct rpcrdma_req *req, struct rpc_rqst *rqst, - __be32 *iptr, enum rpcrdma_chunktype wtype) +static noinline int +rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, enum rpcrdma_chunktype wtype) { + struct xdr_stream *xdr = &req->rl_stream; struct rpcrdma_mr_seg *seg; struct rpcrdma_mw *mw; - int n, nsegs, nchunks; + int nsegs, nchunks; __be32 *segcount; - if (wtype != rpcrdma_replych) { - *iptr++ = xdr_zero; /* no Reply chunk present */ - return iptr; - } - seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) - return ERR_PTR(nsegs); + return nsegs; - *iptr++ = xdr_one; /* Reply chunk present */ - segcount = iptr++; /* save location of segment count */ + if (encode_item_present(xdr) < 0) + return -EMSGSIZE; + segcount = xdr_reserve_space(xdr, sizeof(*segcount)); + if (unlikely(!segcount)) + return -EMSGSIZE; + /* Actual value encoded below */ nchunks = 0; do { - n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mw); - if (n < 0) - return ERR_PTR(n); + seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, + true, &mw); + if (IS_ERR(seg)) + return PTR_ERR(seg); rpcrdma_push_mw(mw, &req->rl_registered); - iptr = xdr_encode_rdma_segment(iptr, mw); + if (encode_rdma_segment(xdr, mw) < 0) + return -EMSGSIZE; dprintk("RPC: %5u %s: %u@0x%016llx:0x%08x (%s)\n", rqst->rq_task->tk_pid, __func__, mw->mw_length, (unsigned long long)mw->mw_offset, - mw->mw_handle, n < nsegs ? "more" : "last"); + mw->mw_handle, mw->mw_nents < nsegs ? "more" : "last"); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += seg->mr_len; nchunks++; - seg += n; - nsegs -= n; + nsegs -= mw->mw_nents; } while (nsegs); /* Update count of segments in the Reply chunk */ *segcount = cpu_to_be32(nchunks); - return iptr; + return 0; } /* Prepare the RPC-over-RDMA header SGE. @@ -651,37 +685,52 @@ rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req) req->rl_mapped_sges = 0; } -/* - * Marshal a request: the primary job of this routine is to choose - * the transfer modes. See comments below. +/** + * rpcrdma_marshal_req - Marshal and send one RPC request + * @r_xprt: controlling transport + * @rqst: RPC request to be marshaled + * + * For the RPC in "rqst", this function: + * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) + * - Registers Read, Write, and Reply chunks + * - Constructs the transport header + * - Posts a Send WR to send the transport header and request * - * Returns zero on success, otherwise a negative errno. + * Returns: + * %0 if the RPC was sent successfully, + * %-ENOTCONN if the connection was lost, + * %-EAGAIN if not enough pages are available for on-demand reply buffer, + * %-ENOBUFS if no MRs are available to register chunks, + * %-EMSGSIZE if the transport header is too small, + * %-EIO if a permanent problem occurred while marshaling. */ - int -rpcrdma_marshal_req(struct rpc_rqst *rqst) +rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - struct rpc_xprt *xprt = rqst->rq_xprt; - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; - struct rpcrdma_msg *headerp; bool ddp_allowed; - ssize_t hdrlen; - size_t rpclen; - __be32 *iptr; + __be32 *p; + int ret; #if defined(CONFIG_SUNRPC_BACKCHANNEL) if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) return rpcrdma_bc_marshal_reply(rqst); #endif - headerp = rdmab_to_msg(req->rl_rdmabuf); - /* don't byte-swap XID, it's already done in request */ - headerp->rm_xid = rqst->rq_xid; - headerp->rm_vers = rpcrdma_version; - headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); - headerp->rm_type = rdma_msg; + rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); + xdr_init_encode(xdr, &req->rl_hdrbuf, + req->rl_rdmabuf->rg_base); + + /* Fixed header fields */ + ret = -EMSGSIZE; + p = xdr_reserve_space(xdr, 4 * sizeof(*p)); + if (!p) + goto out_err; + *p++ = rqst->rq_xid; + *p++ = rpcrdma_version; + *p++ = cpu_to_be32(r_xprt->rx_buf.rb_max_requests); /* When the ULP employs a GSS flavor that guarantees integrity * or privacy, direct data placement of individual data items @@ -721,22 +770,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * by themselves are larger than the inline threshold. */ if (rpcrdma_args_inline(r_xprt, rqst)) { + *p++ = rdma_msg; rtype = rpcrdma_noch; - rpclen = rqst->rq_snd_buf.len; } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + *p++ = rdma_msg; rtype = rpcrdma_readch; - rpclen = rqst->rq_snd_buf.head[0].iov_len + - rqst->rq_snd_buf.tail[0].iov_len; } else { r_xprt->rx_stats.nomsg_call_count++; - headerp->rm_type = htonl(RDMA_NOMSG); + *p++ = rdma_nomsg; rtype = rpcrdma_areadch; - rpclen = 0; } - req->rl_xid = rqst->rq_xid; - rpcrdma_insert_req(&r_xprt->rx_buf, req); - /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -759,79 +803,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * send a Call message with a Position Zero Read chunk and a * regular Read chunk at the same time. */ - iptr = headerp->rm_body.rm_chunks; - iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); - if (IS_ERR(iptr)) + if (rtype != rpcrdma_noch) { + ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) goto out_err; - iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); - if (IS_ERR(iptr)) + + if (wtype == rpcrdma_writech) { + ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); + if (ret) + goto out_err; + } + ret = encode_item_not_present(xdr); + if (ret) goto out_err; - iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); - if (IS_ERR(iptr)) + + if (wtype != rpcrdma_replych) + ret = encode_item_not_present(xdr); + else + ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); + if (ret) goto out_err; - hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; - dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", + dprintk("RPC: %5u %s: %s/%s: hdrlen %u rpclen\n", rqst->rq_task->tk_pid, __func__, transfertypes[rtype], transfertypes[wtype], - hdrlen, rpclen); + xdr_stream_pos(xdr)); - if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen, + if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, + xdr_stream_pos(xdr), &rqst->rq_snd_buf, rtype)) { - iptr = ERR_PTR(-EIO); + ret = -EIO; goto out_err; } return 0; out_err: - if (PTR_ERR(iptr) != -ENOBUFS) { - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", - PTR_ERR(iptr)); + if (ret != -ENOBUFS) { + pr_err("rpcrdma: header marshaling failed (%d)\n", ret); r_xprt->rx_stats.failed_marshal_count++; } - return PTR_ERR(iptr); -} - -/* - * Chase down a received write or reply chunklist to get length - * RDMA'd by server. See map at rpcrdma_create_chunks()! :-) - */ -static int -rpcrdma_count_chunks(struct rpcrdma_rep *rep, int wrchunk, __be32 **iptrp) -{ - unsigned int i, total_len; - struct rpcrdma_write_chunk *cur_wchunk; - char *base = (char *)rdmab_to_msg(rep->rr_rdmabuf); - - i = be32_to_cpu(**iptrp); - cur_wchunk = (struct rpcrdma_write_chunk *) (*iptrp + 1); - total_len = 0; - while (i--) { - struct rpcrdma_segment *seg = &cur_wchunk->wc_target; - ifdebug(FACILITY) { - u64 off; - xdr_decode_hyper((__be32 *)&seg->rs_offset, &off); - dprintk("RPC: %s: chunk %d@0x%016llx:0x%08x\n", - __func__, - be32_to_cpu(seg->rs_length), - (unsigned long long)off, - be32_to_cpu(seg->rs_handle)); - } - total_len += be32_to_cpu(seg->rs_length); - ++cur_wchunk; - } - /* check and adjust for properly terminated write chunk */ - if (wrchunk) { - __be32 *w = (__be32 *) cur_wchunk; - if (*w++ != xdr_zero) - return -1; - cur_wchunk = (struct rpcrdma_write_chunk *) w; - } - if ((char *)cur_wchunk > base + rep->rr_len) - return -1; - - *iptrp = (__be32 *) cur_wchunk; - return total_len; + return ret; } /** @@ -949,37 +964,254 @@ rpcrdma_mark_remote_invalidation(struct list_head *mws, } } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ static bool -rpcrdma_is_bcall(struct rpcrdma_msg *headerp) +rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + __be32 xid, __be32 proc) +#if defined(CONFIG_SUNRPC_BACKCHANNEL) { - __be32 *p = (__be32 *)headerp; + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; - if (headerp->rm_type != rdma_msg) + if (proc != rdma_msg) return false; - if (headerp->rm_body.rm_chunks[0] != xdr_zero) + + /* Peek at stream contents without advancing. */ + p = xdr_inline_decode(xdr, 0); + + /* Chunk lists */ + if (*p++ != xdr_zero) return false; - if (headerp->rm_body.rm_chunks[1] != xdr_zero) + if (*p++ != xdr_zero) return false; - if (headerp->rm_body.rm_chunks[2] != xdr_zero) + if (*p++ != xdr_zero) return false; - /* sanity */ - if (p[7] != headerp->rm_xid) + /* RPC header */ + if (*p++ != xid) return false; - /* call direction */ - if (p[8] != cpu_to_be32(RPC_CALL)) + if (*p != cpu_to_be32(RPC_CALL)) return false; + /* Now that we are sure this is a backchannel call, + * advance to the RPC header. + */ + p = xdr_inline_decode(xdr, 3 * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + + rpcrdma_bc_receive_call(r_xprt, rep); + return true; + +out_short: + pr_warn("RPC/RDMA short backward direction call\n"); + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) + xprt_disconnect_done(&r_xprt->rx_xprt); return true; } +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +{ + return false; +} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ +static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + ifdebug(FACILITY) { + u64 offset; + u32 handle; + + handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + dprintk("RPC: %s: segment %u@0x%016llx:0x%08x\n", + __func__, *length, (unsigned long long)offset, + handle); + } else { + *length = be32_to_cpup(p + 1); + } + + return 0; +} + +static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) +{ + u32 segcount, seglength; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + segcount = be32_to_cpup(p); + while (segcount--) { + if (decode_rdma_segment(xdr, &seglength)) + return -EIO; + *length += seglength; + } + + dprintk("RPC: %s: segcount=%u, %u bytes\n", + __func__, be32_to_cpup(p), *length); + return 0; +} + +/* In RPC-over-RDMA Version One replies, a Read list is never + * expected. This decoder is a stub that returns an error if + * a Read list is present. + */ +static int decode_read_list(struct xdr_stream *xdr) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != xdr_zero)) + return -EIO; + return 0; +} + +/* Supports only one Write chunk in the Write list + */ +static int decode_write_list(struct xdr_stream *xdr, u32 *length) +{ + u32 chunklen; + bool first; + __be32 *p; + + *length = 0; + first = true; + do { + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + if (*p == xdr_zero) + break; + if (!first) + return -EIO; + + if (decode_write_chunk(xdr, &chunklen)) + return -EIO; + *length += chunklen; + first = false; + } while (true); + return 0; +} + +static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) +{ + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + *length = 0; + if (*p != xdr_zero) + if (decode_write_chunk(xdr, length)) + return -EIO; + return 0; +} + +static int +rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk, rpclen; + char *base; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_MSG sanity checks */ + if (unlikely(replychunk)) + return -EIO; + + /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ + base = (char *)xdr_inline_decode(xdr, 0); + rpclen = xdr_stream_remaining(xdr); + r_xprt->rx_stats.fixup_copy_count += + rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); + + r_xprt->rx_stats.total_rdma_reply += writelist; + return rpclen + xdr_align_size(writelist); +} + +static noinline int +rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) +{ + struct xdr_stream *xdr = &rep->rr_stream; + u32 writelist, replychunk; + + /* Decode the chunk lists */ + if (decode_read_list(xdr)) + return -EIO; + if (decode_write_list(xdr, &writelist)) + return -EIO; + if (decode_reply_chunk(xdr, &replychunk)) + return -EIO; + + /* RDMA_NOMSG sanity checks */ + if (unlikely(writelist)) + return -EIO; + if (unlikely(!replychunk)) + return -EIO; + + /* Reply chunk buffer already is the reply vector */ + r_xprt->rx_stats.total_rdma_reply += replychunk; + return replychunk; +} + +static noinline int +rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, + struct rpc_rqst *rqst) +{ + struct xdr_stream *xdr = &rep->rr_stream; + __be32 *p; + + p = xdr_inline_decode(xdr, sizeof(*p)); + if (unlikely(!p)) + return -EIO; + + switch (*p) { + case err_vers: + p = xdr_inline_decode(xdr, 2 * sizeof(*p)); + if (!p) + break; + dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", + rqst->rq_task->tk_pid, __func__, + be32_to_cpup(p), be32_to_cpu(*(p + 1))); + break; + case err_chunk: + dprintk("RPC: %5u: %s: server reports header decoding error\n", + rqst->rq_task->tk_pid, __func__); + break; + default: + dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); + } + + r_xprt->rx_stats.bad_reply_count++; + return -EREMOTEIO; +} + /* Process received RPC/RDMA messages. * * Errors must result in the RPC task either being awakened, or @@ -991,51 +1223,48 @@ rpcrdma_reply_handler(struct work_struct *work) struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct rpcrdma_msg *headerp; + struct xdr_stream *xdr = &rep->rr_stream; struct rpcrdma_req *req; struct rpc_rqst *rqst; - __be32 *iptr; - int rdmalen, status, rmerr; + __be32 *p, xid, vers, proc; unsigned long cwnd; - struct list_head mws; + int status; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); - if (rep->rr_len == RPCRDMA_BAD_LEN) + if (rep->rr_hdrbuf.head[0].iov_len == 0) goto out_badstatus; - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) + + xdr_init_decode(xdr, &rep->rr_hdrbuf, + rep->rr_hdrbuf.head[0].iov_base); + + /* Fixed transport header fields */ + p = xdr_inline_decode(xdr, 4 * sizeof(*p)); + if (unlikely(!p)) goto out_shortreply; + xid = *p++; + vers = *p++; + p++; /* credits */ + proc = *p++; - headerp = rdmab_to_msg(rep->rr_rdmabuf); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - if (rpcrdma_is_bcall(headerp)) - goto out_bcall; -#endif + if (rpcrdma_is_bcall(r_xprt, rep, xid, proc)) + return; /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&buf->rb_lock); - req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, - headerp->rm_xid); - if (!req) - goto out_nomatch; - if (req->rl_reply) - goto out_duplicate; - - list_replace_init(&req->rl_registered, &mws); - rpcrdma_mark_remote_invalidation(&mws, rep); - - /* Avoid races with signals and duplicate replies - * by marking this req as matched. - */ + spin_lock(&xprt->recv_lock); + rqst = xprt_lookup_rqst(xprt, xid); + if (!rqst) + goto out_norqst; + xprt_pin_rqst(rqst); + spin_unlock(&xprt->recv_lock); + req = rpcr_to_rdmar(rqst); req->rl_reply = rep; - spin_unlock(&buf->rb_lock); dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); + __func__, rep, req, be32_to_cpu(xid)); /* Invalidate and unmap the data payloads before waking the * waiting application. This guarantees the memory regions @@ -1044,99 +1273,42 @@ rpcrdma_reply_handler(struct work_struct *work) * waking the next RPC waits until this RPC has relinquished * all its Send Queue entries. */ - if (!list_empty(&mws)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws); + if (!list_empty(&req->rl_registered)) { + rpcrdma_mark_remote_invalidation(&req->rl_registered, rep); + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, + &req->rl_registered); + } - /* Perform XID lookup, reconstruction of the RPC reply, and - * RPC completion while holding the transport lock to ensure - * the rep, rqst, and rq_task pointers remain stable. - */ - spin_lock_bh(&xprt->transport_lock); - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (!rqst) - goto out_norqst; xprt->reestablish_timeout = 0; - if (headerp->rm_vers != rpcrdma_version) + if (vers != rpcrdma_version) goto out_badversion; - /* check for expected message types */ - /* The order of some of these tests is important. */ - switch (headerp->rm_type) { + switch (proc) { case rdma_msg: - /* never expect read chunks */ - /* never expect reply chunks (two ways to check) */ - if (headerp->rm_body.rm_chunks[0] != xdr_zero || - (headerp->rm_body.rm_chunks[1] == xdr_zero && - headerp->rm_body.rm_chunks[2] != xdr_zero)) - goto badheader; - if (headerp->rm_body.rm_chunks[1] != xdr_zero) { - /* count any expected write chunks in read reply */ - /* start at write chunk array count */ - iptr = &headerp->rm_body.rm_chunks[2]; - rdmalen = rpcrdma_count_chunks(rep, 1, &iptr); - /* check for validity, and no reply chunk after */ - if (rdmalen < 0 || *iptr++ != xdr_zero) - goto badheader; - rep->rr_len -= - ((unsigned char *)iptr - (unsigned char *)headerp); - status = rep->rr_len + rdmalen; - r_xprt->rx_stats.total_rdma_reply += rdmalen; - /* special case - last chunk may omit padding */ - if (rdmalen &= 3) { - rdmalen = 4 - rdmalen; - status += rdmalen; - } - } else { - /* else ordinary inline */ - rdmalen = 0; - iptr = (__be32 *)((unsigned char *)headerp + - RPCRDMA_HDRLEN_MIN); - rep->rr_len -= RPCRDMA_HDRLEN_MIN; - status = rep->rr_len; - } - - r_xprt->rx_stats.fixup_copy_count += - rpcrdma_inline_fixup(rqst, (char *)iptr, rep->rr_len, - rdmalen); + status = rpcrdma_decode_msg(r_xprt, rep, rqst); break; - case rdma_nomsg: - /* never expect read or write chunks, always reply chunks */ - if (headerp->rm_body.rm_chunks[0] != xdr_zero || - headerp->rm_body.rm_chunks[1] != xdr_zero || - headerp->rm_body.rm_chunks[2] != xdr_one) - goto badheader; - iptr = (__be32 *)((unsigned char *)headerp + - RPCRDMA_HDRLEN_MIN); - rdmalen = rpcrdma_count_chunks(rep, 0, &iptr); - if (rdmalen < 0) - goto badheader; - r_xprt->rx_stats.total_rdma_reply += rdmalen; - /* Reply chunk buffer already is the reply vector - no fixup. */ - status = rdmalen; + status = rpcrdma_decode_nomsg(r_xprt, rep); break; - case rdma_error: - goto out_rdmaerr; - -badheader: + status = rpcrdma_decode_error(r_xprt, rep, rqst); + break; default: - dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", - rqst->rq_task->tk_pid, __func__, - be32_to_cpu(headerp->rm_type)); status = -EIO; - r_xprt->rx_stats.bad_reply_count++; - break; } + if (status < 0) + goto out_badheader; out: + spin_lock(&xprt->recv_lock); cwnd = xprt->cwnd; xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); xprt_complete_rqst(rqst->rq_task, status); - spin_unlock_bh(&xprt->transport_lock); + xprt_unpin_rqst(rqst); + spin_unlock(&xprt->recv_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); return; @@ -1149,72 +1321,38 @@ out_badstatus: } return; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -out_bcall: - rpcrdma_bc_receive_call(r_xprt, rep); - return; -#endif - /* If the incoming reply terminated a pending RPC, the next * RPC call will post a replacement receive buffer as it is * being marshaled. */ out_badversion: dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(headerp->rm_vers)); + __func__, be32_to_cpu(vers)); status = -EIO; r_xprt->rx_stats.bad_reply_count++; goto out; -out_rdmaerr: - rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err); - switch (rmerr) { - case ERR_VERS: - pr_err("%s: server reports header version error (%u-%u)\n", - __func__, - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low), - be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high)); - break; - case ERR_CHUNK: - pr_err("%s: server reports header decoding error\n", - __func__); - break; - default: - pr_err("%s: server reports unknown error %d\n", - __func__, rmerr); - } - status = -EREMOTEIO; +out_badheader: + dprintk("RPC: %5u %s: invalid rpcrdma reply (type %u)\n", + rqst->rq_task->tk_pid, __func__, be32_to_cpu(proc)); r_xprt->rx_stats.bad_reply_count++; + status = -EIO; goto out; -/* The req was still available, but by the time the transport_lock +/* The req was still available, but by the time the recv_lock * was acquired, the rqst and task had been released. Thus the RPC * has already been terminated. */ out_norqst: - spin_unlock_bh(&xprt->transport_lock); - rpcrdma_buffer_put(req); - dprintk("RPC: %s: race, no rqst left for req %p\n", - __func__, req); - return; + spin_unlock(&xprt->recv_lock); + dprintk("RPC: %s: no match for incoming xid 0x%08x\n", + __func__, be32_to_cpu(xid)); + goto repost; out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; -out_nomatch: - spin_unlock(&buf->rb_lock); - dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", - __func__, be32_to_cpu(headerp->rm_xid), - rep->rr_len); - goto repost; - -out_duplicate: - spin_unlock(&buf->rb_lock); - dprintk("RPC: %s: " - "duplicate reply %p to RPC request %p: xid 0x%08x\n", - __func__, rep, req, be32_to_cpu(headerp->rm_xid)); - /* If no pending RPC transaction was matched, post a replacement * receive buffer before returning. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index c676ed0efb5a..ec37ad83b068 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -52,7 +52,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -69,17 +69,20 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, else if (credits > r_xprt->rx_buf.rb_bc_max_requests) credits = r_xprt->rx_buf.rb_bc_max_requests; + spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = credits << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) xprt_release_rqst_cong(req->rq_task); + spin_unlock_bh(&xprt->transport_lock); + ret = 0; xprt_complete_rqst(req->rq_task, rcvbuf->len); rcvbuf->len = 0; out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); out: return ret; @@ -266,7 +269,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static struct rpc_xprt_ops xprt_rdma_bc_procs = { +static const struct rpc_xprt_ops xprt_rdma_bc_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, .alloc_slot = xprt_alloc_slot, diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index d1c458e5ec4d..c84e2b644e13 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -149,7 +149,7 @@ static struct ctl_table sunrpc_table[] = { #endif -static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ +static const struct rpc_xprt_ops xprt_rdma_procs; static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -559,6 +559,7 @@ rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, r_xprt->rx_stats.hardway_register_count += size; req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); return true; } @@ -684,7 +685,6 @@ xprt_rdma_free(struct rpc_task *task) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - rpcrdma_remove_req(&r_xprt->rx_buf, req); if (!list_empty(&req->rl_registered)) ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); rpcrdma_unmap_sges(ia, req); @@ -730,7 +730,7 @@ xprt_rdma_send_request(struct rpc_task *task) if (unlikely(!list_empty(&req->rl_registered))) r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); - rc = rpcrdma_marshal_req(rqst); + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -811,7 +811,7 @@ xprt_rdma_disable_swap(struct rpc_xprt *xprt) * Plumbing for rpc transport switch and kernel module */ -static struct rpc_xprt_ops xprt_rdma_procs = { +static const struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ .alloc_slot = xprt_alloc_slot, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e4171f2abe37..11a1fbf7e59e 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -139,14 +139,11 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) static void rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) { - struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf); struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf; + __be32 *p = rep->rr_rdmabuf->rg_base; u32 credits; - if (rep->rr_len < RPCRDMA_HDRLEN_ERR) - return; - - credits = be32_to_cpu(rmsgp->rm_credit); + credits = be32_to_cpup(p + 2); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buffer->rb_max_requests) @@ -173,21 +170,19 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) goto out_fail; /* status == SUCCESS means all fields in wc are trustworthy */ - if (wc->opcode != IB_WC_RECV) - return; - dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", __func__, rep, wc->byte_len); - rep->rr_len = wc->byte_len; + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), rdmab_addr(rep->rr_rdmabuf), - rep->rr_len, DMA_FROM_DEVICE); + wc->byte_len, DMA_FROM_DEVICE); - rpcrdma_update_granted_credits(rep); + if (wc->byte_len >= RPCRDMA_HDRLEN_ERR) + rpcrdma_update_granted_credits(rep); out_schedule: queue_work(rpcrdma_receive_wq, &rep->rr_work); @@ -198,7 +193,7 @@ out_fail: pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - rep->rr_len = RPCRDMA_BAD_LEN; + rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); goto out_schedule; } @@ -974,6 +969,8 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(rep->rr_rdmabuf); goto out_free; } + xdr_buf_init(&rep->rr_hdrbuf, rep->rr_rdmabuf->rg_base, + rdmab_length(rep->rr_rdmabuf)); rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; @@ -1004,7 +1001,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_pending); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index b282d3f8cdd8..e26a97d2f922 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -218,18 +218,17 @@ enum { struct rpcrdma_rep { struct ib_cqe rr_cqe; - unsigned int rr_len; int rr_wc_flags; u32 rr_inv_rkey; + struct rpcrdma_regbuf *rr_rdmabuf; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; + struct xdr_buf rr_hdrbuf; + struct xdr_stream rr_stream; struct list_head rr_list; struct ib_recv_wr rr_recv_wr; - struct rpcrdma_regbuf *rr_rdmabuf; }; -#define RPCRDMA_BAD_LEN (~0U) - /* * struct rpcrdma_mw - external memory region metadata * @@ -341,11 +340,12 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - __be32 rl_xid; unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; + struct xdr_stream rl_stream; + struct xdr_buf rl_hdrbuf; struct ib_send_wr rl_send_wr; struct ib_sge rl_send_sge[RPCRDMA_MAX_SEND_SGES]; struct rpcrdma_regbuf *rl_rdmabuf; /* xprt header */ @@ -403,7 +403,6 @@ struct rpcrdma_buffer { int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; - struct list_head rb_pending; u32 rb_max_requests; atomic_t rb_credits; /* most recent credit grant */ @@ -440,24 +439,27 @@ struct rpcrdma_create_data_internal { * Statistics for RPCRDMA */ struct rpcrdma_stats { + /* accessed when sending a call */ unsigned long read_chunk_count; unsigned long write_chunk_count; unsigned long reply_chunk_count; - unsigned long long total_rdma_request; - unsigned long long total_rdma_reply; + /* rarely accessed error counters */ unsigned long long pullup_copy_count; - unsigned long long fixup_copy_count; unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long nomsg_call_count; - unsigned long bcall_count; unsigned long mrs_recovered; unsigned long mrs_orphaned; unsigned long mrs_allocated; + + /* accessed when receiving a reply */ + unsigned long long total_rdma_reply; + unsigned long long fixup_copy_count; unsigned long local_inv_needed; + unsigned long nomsg_call_count; + unsigned long bcall_count; }; /* @@ -465,7 +467,8 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt; struct rpcrdma_memreg_ops { - int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg * + (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, @@ -552,34 +555,6 @@ void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); -static inline void -rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) -{ - spin_lock(&buffers->rb_lock); - if (list_empty(&req->rl_list)) - list_add_tail(&req->rl_list, &buffers->rb_pending); - spin_unlock(&buffers->rb_lock); -} - -static inline struct rpcrdma_req * -rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) -{ - struct rpcrdma_req *pos; - - list_for_each_entry(pos, &buffers->rb_pending, rl_list) - if (pos->rl_xid == xid) - return pos; - return NULL; -} - -static inline void -rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) -{ - spin_lock(&buffers->rb_lock); - list_del(&req->rl_list); - spin_unlock(&buffers->rb_lock); -} - struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); @@ -638,10 +613,16 @@ enum rpcrdma_chunktype { bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *, u32, struct xdr_buf *, enum rpcrdma_chunktype); void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *); -int rpcrdma_marshal_req(struct rpc_rqst *); +int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); void rpcrdma_reply_handler(struct work_struct *work); +static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) +{ + xdr->head[0].iov_len = len; + xdr->len = len; +} + /* RPC/RDMA module init - xprtrdma/transport.c */ extern unsigned int xprt_rdma_max_inline_read; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 4f154d388748..9b5de31aa429 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -969,10 +969,12 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; + xprt_pin_rqst(rovr); + spin_unlock(&xprt->recv_lock); task = rovr->rq_task; copied = rovr->rq_private_buf.buflen; @@ -981,13 +983,16 @@ static void xs_local_data_read_skb(struct rpc_xprt *xprt, if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { dprintk("RPC: sk_buff copy failed\n"); - goto out_unlock; + spin_lock(&xprt->recv_lock); + goto out_unpin; } + spin_lock(&xprt->recv_lock); xprt_complete_rqst(task, copied); - +out_unpin: + xprt_unpin_rqst(rovr); out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); } static void xs_local_data_receive(struct sock_xprt *transport) @@ -1050,10 +1055,12 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; + xprt_pin_rqst(rovr); + spin_unlock(&xprt->recv_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1062,16 +1069,21 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); - goto out_unlock; + spin_lock(&xprt->recv_lock); + goto out_unpin; } __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); + spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); + spin_unlock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); xprt_complete_rqst(task, copied); - +out_unpin: + xprt_unpin_rqst(rovr); out_unlock: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1277,25 +1289,12 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, } len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) { - struct xdr_skb_reader my_desc; - - len = transport->tcp_reclen - transport->tcp_offset; - memcpy(&my_desc, desc, sizeof(my_desc)); - my_desc.count = len; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - &my_desc, xdr_skb_read_bits); - desc->count -= r; - desc->offset += r; - } else - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, + if (len > transport->tcp_reclen - transport->tcp_offset) + desc->count = transport->tcp_reclen - transport->tcp_offset; + r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, desc, xdr_skb_read_bits); - if (r > 0) { - transport->tcp_copied += r; - transport->tcp_offset += r; - } - if (r != len) { + if (desc->count) { /* Error when copying to the receive buffer, * usually because we weren't able to allocate * additional buffer pages. All we can do now @@ -1315,6 +1314,10 @@ static inline void xs_tcp_read_common(struct rpc_xprt *xprt, return; } + transport->tcp_copied += r; + transport->tcp_offset += r; + desc->count = len - r; + dprintk("RPC: XID %08x read %zd bytes\n", ntohl(transport->tcp_xid), r); dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " @@ -1343,21 +1346,24 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); /* Find and lock the request corresponding to this xid */ - spin_lock_bh(&xprt->transport_lock); + spin_lock(&xprt->recv_lock); req = xprt_lookup_rqst(xprt, transport->tcp_xid); if (!req) { dprintk("RPC: XID %08x request not found!\n", ntohl(transport->tcp_xid)); - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&xprt->recv_lock); return -1; } + xprt_pin_rqst(req); + spin_unlock(&xprt->recv_lock); xs_tcp_read_common(xprt, desc, req); + spin_lock(&xprt->recv_lock); if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_rqst(req->rq_task, transport->tcp_copied); - - spin_unlock_bh(&xprt->transport_lock); + xprt_unpin_rqst(req); + spin_unlock(&xprt->recv_lock); return 0; } @@ -1376,11 +1382,9 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, container_of(xprt, struct sock_xprt, xprt); struct rpc_rqst *req; - /* Look up and lock the request corresponding to the given XID */ - spin_lock_bh(&xprt->transport_lock); + /* Look up the request corresponding to the given XID */ req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { - spin_unlock_bh(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; @@ -1391,7 +1395,6 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_bc_request(req, transport->tcp_copied); - spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1516,6 +1519,7 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) .arg.data = xprt, }; unsigned long total = 0; + int loop; int read = 0; mutex_lock(&transport->recv_mutex); @@ -1524,20 +1528,20 @@ static void xs_tcp_data_receive(struct sock_xprt *transport) goto out; /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { + for (loop = 0; loop < 64; loop++) { lock_sock(sk); read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); if (read <= 0) { clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); release_sock(sk); - if (!test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - } else { - release_sock(sk); - total += read; + break; } + release_sock(sk); + total += read; rd_desc.count = 65536; } + if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + queue_work(xprtiod_workqueue, &transport->recv_worker); out: mutex_unlock(&transport->recv_mutex); trace_xs_tcp_data_ready(xprt, read, total); @@ -2724,7 +2728,7 @@ static void bc_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static struct rpc_xprt_ops xs_local_ops = { +static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_alloc_slot, @@ -2742,7 +2746,7 @@ static struct rpc_xprt_ops xs_local_ops = { .disable_swap = xs_disable_swap, }; -static struct rpc_xprt_ops xs_udp_ops = { +static const struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, @@ -2764,7 +2768,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .inject_disconnect = xs_inject_disconnect, }; -static struct rpc_xprt_ops xs_tcp_ops = { +static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, .alloc_slot = xprt_lock_and_alloc_slot, @@ -2795,7 +2799,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { * The rpc_xprt_ops for the server backchannel */ -static struct rpc_xprt_ops bc_tcp_ops = { +static const struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, diff --git a/scripts/selinux/genheaders/genheaders.c b/scripts/selinux/genheaders/genheaders.c index 6a24569c3578..672b069dcfea 100644 --- a/scripts/selinux/genheaders/genheaders.c +++ b/scripts/selinux/genheaders/genheaders.c @@ -129,11 +129,16 @@ int main(int argc, char *argv[]) for (i = 0; secclass_map[i].name; i++) { struct security_class_mapping *map = &secclass_map[i]; for (j = 0; map->perms[j]; j++) { + if (j >= 32) { + fprintf(stderr, "Too many permissions to fit into an access vector at (%s, %s).\n", + map->name, map->perms[j]); + exit(5); + } fprintf(fout, "#define %s__%s", map->name, map->perms[j]); for (k = 0; k < max(1, 40 - strlen(map->name) - strlen(map->perms[j])); k++) fprintf(fout, " "); - fprintf(fout, "0x%08xUL\n", (1<<j)); + fprintf(fout, "0x%08xU\n", (1<<j)); } } diff --git a/scripts/sphinx-pre-install b/scripts/sphinx-pre-install index 677756ae34c9..067459760a7b 100755 --- a/scripts/sphinx-pre-install +++ b/scripts/sphinx-pre-install @@ -40,7 +40,6 @@ my $virtualenv = 1; # my %texlive = ( - 'adjustbox.sty' => 'texlive-adjustbox', 'amsfonts.sty' => 'texlive-amsfonts', 'amsmath.sty' => 'texlive-amsmath', 'amssymb.sty' => 'texlive-amsfonts', diff --git a/security/commoncap.c b/security/commoncap.c index d8e26fb9781d..6bf72b175b49 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -82,8 +82,11 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; - /* Have we tried all of the parent namespaces? */ - if (ns == &init_user_ns) + /* + * If we're already at a lower level than we're looking for, + * we're done searching. + */ + if (ns->level <= cred->user_ns->level) return -EPERM; /* @@ -323,6 +326,209 @@ int cap_inode_killpriv(struct dentry *dentry) return error; } +static bool rootid_owns_currentns(kuid_t kroot) +{ + struct user_namespace *ns; + + if (!uid_valid(kroot)) + return false; + + for (ns = current_user_ns(); ; ns = ns->parent) { + if (from_kuid(ns, kroot) == 0) + return true; + if (ns == &init_user_ns) + break; + } + + return false; +} + +static __u32 sansflags(__u32 m) +{ + return m & ~VFS_CAP_FLAGS_EFFECTIVE; +} + +static bool is_v2header(size_t size, __le32 magic) +{ + __u32 m = le32_to_cpu(magic); + if (size != XATTR_CAPS_SZ_2) + return false; + return sansflags(m) == VFS_CAP_REVISION_2; +} + +static bool is_v3header(size_t size, __le32 magic) +{ + __u32 m = le32_to_cpu(magic); + + if (size != XATTR_CAPS_SZ_3) + return false; + return sansflags(m) == VFS_CAP_REVISION_3; +} + +/* + * getsecurity: We are called for security.* before any attempt to read the + * xattr from the inode itself. + * + * This gives us a chance to read the on-disk value and convert it. If we + * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. + * + * Note we are not called by vfs_getxattr_alloc(), but that is only called + * by the integrity subsystem, which really wants the unconverted values - + * so that's good. + */ +int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, + bool alloc) +{ + int size, ret; + kuid_t kroot; + uid_t root, mappedroot; + char *tmpbuf = NULL; + struct vfs_cap_data *cap; + struct vfs_ns_cap_data *nscap; + struct dentry *dentry; + struct user_namespace *fs_ns; + + if (strcmp(name, "capability") != 0) + return -EOPNOTSUPP; + + dentry = d_find_alias(inode); + if (!dentry) + return -EINVAL; + + size = sizeof(struct vfs_ns_cap_data); + ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, + &tmpbuf, size, GFP_NOFS); + dput(dentry); + + if (ret < 0) + return ret; + + fs_ns = inode->i_sb->s_user_ns; + cap = (struct vfs_cap_data *) tmpbuf; + if (is_v2header((size_t) ret, cap->magic_etc)) { + /* If this is sizeof(vfs_cap_data) then we're ok with the + * on-disk value, so return that. */ + if (alloc) + *buffer = tmpbuf; + else + kfree(tmpbuf); + return ret; + } else if (!is_v3header((size_t) ret, cap->magic_etc)) { + kfree(tmpbuf); + return -EINVAL; + } + + nscap = (struct vfs_ns_cap_data *) tmpbuf; + root = le32_to_cpu(nscap->rootid); + kroot = make_kuid(fs_ns, root); + + /* If the root kuid maps to a valid uid in current ns, then return + * this as a nscap. */ + mappedroot = from_kuid(current_user_ns(), kroot); + if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { + if (alloc) { + *buffer = tmpbuf; + nscap->rootid = cpu_to_le32(mappedroot); + } else + kfree(tmpbuf); + return size; + } + + if (!rootid_owns_currentns(kroot)) { + kfree(tmpbuf); + return -EOPNOTSUPP; + } + + /* This comes from a parent namespace. Return as a v2 capability */ + size = sizeof(struct vfs_cap_data); + if (alloc) { + *buffer = kmalloc(size, GFP_ATOMIC); + if (*buffer) { + struct vfs_cap_data *cap = *buffer; + __le32 nsmagic, magic; + magic = VFS_CAP_REVISION_2; + nsmagic = le32_to_cpu(nscap->magic_etc); + if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) + magic |= VFS_CAP_FLAGS_EFFECTIVE; + memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); + cap->magic_etc = cpu_to_le32(magic); + } + } + kfree(tmpbuf); + return size; +} + +static kuid_t rootid_from_xattr(const void *value, size_t size, + struct user_namespace *task_ns) +{ + const struct vfs_ns_cap_data *nscap = value; + uid_t rootid = 0; + + if (size == XATTR_CAPS_SZ_3) + rootid = le32_to_cpu(nscap->rootid); + + return make_kuid(task_ns, rootid); +} + +static bool validheader(size_t size, __le32 magic) +{ + return is_v2header(size, magic) || is_v3header(size, magic); +} + +/* + * User requested a write of security.capability. If needed, update the + * xattr to change from v2 to v3, or to fixup the v3 rootid. + * + * If all is ok, we return the new size, on error return < 0. + */ +int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size) +{ + struct vfs_ns_cap_data *nscap; + uid_t nsrootid; + const struct vfs_cap_data *cap = *ivalue; + __u32 magic, nsmagic; + struct inode *inode = d_backing_inode(dentry); + struct user_namespace *task_ns = current_user_ns(), + *fs_ns = inode->i_sb->s_user_ns; + kuid_t rootid; + size_t newsize; + + if (!*ivalue) + return -EINVAL; + if (!validheader(size, cap->magic_etc)) + return -EINVAL; + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) + return -EPERM; + if (size == XATTR_CAPS_SZ_2) + if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) + /* user is privileged, just write the v2 */ + return size; + + rootid = rootid_from_xattr(*ivalue, size, task_ns); + if (!uid_valid(rootid)) + return -EINVAL; + + nsrootid = from_kuid(fs_ns, rootid); + if (nsrootid == -1) + return -EINVAL; + + newsize = sizeof(struct vfs_ns_cap_data); + nscap = kmalloc(newsize, GFP_ATOMIC); + if (!nscap) + return -ENOMEM; + nscap->rootid = cpu_to_le32(nsrootid); + nsmagic = VFS_CAP_REVISION_3; + magic = le32_to_cpu(cap->magic_etc); + if (magic & VFS_CAP_FLAGS_EFFECTIVE) + nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; + nscap->magic_etc = cpu_to_le32(nsmagic); + memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); + + kvfree(*ivalue); + *ivalue = nscap; + return newsize; +} + /* * Calculate the new process capability sets from the capability sets attached * to a file. @@ -376,7 +582,10 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data __u32 magic_etc; unsigned tocopy, i; int size; - struct vfs_cap_data caps; + struct vfs_ns_cap_data data, *nscaps = &data; + struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; + kuid_t rootkuid; + struct user_namespace *fs_ns = inode->i_sb->s_user_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); @@ -384,18 +593,20 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data return -ENODATA; size = __vfs_getxattr((struct dentry *)dentry, inode, - XATTR_NAME_CAPS, &caps, XATTR_CAPS_SZ); + XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; + if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; - cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps.magic_etc); + cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); + rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) @@ -407,15 +618,27 @@ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data return -EINVAL; tocopy = VFS_CAP_U32_2; break; + case VFS_CAP_REVISION_3: + if (size != XATTR_CAPS_SZ_3) + return -EINVAL; + tocopy = VFS_CAP_U32_3; + rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); + break; + default: return -EINVAL; } + /* Limit the caps to the mounter of the filesystem + * or the more limited uid specified in the xattr. + */ + if (!rootid_owns_currentns(rootkuid)) + return -ENODATA; CAP_FOR_EACH_U32(i) { if (i >= tocopy) break; - cpu_caps->permitted.cap[i] = le32_to_cpu(caps.data[i].permitted); - cpu_caps->inheritable.cap[i] = le32_to_cpu(caps.data[i].inheritable); + cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted); + cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable); } cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; @@ -453,8 +676,8 @@ static int get_file_caps(struct linux_binprm *bprm, bool *effective, bool *has_c rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) - printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n", - __func__, rc, bprm->filename); + printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", + bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; @@ -633,15 +856,19 @@ skip: int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (!strcmp(name, XATTR_NAME_CAPS)) { - if (!capable(CAP_SETFCAP)) - return -EPERM; + /* Ignore non-security xattrs */ + if (strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0) + return 0; + + /* + * For XATTR_NAME_CAPS the check will be done in + * cap_convert_nscap(), called by setxattr() + */ + if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; - } - if (!strncmp(name, XATTR_SECURITY_PREFIX, - sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -659,15 +886,22 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name, */ int cap_inode_removexattr(struct dentry *dentry, const char *name) { - if (!strcmp(name, XATTR_NAME_CAPS)) { - if (!capable(CAP_SETFCAP)) + /* Ignore non-security xattrs */ + if (strncmp(name, XATTR_SECURITY_PREFIX, + sizeof(XATTR_SECURITY_PREFIX) - 1) != 0) + return 0; + + if (strcmp(name, XATTR_NAME_CAPS) == 0) { + /* security.capability gets namespaced */ + struct inode *inode = d_backing_inode(dentry); + if (!inode) + return -EINVAL; + if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; return 0; } - if (!strncmp(name, XATTR_SECURITY_PREFIX, - sizeof(XATTR_SECURITY_PREFIX) - 1) && - !capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -1054,6 +1288,7 @@ struct security_hook_list capability_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(bprm_set_creds, cap_bprm_set_creds), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), + LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(mmap_file, cap_mmap_file), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 28d4c3a528ab..67703dbe29ea 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -2,7 +2,7 @@ * common LSM auditing functions * * Based on code written for SELinux by : - * Stephen Smalley, <sds@epoch.ncsc.mil> + * Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> * diff --git a/security/security.c b/security/security.c index afc34f46c6c5..4bf0f571b4ef 100644 --- a/security/security.c +++ b/security/security.c @@ -974,11 +974,6 @@ int security_file_open(struct file *file, const struct cred *cred) return fsnotify_perm(file, MAY_OPEN); } -int security_task_create(unsigned long clone_flags) -{ - return call_int_hook(task_create, 0, clone_flags); -} - int security_task_alloc(struct task_struct *task, unsigned long clone_flags) { return call_int_hook(task_alloc, 0, task, clone_flags); diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 4b4293194aee..2380b8d72cec 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -1,7 +1,7 @@ /* * Implementation of the kernel access vector cache (AVC). * - * Authors: Stephen Smalley, <sds@epoch.ncsc.mil> + * Authors: Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * * Update: KaiGai, Kohei <kaigai@ak.jp.nec.com> @@ -346,27 +346,26 @@ static struct avc_xperms_decision_node struct avc_xperms_decision_node *xpd_node; struct extended_perms_decision *xpd; - xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep, GFP_NOWAIT); if (!xpd_node) return NULL; xpd = &xpd_node->xpd; if (which & XPERMS_ALLOWED) { xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + GFP_NOWAIT); if (!xpd->allowed) goto error; } if (which & XPERMS_AUDITALLOW) { xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + GFP_NOWAIT); if (!xpd->auditallow) goto error; } if (which & XPERMS_DONTAUDIT) { xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep, - GFP_ATOMIC | __GFP_NOMEMALLOC); + GFP_NOWAIT); if (!xpd->dontaudit) goto error; } @@ -394,8 +393,7 @@ static struct avc_xperms_node *avc_xperms_alloc(void) { struct avc_xperms_node *xp_node; - xp_node = kmem_cache_zalloc(avc_xperms_cachep, - GFP_ATOMIC|__GFP_NOMEMALLOC); + xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT); if (!xp_node) return xp_node; INIT_LIST_HEAD(&xp_node->xpd_head); @@ -548,7 +546,7 @@ static struct avc_node *avc_alloc_node(void) { struct avc_node *node; - node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC); + node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT); if (!node) goto out; diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ad3b0f53ede0..f5d304736852 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3,7 +3,7 @@ * * This file contains the SELinux hook function implementations. * - * Authors: Stephen Smalley, <sds@epoch.ncsc.mil> + * Authors: Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> @@ -815,7 +815,9 @@ static int selinux_set_mnt_opts(struct super_block *sb, if (!strcmp(sb->s_type->name, "debugfs") || !strcmp(sb->s_type->name, "tracefs") || !strcmp(sb->s_type->name, "sysfs") || - !strcmp(sb->s_type->name, "pstore")) + !strcmp(sb->s_type->name, "pstore") || + !strcmp(sb->s_type->name, "cgroup") || + !strcmp(sb->s_type->name, "cgroup2")) sbsec->flags |= SE_SBGENFS; if (!sbsec->behavior) { @@ -1303,6 +1305,7 @@ static inline u16 socket_type_to_security_class(int family, int type, int protoc case SOCK_SEQPACKET: return SECCLASS_UNIX_STREAM_SOCKET; case SOCK_DGRAM: + case SOCK_RAW: return SECCLASS_UNIX_DGRAM_SOCKET; } break; @@ -2317,6 +2320,7 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS); int nosuid = !mnt_may_suid(bprm->file->f_path.mnt); int rc; + u32 av; if (!nnp && !nosuid) return 0; /* neither NNP nor nosuid */ @@ -2325,24 +2329,40 @@ static int check_nnp_nosuid(const struct linux_binprm *bprm, return 0; /* No change in credentials */ /* - * The only transitions we permit under NNP or nosuid - * are transitions to bounded SIDs, i.e. SIDs that are - * guaranteed to only be allowed a subset of the permissions - * of the current SID. + * If the policy enables the nnp_nosuid_transition policy capability, + * then we permit transitions under NNP or nosuid if the + * policy allows the corresponding permission between + * the old and new contexts. */ - rc = security_bounded_transition(old_tsec->sid, new_tsec->sid); - if (rc) { - /* - * On failure, preserve the errno values for NNP vs nosuid. - * NNP: Operation not permitted for caller. - * nosuid: Permission denied to file. - */ + if (selinux_policycap_nnp_nosuid_transition) { + av = 0; if (nnp) - return -EPERM; - else - return -EACCES; + av |= PROCESS2__NNP_TRANSITION; + if (nosuid) + av |= PROCESS2__NOSUID_TRANSITION; + rc = avc_has_perm(old_tsec->sid, new_tsec->sid, + SECCLASS_PROCESS2, av, NULL); + if (!rc) + return 0; } - return 0; + + /* + * We also permit NNP or nosuid transitions to bounded SIDs, + * i.e. SIDs that are guaranteed to only be allowed a subset + * of the permissions of the current SID. + */ + rc = security_bounded_transition(old_tsec->sid, new_tsec->sid); + if (!rc) + return 0; + + /* + * On failure, preserve the errno values for NNP vs nosuid. + * NNP: Operation not permitted for caller. + * nosuid: Permission denied to file. + */ + if (nnp) + return -EPERM; + return -EACCES; } static int selinux_bprm_set_creds(struct linux_binprm *bprm) diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h index 0999df03af8b..a5004e9de11a 100644 --- a/security/selinux/include/avc.h +++ b/security/selinux/include/avc.h @@ -1,7 +1,7 @@ /* * Access vector cache interface for object managers. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SELINUX_AVC_H_ #define _SELINUX_AVC_H_ diff --git a/security/selinux/include/avc_ss.h b/security/selinux/include/avc_ss.h index d5c328452df0..37d57dadd476 100644 --- a/security/selinux/include/avc_ss.h +++ b/security/selinux/include/avc_ss.h @@ -1,7 +1,7 @@ /* * Access vector cache interface for the security server. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SELINUX_AVC_SS_H_ #define _SELINUX_AVC_SS_H_ diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index b9fe3434b036..35ffb29a69cb 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -48,6 +48,8 @@ struct security_class_mapping secclass_map[] = { "setrlimit", "rlimitinh", "dyntransition", "setcurrent", "execmem", "execstack", "execheap", "setkeycreate", "setsockcreate", "getrlimit", NULL } }, + { "process2", + { "nnp_transition", "nosuid_transition", NULL } }, { "system", { "ipc_info", "syslog_read", "syslog_mod", "syslog_console", "module_request", "module_load", NULL } }, diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 6ebc61e370ff..1649cd18eb0b 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -3,7 +3,7 @@ * * This file contains the SELinux security data structures for kernel objects. * - * Author(s): Stephen Smalley, <sds@epoch.ncsc.mil> + * Author(s): Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index e91f08c16c0b..28dfb2f93e4d 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -1,7 +1,7 @@ /* * Security server interface. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> * */ @@ -73,6 +73,7 @@ enum { POLICYDB_CAPABILITY_EXTSOCKCLASS, POLICYDB_CAPABILITY_ALWAYSNETWORK, POLICYDB_CAPABILITY_CGROUPSECLABEL, + POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION, __POLICYDB_CAPABILITY_MAX }; #define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1) @@ -84,6 +85,7 @@ extern int selinux_policycap_openperm; extern int selinux_policycap_extsockclass; extern int selinux_policycap_alwaysnetwork; extern int selinux_policycap_cgroupseclabel; +extern int selinux_policycap_nnp_nosuid_transition; /* * type_datum properties diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index 3628d3a868b6..2c3c7d010d8a 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -1,7 +1,7 @@ /* * Implementation of the access vector table type. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h index d946c9dc3c9c..725853cadc42 100644 --- a/security/selinux/ss/avtab.h +++ b/security/selinux/ss/avtab.h @@ -5,7 +5,7 @@ * table is used to represent the type enforcement * tables. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> diff --git a/security/selinux/ss/constraint.h b/security/selinux/ss/constraint.h index 96fd947c494b..33ae2aec4f36 100644 --- a/security/selinux/ss/constraint.h +++ b/security/selinux/ss/constraint.h @@ -10,7 +10,7 @@ * process from labeling an object with a different user * identity. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_CONSTRAINT_H_ #define _SS_CONSTRAINT_H_ diff --git a/security/selinux/ss/context.h b/security/selinux/ss/context.h index 212e3479a0d9..a2c0f37c42ae 100644 --- a/security/selinux/ss/context.h +++ b/security/selinux/ss/context.h @@ -10,7 +10,7 @@ * security server and can be changed without affecting * clients of the security server. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c index ad38299164c3..fc28149a4f2e 100644 --- a/security/selinux/ss/ebitmap.c +++ b/security/selinux/ss/ebitmap.c @@ -1,7 +1,7 @@ /* * Implementation of the extensible bitmap type. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Hewlett-Packard <paul@paul-moore.com> diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h index 6d5a9ac4251f..da1325dda550 100644 --- a/security/selinux/ss/ebitmap.h +++ b/security/selinux/ss/ebitmap.h @@ -9,7 +9,7 @@ * an explicitly specified starting bit position within * the total bitmap. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_EBITMAP_H_ #define _SS_EBITMAP_H_ diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c index 3858706a29fb..686c3917064c 100644 --- a/security/selinux/ss/hashtab.c +++ b/security/selinux/ss/hashtab.c @@ -1,7 +1,7 @@ /* * Implementation of the hash table type. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #include <linux/kernel.h> #include <linux/slab.h> diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h index 953872cd84ab..009fb5e06172 100644 --- a/security/selinux/ss/hashtab.h +++ b/security/selinux/ss/hashtab.h @@ -5,7 +5,7 @@ * functions for hash computation and key comparison are * provided by the creator of the table. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_HASHTAB_H_ #define _SS_HASHTAB_H_ diff --git a/security/selinux/ss/mls.c b/security/selinux/ss/mls.c index e1088842232c..d9dc34f4fade 100644 --- a/security/selinux/ss/mls.c +++ b/security/selinux/ss/mls.c @@ -1,7 +1,7 @@ /* * Implementation of the multi-level security (MLS) policy. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> diff --git a/security/selinux/ss/mls.h b/security/selinux/ss/mls.h index e4369e3e6366..0f0a1d65b2ce 100644 --- a/security/selinux/ss/mls.h +++ b/security/selinux/ss/mls.h @@ -1,7 +1,7 @@ /* * Multi-level security (MLS) policy operations. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> diff --git a/security/selinux/ss/mls_types.h b/security/selinux/ss/mls_types.h index e93648774137..47f3702cd596 100644 --- a/security/selinux/ss/mls_types.h +++ b/security/selinux/ss/mls_types.h @@ -1,7 +1,7 @@ /* * Type definitions for the multi-level security (MLS) policy. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index aa6500abb178..6e8c8056d7ad 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -1,7 +1,7 @@ /* * Implementation of the policy database. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h index 5d23eed35fa7..215f8f30ac5a 100644 --- a/security/selinux/ss/policydb.h +++ b/security/selinux/ss/policydb.h @@ -2,7 +2,7 @@ * A policy database (policydb) specifies the * configuration data for the security policy. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 2f02fa67ec2e..e4a1c0dc561a 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1,7 +1,7 @@ /* * Implementation of the security services. * - * Authors : Stephen Smalley, <sds@epoch.ncsc.mil> + * Authors : Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> @@ -76,7 +76,8 @@ char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = { "open_perms", "extended_socket_class", "always_check_network", - "cgroup_seclabel" + "cgroup_seclabel", + "nnp_nosuid_transition" }; int selinux_policycap_netpeer; @@ -84,6 +85,7 @@ int selinux_policycap_openperm; int selinux_policycap_extsockclass; int selinux_policycap_alwaysnetwork; int selinux_policycap_cgroupseclabel; +int selinux_policycap_nnp_nosuid_transition; static DEFINE_RWLOCK(policy_rwlock); @@ -2009,6 +2011,9 @@ static void security_load_policycaps(void) selinux_policycap_cgroupseclabel = ebitmap_get_bit(&policydb.policycaps, POLICYDB_CAPABILITY_CGROUPSECLABEL); + selinux_policycap_nnp_nosuid_transition = + ebitmap_get_bit(&policydb.policycaps, + POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION); for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) pr_info("SELinux: policy capability %s=%d\n", diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h index 6abcd8729ec3..3d9fa9556b4f 100644 --- a/security/selinux/ss/services.h +++ b/security/selinux/ss/services.h @@ -1,7 +1,7 @@ /* * Implementation of the security services. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_SERVICES_H_ #define _SS_SERVICES_H_ diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c index c5f436b15d19..6ae08efc5ae7 100644 --- a/security/selinux/ss/sidtab.c +++ b/security/selinux/ss/sidtab.c @@ -1,7 +1,7 @@ /* * Implementation of the SID table type. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #include <linux/kernel.h> #include <linux/slab.h> diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h index 84dc154d9389..de5d0ea583d2 100644 --- a/security/selinux/ss/sidtab.h +++ b/security/selinux/ss/sidtab.h @@ -2,7 +2,7 @@ * A security identifier table (sidtab) is a hash table * of security context structures indexed by SID value. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_SIDTAB_H_ #define _SS_SIDTAB_H_ diff --git a/security/selinux/ss/symtab.c b/security/selinux/ss/symtab.c index 160326ee99e5..d1a6745849a7 100644 --- a/security/selinux/ss/symtab.c +++ b/security/selinux/ss/symtab.c @@ -1,7 +1,7 @@ /* * Implementation of the symbol table type. * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #include <linux/kernel.h> #include <linux/string.h> diff --git a/security/selinux/ss/symtab.h b/security/selinux/ss/symtab.h index ca422b42fbc0..0bc12d587d3a 100644 --- a/security/selinux/ss/symtab.h +++ b/security/selinux/ss/symtab.h @@ -4,7 +4,7 @@ * is arbitrary. The symbol table type is implemented * using the hash table type (hashtab). * - * Author : Stephen Smalley, <sds@epoch.ncsc.mil> + * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_SYMTAB_H_ #define _SS_SYMTAB_H_ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 2a37ae925d85..140ae638cfd6 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -139,8 +139,9 @@ enum perf_event_sample_format { PERF_SAMPLE_IDENTIFIER = 1U << 16, PERF_SAMPLE_TRANSACTION = 1U << 17, PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, - PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */ }; /* @@ -814,6 +815,7 @@ enum perf_event_type { * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/tools/perf/Documentation/intel-pt.txt b/tools/perf/Documentation/intel-pt.txt index ab1b0825130a..76971d2e4164 100644 --- a/tools/perf/Documentation/intel-pt.txt +++ b/tools/perf/Documentation/intel-pt.txt @@ -873,7 +873,7 @@ amended to take the number of elements as a parameter. $ cat ~/.perfconfig [intel-pt] - mispred-all + mispred-all = on $ perf record -e intel_pt//u ./sort 3000 Bubble sorting array of 3000 elements diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 73496320fca3..4be08a1e3f8d 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -59,6 +59,10 @@ OPTIONS --ldload:: Specify desired latency for loads event. +-p:: +--phys-data:: + Record/Report sample physical addresses + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 9bdea047c5db..e397453e5a46 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -249,7 +249,10 @@ OPTIONS -d:: --data:: - Record the sample addresses. + Record the sample virtual addresses. + +--phys-data:: + Record the sample physical addresses. -T:: --timestamp:: diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 9fa84617181e..383a98d992ed 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -137,6 +137,7 @@ OPTIONS - mem: type of memory access for the data at the time of the sample - snoop: type of snoop (if any) for the data at the time of the sample - dcacheline: the cacheline the data address is on at the time of the sample + - phys_daddr: physical address of data being executed on at the time of sample And the default sort keys are changed to local_weight, mem, sym, dso, symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'. diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 5ee8796be96e..18dfcfa38454 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -117,7 +117,7 @@ OPTIONS Comma separated list of fields to print. Options are: comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline, period, iregs, brstack, brstacksym, flags, bpf-output, brstackinsn, brstackoff, - callindent, insn, insnlen, synth. + callindent, insn, insnlen, synth, phys_addr. Field list can be prepended with the type, trace, sw or hw, to indicate to which event type the field list applies. e.g., -F sw:comm,tid,time,ip,sym and -F trace:time,cpu,trace diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index c1e3288a2dfb..d53bea6bd571 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -37,7 +37,7 @@ OPTIONS --expr:: --event:: List of syscalls and other perf events (tracepoints, HW cache events, - etc) to show. + etc) to show. Globbing is supported, e.g.: "epoll_*", "*msg*", etc. See 'perf list' for a complete list of events. Prefixing with ! shows all syscalls but the ones specified. You may need to escape it. diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index e001c0290793..0f15634ef82c 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -23,6 +23,7 @@ struct perf_mem { bool hide_unresolved; bool dump_raw; bool force; + bool phys_addr; int operation; const char *cpu_list; DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -101,6 +102,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem) rec_argv[i++] = "-d"; + if (mem->phys_addr) + rec_argv[i++] = "--phys-data"; + for (j = 0; j < PERF_MEM_EVENTS__MAX; j++) { if (!perf_mem_events[j].record) continue; @@ -161,30 +165,60 @@ dump_raw_samples(struct perf_tool *tool, if (al.map != NULL) al.map->dso->hit = 1; - if (symbol_conf.field_sep) { - fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64 - "%s0x%"PRIx64"%s%s:%s\n"; + if (mem->phys_addr) { + if (symbol_conf.field_sep) { + fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s0x%016"PRIx64 + "%s%"PRIu64"%s0x%"PRIx64"%s%s:%s\n"; + } else { + fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 + "%s0x%016"PRIx64"%s%5"PRIu64"%s0x%06"PRIx64 + "%s%s:%s\n"; + symbol_conf.field_sep = " "; + } + + printf(fmt, + sample->pid, + symbol_conf.field_sep, + sample->tid, + symbol_conf.field_sep, + sample->ip, + symbol_conf.field_sep, + sample->addr, + symbol_conf.field_sep, + sample->phys_addr, + symbol_conf.field_sep, + sample->weight, + symbol_conf.field_sep, + sample->data_src, + symbol_conf.field_sep, + al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", + al.sym ? al.sym->name : "???"); } else { - fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 - "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n"; - symbol_conf.field_sep = " "; - } + if (symbol_conf.field_sep) { + fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64 + "%s0x%"PRIx64"%s%s:%s\n"; + } else { + fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64 + "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n"; + symbol_conf.field_sep = " "; + } - printf(fmt, - sample->pid, - symbol_conf.field_sep, - sample->tid, - symbol_conf.field_sep, - sample->ip, - symbol_conf.field_sep, - sample->addr, - symbol_conf.field_sep, - sample->weight, - symbol_conf.field_sep, - sample->data_src, - symbol_conf.field_sep, - al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", - al.sym ? al.sym->name : "???"); + printf(fmt, + sample->pid, + symbol_conf.field_sep, + sample->tid, + symbol_conf.field_sep, + sample->ip, + symbol_conf.field_sep, + sample->addr, + symbol_conf.field_sep, + sample->weight, + symbol_conf.field_sep, + sample->data_src, + symbol_conf.field_sep, + al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???", + al.sym ? al.sym->name : "???"); + } out_put: addr_location__put(&al); return 0; @@ -224,7 +258,10 @@ static int report_raw_events(struct perf_mem *mem) if (ret < 0) goto out_delete; - printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); + if (mem->phys_addr) + printf("# PID, TID, IP, ADDR, PHYS ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); + else + printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n"); ret = perf_session__process_events(session); @@ -254,9 +291,16 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem) * there is no weight (cost) associated with stores, so don't print * the column */ - if (!(mem->operation & MEM_OPERATION_LOAD)) - rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," - "dso_daddr,tlb,locked"; + if (!(mem->operation & MEM_OPERATION_LOAD)) { + if (mem->phys_addr) + rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," + "dso_daddr,tlb,locked,phys_daddr"; + else + rep_argv[i++] = "--sort=mem,sym,dso,symbol_daddr," + "dso_daddr,tlb,locked"; + } else if (mem->phys_addr) + rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr," + "dso_daddr,snoop,tlb,locked,phys_daddr"; for (j = 1; j < argc; j++, i++) rep_argv[i] = argv[j]; @@ -373,6 +417,7 @@ int cmd_mem(int argc, const char **argv) "separator for columns, no spaces will be added" " between columns '.' is reserved."), OPT_BOOLEAN('f', "force", &mem.force, "don't complain, do it"), + OPT_BOOLEAN('p', "phys-data", &mem.phys_addr, "Record/Report sample physical addresses"), OPT_END() }; const char *const mem_subcommands[] = { "record", "report", NULL }; diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 36d7117a7562..56f8142ff97f 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1604,6 +1604,8 @@ static struct option __record_options[] = { OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, "per thread counts"), OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"), + OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr, + "Record the sample physical addresses"), OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"), OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time, &record.opts.sample_time_set, diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 378f76cdf923..3d4c3b5e1868 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -87,6 +87,7 @@ enum perf_output_field { PERF_OUTPUT_BRSTACKINSN = 1U << 23, PERF_OUTPUT_BRSTACKOFF = 1U << 24, PERF_OUTPUT_SYNTH = 1U << 25, + PERF_OUTPUT_PHYS_ADDR = 1U << 26, }; struct output_option { @@ -119,6 +120,7 @@ struct output_option { {.str = "brstackinsn", .field = PERF_OUTPUT_BRSTACKINSN}, {.str = "brstackoff", .field = PERF_OUTPUT_BRSTACKOFF}, {.str = "synth", .field = PERF_OUTPUT_SYNTH}, + {.str = "phys_addr", .field = PERF_OUTPUT_PHYS_ADDR}, }; enum { @@ -175,7 +177,8 @@ static struct { PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP | PERF_OUTPUT_SYM | PERF_OUTPUT_DSO | PERF_OUTPUT_PERIOD | PERF_OUTPUT_ADDR | - PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT, + PERF_OUTPUT_DATA_SRC | PERF_OUTPUT_WEIGHT | + PERF_OUTPUT_PHYS_ADDR, .invalid_fields = PERF_OUTPUT_TRACE | PERF_OUTPUT_BPF_OUTPUT, }, @@ -382,6 +385,11 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel, PERF_OUTPUT_IREGS)) return -EINVAL; + if (PRINT_FIELD(PHYS_ADDR) && + perf_evsel__check_stype(evsel, PERF_SAMPLE_PHYS_ADDR, "PHYS_ADDR", + PERF_OUTPUT_PHYS_ADDR)) + return -EINVAL; + return 0; } @@ -1446,6 +1454,9 @@ static void process_event(struct perf_script *script, if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT)) print_sample_bpf_output(sample); print_insn(sample, attr, thread, machine); + + if (PRINT_FIELD(PHYS_ADDR)) + printf("%16" PRIx64, sample->phys_addr); printf("\n"); } @@ -2729,7 +2740,7 @@ int cmd_script(int argc, const char **argv) "Valid types: hw,sw,trace,raw,synth. " "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso," "addr,symoff,period,iregs,brstack,brstacksym,flags," - "bpf-output,callindent,insn,insnlen,brstackinsn,synth", + "bpf-output,callindent,insn,insnlen,brstackinsn,synth,phys_addr", parse_output_fields), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 866da7aa54bf..85e992d9215b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1257,7 +1257,7 @@ static bool collect_data(struct perf_evsel *counter, if (counter->merged_stat) return false; cb(counter, data, true); - if (!no_merge) + if (!no_merge && counter->auto_merge_stats) collect_all_aliases(counter, cb, data); return true; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index d59cdadf3a79..771ddab94bb0 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -1261,6 +1261,7 @@ static int trace__read_syscall_info(struct trace *trace, int id) static int trace__validate_ev_qualifier(struct trace *trace) { int err = 0, i; + size_t nr_allocated; struct str_node *pos; trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); @@ -1274,13 +1275,18 @@ static int trace__validate_ev_qualifier(struct trace *trace) goto out; } + nr_allocated = trace->ev_qualifier_ids.nr; i = 0; strlist__for_each_entry(pos, trace->ev_qualifier) { const char *sc = pos->s; - int id = syscalltbl__id(trace->sctbl, sc); + int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; if (id < 0) { + id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); + if (id >= 0) + goto matches; + if (err == 0) { fputs("Error:\tInvalid syscall ", trace->output); err = -EINVAL; @@ -1290,13 +1296,37 @@ static int trace__validate_ev_qualifier(struct trace *trace) fputs(sc, trace->output); } - +matches: trace->ev_qualifier_ids.entries[i++] = id; + if (match_next == -1) + continue; + + while (1) { + id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); + if (id < 0) + break; + if (nr_allocated == trace->ev_qualifier_ids.nr) { + void *entries; + + nr_allocated += 8; + entries = realloc(trace->ev_qualifier_ids.entries, + nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); + if (entries == NULL) { + err = -ENOMEM; + fputs("\nError:\t Not enough memory for parsing\n", trace->output); + goto out_free; + } + trace->ev_qualifier_ids.entries = entries; + } + trace->ev_qualifier_ids.nr++; + trace->ev_qualifier_ids.entries[i++] = id; + } } if (err < 0) { fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" "\nHint:\tand: 'man syscalls'\n", trace->output); +out_free: zfree(&trace->ev_qualifier_ids.entries); trace->ev_qualifier_ids.nr = 0; } @@ -2814,7 +2844,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, struct trace *trace = (struct trace *)opt->value; const char *s = str; char *sep = NULL, *lists[2] = { NULL, NULL, }; - int len = strlen(str) + 1, err = -1, list; + int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; @@ -2831,7 +2861,8 @@ static int trace__parse_events_option(const struct option *opt, const char *str, *sep = '\0'; list = 0; - if (syscalltbl__id(trace->sctbl, s) >= 0) { + if (syscalltbl__id(trace->sctbl, s) >= 0 || + syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { list = 1; } else { path__join(group_name, sizeof(group_name), strace_groups_dir, s); diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2c010dd6a79d..dc442ba21bf6 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -43,6 +43,7 @@ struct record_opts { bool no_samples; bool raw_samples; bool sample_address; + bool sample_phys_addr; bool sample_weight; bool sample_time; bool sample_time_set; diff --git a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json index 7e62c46d7a20..c63a919eda98 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/frontend.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/frontend.json @@ -80,11 +80,6 @@ "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." }, {, - "EventCode": "0x400F0", - "EventName": "PM_LD_MISS_L1", - "BriefDescription": "Load Missed L1, counted at execution time (can be greater than loads finished). LMQ merges are not included in this count. i.e. if a load instruction misses on an address that is already allocated on the LMQ, this event will not increment for that load). Note that this count is per slice, so if a load spans multiple slices this event will increment multiple times for a single load." - }, - {, "EventCode": "0x2E01A", "EventName": "PM_CMPLU_STALL_LSU_FLUSH_NEXT", "BriefDescription": "Completion stall of one cycle because the LSU requested to flush the next iop in the sequence. It takes 1 cycle for the ISU to process this request before the LSU instruction is allowed to complete" @@ -374,4 +369,4 @@ "EventName": "PM_IPTEG_FROM_L31_ECO_MOD", "BriefDescription": "A Page Table Entry was loaded into the TLB with Modified (M) data from another core's ECO L3 on the same chip due to a instruction side request" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/powerpc/power9/other.json b/tools/perf/pmu-events/arch/powerpc/power9/other.json index 00f3d2a21f31..54cc3be00fc2 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/other.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/other.json @@ -605,11 +605,6 @@ "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)" }, {, - "EventCode": "0x3689E", - "EventName": "PM_L2_RTY_LD", - "BriefDescription": "RC retries on PB for any load from core (excludes DCBFs)" - }, - {, "EventCode": "0xE08C", "EventName": "PM_LSU0_ERAT_HIT", "BriefDescription": "Primary ERAT hit. There is no secondary ERAT" @@ -715,11 +710,6 @@ "BriefDescription": "Lifetime, sample of RD machine 0 valid" }, {, - "EventCode": "0x468B4", - "EventName": "PM_L3_RD0_BUSY", - "BriefDescription": "Lifetime, sample of RD machine 0 valid" - }, - {, "EventCode": "0x46080", "EventName": "PM_L2_DISP_ALL_L2MISS", "BriefDescription": "All successful Ld/St dispatches for this thread that were an L2 miss (excludes i_l2mru_tch_reqs)" @@ -850,21 +840,11 @@ "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)" }, {, - "EventCode": "0x2608C", - "EventName": "PM_RC0_BUSY", - "BriefDescription": "RC mach 0 Busy. Used by PMU to sample ave RC lifetime (mach0 used as sample point)" - }, - {, "EventCode": "0x36082", "EventName": "PM_L2_LD_DISP", "BriefDescription": "All successful I-or-D side load dispatches for this thread (excludes i_l2mru_tch_reqs)." }, {, - "EventCode": "0x1609E", - "EventName": "PM_L2_LD_DISP", - "BriefDescription": "All successful D side load dispatches for this thread (L2 miss + L2 hits)" - }, - {, "EventCode": "0xF8B0", "EventName": "PM_L3_SW_PREF", "BriefDescription": "L3 load prefetch, sourced from a software prefetch stream, was sent to the nest" @@ -1040,11 +1020,6 @@ "BriefDescription": "L3 castouts in Mepf state for this thread" }, {, - "EventCode": "0x168A0", - "EventName": "PM_L3_CO_MEPF", - "BriefDescription": "L3 CO of line in Mep state (includes casthrough to memory). The Mepf state indicates that a line was brought in to satisfy an L3 prefetch request" - }, - {, "EventCode": "0x460A2", "EventName": "PM_L3_LAT_CI_HIT", "BriefDescription": "L3 Lateral Castins Hit" @@ -1150,11 +1125,6 @@ "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)" }, {, - "EventCode": "0x4689E", - "EventName": "PM_L2_RTY_ST", - "BriefDescription": "RC retries on PB for any store from core (excludes DCBFs)" - }, - {, "EventCode": "0x24040", "EventName": "PM_INST_FROM_L2_MEPF", "BriefDescription": "The processor's Instruction cache was reloaded from local core's L2 hit without dispatch conflicts on Mepf state. due to an instruction fetch (not prefetch)" @@ -1255,11 +1225,6 @@ "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)" }, {, - "EventCode": "0x4608C", - "EventName": "PM_CO0_BUSY", - "BriefDescription": "CO mach 0 Busy. Used by PMU to sample ave CO lifetime (mach0 used as sample point)" - }, - {, "EventCode": "0x2C122", "EventName": "PM_MRK_DATA_FROM_L3_DISP_CONFLICT_CYC", "BriefDescription": "Duration in cycles to reload from local core's L3 with dispatch conflict due to a marked load" @@ -1395,11 +1360,6 @@ "BriefDescription": "A Page Table Entry was loaded into the TLB from the local chip's Memory due to a instruction side request" }, {, - "EventCode": "0x40006", - "EventName": "PM_ISLB_MISS", - "BriefDescription": "Number of ISLB misses for this thread" - }, - {, "EventCode": "0xD8A8", "EventName": "PM_ISLB_MISS", "BriefDescription": "Instruction SLB miss - Total of all segment sizes" @@ -1515,11 +1475,6 @@ "BriefDescription": "All successful I-side dispatches for this thread (excludes i_l2mru_tch reqs)." }, {, - "EventCode": "0x3609E", - "EventName": "PM_L2_INST", - "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" - }, - {, "EventCode": "0x3504C", "EventName": "PM_IPTEG_FROM_DL4", "BriefDescription": "A Page Table Entry was loaded into the TLB from another chip's L4 on a different Node or Group (Distant) due to a instruction side request" @@ -1690,11 +1645,6 @@ "BriefDescription": "All successful I-or-D side load dispatches for this thread that were L2 hits (excludes i_l2mru_tch_reqs)" }, {, - "EventCode": "0x2609E", - "EventName": "PM_L2_LD_HIT", - "BriefDescription": "All successful D side load dispatches for this thread that were L2 hits for this thread" - }, - {, "EventCode": "0x168AC", "EventName": "PM_L3_CI_USAGE", "BriefDescription": "Rotating sample of 16 CI or CO actives" @@ -1795,21 +1745,11 @@ "BriefDescription": "Rotating sample of 8 WI valid" }, {, - "EventCode": "0x260B6", - "EventName": "PM_L3_WI0_BUSY", - "BriefDescription": "Rotating sample of 8 WI valid (duplicate)" - }, - {, "EventCode": "0x368AC", "EventName": "PM_L3_CO0_BUSY", "BriefDescription": "Lifetime, sample of CO machine 0 valid" }, {, - "EventCode": "0x468AC", - "EventName": "PM_L3_CO0_BUSY", - "BriefDescription": "Lifetime, sample of CO machine 0 valid" - }, - {, "EventCode": "0x2E040", "EventName": "PM_DPTEG_FROM_L2_MEPF", "BriefDescription": "A Page Table Entry was loaded into the TLB from local core's L2 hit without dispatch conflicts on Mepf state. due to a data side request. When using Radix Page Translation, this count excludes PDE reloads. Only PTE reloads are included" @@ -1840,11 +1780,6 @@ "BriefDescription": "L3 PF received retry port 0, every retry counted" }, {, - "EventCode": "0x260AE", - "EventName": "PM_L3_P0_PF_RTY", - "BriefDescription": "L3 PF received retry port 0, every retry counted" - }, - {, "EventCode": "0x268B2", "EventName": "PM_L3_LOC_GUESS_WRONG", "BriefDescription": "Initial scope=node (LNS) but data from out side local node (near or far or rem). Prediction too Low" @@ -1895,11 +1830,6 @@ "BriefDescription": "Lifetime, sample of snooper machine 0 valid" }, {, - "EventCode": "0x460AC", - "EventName": "PM_L3_SN0_BUSY", - "BriefDescription": "Lifetime, sample of snooper machine 0 valid" - }, - {, "EventCode": "0x3005C", "EventName": "PM_BFU_BUSY", "BriefDescription": "Cycles in which all 4 Binary Floating Point units are busy. The BFU is running at capacity" @@ -1935,11 +1865,6 @@ "BriefDescription": "Lifetime, sample of PF machine 0 valid" }, {, - "EventCode": "0x460B4", - "EventName": "PM_L3_PF0_BUSY", - "BriefDescription": "Lifetime, sample of PF machine 0 valid" - }, - {, "EventCode": "0xC0B0", "EventName": "PM_LSU_FLUSH_UE", "BriefDescription": "Correctable ECC error on reload data, reported at critical data forward time" @@ -2085,11 +2010,6 @@ "BriefDescription": "L3 CO received retry port 1 (memory only), every retry counted" }, {, - "EventCode": "0x468AE", - "EventName": "PM_L3_P1_CO_RTY", - "BriefDescription": "L3 CO received retry port 3 (memory only), every retry counted" - }, - {, "EventCode": "0xC0AC", "EventName": "PM_LSU_FLUSH_EMSH", "BriefDescription": "An ERAT miss was detected after a set-p hit. Erat tracker indicates fail due to tlbmiss and the instruction gets flushed because the instruction was working on the wrong address" @@ -2195,11 +2115,6 @@ "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)" }, {, - "EventCode": "0x46886", - "EventName": "PM_L2_SN_M_WR_DONE", - "BriefDescription": "SNP dispatched for a write and was M (true M); for DMA cacheinj this will pulse if rty/push is required (won't pulse if cacheinj is accepted)" - }, - {, "EventCode": "0x489C", "EventName": "PM_BR_CORECT_PRED_TAKEN_CMPL", "BriefDescription": "Conditional Branch Completed in which the HW correctly predicted the direction as taken. Counted at completion time" @@ -2290,21 +2205,11 @@ "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)" }, {, - "EventCode": "0x26090", - "EventName": "PM_SN0_BUSY", - "BriefDescription": "SN mach 0 Busy. Used by PMU to sample ave SN lifetime (mach0 used as sample point)" - }, - {, "EventCode": "0x360AE", "EventName": "PM_L3_P0_CO_RTY", "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted" }, {, - "EventCode": "0x460AE", - "EventName": "PM_L3_P0_CO_RTY", - "BriefDescription": "L3 CO received retry port 0 (memory only), every retry counted" - }, - {, "EventCode": "0x168A8", "EventName": "PM_L3_WI_USAGE", "BriefDescription": "Lifetime, sample of Write Inject machine 0 valid" @@ -2340,26 +2245,11 @@ "BriefDescription": "L3 PF received retry port 1, every retry counted" }, {, - "EventCode": "0x268AE", - "EventName": "PM_L3_P1_PF_RTY", - "BriefDescription": "L3 PF received retry port 3, every retry counted" - }, - {, "EventCode": "0x46082", "EventName": "PM_L2_ST_DISP", "BriefDescription": "All successful D-side store dispatches for this thread " }, {, - "EventCode": "0x1689E", - "EventName": "PM_L2_ST_DISP", - "BriefDescription": "All successful D-side store dispatches for this thread (L2 miss + L2 hits)" - }, - {, - "EventCode": "0x36880", - "EventName": "PM_L2_INST_MISS", - "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" - }, - {, "EventCode": "0x4609E", "EventName": "PM_L2_INST_MISS", "BriefDescription": "All successful I-side dispatches that were an L2 miss for this thread (excludes i_l2mru_tch reqs)" @@ -2430,11 +2320,6 @@ "BriefDescription": "# PPC Dispatched" }, {, - "EventCode": "0x300F2", - "EventName": "PM_INST_DISP", - "BriefDescription": "# PPC Dispatched" - }, - {, "EventCode": "0x4E05E", "EventName": "PM_TM_OUTER_TBEGIN_DISP", "BriefDescription": "Number of outer tbegin instructions dispatched. The dispatch unit determines whether the tbegin instruction is outer or nested. This is a speculative count, which includes flushed instructions" @@ -2460,11 +2345,6 @@ "BriefDescription": "All successful D-side store dispatches for this thread that were L2 hits" }, {, - "EventCode": "0x2689E", - "EventName": "PM_L2_ST_HIT", - "BriefDescription": "All successful D-side store dispatches that were L2 hits for this thread" - }, - {, "EventCode": "0x360A8", "EventName": "PM_L3_CO", "BriefDescription": "L3 castout occurring (does not include casthrough or log writes (cinj/dmaw))" diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json index 47a82568a8df..bc2db636dabf 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/pipeline.json @@ -420,11 +420,6 @@ "BriefDescription": "Final Pump Scope (Group) ended up larger than Initial Pump Scope (Chip) for an instruction fetch" }, {, - "EventCode": "0x10016", - "EventName": "PM_DSLB_MISS", - "BriefDescription": "Data SLB Miss - Total of all segment sizes" - }, - {, "EventCode": "0xD0A8", "EventName": "PM_DSLB_MISS", "BriefDescription": "Data SLB Miss - Total of all segment sizes" @@ -554,4 +549,4 @@ "EventName": "PM_MRK_DATA_FROM_L21_SHR_CYC", "BriefDescription": "Duration in cycles to reload with Shared (S) data from another core's L2 on the same chip due to a marked load" } -]
\ No newline at end of file +] diff --git a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json index a2c95a99e168..3ef8a10aac86 100644 --- a/tools/perf/pmu-events/arch/powerpc/power9/pmc.json +++ b/tools/perf/pmu-events/arch/powerpc/power9/pmc.json @@ -5,11 +5,6 @@ "BriefDescription": "Branches that are not strongly biased" }, {, - "EventCode": "0x40036", - "EventName": "PM_BR_2PATH", - "BriefDescription": "Branches that are not strongly biased" - }, - {, "EventCode": "0x40056", "EventName": "PM_MEM_LOC_THRESH_LSU_HIGH", "BriefDescription": "Local memory above threshold for LSU medium" @@ -124,4 +119,4 @@ "EventName": "PM_1FLOP_CMPL", "BriefDescription": "one flop (fadd, fmul, fsub, fcmp, fsel, fabs, fnabs, fres, fsqrte, fneg) operation completed" } -]
\ No newline at end of file +] diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 761c5a448c56..466a462b26d1 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -237,6 +237,11 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, thread__find_addr_map(thread, cpumode, MAP__FUNCTION, addr, &al); if (!al.map || !al.map->dso) { + if (cpumode == PERF_RECORD_MISC_HYPERVISOR) { + pr_debug("Hypervisor address can not be resolved - skipping\n"); + return 0; + } + pr_debug("thread__find_addr_map failed\n"); return -1; } diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 6d028f42b3cf..c3858487159d 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -141,6 +141,9 @@ static bool samples_same(const struct perf_sample *s1, } } + if (type & PERF_SAMPLE_PHYS_ADDR) + COMP(phys_addr); + return true; } @@ -206,6 +209,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) .mask = sample_regs, .regs = regs, }, + .phys_addr = 113, }; struct sample_read_value values[] = {{1, 5}, {9, 3}, {2, 7}, {6, 4},}; struct perf_sample sample_out; @@ -305,7 +309,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u * were added. Please actually update the test rather than just change * the condition below. */ - if (PERF_SAMPLE_MAX > PERF_SAMPLE_REGS_INTR << 1) { + if (PERF_SAMPLE_MAX > PERF_SAMPLE_PHYS_ADDR << 1) { pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n"); return -1; } diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index ba0aee576a2b..786fecaf578e 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -829,7 +829,8 @@ static int annotate_browser__run(struct annotate_browser *browser, "q/ESC/CTRL+C Exit\n\n" "ENTER Go to target\n" "ESC Exit\n" - "H Cycle thru hottest instructions\n" + "H Go to hottest instruction\n" + "TAB/shift+TAB Cycle thru hottest instructions\n" "j Toggle showing jump to target arrows\n" "J Toggle showing number of jump sources on targets\n" "n Search next string\n" diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index f4bc2462bc2c..13dfb0a0bdeb 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -931,12 +931,8 @@ static int hist_browser__show_callchain_list(struct hist_browser *browser, browser->show_dso); if (symbol_conf.show_branchflag_count) { - if (need_percent) - callchain_list_counts__printf_value(node, chain, NULL, - buf, sizeof(buf)); - else - callchain_list_counts__printf_value(NULL, chain, NULL, - buf, sizeof(buf)); + callchain_list_counts__printf_value(chain, NULL, + buf, sizeof(buf)); if (asprintf(&alloc_str2, "%s%s", str, buf) < 0) str = "Not enough memory!"; diff --git a/tools/perf/ui/stdio/hist.c b/tools/perf/ui/stdio/hist.c index 5c95b8301c67..8bdb7a500181 100644 --- a/tools/perf/ui/stdio/hist.c +++ b/tools/perf/ui/stdio/hist.c @@ -124,12 +124,8 @@ static size_t ipchain__fprintf_graph(FILE *fp, struct callchain_node *node, str = callchain_list__sym_name(chain, bf, sizeof(bf), false); if (symbol_conf.show_branchflag_count) { - if (!period) - callchain_list_counts__printf_value(node, chain, NULL, - buf, sizeof(buf)); - else - callchain_list_counts__printf_value(NULL, chain, NULL, - buf, sizeof(buf)); + callchain_list_counts__printf_value(chain, NULL, + buf, sizeof(buf)); if (asprintf(&alloc_str, "%s%s", str, buf) < 0) str = "Not enough memory!"; @@ -313,7 +309,7 @@ static size_t callchain__fprintf_graph(FILE *fp, struct rb_root *root, if (symbol_conf.show_branchflag_count) ret += callchain_list_counts__printf_value( - NULL, chain, fp, NULL, 0); + chain, fp, NULL, 0); ret += fprintf(fp, "\n"); if (++entries_printed == callchain_param.print_limit) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index f320b0777e0d..510b513e0f01 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -588,7 +588,7 @@ fill_node(struct callchain_node *node, struct callchain_cursor *cursor) call->cycles_count = cursor_node->branch_flags.cycles; call->iter_count = cursor_node->nr_loop_iter; - call->samples_count = cursor_node->samples; + call->iter_cycles = cursor_node->iter_cycles; } } @@ -722,7 +722,7 @@ static enum match_result match_chain(struct callchain_cursor_node *node, cnode->cycles_count += node->branch_flags.cycles; cnode->iter_count += node->nr_loop_iter; - cnode->samples_count += node->samples; + cnode->iter_cycles += node->iter_cycles; } } @@ -998,7 +998,7 @@ int callchain_merge(struct callchain_cursor *cursor, int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, struct map *map, struct symbol *sym, bool branch, struct branch_flags *flags, - int nr_loop_iter, int samples, u64 branch_from) + int nr_loop_iter, u64 iter_cycles, u64 branch_from) { struct callchain_cursor_node *node = *cursor->last; @@ -1016,7 +1016,7 @@ int callchain_cursor_append(struct callchain_cursor *cursor, node->sym = sym; node->branch = branch; node->nr_loop_iter = nr_loop_iter; - node->samples = samples; + node->iter_cycles = iter_cycles; if (flags) memcpy(&node->branch_flags, flags, @@ -1306,7 +1306,7 @@ static int branch_to_str(char *bf, int bfsize, static int branch_from_str(char *bf, int bfsize, u64 branch_count, u64 cycles_count, u64 iter_count, - u64 samples_count) + u64 iter_cycles) { int printed = 0, i = 0; u64 cycles; @@ -1318,9 +1318,13 @@ static int branch_from_str(char *bf, int bfsize, bf + printed, bfsize - printed); } - if (iter_count && samples_count) { - printed += count_pri64_printf(i++, "iterations", - iter_count / samples_count, + if (iter_count) { + printed += count_pri64_printf(i++, "iter", + iter_count, + bf + printed, bfsize - printed); + + printed += count_pri64_printf(i++, "avg_cycles", + iter_cycles / iter_count, bf + printed, bfsize - printed); } @@ -1333,7 +1337,7 @@ static int branch_from_str(char *bf, int bfsize, static int counts_str_build(char *bf, int bfsize, u64 branch_count, u64 predicted_count, u64 abort_count, u64 cycles_count, - u64 iter_count, u64 samples_count, + u64 iter_count, u64 iter_cycles, struct branch_type_stat *brtype_stat) { int printed; @@ -1346,7 +1350,7 @@ static int counts_str_build(char *bf, int bfsize, predicted_count, abort_count, brtype_stat); } else { printed = branch_from_str(bf, bfsize, branch_count, - cycles_count, iter_count, samples_count); + cycles_count, iter_count, iter_cycles); } if (!printed) @@ -1358,14 +1362,14 @@ static int counts_str_build(char *bf, int bfsize, static int callchain_counts_printf(FILE *fp, char *bf, int bfsize, u64 branch_count, u64 predicted_count, u64 abort_count, u64 cycles_count, - u64 iter_count, u64 samples_count, + u64 iter_count, u64 iter_cycles, struct branch_type_stat *brtype_stat) { char str[256]; counts_str_build(str, sizeof(str), branch_count, predicted_count, abort_count, cycles_count, - iter_count, samples_count, brtype_stat); + iter_count, iter_cycles, brtype_stat); if (fp) return fprintf(fp, "%s", str); @@ -1373,31 +1377,23 @@ static int callchain_counts_printf(FILE *fp, char *bf, int bfsize, return scnprintf(bf, bfsize, "%s", str); } -int callchain_list_counts__printf_value(struct callchain_node *node, - struct callchain_list *clist, +int callchain_list_counts__printf_value(struct callchain_list *clist, FILE *fp, char *bf, int bfsize) { u64 branch_count, predicted_count; u64 abort_count, cycles_count; - u64 iter_count = 0, samples_count = 0; + u64 iter_count, iter_cycles; branch_count = clist->branch_count; predicted_count = clist->predicted_count; abort_count = clist->abort_count; cycles_count = clist->cycles_count; - - if (node) { - struct callchain_list *call; - - list_for_each_entry(call, &node->val, list) { - iter_count += call->iter_count; - samples_count += call->samples_count; - } - } + iter_count = clist->iter_count; + iter_cycles = clist->iter_cycles; return callchain_counts_printf(fp, bf, bfsize, branch_count, predicted_count, abort_count, - cycles_count, iter_count, samples_count, + cycles_count, iter_count, iter_cycles, &clist->brtype_stat); } @@ -1523,7 +1519,8 @@ int callchain_cursor__copy(struct callchain_cursor *dst, rc = callchain_cursor_append(dst, node->ip, node->map, node->sym, node->branch, &node->branch_flags, - node->nr_loop_iter, node->samples, + node->nr_loop_iter, + node->iter_cycles, node->branch_from); if (rc) break; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 97738201464a..1ed6fc61d0a5 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -119,7 +119,7 @@ struct callchain_list { u64 abort_count; u64 cycles_count; u64 iter_count; - u64 samples_count; + u64 iter_cycles; struct branch_type_stat brtype_stat; char *srcline; struct list_head list; @@ -139,7 +139,7 @@ struct callchain_cursor_node { struct branch_flags branch_flags; u64 branch_from; int nr_loop_iter; - int samples; + u64 iter_cycles; struct callchain_cursor_node *next; }; @@ -201,7 +201,7 @@ static inline void callchain_cursor_reset(struct callchain_cursor *cursor) int callchain_cursor_append(struct callchain_cursor *cursor, u64 ip, struct map *map, struct symbol *sym, bool branch, struct branch_flags *flags, - int nr_loop_iter, int samples, u64 branch_from); + int nr_loop_iter, u64 iter_cycles, u64 branch_from); /* Close a cursor writing session. Initialize for the reader */ static inline void callchain_cursor_commit(struct callchain_cursor *cursor) @@ -282,8 +282,7 @@ char *callchain_node__scnprintf_value(struct callchain_node *node, int callchain_node__fprintf_value(struct callchain_node *node, FILE *fp, u64 total); -int callchain_list_counts__printf_value(struct callchain_node *node, - struct callchain_list *clist, +int callchain_list_counts__printf_value(struct callchain_list *clist, FILE *fp, char *bf, int bfsize); void free_callchain(struct callchain_root *root); diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 423ac82605f3..ee7bcc898d35 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -200,6 +200,7 @@ struct perf_sample { u32 cpu; u32 raw_size; u64 data_src; + u64 phys_addr; u32 flags; u16 insn_len; u8 cpumode; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d9bd632ed7db..4bb89373eb52 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -955,6 +955,9 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts, if (opts->sample_address) perf_evsel__set_sample_bit(evsel, DATA_SRC); + if (opts->sample_phys_addr) + perf_evsel__set_sample_bit(evsel, PHYS_ADDR); + if (opts->no_buffering) { attr->watermark = 0; attr->wakeup_events = 1; @@ -1464,7 +1467,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(PERIOD), bit_name(STREAM_ID), bit_name(RAW), bit_name(BRANCH_STACK), bit_name(REGS_USER), bit_name(STACK_USER), bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), - bit_name(WEIGHT), + bit_name(WEIGHT), bit_name(PHYS_ADDR), { .name = NULL, } }; #undef bit_name @@ -2206,6 +2209,12 @@ int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event, } } + data->phys_addr = 0; + if (type & PERF_SAMPLE_PHYS_ADDR) { + data->phys_addr = *array; + array++; + } + return 0; } @@ -2311,6 +2320,9 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, } } + if (type & PERF_SAMPLE_PHYS_ADDR) + result += sizeof(u64); + return result; } @@ -2500,6 +2512,11 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, } } + if (type & PERF_SAMPLE_PHYS_ADDR) { + *array = sample->phys_addr; + array++; + } + return 0; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 351d3b2d8887..dd2c4b5112a5 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -131,6 +131,7 @@ struct perf_evsel { bool cmdline_group_boundary; struct list_head config_terms; int bpf_fd; + bool auto_merge_stats; bool merged_stat; const char * metric_expr; const char * metric_name; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 9453b2e27015..e60d8d8ea4c2 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -167,6 +167,10 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) symlen = unresolved_col_width + 4 + 2; hists__set_unres_dso_col_len(hists, HISTC_MEM_DADDR_DSO); } + + hists__new_col_len(hists, HISTC_MEM_PHYS_DADDR, + unresolved_col_width + 4 + 2); + } else { symlen = unresolved_col_width + 4 + 2; hists__new_col_len(hists, HISTC_MEM_DADDR_SYMBOL, symlen); diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index ee3670a388df..e60dda26a920 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -47,6 +47,7 @@ enum hist_column { HISTC_GLOBAL_WEIGHT, HISTC_MEM_DADDR_SYMBOL, HISTC_MEM_DADDR_DSO, + HISTC_MEM_PHYS_DADDR, HISTC_MEM_LOCKED, HISTC_MEM_TLB, HISTC_MEM_LVL, diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index 5c8eacaca4f4..df709363ef69 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -1635,10 +1635,12 @@ static void ip__resolve_ams(struct thread *thread, ams->al_addr = al.addr; ams->sym = al.sym; ams->map = al.map; + ams->phys_addr = 0; } static void ip__resolve_data(struct thread *thread, - u8 m, struct addr_map_symbol *ams, u64 addr) + u8 m, struct addr_map_symbol *ams, + u64 addr, u64 phys_addr) { struct addr_location al; @@ -1658,6 +1660,7 @@ static void ip__resolve_data(struct thread *thread, ams->al_addr = al.addr; ams->sym = al.sym; ams->map = al.map; + ams->phys_addr = phys_addr; } struct mem_info *sample__resolve_mem(struct perf_sample *sample, @@ -1669,12 +1672,18 @@ struct mem_info *sample__resolve_mem(struct perf_sample *sample, return NULL; ip__resolve_ams(al->thread, &mi->iaddr, sample->ip); - ip__resolve_data(al->thread, al->cpumode, &mi->daddr, sample->addr); + ip__resolve_data(al->thread, al->cpumode, &mi->daddr, + sample->addr, sample->phys_addr); mi->data_src.val = sample->data_src; return mi; } +struct iterations { + int nr_loop_iter; + u64 cycles; +}; + static int add_callchain_ip(struct thread *thread, struct callchain_cursor *cursor, struct symbol **parent, @@ -1683,11 +1692,12 @@ static int add_callchain_ip(struct thread *thread, u64 ip, bool branch, struct branch_flags *flags, - int nr_loop_iter, - int samples, + struct iterations *iter, u64 branch_from) { struct addr_location al; + int nr_loop_iter = 0; + u64 iter_cycles = 0; al.filtered = 0; al.sym = NULL; @@ -1737,9 +1747,15 @@ static int add_callchain_ip(struct thread *thread, if (symbol_conf.hide_unresolved && al.sym == NULL) return 0; + + if (iter) { + nr_loop_iter = iter->nr_loop_iter; + iter_cycles = iter->cycles; + } + return callchain_cursor_append(cursor, al.addr, al.map, al.sym, - branch, flags, nr_loop_iter, samples, - branch_from); + branch, flags, nr_loop_iter, + iter_cycles, branch_from); } struct branch_info *sample__resolve_bstack(struct perf_sample *sample, @@ -1760,6 +1776,18 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, return bi; } +static void save_iterations(struct iterations *iter, + struct branch_entry *be, int nr) +{ + int i; + + iter->nr_loop_iter = nr; + iter->cycles = 0; + + for (i = 0; i < nr; i++) + iter->cycles += be[i].flags.cycles; +} + #define CHASHSZ 127 #define CHASHBITS 7 #define NO_ENTRY 0xff @@ -1767,7 +1795,8 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, #define PERF_MAX_BRANCH_DEPTH 127 /* Remove loops. */ -static int remove_loops(struct branch_entry *l, int nr) +static int remove_loops(struct branch_entry *l, int nr, + struct iterations *iter) { int i, j, off; unsigned char chash[CHASHSZ]; @@ -1792,8 +1821,18 @@ static int remove_loops(struct branch_entry *l, int nr) break; } if (is_loop) { - memmove(l + i, l + i + off, - (nr - (i + off)) * sizeof(*l)); + j = nr - (i + off); + if (j > 0) { + save_iterations(iter + i + off, + l + i, off); + + memmove(iter + i, iter + i + off, + j * sizeof(*iter)); + + memmove(l + i, l + i + off, + j * sizeof(*l)); + } + nr -= off; } } @@ -1883,7 +1922,7 @@ static int resolve_lbr_callchain_sample(struct thread *thread, err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, - branch, flags, 0, 0, + branch, flags, NULL, branch_from); if (err) return (err < 0) ? err : 0; @@ -1909,7 +1948,6 @@ static int thread__resolve_callchain_sample(struct thread *thread, int i, j, err, nr_entries; int skip_idx = -1; int first_call = 0; - int nr_loop_iter; if (chain) chain_nr = chain->nr; @@ -1942,6 +1980,7 @@ static int thread__resolve_callchain_sample(struct thread *thread, if (branch && callchain_param.branch_callstack) { int nr = min(max_stack, (int)branch->nr); struct branch_entry be[nr]; + struct iterations iter[nr]; if (branch->nr > PERF_MAX_BRANCH_DEPTH) { pr_warning("corrupted branch chain. skipping...\n"); @@ -1972,38 +2011,21 @@ static int thread__resolve_callchain_sample(struct thread *thread, be[i] = branch->entries[branch->nr - i - 1]; } - nr_loop_iter = nr; - nr = remove_loops(be, nr); - - /* - * Get the number of iterations. - * It's only approximation, but good enough in practice. - */ - if (nr_loop_iter > nr) - nr_loop_iter = nr_loop_iter - nr + 1; - else - nr_loop_iter = 0; + memset(iter, 0, sizeof(struct iterations) * nr); + nr = remove_loops(be, nr, iter); for (i = 0; i < nr; i++) { - if (i == nr - 1) - err = add_callchain_ip(thread, cursor, parent, - root_al, - NULL, be[i].to, - true, &be[i].flags, - nr_loop_iter, 1, - be[i].from); - else - err = add_callchain_ip(thread, cursor, parent, - root_al, - NULL, be[i].to, - true, &be[i].flags, - 0, 0, be[i].from); + err = add_callchain_ip(thread, cursor, parent, + root_al, + NULL, be[i].to, + true, &be[i].flags, + NULL, be[i].from); if (!err) err = add_callchain_ip(thread, cursor, parent, root_al, NULL, be[i].from, true, &be[i].flags, - 0, 0, 0); + &iter[i], 0); if (err == -EINVAL) break; if (err) @@ -2037,7 +2059,7 @@ check_calls: err = add_callchain_ip(thread, cursor, parent, root_al, &cpumode, ip, - false, NULL, 0, 0, 0); + false, NULL, NULL, 0); if (err) return (err < 0) ? err : 0; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f44aeba51d1f..f6257fb4f08c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -310,7 +310,7 @@ static struct perf_evsel * __add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name, struct cpu_map *cpus, - struct list_head *config_terms) + struct list_head *config_terms, bool auto_merge_stats) { struct perf_evsel *evsel; @@ -324,6 +324,7 @@ __add_event(struct list_head *list, int *idx, evsel->cpus = cpu_map__get(cpus); evsel->own_cpus = cpu_map__get(cpus); evsel->system_wide = !!cpus; + evsel->auto_merge_stats = auto_merge_stats; if (name) evsel->name = strdup(name); @@ -339,7 +340,7 @@ static int add_event(struct list_head *list, int *idx, struct perf_event_attr *attr, char *name, struct list_head *config_terms) { - return __add_event(list, idx, attr, name, NULL, config_terms) ? 0 : -ENOMEM; + return __add_event(list, idx, attr, name, NULL, config_terms, false) ? 0 : -ENOMEM; } static int parse_aliases(char *str, const char *names[][PERF_EVSEL__MAX_ALIASES], int size) @@ -1209,9 +1210,9 @@ int parse_events_add_numeric(struct parse_events_state *parse_state, get_config_name(head_config), &config_terms); } -int parse_events_add_pmu(struct parse_events_state *parse_state, +static int __parse_events_add_pmu(struct parse_events_state *parse_state, struct list_head *list, char *name, - struct list_head *head_config) + struct list_head *head_config, bool auto_merge_stats) { struct perf_event_attr attr; struct perf_pmu_info info; @@ -1232,7 +1233,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, if (!head_config) { attr.type = pmu->type; - evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL); + evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu->cpus, NULL, auto_merge_stats); return evsel ? 0 : -ENOMEM; } @@ -1254,7 +1255,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, evsel = __add_event(list, &parse_state->idx, &attr, get_config_name(head_config), pmu->cpus, - &config_terms); + &config_terms, auto_merge_stats); if (evsel) { evsel->unit = info.unit; evsel->scale = info.scale; @@ -1267,6 +1268,13 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, return evsel ? 0 : -ENOMEM; } +int parse_events_add_pmu(struct parse_events_state *parse_state, + struct list_head *list, char *name, + struct list_head *head_config) +{ + return __parse_events_add_pmu(parse_state, list, name, head_config, false); +} + int parse_events_multi_pmu_add(struct parse_events_state *parse_state, char *str, struct list_head **listp) { @@ -1296,8 +1304,8 @@ int parse_events_multi_pmu_add(struct parse_events_state *parse_state, return -1; list_add_tail(&term->list, head); - if (!parse_events_add_pmu(parse_state, list, - pmu->name, head)) { + if (!__parse_events_add_pmu(parse_state, list, + pmu->name, head, true)) { pr_debug("%s -> %s/%s/\n", str, pmu->name, alias->str); ok++; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index ac863691605f..a7ebd9fe8e40 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1120,6 +1120,9 @@ static void dump_sample(struct perf_evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); + if (sample_type & PERF_SAMPLE_PHYS_ADDR) + printf(" .. phys_addr: 0x%"PRIx64"\n", sample->phys_addr); + if (sample_type & PERF_SAMPLE_TRANSACTION) printf("... transaction: %" PRIx64 "\n", sample->transaction); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 12359bd986db..eb3ab902a1c0 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -1316,6 +1316,47 @@ struct sort_entry sort_mem_dcacheline = { }; static int64_t +sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right) +{ + uint64_t l = 0, r = 0; + + if (left->mem_info) + l = left->mem_info->daddr.phys_addr; + if (right->mem_info) + r = right->mem_info->daddr.phys_addr; + + return (int64_t)(r - l); +} + +static int hist_entry__phys_daddr_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + uint64_t addr = 0; + size_t ret = 0; + size_t len = BITS_PER_LONG / 4; + + addr = he->mem_info->daddr.phys_addr; + + ret += repsep_snprintf(bf + ret, size - ret, "[%c] ", he->level); + + ret += repsep_snprintf(bf + ret, size - ret, "%-#.*llx", len, addr); + + ret += repsep_snprintf(bf + ret, size - ret, "%-*s", width - ret, ""); + + if (ret > width) + bf[width] = '\0'; + + return width; +} + +struct sort_entry sort_mem_phys_daddr = { + .se_header = "Data Physical Address", + .se_cmp = sort__phys_daddr_cmp, + .se_snprintf = hist_entry__phys_daddr_snprintf, + .se_width_idx = HISTC_MEM_PHYS_DADDR, +}; + +static int64_t sort__abort_cmp(struct hist_entry *left, struct hist_entry *right) { if (!left->branch_info || !right->branch_info) @@ -1547,6 +1588,7 @@ static struct sort_dimension memory_sort_dimensions[] = { DIM(SORT_MEM_LVL, "mem", sort_mem_lvl), DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop), DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline), + DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index b7c75597e18f..f36dc4980a6c 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -245,6 +245,7 @@ enum sort_type { SORT_MEM_SNOOP, SORT_MEM_DCACHELINE, SORT_MEM_IADDR_SYMBOL, + SORT_MEM_PHYS_DADDR, }; /* diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index d00a012cfdfb..2bd6a1f01a1c 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -186,6 +186,7 @@ struct addr_map_symbol { struct symbol *sym; u64 addr; u64 al_addr; + u64 phys_addr; }; struct branch_info { diff --git a/tools/perf/util/syscalltbl.c b/tools/perf/util/syscalltbl.c index bbb4c1957578..19e5db90394c 100644 --- a/tools/perf/util/syscalltbl.c +++ b/tools/perf/util/syscalltbl.c @@ -19,6 +19,7 @@ #ifdef HAVE_SYSCALL_TABLE #include <linux/compiler.h> #include <string.h> +#include "string2.h" #include "util.h" #if defined(__x86_64__) @@ -105,6 +106,27 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name) return sc ? sc->id : -1; } +int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx) +{ + int i; + struct syscall *syscalls = tbl->syscalls.entries; + + for (i = *idx + 1; i < tbl->syscalls.nr_entries; ++i) { + if (strglobmatch(syscalls[i].name, syscall_glob)) { + *idx = i; + return syscalls[i].id; + } + } + + return -1; +} + +int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx) +{ + *idx = -1; + return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx); +} + #else /* HAVE_SYSCALL_TABLE */ #include <libaudit.h> @@ -131,4 +153,15 @@ int syscalltbl__id(struct syscalltbl *tbl, const char *name) { return audit_name_to_syscall(name, tbl->audit_machine); } + +int syscalltbl__strglobmatch_next(struct syscalltbl *tbl __maybe_unused, + const char *syscall_glob __maybe_unused, int *idx __maybe_unused) +{ + return -1; +} + +int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx) +{ + return syscalltbl__strglobmatch_next(tbl, syscall_glob, idx); +} #endif /* HAVE_SYSCALL_TABLE */ diff --git a/tools/perf/util/syscalltbl.h b/tools/perf/util/syscalltbl.h index e2951510484f..e9fb8786da7c 100644 --- a/tools/perf/util/syscalltbl.h +++ b/tools/perf/util/syscalltbl.h @@ -17,4 +17,7 @@ void syscalltbl__delete(struct syscalltbl *tbl); const char *syscalltbl__name(const struct syscalltbl *tbl, int id); int syscalltbl__id(struct syscalltbl *tbl, const char *name); +int syscalltbl__strglobmatch_first(struct syscalltbl *tbl, const char *syscall_glob, int *idx); +int syscalltbl__strglobmatch_next(struct syscalltbl *tbl, const char *syscall_glob, int *idx); + #endif /* __PERF_SYSCALLTBL_H */ diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 4c2fa98ef39d..d20791c3f499 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -1546,8 +1546,8 @@ static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa, else { memcpy(iobuf, mmio->addr.base + dpa, len); - /* give us some some coverage of the mmio_flush_range() API */ - mmio_flush_range(mmio->addr.base + dpa, len); + /* give us some some coverage of the arch_invalidate_pmem() API */ + arch_invalidate_pmem(mmio->addr.base + dpa, len); } nd_region_release_lane(nd_region, lane); diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c index a8df159a8924..ec0f6b45ce8b 100644 --- a/tools/testing/selftests/x86/mpx-mini-test.c +++ b/tools/testing/selftests/x86/mpx-mini-test.c @@ -391,8 +391,7 @@ void handler(int signum, siginfo_t *si, void *vucontext) br_count++; dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count); -#define __SI_FAULT (3 << 16) -#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ +#define SEGV_BNDERR 3 /* failed address bound checks */ dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n", status, ip, br_reason); diff --git a/tools/testing/selftests/x86/protection_keys.c b/tools/testing/selftests/x86/protection_keys.c index 3237bc010e1c..23927845518d 100644 --- a/tools/testing/selftests/x86/protection_keys.c +++ b/tools/testing/selftests/x86/protection_keys.c @@ -212,19 +212,18 @@ void dump_mem(void *dumpme, int len_bytes) } } -#define __SI_FAULT (3 << 16) -#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ -#define SEGV_PKUERR (__SI_FAULT|4) +#define SEGV_BNDERR 3 /* failed address bound checks */ +#define SEGV_PKUERR 4 static char *si_code_str(int si_code) { - if (si_code & SEGV_MAPERR) + if (si_code == SEGV_MAPERR) return "SEGV_MAPERR"; - if (si_code & SEGV_ACCERR) + if (si_code == SEGV_ACCERR) return "SEGV_ACCERR"; - if (si_code & SEGV_BNDERR) + if (si_code == SEGV_BNDERR) return "SEGV_BNDERR"; - if (si_code & SEGV_PKUERR) + if (si_code == SEGV_PKUERR) return "SEGV_PKUERR"; return "UNKNOWN"; } |